# syntax=docker/dockerfile:1 # NOTE: Building this image require's docker version >= 23.0. # # For reference: # - https://docs.docker.com/build/dockerfile/frontend/#stable-channel ARG TAG_VERSION="12.1.1" FROM nvidia/cuda:${TAG_VERSION}-cudnn8-devel-ubuntu22.04 as apptainerbuilder ARG HTTP_PROXY ARG HTTPS_PROXY ENV http_proxy=${HTTP_PROXY} ENV https_proxy=${HTTPS_PROXY} ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} # 安装必需的包 RUN apt-get update && apt-get install -y \ wget \ gcc \ git \ libc-dev \ make \ bash \ linux-headers-generic \ libseccomp-dev \ libssl-dev \ libuuid1 \ uuid-dev \ pkg-config \ && rm -rf /var/lib/apt/lists/* # 安装 Go ARG GO_VERSION="1.21.13" RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \ tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \ rm go${GO_VERSION}.linux-amd64.tar.gz # 设置 Go 环境变量 ENV PATH="/usr/local/go/bin:${PATH}" # 构建 Apptainer ARG APPTAINER_COMMITISH="main" ARG MCONFIG_OPTIONS="--with-suid" WORKDIR /go/src/github.com/apptainer RUN git clone https://github.com/apptainer/apptainer.git \ && cd apptainer \ && git checkout "$APPTAINER_COMMITISH" \ && ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \ && cd builddir \ && make \ && make install # 清理 RUN apt-get remove -y wget gcc git && \ apt-get autoremove -y && \ apt-get clean FROM nvidia/cuda:${TAG_VERSION}-cudnn8-devel-ubuntu22.04 # 复制 Apptainer 和 Go COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer COPY --from=apptainerbuilder /usr/local/go /usr/local/go ENV CUDA_HOME=/usr/local/cuda ENV GO_PATH="/usr/local/go" ENV PATH="/usr/local/apptainer/bin:${GO_PATH}/bin:$PATH" ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer" ARG HTTP_PROXY ARG HTTPS_PROXY ENV http_proxy=${HTTP_PROXY} ENV https_proxy=${HTTPS_PROXY} ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} ARG ROOT_PASSWD="root" ENV ROOT_PASSWD=${ROOT_PASSWD} ENV SSH_PORT=2222 WORKDIR /root SHELL ["/bin/bash", "-c"] # base tools RUN < ~/.ssh/config cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2 chmod 600 /root/.ssh/authorized_keys chmod 600 /root/.ssh/authorized_keys2 mkdir /var/run/sshd echo "root:${ROOT_PASSWD}" | chpasswd mkdir -p ~/.pip # install pixi curl -fsSL https://pixi.sh/install.sh | bash EOT # install NVIDIA DOCA 2.7 # RUN < -t bw -p -n # 测试 UCX 读取配置 # ucx_read_profile # 检查 UCX 进程 # mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program # CUDA support check ucx_info -c ucx_info -d # ompi_info | grep ucx EOT # # mpich install with ucx # # UCX is already embedded in the MPICH tarball, so you do not need to separately download UCX. # ENV MPICH_HOME=/opt/mpich # RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... MAX_JOBS=1 /usr/bin/python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ /usr/bin/python3 -c "import apex.amp; print('Apex is installed and the amp module is available.')" cd .. rm -rf apex EOT # install colossalai ARG COLOSSALAI_VERSION="0.4.2" ENV COLOSSALAI_VERSION=${COLOSSALAI_VERSION} RUN <> ~/.bashrc # . /opt/conda/etc/profile.d/conda.sh # conda init bash # conda config --set show_channel_urls true # # 配置 .condarc 文件 # cat < ~/.condarc # channels: # - conda-forge # - bioconda # - pytorch # - pytorch-nightly # - nvidia # - defaults # show_channel_urls: true # EOF # source /opt/conda/etc/profile.d/conda.sh # conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -y # conda activate ${CONDA_ENV_NAME} # EOT ENV PATH=/usr/local/cuda/bin:$PATH # 添加deepspeed user # RUN <> /etc/sudoers # EOT # # # Change to non-root privilege # USER deepspeed RUN <