# syntax=docker/dockerfile:1 # NOTE: Building this image require's docker version >= 23.0. # # For reference: # - https://docs.docker.com/build/dockerfile/frontend/#stable-channel ARG TAG_VERSION="12.1.1" FROM nvidia/cuda:${TAG_VERSION} as apptainerbuilder ARG HTTP_PROXY ARG HTTPS_PROXY ENV http_proxy=${HTTP_PROXY} ENV https_proxy=${HTTPS_PROXY} ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} # 安装必需的包 RUN apt-get update && apt-get install -y \ wget \ gcc \ git \ libc-dev \ make \ bash \ linux-headers-generic \ libseccomp-dev \ libssl-dev \ libuuid1 \ uuid-dev \ pkg-config \ && rm -rf /var/lib/apt/lists/* # 安装 Go ARG GO_VERSION="1.21.13" RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \ tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \ rm go${GO_VERSION}.linux-amd64.tar.gz # 设置 Go 环境变量 ENV PATH="/usr/local/go/bin:${PATH}" # 构建 Apptainer ARG APPTAINER_COMMITISH="main" ARG MCONFIG_OPTIONS="--with-suid" WORKDIR /go/src/github.com/apptainer RUN git clone https://github.com/apptainer/apptainer.git \ && cd apptainer \ && git checkout "$APPTAINER_COMMITISH" \ && ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \ && cd builddir \ && make \ && make install # 清理 RUN apt-get remove -y wget gcc git && \ apt-get autoremove -y && \ apt-get clean FROM nvidia/cuda:${TAG_VERSION} # 复制 Apptainer 和 Go COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer COPY --from=apptainerbuilder /usr/local/go /usr/local/go ENV GO_PATH="/usr/local/go" ENV PATH="/usr/local/apptainer/bin:${GO_PATH}/bin:$PATH" ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer" ARG HTTP_PROXY ARG HTTPS_PROXY ENV http_proxy=${HTTP_PROXY} ENV https_proxy=${HTTPS_PROXY} ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} ARG ROOT_PASSWD="root" ENV ROOT_PASSWD=${ROOT_PASSWD} ARG SSH_PORT=2222 ENV SSH_PORT=${SSH_PORT} WORKDIR /root SHELL ["/bin/bash", "-c"] RUN < ~/.ssh/config cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2 chmod 600 /root/.ssh/authorized_keys chmod 600 /root/.ssh/authorized_keys2 mkdir /var/run/sshd echo "root:${ROOT_PASSWD}" | chpasswd mkdir -p ~/.pip # install pixi curl -fsSL https://pixi.sh/install.sh | bash EOT # reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile # PyTorch ARG PYTHON_VERSION=3.10 ENV PYTHON_VERSION=${PYTHON_VERSION} ENV PATH=/usr/bin:/opt/conda/bin:$PATH ENV REF='main' ENV STAGE_DIR=/tmp ARG CUDA='cu121' ENV CUDA=${CUDA} ARG PYTORCH_VERSION=2.3.1 ENV PYTORCH_VERSION=${PYTORCH_VERSION} ARG TORCHVISION_VERSION=0.18.1 ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} ARG TORCHAUDIO_VERSION=2.3.1 ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} ARG PYTORCH_CUDA_VERSION=12.1 ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} ARG SETUPTOOLS_VERSION=69.5.1 ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} ARG USE_CUDA=1 ENV USE_CUDA=${USE_CUDA} ARG USE_ROCM=0 ENV USE_ROCM=${USE_ROCM} ARG USE_XPU=0 ENV USE_XPU=${USE_XPU} ARG _GLIBCXX_USE_CXX11_ABI=1 ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ python -c "import apex.amp; print('Apex is installed and the amp module is available.')" cd .. rm -rf ${STAGE_DIR}/apex EOT # https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ ENV MLNX_OFED_VERSION=23.10-3.2.2.0 RUN <&1 # from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail # reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile # install deepspeed prepare # install Mellanox OFED mkdir -p ${STAGE_DIR} wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 ./mlnxofedinstall --user-space-only --without-fw-update --all -q cd ${STAGE_DIR} rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* EOT ARG NV_PEER_MEM_VERSION="1.2" ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 RUN < -t bw -p -n # 测试 UCX 读取配置 # ucx_read_profile # 检查 UCX 进程 # mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program # CUDA support check # ucx_info -c # ucx_info -d # ompi_info | grep ucx EOT # install mpi ENV OPENMPI_BASEVERSION=4.1 ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 ENV PATH=/usr/local/mpi/bin:${PATH} ENV MPI_HOME=/usr/local/openmpi ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH} ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH RUN < /usr/local/mpi/bin/mpirun echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun chmod a+x /usr/local/mpi/bin/mpirun EOT # 29.78 Usage: install.sh [options...] # 29.78 # 29.78 By default will install deepspeed and all third party dependencies across all machines listed in # 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally # 29.78 # 29.78 [optional] # 29.78 -l, --local_only Install only on local machine # 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) # 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) # 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels # 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) # 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) # 29.78 -e, --examples Checkout deepspeed example submodule (no install) # 29.78 -v, --verbose Verbose logging # 29.78 -h, --help This help text RUN <> /etc/sudoers EOT # install cutlass https://github.com/NVIDIA/cutlass # H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) # A100: architecture is Ampere # V100: architecture is Volta # T4: architecture is Turing # ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc # 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 # 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 # 80:适用于 NVIDIA Ampere 架构(如 A100)。 # 90a:适用于 NVIDIA Hopper 架构(如 H100)。 # 89:GeForce RTX 4090 ARG DCUTLASS_NVCC_ARCHS="80;89;90a" ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} RUN < install_modified.sh chmod +x ./install_modified.sh # 检查 HOSTFILE_CONTENT 并写入文件 if [ -n "${HOSTFILE_CONTENT}" ]; then echo "${HOSTFILE_CONTENT}" > /tmp/hostfile INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" else INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" fi eval $INSTALL_CMD # compile deepspeed ops ds_report # clean up # rm -f deepspeed/git_version_info_installed.py # rm -rf dist build deepspeed.egg-info # python setup.py bdist_wheel # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt # pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 EOT # install transformers and flash-attn RUN < ~/.deepspeed_env NCCL_IB_DISABLE=${NCCL_IB_DISABLE} NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME} NCCL_DEBUG=INFO CUTLASS_PATH=${CUTLASS_PATH} CUDA_HOME=${CUDA_HOME} EOF #CUDA_VISIBLE_DEVICES=0,1,2,3 #OMP_NUM_THREADS=8 #MASTER_ADDR=192.168.1.1 #MASTER_PORT=12345 EOT CMD ["/usr/sbin/sshd", "-D"]