# syntax=docker/dockerfile:1 # NOTE: Building this image require's docker version >= 23.0. # # For reference: # - https://docs.docker.com/build/dockerfile/frontend/#stable-channel ARG CUDA_VERSION=12.1.0 FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 ARG HTTP_PROXY ARG HTTPS_PROXY ENV http_proxy=${HTTP_PROXY} ENV https_proxy=${HTTPS_PROXY} ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} ENV MAMBA_ROOT_PREFIX=~/micromamba ARG ROOT_PASSWD="root" ENV ROOT_PASSWD=${ROOT_PASSWD} WORKDIR /root SHELL ["/bin/bash", "-c"] COPY id_rsa.pub /root/.ssh/id_rsa.pub # base tools RUN apt-get update && \ apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 && \ apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c && \ add-apt-repository ppa:git-core/ppa -y && \ apt-get install -y git libnuma-dev wget && \ wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \ sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ sudo apt-get update && \ sudo apt-get install -y cmake && \ sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config && \ mkdir /var/run/sshd && \ echo "root:${ROOT_PASSWD}" | chpasswd && \ mkdir -p ~/.pip && \ wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \ bash /tmp/miniconda.sh -b -p /opt/conda && \ rm /tmp/miniconda.sh && \ conda init bash && \ ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ cat < ~/.condarc channels: - conda-forge - bioconda - pytorch - pytorch-nightly - nvidia - defaults show_channel_urls: true EOF # reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile # PyTorch ARG CONDA_ENV_NAME="deepspeed" ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} ARG PYTHON_VERSION=3.10 ENV PYTHON_VERSION=${PYTHON_VERSION} ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" ENV REF='main' ENV STAGE_DIR=/tmp ENV OPENMPI_BASEVERSION=4.1 ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 ARG CUDA='cu121' ENV CUDA=${CUDA} ARG PYTORCH_VERSION=2.3.1 ENV PYTORCH_VERSION=${PYTORCH_VERSION} ARG TORCHVISION_VERSION=0.18.1 ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} ARG TORCHAUDIO_VERSION=2.3.1 ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} ARG PYTORCH_CUDA_VERSION=12.1 ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} ARG SETUPTOOLS_VERSION=69.5.1 ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} ARG USE_CUDA=1 ENV USE_CUDA=${USE_CUDA} ARG USE_ROCM=0 ENV USE_ROCM=${USE_ROCM} ARG USE_XPU=0 ENV USE_XPU=${USE_XPU} ARG _GLIBCXX_USE_CXX11_ABI=1 ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} RUN source /opt/conda/etc/profile.d/conda.sh && \ conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y && \ echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ which python > ~/python_path.txt && \ conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja && \ cd ${STAGE_DIR}/ninja && \ git clone https://github.com/google/googletest.git && \ python ./configure.py --bootstrap && \ conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" && \ ./ninja all && \ ./ninja_test && \ python3 -m pip install --no-cache-dir --upgrade pip && \ python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python && \ conda clean -afy && \ git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. && \ python -m pip install setuptools==${SETUPTOOLS_VERSION} && \ python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] && \ python3 -m pip uninstall -y torch torchvision torchaudio && \ python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} && \ python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate && \ python3 -m pip uninstall -y transformer-engine && \ python3 -m pip uninstall -y torch-tensorrt && \ python3 -m pip uninstall -y apex # install apex RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex && \ cd apex && \ MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ cd .. && \ rm -rf ${STAGE_DIR}/apex ENV MLNX_OFED_VERSION=5.8-4.1.5.0 RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ mkdir -p ${STAGE_DIR} && \ wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \ cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \ ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ cd ${STAGE_DIR} && \ rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* ENV NV_PEER_MEM_VERSION=1.2 ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ rm -rf ${STAGE_DIR} && \ mkdir -p ${STAGE_DIR} && \ git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ cd ${STAGE_DIR}/nv_peer_memory && \ ./build_module.sh && \ cd ${STAGE_DIR} && \ tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ apt-get update && \ apt --fix-broken install -y && \ apt-get install -y dkms && \ dpkg-buildpackage -us -uc && \ dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb # install mpi ENV PATH=/usr/local/mpi/bin:${PATH} ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ rm -rf ${STAGE_DIR} && \ mkdir -p ${STAGE_DIR} && \ cd ${STAGE_DIR} && \ wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ cd openmpi-${OPENMPI_VERSION} && \ ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ make -j"$(nproc)" install && \ ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ test -f /usr/local/mpi/bin/mpic++ && \ cd ${STAGE_DIR} && \ rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} && \ mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ chmod a+x /usr/local/mpi/bin/mpirun # SSH daemon port inside container cannot conflict with host OS port # ENV SSH_PORT=2222 # RUN source /opt/conda/etc/profile.d/conda.sh && \ # conda activate ${CONDA_ENV_NAME} && \ # cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ # sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ useradd --create-home --uid 1000 --shell /bin/bash deepspeed && \ usermod -aG sudo deepspeed && \ echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers # install cutlass https://github.com/NVIDIA/cutlass # H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) # A100: architecture is Ampere # V100: architecture is Volta # T4: architecture is Turing # ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc # 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 # 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 # 80:适用于 NVIDIA Ampere 架构(如 A100)。 # 90a:适用于 NVIDIA Hopper 架构(如 H100)。 # 89:GeForce RTX 4090 ARG DCUTLASS_NVCC_ARCHS="80;89;90a" ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ cd /opt/cutlass && \ git checkout . && \ git checkout master && \ mkdir build && \ cd build && \ cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ make -j"$(nproc)" install && \ cd .. # Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile # RUN source /opt/conda/etc/profile.d/conda.sh && \ # conda activate ${CONDA_ENV_NAME} && \ # apt-get update && \ # apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev && \ # python -m pip install pipdeptree psutil yappi cffi ipdb pandas matplotlib py3nvml pyarrow graphviz astor boto3 tqdm sentencepiece msgpack requests pandas sphinx sphinx_rtd_theme scipy numpy scikit-learn nvidia-ml-py3 mpi4py # install deepspeed step 1 RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ /opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION} && \ git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL && \ cd ${STAGE_DIR}/oneCCL && \ git checkout . && \ git checkout master && \ mkdir build && \ cd build && \ cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \ make -j"$(nproc)" install # install deepspeed step 2 ARG CUDA_ARCH_LIST="80;86;89;90" ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST} RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels && \ cd ${STAGE_DIR}/DeepSpeed-Kernels && \ CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v . ARG DEEPSPEED_VERSION="v0.14.3" ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION} ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose" ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS} ARG DS_BUILD_SPARSE_ATTN=0 ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN} ARG DS_BUILD_FUSED_ADAM=1 ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM} ARG DS_BUILD_CPU_ADAM=1 ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} ARG DS_BUILD_OPS=1 ENV DS_BUILD_OPS=${DS_BUILD_OPS} ARG HOSTFILE_CONTENT="" ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT} ENV CUTLASS_PATH='/opt/cutlass' ENV CUDA_HOME='/usr/local/cuda' ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} ENV PATH=${CUDA_HOME}/bin:${PATH} # install deepspeed step 3 RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \ cd ${STAGE_DIR}/DeepSpeed && \ git checkout ${DEEPSPEED_VERSION} && \ sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \ chmod +x ./install_modified.sh && \ if [ -n "${HOSTFILE_CONTENT}" ]; then \ echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \ INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \ else \ INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \ fi && \ eval $INSTALL_CMD && \ cat < ~/compile_deepspeed_ops.py import deepspeed def compile_ops(): builders = [ deepspeed.ops.op_builder.AsyncIOBuilder, deepspeed.ops.op_builder.FusedAdamBuilder, deepspeed.ops.op_builder.CPUAdamBuilder, deepspeed.ops.op_builder.CPUAdagradBuilder, deepspeed.ops.op_builder.CPULionBuilder, deepspeed.ops.op_builder.EvoformerAttnBuilder, deepspeed.ops.op_builder.FPQuantizerBuilder, deepspeed.ops.op_builder.FusedLambBuilder, deepspeed.ops.op_builder.FusedLionBuilder, deepspeed.ops.op_builder.QuantizerBuilder, deepspeed.ops.op_builder.RaggedOpsBuilder, deepspeed.ops.op_builder.RandomLTDBuilder, deepspeed.ops.op_builder.SparseAttnBuilder, deepspeed.ops.op_builder.SpatialInferenceBuilder, deepspeed.ops.op_builder.TransformerBuilder, deepspeed.ops.op_builder.StochasticTransformerBuilder, ] for builder in builders: print(f"Compiling {builder.__name__}") builder().load() if __name__ == "__main__": compile_ops() EOF && \ python compile_deepspeed_ops.py && \ ds_report # install transformers and flash-attn RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers && \ cd ${STAGE_DIR}/transformers && \ python3 ./setup.py develop && \ python3 -m pip install -U --no-cache-dir "pydantic<2" && \ pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org # other packages ENV TORCH_CUDA_ARCH_LIST="80;86;89;90" RUN source /opt/conda/etc/profile.d/conda.sh && \ conda activate ${CONDA_ENV_NAME} && \ pip3 install optimum && \ pip3 install peft tiktoken tqdm matplotlib seaborn numpy pandas scikit-learn diffusers huggingface_hub spacy blobfile pycocotools open_clip_torch zstandard -i https://pypi.org/simple/ --trusted-host pypi.org ARG DEEPSPEED_TRAIN='/data/train_data' ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN ARG DEEPSPEED_VALIDATION='/data/validation_data' ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION ARG NCCL_SOCKET_IFNAME='eth0' CMD ["/usr/sbin/sshd", "-D"] # CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]