@@ -0,0 +1,372 @@
# syntax=docker/dockerfile:1
# NOTE: Building this image requires Docker version >= 23.0.
#
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG CUDA_VERSION = 12 .1.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
ARG DEBIAN_FRONTEND = "noninteractive"
ENV DEBIAN_FRONTEND = ${ DEBIAN_FRONTEND }
ENV MAMBA_ROOT_PREFIX = ~/micromamba
ARG ROOT_PASSWD = "root"
ENV ROOT_PASSWD = ${ ROOT_PASSWD }
WORKDIR /root
SHELL [ "/bin/bash" , "-c" ]
# Base tools
RUN <<EOT
apt-get update
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add -apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd
echo 'root:${ROOT_PASSWD}' | chpasswd
mkdir -p ~/.pip
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
conda init bash
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
echo 1 | bash <( curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
micromamba shell init -s bash -p ~/micromamba
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
alias mamba = micromamba
alias mba = mamba
EOF
cat <<EOF > ~/.mambarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
EOT
# PyTorch
ARG CONDA_ENV_NAME = "deepspeed"
ENV CONDA_ENV_NAME = ${ CONDA_ENV_NAME }
ARG PYTHON_VERSION = 3 .10
ENV PYTHON_VERSION = ${ PYTHON_VERSION }
ENV PATH /opt/conda/bin:/opt/conda/envs/${ CONDA_ENV_NAME } /bin:$PATH
ARG PYTORCH_VERSION = 2 .3.0
ENV PYTORCH_VERSION = ${ PYTORCH_VERSION }
ENV TORCH_CUDA_ARCH_LIST = "7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a"
ENV TORCH_NVCC_FLAGS = "-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH = " $( dirname $( which conda) ) /../ "
ARG CUDA_NUM = '121'
ENV CUDA_NUM = ${ CUDA_NUM }
RUN <<EOT
source /opt/conda/etc/profile.d/conda.sh
conda create -n ${ CONDA_ENV_NAME } python = ${ PYTHON_VERSION } cmake ninja -c conda-forge -y
echo " conda activate ${ CONDA_ENV_NAME } " >> ~/.bashrc
conda activate ${ CONDA_ENV_NAME }
python3 -m pip install --no-cache-dir --upgrade pip
conda clean -afy
# 获取指定版本的 PyTorch 源代码
git clone --recursive https://github.com/pytorch/pytorch ${ STAGE_DIR } /pytorch
cd ${ STAGE_DIR } /pytorch
git checkout v${ PYTORCH_VERSION }
git submodule sync
git submodule update --init --recursive
# 安装依赖项
conda install -y intel::mkl-static intel::mkl-include
conda install -y -c pytorch magma-cuda${ CUDA_NUM }
# 构建和安装 PyTorch
export CMAKE_PREFIX_PATH = ${ CONDA_PREFIX :- " $( dirname $( which conda) ) /../ " }
python setup.py install
# 安装其他必要的依赖项
python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
python -m pip install setuptools = = 69.5.1
python3 -m pip install --no-cache-dir ./transformers[ deepspeed-testing]
python3 -m pip uninstall -y torch torchvision torchaudio
python3 -m pip install torch = = ${ PYTORCH_VERSION } +${ CUDA } torchvision = = 0.18.0+${ CUDA } torchaudio = = 2.3.0 --extra-index-url https://download.pytorch.org/whl/${ CUDA }
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg= accelerate
python3 -m pip uninstall -y transformer-engine
python3 -m pip uninstall -y torch-tensorrt
python3 -m pip uninstall -y apex
EOT
# install apex
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
git clone https://github.com/NVIDIA/apex ${ STAGE_DIR } /apex
cd apex
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
MAX_JOBS = 1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
cd ..
rm -rf ${ STAGE_DIR } /apex
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${ STAGE_DIR }
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${ MLNX_OFED_VERSION } /MLNX_OFED_LINUX-${ MLNX_OFED_VERSION } -ubuntu20.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${ MLNX_OFED_VERSION } -ubuntu20.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${ STAGE_DIR }
rm -rf ${ STAGE_DIR } /MLNX_OFED_LINUX-${ MLNX_OFED_VERSION } -ubuntu20.04-x86_64*
cd ..
# install nv_peer_mem
rm -rf ${ STAGE_DIR }
mkdir -p ${ STAGE_DIR }
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${ NV_PEER_MEM_TAG } ${ STAGE_DIR } /nv_peer_memory
cd ${ STAGE_DIR } /nv_peer_memory
./build_module.sh
cd ${ STAGE_DIR }
tar xzf ${ STAGE_DIR } /nvidia-peer-memory_${ NV_PEER_MEM_VERSION } .orig.tar.gz
cd ${ STAGE_DIR } /nvidia-peer-memory-${ NV_PEER_MEM_VERSION }
apt-get update
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${ STAGE_DIR } /nvidia-peer-memory_${ NV_PEER_MEM_TAG } _all.deb
EOT
# install mpi
ENV PATH = /usr/local/mpi/bin:${ PATH }
ENV LD_LIBRARY_PATH = /usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${ LD_LIBRARY_PATH }
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
# OPENMPI
rm -rf ${ STAGE_DIR }
mkdir -p ${ STAGE_DIR }
cd ${ STAGE_DIR }
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${ OPENMPI_BASEVERSION } /openmpi-${ OPENMPI_VERSION } .tar.gz | tar xzf -
cd openmpi-${ OPENMPI_VERSION }
./configure --prefix= /usr/local/openmpi-${ OPENMPI_VERSION }
make -j" $( nproc) " install
ln -s /usr/local/openmpi-${ OPENMPI_VERSION } /usr/local/mpi
# Sanity check:
test -f /usr/local/mpi/bin/mpic++
cd ${ STAGE_DIR }
rm -r ${ STAGE_DIR } /openmpi-${ OPENMPI_VERSION }
# Create a wrapper for OpenMPI to allow running as root by default
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
chmod a+x /usr/local/mpi/bin/mpirun
EOT
# Some Packages
RUN <<EOT
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
apt-get update
apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
python -m pip install https://github.com/mpi4py/mpi4py/tarball/master
python -m pip install psutil \
yappi \
cffi \
ipdb \
pandas \
matplotlib \
py3nvml \
pyarrow \
graphviz \
astor \
boto3 \
tqdm \
sentencepiece \
msgpack \
requests \
pandas \
sphinx \
sphinx_rtd_theme \
scipy \
numpy \
scikit-learn \
nvidia-ml-py3
EOT
# SSH daemon port inside container cannot conflict with host OS port
ENV SSH_PORT = 2222
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
cat /etc/ssh/sshd_config > ${ STAGE_DIR } /sshd_config && \
sed " 0,/^Port 22/s//Port ${ SSH_PORT } / " ${ STAGE_DIR } /sshd_config > /etc/ssh/sshd_config
EOT
# 29.78 Usage: install.sh [options...]
# 29.78
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
# 29.78
# 29.78 [optional]
# 29.78 -l, --local_only Install only on local machine
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
# 29.78 -v, --verbose Verbose logging
# 29.78 -h, --help This help text
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
usermod -aG sudo deepspeed
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
EOT
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70: 适用于 NVIDIA Volta 架构(如 Tesla V100) 。
# 75: 适用于 NVIDIA Turing 架构(如 Tesla T4) 。
# 80: 适用于 NVIDIA Ampere 架构(如 A100) 。
# 90a: 适用于 NVIDIA Hopper 架构(如 H100) 。
# 89:GeForce RTX 4090
ARG DCUTLASS_NVCC_ARCHS = "89"
ENV DCUTLASS_NVCC_ARCHS = ${ DCUTLASS_NVCC_ARCHS }
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
cd /opt/cutlass
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCUTLASS_NVCC_ARCHS= ${ DCUTLASS_NVCC_ARCHS } -DCUTLASS_ENABLE_TESTS= OFF -DCUTLASS_UNITY_BUILD_ENABLED= ON # compiles for NVIDIA Hopper GPU architecture, like H100
make -j" $( nproc) " install
cd ..
# make test_unit -j"$(nproc)"
# make test_unit_gemm_warp -j"$(nproc)"
EOT
# CUDA_ARCH_LIST="80;86;89;90"
ARG DEEPSPEED_INSTALL_FLAGS = "--allow_sudo --pip_sudo --no_clean"
ENV DEEPSPEED_INSTALL_FLAGS = ${ DEEPSPEED_INSTALL_FLAGS }
ARG CUDA_ARCH_LIST = "80;86;89;90"
ENV CUDA_ARCH_LIST = ${ CUDA_ARCH_LIST }
ARG DS_BUILD_SPARSE_ATTN = 0
ENV DS_BUILD_SPARSE_ATTN = ${ DS_BUILD_SPARSE_ATTN }
ARG DS_BUILD_FUSED_ADAM = 1
ENV DS_BUILD_FUSED_ADAM = ${ DS_BUILD_FUSED_ADAM }
ARG DS_BUILD_CPU_ADAM = 0
ENV DS_BUILD_CPU_ADAM = ${ DS_BUILD_CPU_ADAM }
ARG DS_BUILD_OPS = 1
ENV DS_BUILD_OPS = ${ DS_BUILD_OPS }
ENV CUTLASS_PATH = /opt/cutlass
# install deepspeed
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
python -m pip install setuptools = = ${ SETUPTOOLS_VERSION }
# install oneapi for deepspeed
git clone https://github.com/oneapi-src/oneCCL.git ${ STAGE_DIR } /oneCCL
cd ${ STAGE_DIR } /oneCCL
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCMAKE_INSTALL_PREFIX= /usr/local
make -j install
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${ STAGE_DIR } /DeepSpeed-Kernels
cd ${ STAGE_DIR } /DeepSpeed-Kernels
CUDA_ARCH_LIST = ${ CUDA_ARCH_LIST } python setup.py bdist_wheel
# pip install dist/deepspeed_kernels-*.whl
CUDA_ARCH_LIST = ${ CUDA_ARCH_LIST } pip install -v .
git clone https://github.com/microsoft/DeepSpeed.git ${ STAGE_DIR } /DeepSpeed
cd ${ STAGE_DIR } /DeepSpeed
git checkout .
git checkout master
python setup.py bdist_wheel
DS_BUILD_OPS = ${ DS_BUILD_OPS } pip install dist/deepspeed*.whl --force-reinstall
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt
# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=1 pip install -U --no-cache-dir .
# ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile
cd ..
# rm -rf ${STAGE_DIR}/DeepSpeed
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
# install transformers
git clone https://github.com/huggingface/transformers ${ STAGE_DIR } /transformers
cd ${ STAGE_DIR } /transformers
python3 ./setup.py develop
python3 -m pip install -U --no-cache-dir "pydantic<2"
# install flash-attn
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${ CONDA_ENV_NAME }
pip install optimum
pip install peft tiktoken \
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
huggingface_hub spacy blobfile pycocotools \
xformers open_clip_torch \
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
# add vscode server
# RUN <<EOT
# #!/bin/bash
# wget -qO- https://update.code.visualstudio.com/commit:${commit_id}/server-linux-x64/stable
# code-server --install-extension ms-python.vscode-pylance
# EOT
# 启动 ssh 服务
# CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]
CMD [ "/usr/sbin/sshd" , "-D" ]