Files
cdc_dockerfile/finetune/Dockerfile

464 lines
18 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# NOTE: Building this image require's docker version >= 23.0.
#
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
# install latest cmake
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
sudo apt-get update
sudo apt-get install -y cmake
# Configure SSH for password and public key authentication
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip
# install miniconda
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
conda init bash
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# 安装 micromamba
# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
# micromamba shell init -s bash -p ~/micromamba
# cat <<'EOF' >> ~/.bashrc
# source ~/micromamba/etc/profile.d/micromamba.sh
# EOF
# # 配置 .mambarc 文件
# cat <<EOF > ~/.mambarc
# channels:
# - conda-forge
# - bioconda
# - pytorch
# - pytorch-nightly
# - nvidia
# - defaults
# show_channel_urls: true
# EOF
EOT
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
# PyTorch
ARG CONDA_ENV_NAME="deepspeed"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
ENV REF='main'
ENV STAGE_DIR=/tmp
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ARG CUDA='cu121'
ENV CUDA=${CUDA}
ARG PYTORCH_VERSION=2.3.1
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ARG TORCHVISION_VERSION=0.18.1
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
ARG TORCHAUDIO_VERSION=2.3.1
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
ARG PYTORCH_CUDA_VERSION=12.1
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
ARG SETUPTOOLS_VERSION=69.5.1
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
ARG USE_CUDA=1
ENV USE_CUDA=${USE_CUDA}
ARG USE_ROCM=0
ENV USE_ROCM=${USE_ROCM}
ARG USE_XPU=0
ENV USE_XPU=${USE_XPU}
ARG _GLIBCXX_USE_CXX11_ABI=1
ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
which python > ~/python_path.txt
conda activate ${CONDA_ENV_NAME}
# 克隆 ninja 源码并编译
git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
cd ${STAGE_DIR}/ninja
# 克隆 GoogleTest 源码
git clone https://github.com/google/googletest.git
python ./configure.py --bootstrap
# 配置并构建 Ninja 测试,添加 pthread 链接选项
# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
./ninja all
# 运行 Ninja 单元测试
./ninja_test
python3 -m pip install --no-cache-dir --upgrade pip
python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
conda clean -afy
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
python -m pip install setuptools==${SETUPTOOLS_VERSION}
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
python3 -m pip uninstall -y torch torchvision torchaudio
# # install pytorch create conda env aleay exists
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
python3 -m pip uninstall -y transformer-engine
python3 -m pip uninstall -y torch-tensorrt
python3 -m pip uninstall -y apex
EOT
# install apex
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
cd apex
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
cd ..
rm -rf ${STAGE_DIR}/apex
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
cd ..
# install nv_peer_mem
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
cd ${STAGE_DIR}/nv_peer_memory
./build_module.sh
cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# install mpi
ENV PATH=/usr/local/mpi/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# OPENMPI
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
cd ${STAGE_DIR}
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
cd openmpi-${OPENMPI_VERSION}
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
make -j"$(nproc)" install
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
# Sanity check:
test -f /usr/local/mpi/bin/mpic++
cd ${STAGE_DIR}
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
# Create a wrapper for OpenMPI to allow running as root by default
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
chmod a+x /usr/local/mpi/bin/mpirun
EOT
# SSH daemon port inside container cannot conflict with host OS port
ENV SSH_PORT=2222
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
EOT
# 29.78 Usage: install.sh [options...]
# 29.78
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
# 29.78
# 29.78 [optional]
# 29.78 -l, --local_only Install only on local machine
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
# 29.78 -v, --verbose Verbose logging
# 29.78 -h, --help This help text
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
usermod -aG sudo deepspeed
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
EOT
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70适用于 NVIDIA Volta 架构(如 Tesla V100
# 75适用于 NVIDIA Turing 架构(如 Tesla T4
# 80适用于 NVIDIA Ampere 架构(如 A100
# 90a适用于 NVIDIA Hopper 架构(如 H100
# 89:GeForce RTX 4090
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
cd /opt/cutlass
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
make -j"$(nproc)" install
cd ..
# make test_unit -j"$(nproc)"
# make test_unit_gemm_warp -j"$(nproc)"
EOT
# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# RUN <<EOT
# source /opt/conda/etc/profile.d/conda.sh
# conda activate ${CONDA_ENV_NAME}
# apt-get update
# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
# python -m pip install pipdeptree \
# psutil \
# yappi \
# cffi \
# ipdb \
# pandas \
# matplotlib \
# py3nvml \
# pyarrow \
# graphviz \
# astor \
# boto3 \
# tqdm \
# sentencepiece \
# msgpack \
# requests \
# pandas \
# sphinx \
# sphinx_rtd_theme \
# scipy \
# numpy \
# scikit-learn \
# nvidia-ml-py3 \
# mpi4py
# EOT
# install deepspeed step 1
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
# install oneapi for deepspeed
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
cd ${STAGE_DIR}/oneCCL
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
make -j"$(nproc)" install
EOT
# install deepspeed step 2
ARG CUDA_ARCH_LIST="80;86;89;90"
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
cd ${STAGE_DIR}/DeepSpeed-Kernels
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
# pip install dist/deepspeed_kernels-*.whl
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
EOT
ARG DEEPSPEED_VERSION="v0.14.3"
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
ARG DS_BUILD_SPARSE_ATTN=0
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
ARG DS_BUILD_FUSED_ADAM=1
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
ARG DS_BUILD_CPU_ADAM=1
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
ARG DS_BUILD_OPS=1
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
ARG HOSTFILE_CONTENT=""
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
ENV CUTLASS_PATH='/opt/cutlass'
ENV CUDA_HOME='/usr/local/cuda'
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${CUDA_HOME}/bin:${PATH}
# install deepspeed step 3
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
cd ${STAGE_DIR}/DeepSpeed
git checkout ${DEEPSPEED_VERSION}
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
chmod +x ./install_modified.sh
# 检查 HOSTFILE_CONTENT 并写入文件
if [ -n "${HOSTFILE_CONTENT}" ]; then
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
else
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
fi
eval $INSTALL_CMD
# compile deepspeed ops
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
echo "alias mamba=micromamba" >> ~/.bashrc
echo "alias mba=mamba" >> ~/.bashrc
EOF
# 配置 .mambarc 文件
cat <<EOF > ~/compile_deepspeed_ops.py
import deepspeed
def compile_ops():
builders = [
deepspeed.ops.op_builder.AsyncIOBuilder,
deepspeed.ops.op_builder.FusedAdamBuilder,
deepspeed.ops.op_builder.CPUAdamBuilder,
deepspeed.ops.op_builder.CPUAdagradBuilder,
deepspeed.ops.op_builder.CPULionBuilder,
deepspeed.ops.op_builder.EvoformerAttnBuilder,
deepspeed.ops.op_builder.FPQuantizerBuilder,
deepspeed.ops.op_builder.FusedLambBuilder,
deepspeed.ops.op_builder.FusedLionBuilder,
deepspeed.ops.op_builder.QuantizerBuilder,
deepspeed.ops.op_builder.RaggedOpsBuilder,
deepspeed.ops.op_builder.RandomLTDBuilder,
deepspeed.ops.op_builder.SparseAttnBuilder,
deepspeed.ops.op_builder.SpatialInferenceBuilder,
deepspeed.ops.op_builder.TransformerBuilder,
deepspeed.ops.op_builder.StochasticTransformerBuilder,
]
for builder in builders:
print(f"Compiling {builder.__name__}")
builder().load()
if __name__ == "__main__":
compile_ops()
EOF
python compile_deepspeed_ops.py
ds_report
# clean up
# rm -f deepspeed/git_version_info_installed.py
# rm -rf dist build deepspeed.egg-info
# python setup.py bdist_wheel
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
EOT
# install transformers and flash-attn
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# install transformers
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
cd ${STAGE_DIR}/transformers
python3 ./setup.py develop
python3 -m pip install -U --no-cache-dir "pydantic<2"
# install flash-attn
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
# other packages
ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
pip3 install optimum
pip3 install peft tiktoken \
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
huggingface_hub spacy blobfile pycocotools \
open_clip_torch \
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
CMD ["/usr/sbin/sshd", "-D"]