Files
cdc_dockerfile/finetune/Dockerfile.conda1
2024-06-21 11:58:41 +08:00

391 lines
15 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
# # install latest cmake
# wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
# sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
# sudo apt-get update
# sudo apt-get install -y cmake
# Configure SSH for password and public key authentication
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd
echo 'root:${ROOT_PASSWD}' | chpasswd
mkdir -p ~/.pip
# install miniconda
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
conda init bash
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# 安装 micromamba
echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
micromamba shell init -s bash -p ~/micromamba
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
alias mamba=micromamba
alias mba=mamba
EOF
# 配置 .mambarc 文件
cat <<EOF > ~/.mambarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
EOT
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
# PyTorch
ARG CONDA_ENV_NAME="deepspeed"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
ENV REF='main'
ENV STAGE_DIR=/tmp
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ARG CUDA='cu121'
ENV CUDA=${CUDA}
ARG PYTORCH_VERSION=2.3.0
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ARG TORCHVISION_VERSION=0.18.0
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
ARG TORCHAUDIO_VERSION=2.3.0
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
ARG PYTORCH_CUDA_VERSION=12.1
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
ARG SETUPTOOLS_VERSION=69.5.1
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
ARG USE_CUDA=1
ENV USE_CUDA=${USE_CUDA}
ARG USE_ROCM=0
ENV USE_ROCM=${USE_ROCM}
ARG USE_XPU=0
ENV USE_XPU=${USE_XPU}
ARG _GLIBCXX_USE_CXX11_ABI=1
ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
conda activate ${CONDA_ENV_NAME}
python3 -m pip install --no-cache-dir --upgrade pip
python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
conda clean -afy
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
python -m pip install setuptools==${SETUPTOOLS_VERSION}
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
python3 -m pip uninstall -y torch torchvision torchaudio
# # install pytorch create conda env aleay exists
conda install -y cmake ninja
git clone --recursive https://github.com/pytorch/pytorch ${STAGE_DIR}/pytorch
cd ${STAGE_DIR}/pytorch
git submodule sync
git submodule update --init --recursive
pip install -r requirements.txt
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
python setup.py develop
# python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
python3 -m pip uninstall -y transformer-engine
python3 -m pip uninstall -y torch-tensorrt
python3 -m pip uninstall -y apex
EOT
# install apex
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
cd apex
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
cd ..
rm -rf ${STAGE_DIR}/apex
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
cd ..
# install nv_peer_mem
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
cd ${STAGE_DIR}/nv_peer_memory
./build_module.sh
cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# install mpi
ENV PATH=/usr/local/mpi/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# OPENMPI
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
cd ${STAGE_DIR}
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
cd openmpi-${OPENMPI_VERSION}
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
make -j"$(nproc)" install
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
# Sanity check:
test -f /usr/local/mpi/bin/mpic++
cd ${STAGE_DIR}
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
# Create a wrapper for OpenMPI to allow running as root by default
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
chmod a+x /usr/local/mpi/bin/mpirun
EOT
# Some Packages
RUN <<EOT
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
apt-get update
apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
python -m pip install https://github.com/mpi4py/mpi4py/tarball/master
python -m pip install psutil \
yappi \
cffi \
ipdb \
pandas \
matplotlib \
py3nvml \
pyarrow \
graphviz \
astor \
boto3 \
tqdm \
sentencepiece \
msgpack \
requests \
pandas \
sphinx \
sphinx_rtd_theme \
scipy \
numpy \
scikit-learn \
nvidia-ml-py3
EOT
# SSH daemon port inside container cannot conflict with host OS port
ENV SSH_PORT=2222
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
EOT
# 29.78 Usage: install.sh [options...]
# 29.78
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
# 29.78
# 29.78 [optional]
# 29.78 -l, --local_only Install only on local machine
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
# 29.78 -v, --verbose Verbose logging
# 29.78 -h, --help This help text
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
usermod -aG sudo deepspeed
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
EOT
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70适用于 NVIDIA Volta 架构(如 Tesla V100
# 75适用于 NVIDIA Turing 架构(如 Tesla T4
# 80适用于 NVIDIA Ampere 架构(如 A100
# 90a适用于 NVIDIA Hopper 架构(如 H100
# 89:GeForce RTX 4090
ARG DCUTLASS_NVCC_ARCHS="89"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
cd /opt/cutlass
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
make -j"$(nproc)" install
cd ..
# make test_unit -j"$(nproc)"
# make test_unit_gemm_warp -j"$(nproc)"
EOT
# CUDA_ARCH_LIST="80;86;89;90"
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
ARG CUDA_ARCH_LIST="80;86"
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
ARG DS_BUILD_SPARSE_ATTN=0
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
ARG DS_BUILD_FUSED_ADAM=1
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
ARG DS_BUILD_CPU_ADAM=0
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
ARG DS_BUILD_OPS=1
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
ENV CUTLASS_PATH=/opt/cutlass
# install deepspeed
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
python -m pip install setuptools==${SETUPTOOLS_VERSION}
# install oneapi for deepspeed
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
cd ${STAGE_DIR}/oneCCL
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
make -j install
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
cd ${STAGE_DIR}/DeepSpeed-Kernels
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
# pip install dist/deepspeed_kernels-*.whl
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} pip install -v .
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
cd ${STAGE_DIR}/DeepSpeed
git checkout .
git checkout master
python setup.py bdist_wheel
DS_BUILD_OPS=${DS_BUILD_OPS} pip install dist/deepspeed*.whl --force-reinstall
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt
# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=1 pip install -U --no-cache-dir .
# ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile
cd ..
# rm -rf ${STAGE_DIR}/DeepSpeed
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# install transformers
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
cd ${STAGE_DIR}/transformers
python3 ./setup.py develop
python3 -m pip install -U --no-cache-dir "pydantic<2"
# install flash-attn
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
pip install optimum
pip install peft tiktoken \
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
huggingface_hub spacy blobfile pycocotools \
xformers open_clip_torch \
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
# add vscode server
# RUN <<EOT
# #!/bin/bash
# wget -qO- https://update.code.visualstudio.com/commit:${commit_id}/server-linux-x64/stable
# code-server --install-extension ms-python.vscode-pylance
# EOT
# 启动 ssh 服务
# CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]
CMD ["/usr/sbin/sshd", "-D"]