Files
cdc_dockerfile/finetune/Dockerfile.o
2024-07-12 18:50:27 +08:00

331 lines
15 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# syntax=docker/dockerfile:1
# NOTE: Building this image require's docker version >= 23.0.
#
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
COPY id_rsa.pub /root/.ssh/id_rsa.pub
# base tools
RUN apt-get update && \
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 && \
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c && \
add-apt-repository ppa:git-core/ppa -y && \
apt-get install -y git libnuma-dev wget && \
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
sudo apt-get update && \
sudo apt-get install -y cmake && \
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config && \
mkdir /var/run/sshd && \
echo "root:${ROOT_PASSWD}" | chpasswd && \
mkdir -p ~/.pip && \
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
bash /tmp/miniconda.sh -b -p /opt/conda && \
rm /tmp/miniconda.sh && \
conda init bash && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
# PyTorch
ARG CONDA_ENV_NAME="deepspeed"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
ENV REF='main'
ENV STAGE_DIR=/tmp
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ARG CUDA='cu121'
ENV CUDA=${CUDA}
ARG PYTORCH_VERSION=2.3.1
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ARG TORCHVISION_VERSION=0.18.1
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
ARG TORCHAUDIO_VERSION=2.3.1
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
ARG PYTORCH_CUDA_VERSION=12.1
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
ARG SETUPTOOLS_VERSION=69.5.1
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
ARG USE_CUDA=1
ENV USE_CUDA=${USE_CUDA}
ARG USE_ROCM=0
ENV USE_ROCM=${USE_ROCM}
ARG USE_XPU=0
ENV USE_XPU=${USE_XPU}
ARG _GLIBCXX_USE_CXX11_ABI=1
ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y && \
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
which python > ~/python_path.txt && \
conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja && \
cd ${STAGE_DIR}/ninja && \
git clone https://github.com/google/googletest.git && \
python ./configure.py --bootstrap && \
conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" && \
./ninja all && \
./ninja_test && \
python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python && \
conda clean -afy && \
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. && \
python -m pip install setuptools==${SETUPTOOLS_VERSION} && \
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] && \
python3 -m pip uninstall -y torch torchvision torchaudio && \
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} && \
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate && \
python3 -m pip uninstall -y transformer-engine && \
python3 -m pip uninstall -y torch-tensorrt && \
python3 -m pip uninstall -y apex
# install apex
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex && \
cd apex && \
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \
python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \
cd .. && \
rm -rf ${STAGE_DIR}/apex
ENV MLNX_OFED_VERSION=5.8-4.1.5.0
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
mkdir -p ${STAGE_DIR} && \
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
cd ${STAGE_DIR} && \
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
rm -rf ${STAGE_DIR} && \
mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
./build_module.sh && \
cd ${STAGE_DIR} && \
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
apt-get update && \
apt --fix-broken install -y && \
apt-get install -y dkms && \
dpkg-buildpackage -us -uc && \
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
# install mpi
ENV PATH=/usr/local/mpi/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
rm -rf ${STAGE_DIR} && \
mkdir -p ${STAGE_DIR} && \
cd ${STAGE_DIR} && \
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
make -j"$(nproc)" install && \
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
test -f /usr/local/mpi/bin/mpic++ && \
cd ${STAGE_DIR} && \
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} && \
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
chmod a+x /usr/local/mpi/bin/mpirun
# SSH daemon port inside container cannot conflict with host OS port
# ENV SSH_PORT=2222
# RUN source /opt/conda/etc/profile.d/conda.sh && \
# conda activate ${CONDA_ENV_NAME} && \
# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
useradd --create-home --uid 1000 --shell /bin/bash deepspeed && \
usermod -aG sudo deepspeed && \
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70适用于 NVIDIA Volta 架构(如 Tesla V100
# 75适用于 NVIDIA Turing 架构(如 Tesla T4
# 80适用于 NVIDIA Ampere 架构(如 A100
# 90a适用于 NVIDIA Hopper 架构(如 H100
# 89:GeForce RTX 4090
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
cd /opt/cutlass && \
git checkout . && \
git checkout master && \
mkdir build && \
cd build && \
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
make -j"$(nproc)" install && \
cd ..
# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# RUN source /opt/conda/etc/profile.d/conda.sh && \
# conda activate ${CONDA_ENV_NAME} && \
# apt-get update && \
# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev && \
# python -m pip install pipdeptree psutil yappi cffi ipdb pandas matplotlib py3nvml pyarrow graphviz astor boto3 tqdm sentencepiece msgpack requests pandas sphinx sphinx_rtd_theme scipy numpy scikit-learn nvidia-ml-py3 mpi4py
# install deepspeed step 1
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION} && \
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL && \
cd ${STAGE_DIR}/oneCCL && \
git checkout . && \
git checkout master && \
mkdir build && \
cd build && \
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \
make -j"$(nproc)" install
# install deepspeed step 2
ARG CUDA_ARCH_LIST="80;86;89;90"
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels && \
cd ${STAGE_DIR}/DeepSpeed-Kernels && \
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
ARG DEEPSPEED_VERSION="v0.14.3"
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
ARG DS_BUILD_SPARSE_ATTN=0
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
ARG DS_BUILD_FUSED_ADAM=1
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
ARG DS_BUILD_CPU_ADAM=1
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
ARG DS_BUILD_OPS=1
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
ARG HOSTFILE_CONTENT=""
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
ENV CUTLASS_PATH='/opt/cutlass'
ENV CUDA_HOME='/usr/local/cuda'
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${CUDA_HOME}/bin:${PATH}
# install deepspeed step 3
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \
cd ${STAGE_DIR}/DeepSpeed && \
git checkout ${DEEPSPEED_VERSION} && \
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \
chmod +x ./install_modified.sh && \
if [ -n "${HOSTFILE_CONTENT}" ]; then \
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \
else \
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \
fi && \
eval $INSTALL_CMD && \
cat <<EOF > ~/compile_deepspeed_ops.py
import deepspeed
def compile_ops():
builders = [
deepspeed.ops.op_builder.AsyncIOBuilder,
deepspeed.ops.op_builder.FusedAdamBuilder,
deepspeed.ops.op_builder.CPUAdamBuilder,
deepspeed.ops.op_builder.CPUAdagradBuilder,
deepspeed.ops.op_builder.CPULionBuilder,
deepspeed.ops.op_builder.EvoformerAttnBuilder,
deepspeed.ops.op_builder.FPQuantizerBuilder,
deepspeed.ops.op_builder.FusedLambBuilder,
deepspeed.ops.op_builder.FusedLionBuilder,
deepspeed.ops.op_builder.QuantizerBuilder,
deepspeed.ops.op_builder.RaggedOpsBuilder,
deepspeed.ops.op_builder.RandomLTDBuilder,
deepspeed.ops.op_builder.SparseAttnBuilder,
deepspeed.ops.op_builder.SpatialInferenceBuilder,
deepspeed.ops.op_builder.TransformerBuilder,
deepspeed.ops.op_builder.StochasticTransformerBuilder,
]
for builder in builders:
print(f"Compiling {builder.__name__}")
builder().load()
if __name__ == "__main__":
compile_ops()
EOF && \
python compile_deepspeed_ops.py && \
ds_report
# install transformers and flash-attn
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers && \
cd ${STAGE_DIR}/transformers && \
python3 ./setup.py develop && \
python3 -m pip install -U --no-cache-dir "pydantic<2" && \
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
# other packages
ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
RUN source /opt/conda/etc/profile.d/conda.sh && \
conda activate ${CONDA_ENV_NAME} && \
pip3 install optimum && \
pip3 install peft tiktoken tqdm matplotlib seaborn numpy pandas scikit-learn diffusers huggingface_hub spacy blobfile pycocotools open_clip_torch zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
ARG DEEPSPEED_TRAIN='/data/train_data'
ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
ARG DEEPSPEED_VALIDATION='/data/validation_data'
ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
ARG NCCL_SOCKET_IFNAME='eth0'
CMD ["/usr/sbin/sshd", "-D"]
# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]