From be77975ce2f37f73fee5b40cfeb654ff51a07ccc Mon Sep 17 00:00:00 2001 From: hotwa Date: Fri, 21 Jun 2024 15:12:44 +0800 Subject: [PATCH] update --- finetune/Dockerfile | 22 +- finetune/Dockerfile.conda | 404 ------------------ finetune/Dockerfile.conda1 | 372 ---------------- finetune/docker-compose_pytorch1.13.yml | 2 +- finetune/docker-compose_pytorch1.13_conda.yml | 53 --- finetune/docker-compose_pytorch2.3.yml | 4 +- finetune/docker-compose_pytorch2.3_conda.yml | 63 --- 7 files changed, 20 insertions(+), 900 deletions(-) delete mode 100644 finetune/Dockerfile.conda delete mode 100644 finetune/Dockerfile.conda1 delete mode 100644 finetune/docker-compose_pytorch1.13_conda.yml delete mode 100644 finetune/docker-compose_pytorch2.3_conda.yml diff --git a/finetune/Dockerfile b/finetune/Dockerfile index 4b9ba41..cb3af52 100644 --- a/finetune/Dockerfile +++ b/finetune/Dockerfile @@ -317,7 +317,7 @@ cd .. EOT # CUDA_ARCH_LIST="80;86;89;90" -ARG DEEPSPEED_VERSION="0.8.3" +ARG DEEPSPEED_VERSION="0.14.3" ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION} ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean" ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS} @@ -339,7 +339,7 @@ source /opt/conda/etc/profile.d/conda.sh conda activate ${CONDA_ENV_NAME} python -m pip install setuptools==${SETUPTOOLS_VERSION} # install oneapi for deepspeed -git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL +git clone https://ghproxy.dockless.eu.org/https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL cd ${STAGE_DIR}/oneCCL git checkout . git checkout master @@ -347,15 +347,27 @@ mkdir build cd build cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local make -j"$(nproc)" install -git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels +EOT + +RUN <= 23.0. -# -# For reference: -# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel -ARG CUDA_VERSION=12.1.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 -ARG DEBIAN_FRONTEND="noninteractive" -ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} -ENV MAMBA_ROOT_PREFIX=~/micromamba -ARG ROOT_PASSWD="root" -ENV ROOT_PASSWD=${ROOT_PASSWD} -WORKDIR /root -SHELL ["/bin/bash", "-c"] -# base tools -RUN <> ~/.bashrc -# 配置 .condarc 文件 -cat < ~/.condarc -channels: - - conda-forge - - bioconda - - pytorch - - pytorch-nightly - - nvidia - - defaults -show_channel_urls: true -EOF -# 安装 micromamba -echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) -micromamba shell init -s bash -p ~/micromamba -cat <<'EOF' >> ~/.bashrc -source ~/micromamba/etc/profile.d/micromamba.sh -alias mamba=micromamba -alias mba=mamba -EOF -# 配置 .mambarc 文件 -cat < ~/.mambarc -channels: - - conda-forge - - bioconda - - pytorch - - pytorch-nightly - - nvidia - - defaults -show_channel_urls: true -EOF -EOT - -# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile -# PyTorch -ARG CONDA_ENV_NAME="deepspeed" -ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} -ARG PYTHON_VERSION=3.10 -ENV PYTHON_VERSION=${PYTHON_VERSION} -ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH -ENV REF='main' -ENV STAGE_DIR=/tmp -ENV NV_PEER_MEM_VERSION=1.2 -ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 -ENV OPENMPI_BASEVERSION=4.1 -ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 -ARG CUDA_NUM='121' -ENV CUDA_NUM=${CUDA_NUM} -ARG CUDA='cu121' -ENV CUDA=${CUDA} -ARG PYTORCH_VERSION=2.3.0 -ENV PYTORCH_VERSION=${PYTORCH_VERSION} -ARG TORCHVISION_VERSION=0.18.0 -ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} -ARG TORCHAUDIO_VERSION=2.3.0 -ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} -ARG PYTORCH_CUDA_VERSION=12.1 -ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} -ENV TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" -ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" -ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" -ENV MLNX_OFED_VERSION=4.9-7.1.0.0 -ARG SETUPTOOLS_VERSION=69.5.1 -ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} -ARG USE_CUDA=1 -ENV USE_CUDA=${USE_CUDA} -ARG USE_ROCM=0 -ENV USE_ROCM=${USE_ROCM} -ARG USE_XPU=0 -ENV USE_XPU=${USE_XPU} -ARG _GLIBCXX_USE_CXX11_ABI=1 -ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} -RUN <> ~/.bashrc -conda activate ${CONDA_ENV_NAME} -python3 -m pip install --no-cache-dir --upgrade pip -python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python -conda clean -afy -git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. -python -m pip install setuptools==${SETUPTOOLS_VERSION} -python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] -# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) -# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -python3 -m pip uninstall -y torch torchvision torchaudio -# # install pytorch create conda env aleay exists -git clone --recursive https://github.com/pytorch/pytorch ${STAGE_DIR}/pytorch -cd ${STAGE_DIR}/pytorch -git checkout v${PYTORCH_VERSION} -git submodule sync -git submodule update --init --recursive -pip install -r requirements.txt -conda install -y intel::mkl-static intel::mkl-include -conda install -y -c pytorch magma-cuda${CUDA_NUM} -export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} -python setup.py install -# python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA} -python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate -python3 -m pip uninstall -y transformer-engine -python3 -m pip uninstall -y torch-tensorrt -python3 -m pip uninstall -y apex -EOT - -# install apex -RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -python -c "import apex.amp; print('Apex is installed and the amp module is available.')" -cd .. -rm -rf ${STAGE_DIR}/apex -EOT - -RUN <&1 -# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail -# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile -# install deepspeed prepare -# install Mellanox OFED -mkdir -p ${STAGE_DIR} -wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - -cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 -./mlnxofedinstall --user-space-only --without-fw-update --all -q -cd ${STAGE_DIR} -rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* -cd .. -# install nv_peer_mem -rm -rf ${STAGE_DIR} -mkdir -p ${STAGE_DIR} -git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory -cd ${STAGE_DIR}/nv_peer_memory -./build_module.sh -cd ${STAGE_DIR} -tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz -cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} -apt-get update -apt-get install -y dkms -dpkg-buildpackage -us -uc -dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb -EOT - -# install mpi -ENV PATH=/usr/local/mpi/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -RUN < /usr/local/mpi/bin/mpirun -echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun -chmod a+x /usr/local/mpi/bin/mpirun -EOT - -# Some Packages -RUN < ${STAGE_DIR}/sshd_config && \ -sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config -EOT - -# 29.78 Usage: install.sh [options...] -# 29.78 -# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in -# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally -# 29.78 -# 29.78 [optional] -# 29.78 -l, --local_only Install only on local machine -# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) -# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) -# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels -# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) -# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) -# 29.78 -e, --examples Checkout deepspeed example submodule (no install) -# 29.78 -v, --verbose Verbose logging -# 29.78 -h, --help This help text - -RUN <> /etc/sudoers -EOT - -# install cutlass https://github.com/NVIDIA/cutlass -# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) -# A100: architecture is Ampere -# V100: architecture is Volta -# T4: architecture is Turing -# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 -# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 -# 80:适用于 NVIDIA Ampere 架构(如 A100)。 -# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 -# 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="89" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -RUN <= 23.0. -# -# For reference: -# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel - -ARG CUDA_VERSION=12.1.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 -ARG DEBIAN_FRONTEND="noninteractive" -ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} -ENV MAMBA_ROOT_PREFIX=~/micromamba -ARG ROOT_PASSWD="root" -ENV ROOT_PASSWD=${ROOT_PASSWD} -WORKDIR /root -SHELL ["/bin/bash", "-c"] - -# Base tools -RUN <> ~/.bashrc -cat < ~/.condarc -channels: - - conda-forge - - bioconda - - pytorch - - pytorch-nightly - - nvidia - - defaults -show_channel_urls: true -EOF -echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) -micromamba shell init -s bash -p ~/micromamba -cat <<'EOF' >> ~/.bashrc -source ~/micromamba/etc/profile.d/micromamba.sh -alias mamba=micromamba -alias mba=mamba -EOF -cat < ~/.mambarc -channels: - - conda-forge - - bioconda - - pytorch - - pytorch-nightly - - nvidia - - defaults -show_channel_urls: true -EOF -EOT - -# PyTorch -ARG CONDA_ENV_NAME="deepspeed" -ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} -ARG PYTHON_VERSION=3.10 -ENV PYTHON_VERSION=${PYTHON_VERSION} -ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH -ARG PYTORCH_VERSION=2.3.0 -ENV PYTORCH_VERSION=${PYTORCH_VERSION} -ENV TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" -ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" -ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" -ARG CUDA_NUM='121' -ENV CUDA_NUM=${CUDA_NUM} - -RUN <> ~/.bashrc -conda activate ${CONDA_ENV_NAME} -python3 -m pip install --no-cache-dir --upgrade pip -conda clean -afy - -# 获取指定版本的 PyTorch 源代码 -git clone --recursive https://github.com/pytorch/pytorch ${STAGE_DIR}/pytorch -cd ${STAGE_DIR}/pytorch -git checkout v${PYTORCH_VERSION} -git submodule sync -git submodule update --init --recursive - -# 安装依赖项 -conda install -y intel::mkl-static intel::mkl-include -conda install -y -c pytorch magma-cuda${CUDA_NUM} - -# 构建和安装 PyTorch -export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} -python setup.py install - -# 安装其他必要的依赖项 -python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python -python -m pip install setuptools==69.5.1 -python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] -python3 -m pip uninstall -y torch torchvision torchaudio -python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==0.18.0+${CUDA} torchaudio==2.3.0 --extra-index-url https://download.pytorch.org/whl/${CUDA} -python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate -python3 -m pip uninstall -y transformer-engine -python3 -m pip uninstall -y torch-tensorrt -python3 -m pip uninstall -y apex -EOT - -# install apex -RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -python -c "import apex.amp; print('Apex is installed and the amp module is available.')" -cd .. -rm -rf ${STAGE_DIR}/apex -EOT - -RUN <&1 -# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail -# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile -# install deepspeed prepare -# install Mellanox OFED -mkdir -p ${STAGE_DIR} -wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - -cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 -./mlnxofedinstall --user-space-only --without-fw-update --all -q -cd ${STAGE_DIR} -rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* -cd .. -# install nv_peer_mem -rm -rf ${STAGE_DIR} -mkdir -p ${STAGE_DIR} -git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory -cd ${STAGE_DIR}/nv_peer_memory -./build_module.sh -cd ${STAGE_DIR} -tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz -cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} -apt-get update -apt-get install -y dkms -dpkg-buildpackage -us -uc -dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb -EOT - -# install mpi -ENV PATH=/usr/local/mpi/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -RUN < /usr/local/mpi/bin/mpirun -echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun -chmod a+x /usr/local/mpi/bin/mpirun -EOT - -# Some Packages -RUN < ${STAGE_DIR}/sshd_config && \ -sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config -EOT - -# 29.78 Usage: install.sh [options...] -# 29.78 -# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in -# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally -# 29.78 -# 29.78 [optional] -# 29.78 -l, --local_only Install only on local machine -# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) -# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) -# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels -# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) -# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) -# 29.78 -e, --examples Checkout deepspeed example submodule (no install) -# 29.78 -v, --verbose Verbose logging -# 29.78 -h, --help This help text - -RUN <> /etc/sudoers -EOT - -# install cutlass https://github.com/NVIDIA/cutlass -# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) -# A100: architecture is Ampere -# V100: architecture is Volta -# T4: architecture is Turing -# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 -# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 -# 80:适用于 NVIDIA Ampere 架构(如 A100)。 -# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 -# 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="89" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -RUN <