From 11ccadb0dfebf397d0ac41e6c9c7cc9ce61e6681 Mon Sep 17 00:00:00 2001 From: hotwa Date: Fri, 21 Jun 2024 11:52:34 +0800 Subject: [PATCH] update --- finetune/Dockerfile | 50 ++++++++++++++----------- finetune/docker-compose_pytorch1.13.yml | 5 ++- finetune/docker-compose_pytorch2.3.yml | 18 ++++++++- 3 files changed, 50 insertions(+), 23 deletions(-) diff --git a/finetune/Dockerfile b/finetune/Dockerfile index 21b5b93..fe8a923 100644 --- a/finetune/Dockerfile +++ b/finetune/Dockerfile @@ -3,13 +3,8 @@ FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} ENV MAMBA_ROOT_PREFIX=~/micromamba -ARG CONDA_ENV_NAME="deepspeed" -ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} -ARG PYTHON_VERSION=3.10 -ENV PYTHON_VERSION=${PYTHON_VERSION} ARG ROOT_PASSWD="root" ENV ROOT_PASSWD=${ROOT_PASSWD} -ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH WORKDIR /root SHELL ["/bin/bash", "-c"] # base tools @@ -41,7 +36,6 @@ rm /tmp/miniconda.sh conda init bash ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc -echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc # 配置 .condarc 文件 cat < ~/.condarc channels: @@ -76,6 +70,11 @@ EOT # reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile # PyTorch +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH ENV REF='main' ENV STAGE_DIR=/tmp ENV NV_PEER_MEM_VERSION=1.2 @@ -95,38 +94,47 @@ ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} ENV MLNX_OFED_VERSION=4.9-7.1.0.0 ARG SETUPTOOLS_VERSION=69.5.1 ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +ARG USE_CUDA=1 +ENV USE_CUDA=${USE_CUDA} +ARG USE_ROCM=0 +ENV USE_ROCM=${USE_ROCM} +ARG USE_XPU=0 +ENV USE_XPU=${USE_XPU} +ARG _GLIBCXX_USE_CXX11_ABI=1 +ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} RUN <> ~/.bashrc conda activate ${CONDA_ENV_NAME} # 克隆 ninja 源码并编译 git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja cd ${STAGE_DIR}/ninja # 克隆 GoogleTest 源码 git clone https://github.com/google/googletest.git -conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap +python ./configure.py --bootstrap # 配置并构建 Ninja 测试,添加 pthread 链接选项 # CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" ./ninja all # 运行 Ninja 单元测试 ./ninja_test -conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y python3 -m pip install --no-cache-dir --upgrade pip -conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python +python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python conda clean -afy git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. -conda run -n ${CONDA_ENV_NAME} python -m pip install setuptools==${SETUPTOOLS_VERSION} -conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +python -m pip install setuptools==${SETUPTOOLS_VERSION} +python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] # # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch torchvision torchaudio +python3 -m pip uninstall -y torch torchvision torchaudio # # install pytorch create conda env aleay exists -conda run -n ${CONDA_ENV_NAME} python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA} -conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y transformer-engine -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch-tensorrt -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y apex +python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA} +python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +python3 -m pip uninstall -y transformer-engine +python3 -m pip uninstall -y torch-tensorrt +python3 -m pip uninstall -y apex EOT # install apex @@ -305,11 +313,11 @@ EOT # CUDA_ARCH_LIST="80;86;89;90" ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean" ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS} -ARG CUDA_ARCH_LIST="80;86" +ARG CUDA_ARCH_LIST="80;86;89;90" ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST} ARG DS_BUILD_SPARSE_ATTN=0 ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN} -ARG DS_BUILD_FUSED_ADAM=0 +ARG DS_BUILD_FUSED_ADAM=1 ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM} ARG DS_BUILD_CPU_ADAM=0 ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} @@ -341,9 +349,9 @@ cd ${STAGE_DIR}/DeepSpeed git checkout . git checkout master python setup.py bdist_wheel -pip install dist/deepspeed*.whl --force-reinstall +DS_BUILD_OPS=${DS_BUILD_OPS} pip install dist/deepspeed*.whl --force-reinstall # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt -# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=0 pip install -U --no-cache-dir . +# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=1 pip install -U --no-cache-dir . # ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile cd .. # rm -rf ${STAGE_DIR}/DeepSpeed diff --git a/finetune/docker-compose_pytorch1.13.yml b/finetune/docker-compose_pytorch1.13.yml index 7004a47..f52a727 100644 --- a/finetune/docker-compose_pytorch1.13.yml +++ b/finetune/docker-compose_pytorch1.13.yml @@ -13,7 +13,10 @@ services: DS_BUILD_OPS: 1 DS_BUILD_SPARSE_ATTN: 0 DS_BUILD_FUSED_ADAM: 1 - DS_BUILD_CPU_ADAM: 0 + DS_BUILD_CPU_ADAM: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 CUDA: cu117 CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" SETUPTOOLS_VERSION: "69.5.1" diff --git a/finetune/docker-compose_pytorch2.3.yml b/finetune/docker-compose_pytorch2.3.yml index 3dfb32c..73b978e 100644 --- a/finetune/docker-compose_pytorch2.3.yml +++ b/finetune/docker-compose_pytorch2.3.yml @@ -1,5 +1,18 @@ version: '3.8' +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + services: ubuntu-finetune: build: @@ -13,7 +26,10 @@ services: DS_BUILD_OPS: 1 DS_BUILD_SPARSE_ATTN: 0 DS_BUILD_FUSED_ADAM: 1 - DS_BUILD_CPU_ADAM: 0 + DS_BUILD_CPU_ADAM: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 CUDA: cu121 CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" SETUPTOOLS_VERSION: "69.5.1"