This commit is contained in:
Your Name
2024-06-23 16:46:22 +00:00
parent 8aebb93e89
commit 13ba53eaca
2 changed files with 45 additions and 43 deletions

View File

@@ -138,7 +138,7 @@ python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) # # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
python3 -m pip uninstall -y torch torchvision torchaudio python3 -m pip uninstall -y torch torchvision torchaudio
# # install pytorch create conda env aleay exists # # install pytorch create conda env aleay exists
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA} python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
python3 -m pip uninstall -y transformer-engine python3 -m pip uninstall -y transformer-engine
python3 -m pip uninstall -y torch-tensorrt python3 -m pip uninstall -y torch-tensorrt
@@ -350,7 +350,7 @@ cd ${STAGE_DIR}/DeepSpeed-Kernels
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v . CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
EOT EOT
ARG DEEPSPEED_VERSION="0.14.3" ARG DEEPSPEED_VERSION="v0.14.3"
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION} ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose" ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS} ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
@@ -373,51 +373,53 @@ source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME} conda activate ${CONDA_ENV_NAME}
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
cd ${STAGE_DIR}/DeepSpeed cd ${STAGE_DIR}/DeepSpeed
git checkout . git checkout ${DEEPSPEED_VERSION}
git checkout v${DEEPSPEED_VERSION} sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
# python setup.py bdist_wheel chmod +x ./install_modified.sh
# 修改 install.sh 脚本中的 python 解释器路径
# sed "s|\bpython\b|/opt/conda/envs/${CONDA_ENV_NAME}/bin/python|g" install.sh > install_modified.sh
# chmod +x ./install_modified.sh
# 检查 HOSTFILE_CONTENT 并写入文件 # 检查 HOSTFILE_CONTENT 并写入文件
if [ -n "${HOSTFILE_CONTENT}" ]; then if [ -n "${HOSTFILE_CONTENT}" ]; then
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
INSTALL_CMD="./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
else else
INSTALL_CMD="./install.sh ${DEEPSPEED_INSTALL_FLAGS}" INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
fi fi
# eval $INSTALL_CMD eval $INSTALL_CMD
# clean up
# rm -f deepspeed/git_version_info_installed.py
# rm -rf dist build deepspeed.egg-info
# python setup.py bdist_wheel
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 # pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
EOT EOT
# install transformers and flash-attn # install transformers and flash-attn
# RUN <<EOT RUN <<EOT
# #!/bin/bash #!/bin/bash
# source /opt/conda/etc/profile.d/conda.sh source /opt/conda/etc/profile.d/conda.sh
# conda activate ${CONDA_ENV_NAME} conda activate ${CONDA_ENV_NAME}
# # install transformers # install transformers
# git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
# cd ${STAGE_DIR}/transformers cd ${STAGE_DIR}/transformers
# python3 ./setup.py develop python3 ./setup.py develop
# python3 -m pip install -U --no-cache-dir "pydantic<2" python3 -m pip install -U --no-cache-dir "pydantic<2"
# # install flash-attn # install flash-attn
# # pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org # pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
# pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
# EOT EOT
# other packages # other packages
# RUN <<EOT ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
# #!/bin/bash RUN <<EOT
# source /opt/conda/etc/profile.d/conda.sh #!/bin/bash
# conda activate ${CONDA_ENV_NAME} source /opt/conda/etc/profile.d/conda.sh
# pip install optimum conda activate ${CONDA_ENV_NAME}
# pip install peft tiktoken \ pip3 install optimum
# tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \ pip3 install peft tiktoken \
# huggingface_hub spacy blobfile pycocotools \ tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
# xformers open_clip_torch \ huggingface_hub spacy blobfile pycocotools \
# zstandard -i https://pypi.org/simple/ --trusted-host pypi.org open_clip_torch \
# EOT zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
CMD ["/usr/sbin/sshd", "-D"] CMD ["/usr/sbin/sshd", "-D"]

View File

@@ -21,13 +21,13 @@ services:
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
PYTHON_VERSION: "3.10" PYTHON_VERSION: "3.10"
CUDA_VERSION: "12.1.0" CUDA_VERSION: "12.1.0"
PYTORCH_VERSION: "2.3.1" PYTORCH_VERSION: "2.3.0"
TORCHVISION_VERSION: "0.18.1" TORCHVISION_VERSION: "0.18.0"
TORCHAUDIO_VERSION: "2.3.1" TORCHAUDIO_VERSION: "2.3.0"
DS_BUILD_OPS: 1 DS_BUILD_OPS: 1
DS_BUILD_SPARSE_ATTN: 0 # DS_BUILD_SPARSE_ATTN: 0
DS_BUILD_FUSED_ADAM: 1 # DS_BUILD_FUSED_ADAM: 1
DS_BUILD_CPU_ADAM: 1 # DS_BUILD_CPU_ADAM: 1
USE_CUDA: 1 USE_CUDA: 1
USE_ROCM: 0 USE_ROCM: 0
USE_XPU: 0 USE_XPU: 0
@@ -35,8 +35,8 @@ services:
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
SETUPTOOLS_VERSION: "69.5.1" SETUPTOOLS_VERSION: "69.5.1"
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
DEEPSPEED_VERSION: "0.14.3" DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo --pip_sudo --verbose" DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
volumes: volumes:
- ./src:/bbtft - ./src:/bbtft
container_name: ubuntu-finetune container_name: ubuntu-finetune