update
This commit is contained in:
@@ -138,7 +138,7 @@ python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
|||||||
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||||||
python3 -m pip uninstall -y torch torchvision torchaudio
|
python3 -m pip uninstall -y torch torchvision torchaudio
|
||||||
# # install pytorch create conda env aleay exists
|
# # install pytorch create conda env aleay exists
|
||||||
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
||||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
python3 -m pip uninstall -y transformer-engine
|
python3 -m pip uninstall -y transformer-engine
|
||||||
python3 -m pip uninstall -y torch-tensorrt
|
python3 -m pip uninstall -y torch-tensorrt
|
||||||
@@ -350,7 +350,7 @@ cd ${STAGE_DIR}/DeepSpeed-Kernels
|
|||||||
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
ARG DEEPSPEED_VERSION="0.14.3"
|
ARG DEEPSPEED_VERSION="v0.14.3"
|
||||||
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
||||||
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
||||||
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
||||||
@@ -373,51 +373,53 @@ source /opt/conda/etc/profile.d/conda.sh
|
|||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||||
cd ${STAGE_DIR}/DeepSpeed
|
cd ${STAGE_DIR}/DeepSpeed
|
||||||
git checkout .
|
git checkout ${DEEPSPEED_VERSION}
|
||||||
git checkout v${DEEPSPEED_VERSION}
|
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
|
||||||
# python setup.py bdist_wheel
|
chmod +x ./install_modified.sh
|
||||||
# 修改 install.sh 脚本中的 python 解释器路径
|
|
||||||
# sed "s|\bpython\b|/opt/conda/envs/${CONDA_ENV_NAME}/bin/python|g" install.sh > install_modified.sh
|
|
||||||
# chmod +x ./install_modified.sh
|
|
||||||
# 检查 HOSTFILE_CONTENT 并写入文件
|
# 检查 HOSTFILE_CONTENT 并写入文件
|
||||||
if [ -n "${HOSTFILE_CONTENT}" ]; then
|
if [ -n "${HOSTFILE_CONTENT}" ]; then
|
||||||
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
|
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
|
||||||
INSTALL_CMD="./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
|
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
|
||||||
else
|
else
|
||||||
INSTALL_CMD="./install.sh ${DEEPSPEED_INSTALL_FLAGS}"
|
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
|
||||||
fi
|
fi
|
||||||
# eval $INSTALL_CMD
|
eval $INSTALL_CMD
|
||||||
|
# clean up
|
||||||
|
# rm -f deepspeed/git_version_info_installed.py
|
||||||
|
# rm -rf dist build deepspeed.egg-info
|
||||||
|
# python setup.py bdist_wheel
|
||||||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
|
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
|
||||||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
|
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
|
||||||
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
|
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
# install transformers and flash-attn
|
# install transformers and flash-attn
|
||||||
# RUN <<EOT
|
RUN <<EOT
|
||||||
# #!/bin/bash
|
#!/bin/bash
|
||||||
# source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
# conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
# # install transformers
|
# install transformers
|
||||||
# git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
|
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
|
||||||
# cd ${STAGE_DIR}/transformers
|
cd ${STAGE_DIR}/transformers
|
||||||
# python3 ./setup.py develop
|
python3 ./setup.py develop
|
||||||
# python3 -m pip install -U --no-cache-dir "pydantic<2"
|
python3 -m pip install -U --no-cache-dir "pydantic<2"
|
||||||
# # install flash-attn
|
# install flash-attn
|
||||||
# # pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
|
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
# pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
# EOT
|
EOT
|
||||||
|
|
||||||
# other packages
|
# other packages
|
||||||
# RUN <<EOT
|
ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
|
||||||
# #!/bin/bash
|
RUN <<EOT
|
||||||
# source /opt/conda/etc/profile.d/conda.sh
|
#!/bin/bash
|
||||||
# conda activate ${CONDA_ENV_NAME}
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
# pip install optimum
|
conda activate ${CONDA_ENV_NAME}
|
||||||
# pip install peft tiktoken \
|
pip3 install optimum
|
||||||
# tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
|
pip3 install peft tiktoken \
|
||||||
# huggingface_hub spacy blobfile pycocotools \
|
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
|
||||||
# xformers open_clip_torch \
|
huggingface_hub spacy blobfile pycocotools \
|
||||||
# zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
open_clip_torch \
|
||||||
# EOT
|
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
|
EOT
|
||||||
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
CMD ["/usr/sbin/sshd", "-D"]
|
||||||
@@ -21,13 +21,13 @@ services:
|
|||||||
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
||||||
PYTHON_VERSION: "3.10"
|
PYTHON_VERSION: "3.10"
|
||||||
CUDA_VERSION: "12.1.0"
|
CUDA_VERSION: "12.1.0"
|
||||||
PYTORCH_VERSION: "2.3.1"
|
PYTORCH_VERSION: "2.3.0"
|
||||||
TORCHVISION_VERSION: "0.18.1"
|
TORCHVISION_VERSION: "0.18.0"
|
||||||
TORCHAUDIO_VERSION: "2.3.1"
|
TORCHAUDIO_VERSION: "2.3.0"
|
||||||
DS_BUILD_OPS: 1
|
DS_BUILD_OPS: 1
|
||||||
DS_BUILD_SPARSE_ATTN: 0
|
# DS_BUILD_SPARSE_ATTN: 0
|
||||||
DS_BUILD_FUSED_ADAM: 1
|
# DS_BUILD_FUSED_ADAM: 1
|
||||||
DS_BUILD_CPU_ADAM: 1
|
# DS_BUILD_CPU_ADAM: 1
|
||||||
USE_CUDA: 1
|
USE_CUDA: 1
|
||||||
USE_ROCM: 0
|
USE_ROCM: 0
|
||||||
USE_XPU: 0
|
USE_XPU: 0
|
||||||
@@ -35,8 +35,8 @@ services:
|
|||||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||||
SETUPTOOLS_VERSION: "69.5.1"
|
SETUPTOOLS_VERSION: "69.5.1"
|
||||||
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
||||||
DEEPSPEED_VERSION: "0.14.3"
|
DEEPSPEED_VERSION: "master"
|
||||||
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo --pip_sudo --verbose"
|
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
|
||||||
volumes:
|
volumes:
|
||||||
- ./src:/bbtft
|
- ./src:/bbtft
|
||||||
container_name: ubuntu-finetune
|
container_name: ubuntu-finetune
|
||||||
|
|||||||
Reference in New Issue
Block a user