From 4a779d06ff71c4c54518f6e31cf075c180537af5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 22 Jun 2024 07:33:12 +0000 Subject: [PATCH] update --- finetune/Dockerfile | 80 +++++++++++++++----------- finetune/docker-compose_pytorch2.3.yml | 2 +- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/finetune/Dockerfile b/finetune/Dockerfile index eb75db6..2097ab3 100644 --- a/finetune/Dockerfile +++ b/finetune/Dockerfile @@ -17,7 +17,7 @@ SHELL ["/bin/bash", "-c"] RUN <> ~/.bashrc source ~/micromamba/etc/profile.d/micromamba.sh -alias mamba=micromamba -alias mba=mamba +echo "alias mamba=micromamba" >> ~/.bashrc +echo "alias mba=mamba" >> ~/.bashrc EOF # 配置 .mambarc 文件 cat < ~/.mambarc @@ -80,7 +80,8 @@ ARG CONDA_ENV_NAME="deepspeed" ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} ARG PYTHON_VERSION=3.10 ENV PYTHON_VERSION=${PYTHON_VERSION} -ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:$PATH +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH +ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" ENV REF='main' ENV STAGE_DIR=/tmp ENV NV_PEER_MEM_VERSION=1.2 @@ -113,6 +114,7 @@ RUN <> ~/.bashrc +which python > ~/python_path.txt conda activate ${CONDA_ENV_NAME} # 克隆 ninja 源码并编译 git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja @@ -297,7 +299,7 @@ EOT # 80:适用于 NVIDIA Ampere 架构(如 A100)。 # 90a:适用于 NVIDIA Hopper 架构(如 H100)。 # 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="89" +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} RUN < install_modified.sh +# chmod +x ./install_modified.sh +# 检查 HOSTFILE_CONTENT 并写入文件 +if [ -n "${HOSTFILE_CONTENT}" ]; then + echo "${HOSTFILE_CONTENT}" > /tmp/hostfile + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" +else + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" +fi +eval $INSTALL_CMD EOT # install transformers and flash-attn @@ -410,6 +422,4 @@ EOT # code-server --install-extension ms-python.vscode-pylance # EOT -# 启动 ssh 服务 -# CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"] CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/finetune/docker-compose_pytorch2.3.yml b/finetune/docker-compose_pytorch2.3.yml index 327d0d9..e75f75a 100644 --- a/finetune/docker-compose_pytorch2.3.yml +++ b/finetune/docker-compose_pytorch2.3.yml @@ -34,7 +34,7 @@ services: CUDA: cu121 CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" SETUPTOOLS_VERSION: "69.5.1" - DCUTLASS_NVCC_ARCHS: "90a" # 90a for H100 GPU 89:GeForce RTX 4090 + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 volumes: - ./src:/bbtft container_name: ubuntu-finetune