update
This commit is contained in:
@@ -17,7 +17,7 @@ SHELL ["/bin/bash", "-c"]
|
|||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
apt-get update
|
apt-get update
|
||||||
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
|
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
|
||||||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||||||
add-apt-repository ppa:git-core/ppa -y
|
add-apt-repository ppa:git-core/ppa -y
|
||||||
apt-get install -y git libnuma-dev wget
|
apt-get install -y git libnuma-dev wget
|
||||||
@@ -58,8 +58,8 @@ echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@m
|
|||||||
micromamba shell init -s bash -p ~/micromamba
|
micromamba shell init -s bash -p ~/micromamba
|
||||||
cat <<'EOF' >> ~/.bashrc
|
cat <<'EOF' >> ~/.bashrc
|
||||||
source ~/micromamba/etc/profile.d/micromamba.sh
|
source ~/micromamba/etc/profile.d/micromamba.sh
|
||||||
alias mamba=micromamba
|
echo "alias mamba=micromamba" >> ~/.bashrc
|
||||||
alias mba=mamba
|
echo "alias mba=mamba" >> ~/.bashrc
|
||||||
EOF
|
EOF
|
||||||
# 配置 .mambarc 文件
|
# 配置 .mambarc 文件
|
||||||
cat <<EOF > ~/.mambarc
|
cat <<EOF > ~/.mambarc
|
||||||
@@ -80,7 +80,8 @@ ARG CONDA_ENV_NAME="deepspeed"
|
|||||||
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.10
|
||||||
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
||||||
ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:$PATH
|
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
|
||||||
|
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
|
||||||
ENV REF='main'
|
ENV REF='main'
|
||||||
ENV STAGE_DIR=/tmp
|
ENV STAGE_DIR=/tmp
|
||||||
ENV NV_PEER_MEM_VERSION=1.2
|
ENV NV_PEER_MEM_VERSION=1.2
|
||||||
@@ -113,6 +114,7 @@ RUN <<EOT
|
|||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
|
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
|
||||||
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
||||||
|
which python > ~/python_path.txt
|
||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
# 克隆 ninja 源码并编译
|
# 克隆 ninja 源码并编译
|
||||||
git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
|
git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
|
||||||
@@ -297,7 +299,7 @@ EOT
|
|||||||
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
||||||
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
||||||
# 89:GeForce RTX 4090
|
# 89:GeForce RTX 4090
|
||||||
ARG DCUTLASS_NVCC_ARCHS="89"
|
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
|
||||||
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
@@ -316,28 +318,12 @@ cd ..
|
|||||||
# make test_unit_gemm_warp -j"$(nproc)"
|
# make test_unit_gemm_warp -j"$(nproc)"
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
# CUDA_ARCH_LIST="80;86;89;90"
|
# install deepspeed step 1
|
||||||
ARG DEEPSPEED_VERSION="0.14.3"
|
|
||||||
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
|
||||||
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean"
|
|
||||||
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
|
||||||
ARG CUDA_ARCH_LIST="80;86;89;90"
|
|
||||||
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
|
||||||
ARG DS_BUILD_SPARSE_ATTN=0
|
|
||||||
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
|
||||||
ARG DS_BUILD_FUSED_ADAM=1
|
|
||||||
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
|
||||||
ARG DS_BUILD_CPU_ADAM=1
|
|
||||||
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
|
||||||
ARG DS_BUILD_OPS=1
|
|
||||||
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
|
||||||
ENV CUTLASS_PATH=/opt/cutlass
|
|
||||||
# install deepspeed
|
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||||||
# install oneapi for deepspeed
|
# install oneapi for deepspeed
|
||||||
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||||||
cd ${STAGE_DIR}/oneCCL
|
cd ${STAGE_DIR}/oneCCL
|
||||||
@@ -349,30 +335,56 @@ cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
|||||||
make -j"$(nproc)" install
|
make -j"$(nproc)" install
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
|
# install deepspeed step 2
|
||||||
|
ARG CUDA_ARCH_LIST="80;86;89;90"
|
||||||
|
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
cd ${STAGE_DIR}/DeepSpeed-Kernels
|
cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
|
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
|
||||||
pip install dist/deepspeed_kernels-*.whl
|
# pip install dist/deepspeed_kernels-*.whl
|
||||||
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} pip install -v .
|
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
|
ARG DEEPSPEED_VERSION="0.14.3"
|
||||||
|
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
||||||
|
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean"
|
||||||
|
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
||||||
|
ARG DS_BUILD_SPARSE_ATTN=0
|
||||||
|
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
||||||
|
ARG DS_BUILD_FUSED_ADAM=1
|
||||||
|
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
||||||
|
ARG DS_BUILD_CPU_ADAM=1
|
||||||
|
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
||||||
|
ARG DS_BUILD_OPS=1
|
||||||
|
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
||||||
|
ARG HOSTFILE_CONTENT=""
|
||||||
|
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
|
||||||
|
ENV CUTLASS_PATH=/opt/cutlass
|
||||||
|
|
||||||
|
# install deepspeed step 3
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||||
cd ${STAGE_DIR}/DeepSpeed
|
cd ${STAGE_DIR}/DeepSpeed
|
||||||
git checkout .
|
git checkout .
|
||||||
# git checkout v${DEEPSPEED_VERSION}
|
git checkout ${DEEPSPEED_VERSION}
|
||||||
python setup.py bdist_wheel
|
# 修改 install.sh 脚本中的 python 解释器路径
|
||||||
DS_BUILD_OPS=${DS_BUILD_OPS} pip install dist/deepspeed*.whl --force-reinstall
|
# sed "s|\bpython\b|/opt/conda/envs/${CONDA_ENV_NAME}/bin/python|g" install.sh > install_modified.sh
|
||||||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt
|
# chmod +x ./install_modified.sh
|
||||||
./install.sh ${DEEPSPEED_INSTALL_FLAGS}
|
# 检查 HOSTFILE_CONTENT 并写入文件
|
||||||
# ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile
|
if [ -n "${HOSTFILE_CONTENT}" ]; then
|
||||||
|
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
|
||||||
|
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
|
||||||
|
else
|
||||||
|
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
|
||||||
|
fi
|
||||||
|
eval $INSTALL_CMD
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
# install transformers and flash-attn
|
# install transformers and flash-attn
|
||||||
@@ -410,6 +422,4 @@ EOT
|
|||||||
# code-server --install-extension ms-python.vscode-pylance
|
# code-server --install-extension ms-python.vscode-pylance
|
||||||
# EOT
|
# EOT
|
||||||
|
|
||||||
# 启动 ssh 服务
|
|
||||||
# CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
CMD ["/usr/sbin/sshd", "-D"]
|
||||||
@@ -34,7 +34,7 @@ services:
|
|||||||
CUDA: cu121
|
CUDA: cu121
|
||||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||||
SETUPTOOLS_VERSION: "69.5.1"
|
SETUPTOOLS_VERSION: "69.5.1"
|
||||||
DCUTLASS_NVCC_ARCHS: "90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
||||||
volumes:
|
volumes:
|
||||||
- ./src:/bbtft
|
- ./src:/bbtft
|
||||||
container_name: ubuntu-finetune
|
container_name: ubuntu-finetune
|
||||||
|
|||||||
Reference in New Issue
Block a user