From 8dc78b7faab20c7eac17493a02a6359b1fff57ef Mon Sep 17 00:00:00 2001
From: hotwa <pylyzeng@gmail.com>
Date: Thu, 20 Jun 2024 16:17:56 +0800
Subject: [PATCH] update

---
 finetune/Dockerfile                     | 387 +++++++++++++++++-------
 finetune/README.md                      |  12 +
 finetune/docker-compose.yml             |   1 +
 finetune/docker-compose_pytorch1.13.yml |  45 +++
 finetune/docker-compose_pytorch2.3.yml  |  45 +++
 5 files changed, 382 insertions(+), 108 deletions(-)
 create mode 100644 finetune/README.md
 create mode 100644 finetune/docker-compose_pytorch1.13.yml
 create mode 100644 finetune/docker-compose_pytorch2.3.yml

diff --git a/finetune/Dockerfile b/finetune/Dockerfile
index d78aa2f..903b0b1 100644
--- a/finetune/Dockerfile
+++ b/finetune/Dockerfile
@@ -1,8 +1,9 @@
-FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
 ARG DEBIAN_FRONTEND="noninteractive"
 ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
 ENV MAMBA_ROOT_PREFIX=~/micromamba
-ARG CONDA_ENV_NAME="ldh"
+ARG CONDA_ENV_NAME="deepspeed"
 ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
 ARG PYTHON_VERSION=3.10
 ENV PYTHON_VERSION=${PYTHON_VERSION}
@@ -15,7 +16,7 @@ SHELL ["/bin/bash", "-c"]
 RUN <<EOT
 #!/bin/bash
 apt-get update
-apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
 apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
 add-apt-repository ppa:git-core/ppa -y
 apt-get install -y git libnuma-dev wget
@@ -38,9 +39,6 @@ wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 bash /tmp/miniconda.sh -b -p /opt/conda 
 rm /tmp/miniconda.sh 
 conda init bash 
-conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -y
-conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch vidia-ml-py3 opencv-contrib-python 
-conda clean -afy 
 ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
 echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
 echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
@@ -76,10 +74,29 @@ show_channel_urls: true
 EOF
 EOT
 
-# 安装 ninja 并测试
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.0
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.0
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.0
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
 RUN <<EOT
 #!/bin/bash
-# 安装 ninja
 source /opt/conda/etc/profile.d/conda.sh
 conda activate ${CONDA_ENV_NAME}
 # 克隆 ninja 源码并编译
@@ -89,116 +106,270 @@ cd ninja
 git clone https://github.com/google/googletest.git
 conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap
 # 配置并构建 Ninja 测试，添加 pthread 链接选项
-CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
 ./ninja all
 # 运行 Ninja 单元测试
 ./ninja_test
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
+python3 -m pip install --no-cache-dir --upgrade pip
+conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+conda run -n ${CONDA_ENV_NAME} python -m pip install setuptools==${SETUPTOOLS_VERSION}
+conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+conda run -n ${CONDA_ENV_NAME} python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
+conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y transformer-engine
+conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch-tensorrt
+conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y apex
 EOT
 
-# # deepspeed
-# ENV STAGE_DIR=/tmp
-# RUN <<EOT
-# #!/bin/bash
-# mkdir -p ${STAGE_DIR}
-# echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
-# cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config
-# sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
-# EOT
-
-# # Mellanox OFED
-# WORKDIR ${STAGE_DIR}
-# ENV MLNX_OFED_VERSION=4.9-7.1.0.0
-# RUN wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
-#     cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
-#     ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
-#     cd ${STAGE_DIR} && \
-#     rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
-# # nv_peer_mem
-# ENV NV_PEER_MEM_VERSION=1.2
-# ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
-# RUN mkdir -p ${STAGE_DIR} && \
-#         git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
-#         cd ${STAGE_DIR}/nv_peer_memory && \
-#         ./build_module.sh && \
-#         cd ${STAGE_DIR} && \
-#         tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
-#         cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
-#         apt-get update && \
-#         apt-get install -y dkms && \
-#         dpkg-buildpackage -us -uc && \
-#         dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
-# # OPENMPI
-# ENV OPENMPI_BASEVERSION=4.1
-# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
-# RUN <<EOT
-# #!/bin/bash
-# cd ${STAGE_DIR}
-# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
-# cd openmpi-${OPENMPI_VERSION}
-# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
-# make -j"$(nproc)" install
-# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
-# # Sanity check:
-# test -f /usr/local/mpi/bin/mpic++
-# cd ${STAGE_DIR}
-# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
-# EOT
-
-# ENV PATH=/usr/local/mpi/bin:${PATH} 
-# ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
-# # Create a wrapper for OpenMPI to allow running as root by default
-# RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
-#     echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
-#     echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
-#     chmod a+x /usr/local/mpi/bin/mpirun
-# # Some Packages
-# RUN <<EOT
-# apt-get update 
-# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
-# source /opt/conda/etc/profile.d/conda.sh
-# conda activate ${CONDA_ENV_NAME}
-# conda install -y mpi4py
-# python -m pip install psutil \
-# yappi \
-# cffi \
-# ipdb \
-# pandas \
-# matplotlib \
-# py3nvml \
-# pyarrow \
-# graphviz \
-# astor \
-# boto3 \
-# tqdm \
-# sentencepiece \
-# msgpack \
-# requests \
-# pandas \
-# sphinx \
-# sphinx_rtd_theme \
-# scipy \
-# numpy \
-# scikit-learn \
-# nvidia-ml-py3
-# EOT
-
-# PyTorch
-ARG PYTORCH_VERSION=2.2.1
-ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+# install apex
 RUN <<EOT
 #!/bin/bash
 source /opt/conda/etc/profile.d/conda.sh
 conda activate ${CONDA_ENV_NAME}
-pip install torch==${PYTORCH_VERSION} torchvision==0.17.1 torchaudio==${PYTORCH_VERSION} --index-url https://download.pytorch.org/whl/cu121
-pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
-pip install deepspeed bitsandbytes accelerate transformers optimum
-pip install pydantic transformers datasets accelerate evaluate peft deepspeed tiktoken \
-    sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn diffusers \
-    huggingface_hub spacy Pillow blobfile requests scipy pycocotools protobuf timm \
-    pyyaml ipython xformers opencv-contrib-python open_clip_torch \
-    packaging psutil zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
-python -c "import deepspeed; print(deepspeed.__version__)"
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
 EOT
 
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+cd ..
+# install nv_peer_mem
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# Some Packages
+RUN <<EOT
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+apt-get update 
+apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+python -m pip install https://github.com/mpi4py/mpi4py/tarball/master
+python -m pip install psutil \
+yappi \
+cffi \
+ipdb \
+pandas \
+matplotlib \
+py3nvml \
+pyarrow \
+graphviz \
+astor \
+boto3 \
+tqdm \
+sentencepiece \
+msgpack \
+requests \
+pandas \
+sphinx \
+sphinx_rtd_theme \
+scipy \
+numpy \
+scikit-learn \
+nvidia-ml-py3
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="89"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# CUDA_ARCH_LIST="80;86;89;90"
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG CUDA_ARCH_LIST="80;86"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=0
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=0
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ENV CUTLASS_PATH=/opt/cutlass
+# install deepspeed
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} pip install -v .
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed 
+git checkout . 
+git checkout master 
+python setup.py bdist_wheel
+pip install dist/deepspeed*.whl --force-reinstall
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt
+# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=0 pip install -U --no-cache-dir .
+# ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile
+cd ..
+# rm -rf ${STAGE_DIR}/DeepSpeed
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+RUN <<EOT
+#!/bin/bash
+pip install optimum
+pip install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    xformers open_clip_torch \
+    zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# add vscode server
+# RUN <<EOT
+# #!/bin/bash
+# wget -qO- https://update.code.visualstudio.com/commit:${commit_id}/server-linux-x64/stable
+# code-server --install-extension ms-python.vscode-pylance
+# EOT
+
 # 启动 ssh 服务
 CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]
\ No newline at end of file
diff --git a/finetune/README.md b/finetune/README.md
new file mode 100644
index 0000000..430d618
--- /dev/null
+++ b/finetune/README.md
@@ -0,0 +1,12 @@
+## deepspeed docker image build
+
+```shell
+docker-compose -f docker-compose_pytorch1.13.yml build
+docker-compose -f docker-compose_pytorch2.3.yml build
+```
+
+## test command
+
+```shell
+docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash
+```
\ No newline at end of file
diff --git a/finetune/docker-compose.yml b/finetune/docker-compose.yml
index ebcf535..52da7ba 100644
--- a/finetune/docker-compose.yml
+++ b/finetune/docker-compose.yml
@@ -12,6 +12,7 @@ services:
     tty: true
     restart: unless-stopped
     image: hotwa/finetune:test
+    shm_size: '32gb'
     ports:
       - 3227:22
     environment:
diff --git a/finetune/docker-compose_pytorch1.13.yml b/finetune/docker-compose_pytorch1.13.yml
new file mode 100644
index 0000000..167b135
--- /dev/null
+++ b/finetune/docker-compose_pytorch1.13.yml
@@ -0,0 +1,45 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        CUDA_VERSION: 11.7.1
+        PYTORCH_VERSION: 1.13.1
+        TORCHVISION_VERSION: 0.14.1
+        TORCHAUDIO_VERSION: 0.13.1
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 0
+        DS_BUILD_FUSED_ADAM: 0
+        DS_BUILD_CPU_ADAM: 0
+        CUDA: cu117
+        CUDA_ARCH_LIST: "80;86" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:test
+    shm_size: '32gb'
+    ports:
+      - 3227:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/finetune/docker-compose_pytorch2.3.yml b/finetune/docker-compose_pytorch2.3.yml
new file mode 100644
index 0000000..ec2ee65
--- /dev/null
+++ b/finetune/docker-compose_pytorch2.3.yml
@@ -0,0 +1,45 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        CUDA_VERSION: 12.1.0
+        PYTORCH_VERSION: 2.3.0
+        TORCHVISION_VERSION: 0.18.0
+        TORCHAUDIO_VERSION: 2.3.0
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 0
+        DS_BUILD_FUSED_ADAM: 0
+        DS_BUILD_CPU_ADAM: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:test
+    shm_size: '32gb'
+    ports:
+      - 3227:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune