add

2024-08-28 17:18:03 +08:00
parent 77140c1407
commit f735605e9f
15 changed files with 2635 additions and 1 deletions
--- a/pdf_clean/Dockerfile.update
+++ b/pdf_clean/Dockerfile.update
@@ -0,0 +1,543 @@
+# syntax=docker/dockerfile:1
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG TAG_VERSION="12.4.1"
+FROM nvidia/cuda:${TAG_VERSION}-cudnn-devel-ubuntu22.04 as apptainerbuilder
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+# 安装必需的包
+RUN apt-get update && apt-get install -y \
+    wget \
+    gcc \
+    git \
+    libc-dev \
+    make \
+    bash \
+    linux-headers-generic \
+    libseccomp-dev \
+    libssl-dev \
+    libuuid1 \
+    uuid-dev \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# 安装 Go
+ARG GO_VERSION="1.21.13"
+RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
+    rm go${GO_VERSION}.linux-amd64.tar.gz
+
+# 设置 Go 环境变量
+ENV PATH="/usr/local/go/bin:${PATH}"
+
+# 构建 Apptainer
+ARG APPTAINER_COMMITISH="main"
+ARG MCONFIG_OPTIONS="--with-suid"
+WORKDIR /go/src/github.com/apptainer
+RUN git clone https://github.com/apptainer/apptainer.git \
+    && cd apptainer \
+    && git checkout "$APPTAINER_COMMITISH" \
+    && ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \
+    && cd builddir \
+    && make \
+    && make install
+
+# 清理
+RUN apt-get remove -y wget gcc git && \
+    apt-get autoremove -y && \
+    apt-get clean
+
+FROM nvidia/cuda:${TAG_VERSION}-cudnn-devel-ubuntu22.04
+COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer
+ENV PATH="/usr/local/apptainer/bin:$PATH"
+ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer"
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+pip install pipx
+pipx install nvitop
+pipx ensurepath 
+. ~/.bashrc
+# Configure SSH for password and public key authentication
+mkdir ~/.ssh
+# 创建或覆盖 SSH 配置文件 ~/.ssh/config
+# - Host *: 针对所有主机的通用配置
+# - ForwardAgent yes: 启用 SSH 代理转发，允许通过本地的 SSH 代理进行身份验证
+# - StrictHostKeyChecking no: 禁用主机密钥检查，自动接受新的主机密钥（适用于自动化环境）
+printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config
+cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
+sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
+sudo sed -i "s/#   Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config
+ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y
+cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
+chmod 600 /root/.ssh/authorized_keys
+chmod 600 /root/.ssh/authorized_keys2
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+. /opt/conda/etc/profile.d/conda.sh 
+conda init bash
+conda config --set show_channel_urls true
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# install pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+EOT
+
+# install NVIDIA DOCA 2.7
+# RUN <<EOT
+# #!/bin/bash
+# wget https://www.mellanox.com/downloads/DOCA/DOCA_v2.7.0/host/doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
+# sudo dpkg -i doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
+# sudo apt-get update
+# sudo apt-get -y install doca-all
+# EOT
+ARG NV_DRIVER_VERSION="535"
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
+    apt-get install -y \
+        apt-file \
+        automake \
+        default-jdk \
+        dh-make \
+        g++ \
+        git \
+        openjdk-8-jdk \
+        libcap2 \
+        libnuma-dev \
+        libtool \
+        # Provide CUDA dependencies by libnvidia-compute*
+        libnvidia-compute-${NV_DRIVER_VERSION} \
+        make \
+        maven \
+        pkg-config \
+        udev \
+        wget \
+        environment-modules \
+    # Remove cuda-compat* from nvidia/cuda:x86_64 images, provide CUDA dependencies by libnvidia-compute* instead
+    && apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y
+
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} ninja cmake  mpich mpi4py ucx ucx-py cuda-cudart cuda-version=12 -y -c rapidsai-nightly -c conda-forge
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+conda activate ${CONDA_ENV_NAME}
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+# 直接将 PyTorch 安装指引 中的 https://download.pytorch.org/whl 替换为 https://mirror.sjtu.edu.cn/pytorch-wheels 即可。
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex TORCH_CUDA_ARCH_LIST all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
+ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd ${STAGE_DIR}/apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+# install colossalai
+ARG VERSION=main
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git
+cd ./ColossalAI
+BUILD_EXT=1 pip install -v .
+rm -rf colossalai
+EOT
+
+# install tensornvme
+RUN <<EOT
+#!/bin/bash
+apt update -y
+apt install -y libaio-dev
+pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
+EOT
+
+# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
+./mlnxofedinstall --user-space-only --without-fw-update --skip-distro-check --without-ucx --without-hcoll --without-openmpi --without-sharp --all --force -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+EOT
+
+ARG NV_PEER_MEM_VERSION="1.2"
+ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install nv_peer_mem
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt --fix-broken install -y
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# # install mpi
+# ENV OPENMPI_BASEVERSION=4.1
+# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+# ENV PATH=/usr/local/mpi/bin:${PATH} 
+# ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# RUN <<EOT
+# #!/bin/bash
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# # OPENMPI
+# rm -rf ${STAGE_DIR}
+# mkdir -p ${STAGE_DIR}
+# cd ${STAGE_DIR}
+# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+# cd openmpi-${OPENMPI_VERSION} 
+# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+# make -j"$(nproc)" install 
+# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# # Sanity check:
+# test -f /usr/local/mpi/bin/mpic++ 
+# cd ${STAGE_DIR} 
+# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# # Create a wrapper for OpenMPI to allow running as root by default
+# mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+# chmod a+x /usr/local/mpi/bin/mpirun
+# EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+# DCUTLASS_NVCC_ARCHS 架构参考：
+# https://github.com/NVIDIA/cutlass/blob/main/media/docs/quickstart.md#building-for-multiple-architectures
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard mpi4py -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+ARG NCCL_IB_DISABLE="1"
+ARG NCCL_SOCKET_IFNAME="eth0"
+ENV NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
+ENV NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
+# deepspeed env
+RUN <<EOT
+#!/bin/bash
+cat <<EOF > ~/.deepspeed_env
+NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
+NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
+NCCL_DEBUG=INFO
+CUTLASS_PATH=${CUTLASS_PATH}
+CUDA_HOME=${CUDA_HOME}
+EOF
+#CUDA_VISIBLE_DEVICES=0,1,2,3
+#OMP_NUM_THREADS=8
+#MASTER_ADDR=192.168.1.1
+#MASTER_PORT=12345
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]