Files
dockerfile_dp/pdf_clean/Dockerfile.update
2024-08-28 17:18:03 +08:00

544 lines
20 KiB
Docker
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# syntax=docker/dockerfile:1
# NOTE: Building this image require's docker version >= 23.0.
#
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG TAG_VERSION="12.4.1"
FROM nvidia/cuda:${TAG_VERSION}-cudnn-devel-ubuntu22.04 as apptainerbuilder
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
# 安装必需的包
RUN apt-get update && apt-get install -y \
wget \
gcc \
git \
libc-dev \
make \
bash \
linux-headers-generic \
libseccomp-dev \
libssl-dev \
libuuid1 \
uuid-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# 安装 Go
ARG GO_VERSION="1.21.13"
RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
rm go${GO_VERSION}.linux-amd64.tar.gz
# 设置 Go 环境变量
ENV PATH="/usr/local/go/bin:${PATH}"
# 构建 Apptainer
ARG APPTAINER_COMMITISH="main"
ARG MCONFIG_OPTIONS="--with-suid"
WORKDIR /go/src/github.com/apptainer
RUN git clone https://github.com/apptainer/apptainer.git \
&& cd apptainer \
&& git checkout "$APPTAINER_COMMITISH" \
&& ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \
&& cd builddir \
&& make \
&& make install
# 清理
RUN apt-get remove -y wget gcc git && \
apt-get autoremove -y && \
apt-get clean
FROM nvidia/cuda:${TAG_VERSION}-cudnn-devel-ubuntu22.04
COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer
ENV PATH="/usr/local/apptainer/bin:$PATH"
ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer"
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
pip install pipx
pipx install nvitop
pipx ensurepath
. ~/.bashrc
# Configure SSH for password and public key authentication
mkdir ~/.ssh
# 创建或覆盖 SSH 配置文件 ~/.ssh/config
# - Host *: 针对所有主机的通用配置
# - ForwardAgent yes: 启用 SSH 代理转发,允许通过本地的 SSH 代理进行身份验证
# - StrictHostKeyChecking no: 禁用主机密钥检查,自动接受新的主机密钥(适用于自动化环境)
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config
cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config
ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y
cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
chmod 600 /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys2
mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip
# install miniconda
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
. /opt/conda/etc/profile.d/conda.sh
conda init bash
conda config --set show_channel_urls true
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# install pixi
curl -fsSL https://pixi.sh/install.sh | bash
EOT
# install NVIDIA DOCA 2.7
# RUN <<EOT
# #!/bin/bash
# wget https://www.mellanox.com/downloads/DOCA/DOCA_v2.7.0/host/doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
# sudo dpkg -i doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
# sudo apt-get update
# sudo apt-get -y install doca-all
# EOT
ARG NV_DRIVER_VERSION="535"
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
apt-get install -y \
apt-file \
automake \
default-jdk \
dh-make \
g++ \
git \
openjdk-8-jdk \
libcap2 \
libnuma-dev \
libtool \
# Provide CUDA dependencies by libnvidia-compute*
libnvidia-compute-${NV_DRIVER_VERSION} \
make \
maven \
pkg-config \
udev \
wget \
environment-modules \
# Remove cuda-compat* from nvidia/cuda:x86_64 images, provide CUDA dependencies by libnvidia-compute* instead
&& apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
# PyTorch
ARG CONDA_ENV_NAME="deepspeed"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
ENV REF='main'
ENV STAGE_DIR=/tmp
ARG CUDA='cu121'
ENV CUDA=${CUDA}
ARG PYTORCH_VERSION=2.3.1
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ARG TORCHVISION_VERSION=0.18.1
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
ARG TORCHAUDIO_VERSION=2.3.1
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
ARG PYTORCH_CUDA_VERSION=12.1
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
ARG SETUPTOOLS_VERSION=69.5.1
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
ARG USE_CUDA=1
ENV USE_CUDA=${USE_CUDA}
ARG USE_ROCM=0
ENV USE_ROCM=${USE_ROCM}
ARG USE_XPU=0
ENV USE_XPU=${USE_XPU}
ARG _GLIBCXX_USE_CXX11_ABI=1
ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} ninja cmake mpich mpi4py ucx ucx-py cuda-cudart cuda-version=12 -y -c rapidsai-nightly -c conda-forge
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
conda activate ${CONDA_ENV_NAME}
python3 -m pip install --no-cache-dir --upgrade pip
python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
conda clean -afy
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
python -m pip install setuptools==${SETUPTOOLS_VERSION}
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
python3 -m pip uninstall -y torch torchvision torchaudio
# # install pytorch create conda env aleay exists
# 直接将 PyTorch 安装指引 中的 https://download.pytorch.org/whl 替换为 https://mirror.sjtu.edu.cn/pytorch-wheels 即可。
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
python3 -m pip uninstall -y transformer-engine
python3 -m pip uninstall -y torch-tensorrt
python3 -m pip uninstall -y apex
EOT
# install apex TORCH_CUDA_ARCH_LIST all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
cd ${STAGE_DIR}/apex
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
cd ..
rm -rf ${STAGE_DIR}/apex
EOT
# install colossalai
ARG VERSION=main
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git
cd ./ColossalAI
BUILD_EXT=1 pip install -v .
rm -rf colossalai
EOT
# install tensornvme
RUN <<EOT
#!/bin/bash
apt update -y
apt install -y libaio-dev
pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
EOT
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --skip-distro-check --without-ucx --without-hcoll --without-openmpi --without-sharp --all --force -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
EOT
ARG NV_PEER_MEM_VERSION="1.2"
ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# install nv_peer_mem
mkdir -p ${STAGE_DIR}
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
cd ${STAGE_DIR}/nv_peer_memory
./build_module.sh
cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt --fix-broken install -y
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# # install mpi
# ENV OPENMPI_BASEVERSION=4.1
# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
# ENV PATH=/usr/local/mpi/bin:${PATH}
# ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
# RUN <<EOT
# #!/bin/bash
# source /opt/conda/etc/profile.d/conda.sh
# conda activate ${CONDA_ENV_NAME}
# # OPENMPI
# rm -rf ${STAGE_DIR}
# mkdir -p ${STAGE_DIR}
# cd ${STAGE_DIR}
# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
# cd openmpi-${OPENMPI_VERSION}
# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
# make -j"$(nproc)" install
# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
# # Sanity check:
# test -f /usr/local/mpi/bin/mpic++
# cd ${STAGE_DIR}
# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
# # Create a wrapper for OpenMPI to allow running as root by default
# mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
# chmod a+x /usr/local/mpi/bin/mpirun
# EOT
# 29.78 Usage: install.sh [options...]
# 29.78
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
# 29.78
# 29.78 [optional]
# 29.78 -l, --local_only Install only on local machine
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
# 29.78 -v, --verbose Verbose logging
# 29.78 -h, --help This help text
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
usermod -aG sudo deepspeed
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
EOT
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70适用于 NVIDIA Volta 架构(如 Tesla V100
# 75适用于 NVIDIA Turing 架构(如 Tesla T4
# 80适用于 NVIDIA Ampere 架构(如 A100
# 90a适用于 NVIDIA Hopper 架构(如 H100
# 89:GeForce RTX 4090
# DCUTLASS_NVCC_ARCHS 架构参考:
# https://github.com/NVIDIA/cutlass/blob/main/media/docs/quickstart.md#building-for-multiple-architectures
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
cd /opt/cutlass
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
make -j"$(nproc)" install
cd ..
# make test_unit -j"$(nproc)"
# make test_unit_gemm_warp -j"$(nproc)"
EOT
# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# RUN <<EOT
# source /opt/conda/etc/profile.d/conda.sh
# conda activate ${CONDA_ENV_NAME}
# apt-get update
# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
# python -m pip install pipdeptree \
# psutil \
# yappi \
# cffi \
# ipdb \
# pandas \
# matplotlib \
# py3nvml \
# pyarrow \
# graphviz \
# astor \
# boto3 \
# tqdm \
# sentencepiece \
# msgpack \
# requests \
# pandas \
# sphinx \
# sphinx_rtd_theme \
# scipy \
# numpy \
# scikit-learn \
# nvidia-ml-py3 \
# mpi4py
# EOT
# install deepspeed step 1
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
# install oneapi for deepspeed
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
cd ${STAGE_DIR}/oneCCL
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
make -j"$(nproc)" install
EOT
# install deepspeed step 2
ARG CUDA_ARCH_LIST="80;86;89;90"
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
cd ${STAGE_DIR}/DeepSpeed-Kernels
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
# pip install dist/deepspeed_kernels-*.whl
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
EOT
ARG DEEPSPEED_VERSION="v0.14.3"
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
ARG DS_BUILD_SPARSE_ATTN=0
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
ARG DS_BUILD_FUSED_ADAM=1
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
ARG DS_BUILD_CPU_ADAM=1
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
ARG DS_BUILD_OPS=1
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
ARG HOSTFILE_CONTENT=""
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
ENV CUTLASS_PATH='/opt/cutlass'
ENV CUDA_HOME='/usr/local/cuda'
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${CUDA_HOME}/bin:${PATH}
# install deepspeed step 3
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
cd ${STAGE_DIR}/DeepSpeed
git checkout ${DEEPSPEED_VERSION}
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
chmod +x ./install_modified.sh
# 检查 HOSTFILE_CONTENT 并写入文件
if [ -n "${HOSTFILE_CONTENT}" ]; then
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
else
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
fi
eval $INSTALL_CMD
# compile deepspeed ops
ds_report
# clean up
# rm -f deepspeed/git_version_info_installed.py
# rm -rf dist build deepspeed.egg-info
# python setup.py bdist_wheel
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
EOT
# install transformers and flash-attn
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# install transformers
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
cd ${STAGE_DIR}/transformers
python3 ./setup.py develop
python3 -m pip install -U --no-cache-dir "pydantic<2"
# install flash-attn
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
# other packages
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
pip3 install optimum
pip3 install peft tiktoken \
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
huggingface_hub spacy blobfile pycocotools \
open_clip_torch \
zstandard mpi4py -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
ARG NCCL_IB_DISABLE="1"
ARG NCCL_SOCKET_IFNAME="eth0"
ENV NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
ENV NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
# deepspeed env
RUN <<EOT
#!/bin/bash
cat <<EOF > ~/.deepspeed_env
NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
NCCL_DEBUG=INFO
CUTLASS_PATH=${CUTLASS_PATH}
CUDA_HOME=${CUDA_HOME}
EOF
#CUDA_VISIBLE_DEVICES=0,1,2,3
#OMP_NUM_THREADS=8
#MASTER_ADDR=192.168.1.1
#MASTER_PORT=12345
EOT
CMD ["/usr/sbin/sshd", "-D"]