473 lines
18 KiB
Docker
473 lines
18 KiB
Docker
|
||
# syntax=docker/dockerfile:1
|
||
# NOTE: Building this image require's docker version >= 23.0.
|
||
#
|
||
# For reference:
|
||
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||
ARG TAG_VERSION="12.1.1"
|
||
FROM nvidia/cuda:${TAG_VERSION} as apptainerbuilder
|
||
ARG HTTP_PROXY
|
||
ARG HTTPS_PROXY
|
||
ENV http_proxy=${HTTP_PROXY}
|
||
ENV https_proxy=${HTTPS_PROXY}
|
||
ARG DEBIAN_FRONTEND="noninteractive"
|
||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||
# 安装必需的包
|
||
RUN apt-get update && apt-get install -y \
|
||
wget \
|
||
gcc \
|
||
git \
|
||
libc-dev \
|
||
make \
|
||
bash \
|
||
linux-headers-generic \
|
||
libseccomp-dev \
|
||
libssl-dev \
|
||
libuuid1 \
|
||
uuid-dev \
|
||
pkg-config \
|
||
&& rm -rf /var/lib/apt/lists/*
|
||
|
||
# 安装 Go
|
||
ARG GO_VERSION="1.21.13"
|
||
RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
|
||
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
|
||
rm go${GO_VERSION}.linux-amd64.tar.gz
|
||
|
||
# 设置 Go 环境变量
|
||
ENV PATH="/usr/local/go/bin:${PATH}"
|
||
|
||
# 构建 Apptainer
|
||
ARG APPTAINER_COMMITISH="main"
|
||
ARG MCONFIG_OPTIONS="--with-suid"
|
||
WORKDIR /go/src/github.com/apptainer
|
||
RUN git clone https://github.com/apptainer/apptainer.git \
|
||
&& cd apptainer \
|
||
&& git checkout "$APPTAINER_COMMITISH" \
|
||
&& ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \
|
||
&& cd builddir \
|
||
&& make \
|
||
&& make install
|
||
|
||
# 清理
|
||
RUN apt-get remove -y wget gcc git && \
|
||
apt-get autoremove -y && \
|
||
apt-get clean
|
||
|
||
FROM nvidia/cuda:${TAG_VERSION}
|
||
# 复制 Apptainer 和 Go
|
||
COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer
|
||
COPY --from=apptainerbuilder /usr/local/go /usr/local/go
|
||
ENV GO_PATH="/usr/local/go"
|
||
ENV PATH="/usr/local/apptainer/bin:${GO_PATH}/bin:$PATH"
|
||
ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer"
|
||
ARG HTTP_PROXY
|
||
ARG HTTPS_PROXY
|
||
ENV http_proxy=${HTTP_PROXY}
|
||
ENV https_proxy=${HTTPS_PROXY}
|
||
ARG DEBIAN_FRONTEND="noninteractive"
|
||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||
ARG ROOT_PASSWD="root"
|
||
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
||
ARG SSH_PORT=2222
|
||
ENV SSH_PORT=${SSH_PORT}
|
||
WORKDIR /root
|
||
SHELL ["/bin/bash", "-c"]
|
||
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
apt-get update
|
||
apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
|
||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||
add-apt-repository ppa:git-core/ppa -y
|
||
apt-get install -y git libnuma-dev wget
|
||
apt-get install -y --no-install-recommends \
|
||
software-properties-common build-essential autotools-dev \
|
||
nfs-common pdsh \
|
||
cmake g++ gcc \
|
||
curl wget vim tmux emacs less unzip screen \
|
||
htop iftop iotop ca-certificates openssh-client openssh-server \
|
||
rsync iputils-ping net-tools sudo \
|
||
llvm-dev \
|
||
libsndfile-dev libcupti-dev libjpeg-dev libpng-dev \
|
||
libaio-dev libnuma-dev
|
||
pip install pipx
|
||
pipx install nvitop
|
||
pipx ensurepath
|
||
. ~/.bashrc
|
||
# Configure SSH for password and public key authentication
|
||
mkdir ~/.ssh
|
||
# 创建或覆盖 SSH 配置文件 ~/.ssh/config
|
||
# - Host *: 针对所有主机的通用配置
|
||
# - ForwardAgent yes: 启用 SSH 代理转发,允许通过本地的 SSH 代理进行身份验证
|
||
# - StrictHostKeyChecking no: 禁用主机密钥检查,自动接受新的主机密钥(适用于自动化环境)
|
||
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config
|
||
cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
|
||
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
||
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||
sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
|
||
sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
|
||
sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config
|
||
ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y
|
||
cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth
|
||
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
|
||
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
|
||
chmod 600 /root/.ssh/authorized_keys
|
||
chmod 600 /root/.ssh/authorized_keys2
|
||
mkdir /var/run/sshd
|
||
echo "root:${ROOT_PASSWD}" | chpasswd
|
||
mkdir -p ~/.pip
|
||
# install pixi
|
||
curl -fsSL https://pixi.sh/install.sh | bash
|
||
EOT
|
||
|
||
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
|
||
# PyTorch
|
||
ARG PYTHON_VERSION=3.10
|
||
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
||
ENV PATH=/usr/bin:/opt/conda/bin:$PATH
|
||
ENV REF='main'
|
||
ENV STAGE_DIR=/tmp
|
||
ARG CUDA='cu121'
|
||
ENV CUDA=${CUDA}
|
||
ARG PYTORCH_VERSION=2.3.1
|
||
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
|
||
ARG TORCHVISION_VERSION=0.18.1
|
||
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
|
||
ARG TORCHAUDIO_VERSION=2.3.1
|
||
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
||
ARG PYTORCH_CUDA_VERSION=12.1
|
||
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
||
ARG SETUPTOOLS_VERSION=69.5.1
|
||
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
||
ARG USE_CUDA=1
|
||
ENV USE_CUDA=${USE_CUDA}
|
||
ARG USE_ROCM=0
|
||
ENV USE_ROCM=${USE_ROCM}
|
||
ARG USE_XPU=0
|
||
ENV USE_XPU=${USE_XPU}
|
||
ARG _GLIBCXX_USE_CXX11_ABI=1
|
||
ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
python3 -m pip install --no-cache-dir --upgrade pip
|
||
python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
|
||
conda clean -afy
|
||
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
|
||
python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
||
# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
|
||
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
|
||
python3 -m pip uninstall -y torch torchvision torchaudio
|
||
# # install pytorch create conda env aleay exists
|
||
# 直接将 PyTorch 安装指引 中的 https://download.pytorch.org/whl 替换为 https://mirror.sjtu.edu.cn/pytorch-wheels 即可。
|
||
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||
python3 -m pip uninstall -y transformer-engine
|
||
python3 -m pip uninstall -y torch-tensorrt
|
||
python3 -m pip uninstall -y apex
|
||
EOT
|
||
|
||
# install apex TORCH_CUDA_ARCH_LIST all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
|
||
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
|
||
ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
|
||
cd ${STAGE_DIR}/apex
|
||
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
|
||
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||
cd ..
|
||
rm -rf ${STAGE_DIR}/apex
|
||
EOT
|
||
|
||
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
|
||
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||
python3 -m pip uninstall -y deepspeed
|
||
# This has to be run (again) inside the GPU VMs running the tests.
|
||
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
|
||
# TODO: Find out why test fail. install deepspeed
|
||
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
|
||
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
|
||
# install deepspeed prepare
|
||
# install Mellanox OFED
|
||
mkdir -p ${STAGE_DIR}
|
||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
||
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
||
cd ${STAGE_DIR}
|
||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||
EOT
|
||
|
||
ARG NV_PEER_MEM_VERSION="1.2"
|
||
ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
|
||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
# install nv_peer_mem
|
||
mkdir -p ${STAGE_DIR}
|
||
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
|
||
cd ${STAGE_DIR}/nv_peer_memory
|
||
./build_module.sh
|
||
cd ${STAGE_DIR}
|
||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||
apt-get update
|
||
apt --fix-broken install -y
|
||
apt-get install -y dkms
|
||
dpkg-buildpackage -us -uc
|
||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||
EOT
|
||
|
||
ENV UCX_HOME=/usr/local/ucx
|
||
ENV CUDA_HOME=/usr/local/cuda
|
||
ENV PATH=${CUDA_HOME}/bin:${UCX_HOME}/bin:$PATH
|
||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${UCX_HOME}/lib:$LD_LIBRARY_PATH
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
# 启用调试信息
|
||
sudo apt update
|
||
sudo apt -y install gdb valgrind
|
||
sudo apt-get update
|
||
sudo apt-get install -y build-essential libnuma-dev pkg-config libfuse3-dev
|
||
# sudo apt install -y openmpi-bin openmpi-common openmpi-doc openmpi-debug libopenmpi-dev
|
||
# sudo apt install -y libucx0-dbg libucs0-dbg libucm0-dbg libuct0-dbg libibverbs1-dbg librdmacm1-dbg libmlx5-1-dbg
|
||
git clone https://github.com/openucx/ucx.git
|
||
cd ucx
|
||
# git checkout v1.15.0
|
||
git checkout master
|
||
./autogen.sh
|
||
mkdir build
|
||
cd build
|
||
../contrib/configure-release --prefix=${UCX_HOME} \
|
||
--with-cuda=${CUDA_HOME} \
|
||
--with-mlx5 \
|
||
--with-go=${GO_PATH} \
|
||
--with-rc \
|
||
--with-ud \
|
||
--with-dc \
|
||
--with-dm \
|
||
--with-verbs
|
||
make -j$(nproc)
|
||
make install
|
||
# ucx_info -a
|
||
# 测试性能
|
||
# ucx_perftest -d <device> -t bw -p <protocol> -n <num_iterations>
|
||
# 测试 UCX 读取配置
|
||
# ucx_read_profile
|
||
# 检查 UCX 进程
|
||
# mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program
|
||
# CUDA support check
|
||
# ucx_info -c
|
||
# ucx_info -d
|
||
# ompi_info | grep ucx
|
||
EOT
|
||
|
||
# install mpi
|
||
ENV OPENMPI_BASEVERSION=4.1
|
||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
||
ENV PATH=/usr/local/mpi/bin:${PATH}
|
||
ENV MPI_HOME=/usr/local/openmpi
|
||
ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH
|
||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||
ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
|
||
ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
apt update && apt install -y autoconf automake libtool flex
|
||
/usr/bin/python3 -m pip install cython
|
||
git clone --recursive https://github.com/open-mpi/ompi.git
|
||
cd ompi
|
||
git checkout v${OPENMPI_VERSION}
|
||
./autogen.pl
|
||
mkdir build
|
||
cd build
|
||
../configure --with-cuda=/usr/local/cuda --without-hcoll --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda --enable-mca-no-build=btl-uct --with-python=/usr/bin/python3
|
||
make -j$(nproc)
|
||
make install
|
||
ln -s ${MPI_HOME} /usr/local/mpi
|
||
test -f /usr/local/mpi/bin/mpic++
|
||
# Create a wrapper for OpenMPI to allow running as root by default
|
||
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
|
||
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
|
||
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
|
||
chmod a+x /usr/local/mpi/bin/mpirun
|
||
EOT
|
||
|
||
|
||
# 29.78 Usage: install.sh [options...]
|
||
# 29.78
|
||
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
|
||
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
|
||
# 29.78
|
||
# 29.78 [optional]
|
||
# 29.78 -l, --local_only Install only on local machine
|
||
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
|
||
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
|
||
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
|
||
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
|
||
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
|
||
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
|
||
# 29.78 -v, --verbose Verbose logging
|
||
# 29.78 -h, --help This help text
|
||
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||
usermod -aG sudo deepspeed
|
||
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||
EOT
|
||
|
||
# install cutlass https://github.com/NVIDIA/cutlass
|
||
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
|
||
# A100: architecture is Ampere
|
||
# V100: architecture is Volta
|
||
# T4: architecture is Turing
|
||
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
|
||
# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。
|
||
# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。
|
||
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
||
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
||
# 89:GeForce RTX 4090
|
||
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
|
||
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
|
||
cd /opt/cutlass
|
||
git checkout .
|
||
git checkout master
|
||
mkdir build
|
||
cd build
|
||
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
|
||
make -j"$(nproc)" install
|
||
cd ..
|
||
# make test_unit -j"$(nproc)"
|
||
# make test_unit_gemm_warp -j"$(nproc)"
|
||
EOT
|
||
|
||
# install deepspeed step 1
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
source /opt/conda/etc/profile.d/conda.sh
|
||
/usr/bin/python3 -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||
# install oneapi for deepspeed
|
||
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||
cd ${STAGE_DIR}/oneCCL
|
||
git checkout .
|
||
git checkout master
|
||
mkdir build
|
||
cd build
|
||
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||
make -j"$(nproc)" install
|
||
EOT
|
||
|
||
# install deepspeed step 2
|
||
ARG CUDA_ARCH_LIST="80;86;89;90"
|
||
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||
cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
|
||
# pip install dist/deepspeed_kernels-*.whl
|
||
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
||
EOT
|
||
|
||
ARG DEEPSPEED_VERSION="v0.14.3"
|
||
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
||
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
||
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
||
ARG DS_BUILD_SPARSE_ATTN=0
|
||
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
||
ARG DS_BUILD_FUSED_ADAM=1
|
||
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
||
ARG DS_BUILD_CPU_ADAM=1
|
||
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
||
ARG DS_BUILD_OPS=1
|
||
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
||
ARG HOSTFILE_CONTENT=""
|
||
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
|
||
ENV CUTLASS_PATH='/opt/cutlass'
|
||
ENV CUDA_HOME='/usr/local/cuda'
|
||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
||
# install deepspeed step 3
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||
cd ${STAGE_DIR}/DeepSpeed
|
||
git checkout ${DEEPSPEED_VERSION}
|
||
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
|
||
chmod +x ./install_modified.sh
|
||
# 检查 HOSTFILE_CONTENT 并写入文件
|
||
if [ -n "${HOSTFILE_CONTENT}" ]; then
|
||
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
|
||
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
|
||
else
|
||
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
|
||
fi
|
||
eval $INSTALL_CMD
|
||
# compile deepspeed ops
|
||
ds_report
|
||
# clean up
|
||
# rm -f deepspeed/git_version_info_installed.py
|
||
# rm -rf dist build deepspeed.egg-info
|
||
# python setup.py bdist_wheel
|
||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
|
||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
|
||
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
|
||
EOT
|
||
|
||
# install transformers and flash-attn
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
# install transformers
|
||
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
|
||
cd ${STAGE_DIR}/transformers
|
||
python3 ./setup.py develop
|
||
python3 -m pip install -U --no-cache-dir "pydantic<2"
|
||
# install flash-attn
|
||
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
|
||
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
||
EOT
|
||
|
||
# other packages
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
pip3 install optimum
|
||
pip3 install peft tiktoken \
|
||
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
|
||
huggingface_hub spacy blobfile pycocotools \
|
||
open_clip_torch \
|
||
zstandard mpi4py -i https://pypi.org/simple/ --trusted-host pypi.org
|
||
EOT
|
||
|
||
ARG NCCL_IB_DISABLE="1"
|
||
ARG NCCL_SOCKET_IFNAME="eth0"
|
||
ENV NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
|
||
ENV NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
|
||
# deepspeed env
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
cat <<EOF > ~/.deepspeed_env
|
||
NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
|
||
NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
|
||
NCCL_DEBUG=INFO
|
||
CUTLASS_PATH=${CUTLASS_PATH}
|
||
CUDA_HOME=${CUDA_HOME}
|
||
EOF
|
||
#CUDA_VISIBLE_DEVICES=0,1,2,3
|
||
#OMP_NUM_THREADS=8
|
||
#MASTER_ADDR=192.168.1.1
|
||
#MASTER_PORT=12345
|
||
EOT
|
||
|
||
CMD ["/usr/sbin/sshd", "-D"]
|