diff --git a/evo/Dockerfile b/evo/Dockerfile new file mode 100644 index 0000000..a1ab221 --- /dev/null +++ b/evo/Dockerfile @@ -0,0 +1,382 @@ +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH +WORKDIR /root +SHELL ["/bin/bash", "-c"] +# base tools +RUN <> ~/.bashrc +echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba +echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +micromamba shell init -s bash -p ~/micromamba +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +alias mamba=micromamba +alias mba=mamba +EOF +# 配置 .mambarc 文件 +cat < ~/.mambarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.0 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.0 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.0 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... +MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +cd .. +rm -rf ${STAGE_DIR}/apex +EOT + +RUN <&1 +# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail +# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# install deepspeed prepare +# install Mellanox OFED +mkdir -p ${STAGE_DIR} +wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - +cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 +./mlnxofedinstall --user-space-only --without-fw-update --all -q +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* +cd .. +# install nv_peer_mem +rm -rf ${STAGE_DIR} +mkdir -p ${STAGE_DIR} +git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory +cd ${STAGE_DIR}/nv_peer_memory +./build_module.sh +cd ${STAGE_DIR} +tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz +cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} +apt-get update +apt-get install -y dkms +dpkg-buildpackage -us -uc +dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb +EOT + +# install mpi +ENV PATH=/usr/local/mpi/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +RUN < /usr/local/mpi/bin/mpirun +echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun +chmod a+x /usr/local/mpi/bin/mpirun +EOT + +# Some Packages +RUN < ${STAGE_DIR}/sshd_config && \ +sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +EOT + +# 29.78 Usage: install.sh [options...] +# 29.78 +# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in +# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally +# 29.78 +# 29.78 [optional] +# 29.78 -l, --local_only Install only on local machine +# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) +# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) +# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels +# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) +# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) +# 29.78 -e, --examples Checkout deepspeed example submodule (no install) +# 29.78 -v, --verbose Verbose logging +# 29.78 -h, --help This help text + +RUN <> /etc/sudoers +EOT + +# install cutlass https://github.com/NVIDIA/cutlass +# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# A100: architecture is Ampere +# V100: architecture is Volta +# T4: architecture is Turing +# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# 89:GeForce RTX 4090 +ARG DCUTLASS_NVCC_ARCHS="89" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN <> ~/.bashrc +echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba +echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +micromamba shell init -s bash -p ~/micromamba +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +alias mamba=micromamba +alias mba=mamba +EOF +# 配置 .mambarc 文件 +cat < ~/.mambarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.0 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.0 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.0 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... +MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +cd .. +rm -rf ${STAGE_DIR}/apex +EOT + +RUN <&1 +# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail +# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# install deepspeed prepare +# install Mellanox OFED +mkdir -p ${STAGE_DIR} +wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - +cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 +./mlnxofedinstall --user-space-only --without-fw-update --all -q +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* +cd .. +# install nv_peer_mem +rm -rf ${STAGE_DIR} +mkdir -p ${STAGE_DIR} +git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory +cd ${STAGE_DIR}/nv_peer_memory +./build_module.sh +cd ${STAGE_DIR} +tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz +cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} +apt-get update +apt-get install -y dkms +dpkg-buildpackage -us -uc +dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb +EOT + +# install mpi +ENV PATH=/usr/local/mpi/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +RUN < /usr/local/mpi/bin/mpirun +echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun +chmod a+x /usr/local/mpi/bin/mpirun +EOT + +# Some Packages +RUN < ${STAGE_DIR}/sshd_config && \ +sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +EOT + +# 29.78 Usage: install.sh [options...] +# 29.78 +# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in +# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally +# 29.78 +# 29.78 [optional] +# 29.78 -l, --local_only Install only on local machine +# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) +# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) +# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels +# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) +# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) +# 29.78 -e, --examples Checkout deepspeed example submodule (no install) +# 29.78 -v, --verbose Verbose logging +# 29.78 -h, --help This help text + +RUN <> /etc/sudoers +EOT + +# install cutlass https://github.com/NVIDIA/cutlass +# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# A100: architecture is Ampere +# V100: architecture is Volta +# T4: architecture is Turing +# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# 89:GeForce RTX 4090 +ARG DCUTLASS_NVCC_ARCHS="89" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN <