From ba94e917e950da24c294a2cc102741b40f65caf9 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Fri, 12 Jul 2024 17:12:17 +0800 Subject: [PATCH 1/3] add ldh --- finetune/Dockerfile.ldh | 177 ++++++++++++++++++++++++++++++++ finetune/docker-compose_ldh.yml | 41 ++++++++ 2 files changed, 218 insertions(+) create mode 100644 finetune/Dockerfile.ldh create mode 100644 finetune/docker-compose_ldh.yml diff --git a/finetune/Dockerfile.ldh b/finetune/Dockerfile.ldh new file mode 100644 index 0000000..0ade984 --- /dev/null +++ b/finetune/Dockerfile.ldh @@ -0,0 +1,177 @@ +# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND="noninteractive" + +ENV STAGE_DIR="/tmp" +RUN mkdir -p ${STAGE_DIR} + +ENV CUTLASS_PATH="/opt/cutlass" +ARG CONDA_ENV_NAME="deepspeed" + +ENV CUDA_HOME="/usr/local/cuda" +ENV PATH=${CUDA_HOME}/bin:${PATH} +ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} + +SHELL ["/bin/bash", "-c"] + +WORKDIR /root + +RUN \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + nfs-common pdsh \ + cmake g++ gcc \ + curl wget vim tmux emacs less unzip \ + htop iftop iotop ca-certificates openssh-client openssh-server \ + rsync iputils-ping net-tools sudo \ + llvm-dev && \ + apt-get install -y git python3 python3-pip && \ + apt-get install -y --no-install-recommends \ + libsndfile-dev \ + libcupti-dev \ + libjpeg-dev \ + libpng-dev \ + screen \ + libaio-dev +RUN \ + apt-get update && \ + apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \ + graphviz ethtool bison libpci3 kmod pciutils dpatch libnl-route-3-200 libusb-1.0-0 \ + tk m4 autoconf debhelper flex gfortran libnl-route-3-dev automake libnl-3-dev chrpath && \ + apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ + apt-get install -y quilt python3-distutils + + +# install latest cmake +RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \ + sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + apt-get update && \ + apt-get install -y cmake + + +# Install Miniconda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda -u && \ + rm ~/miniconda.sh +# Add conda to PATH and initialize conda +ENV PATH=/opt/conda/bin:${PATH} +RUN \ + /opt/conda/bin/conda init bash + +# Create and activate a conda environment +RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=3.10 -y && \ + echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ + /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}" +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH} + +# install cutlass https://github.com/NVIDIA/cutlass +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ + cd /opt/cutlass && \ + git checkout . && \ + git checkout main && \ + mkdir build && \ + cd build && \ + cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ + make -j"$(nproc)" install + + +# Mellanox OFED +# ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + apt-get install -y libnuma-dev libnvidia-compute-515 && \ + cd ${STAGE_DIR} && \ + wget -q -O - https://content.mellanox.com/ofed/MLNX_OFED-5.8-4.1.5.0/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz* + + +# nv_peer_mem +ENV NV_PEER_MEM_VERSION=1.2 +# ENV NV_PEER_MEM_VERSION=1.3 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + + +# OPENMPI +# ENV OPENMPI_BASEVERSION=4.1 +# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ENV OPENMPI_BASEVERSION=5.0 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3 +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + +ENV PYTORCH_VERSION=2.3.0 +ENV TORCHVISION_VERSION=0.18.0 +ENV TORCHAUDIO_VERSION=2.3.0 +ENV PYTORCH_CUDA_VERSION='cu121' + +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \ + pip install packaging && \ + pip install flash-attn && \ + pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \ + pip install tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython psutil pydantic + +# Install apex with CUDA and C++ extensions +# pip --version | grep -q "pip 23.1" && \ +# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \ +# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \ +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/apex /tmp/apex && \ + cd /tmp/apex && \ + pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ + python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ + cd / && \ + rm -rf /tmp/apex + +RUN \ + echo 'root:root' | chpasswd && \ + cp /etc/ssh/sshd_config /tmp/sshd_config && \ + echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \ + sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \ + sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \ + chown root:root /etc/ssh/sshd_config && \ + mkdir -p /run/sshd && chmod 0755 /run/sshd + +RUN \ + bash -c 'echo -e "export CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc' + diff --git a/finetune/docker-compose_ldh.yml b/finetune/docker-compose_ldh.yml new file mode 100644 index 0000000..90d4e43 --- /dev/null +++ b/finetune/docker-compose_ldh.yml @@ -0,0 +1,41 @@ + +services: + ldh-deepspeed-test: + build: + context: . + dockerfile: Dockerfile.ldh + image: ldh/deepspeed:test + shm_size: '32gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + stdin_open: true + tty: true + privileged: true + cap_add: + - IPC_LOCK + devices: + - /dev/infiniband:/dev/infiniband + volumes: + - /mnt/local-nvme2:/root/data/local-nvme2 + - /mnt/local-nvme:/root/data/local-nvme + # ports: + # - "22242:22242" + # - "5000:5000" + # networks: + # - ldh_overlay_network + network_mode: host + # command: ["/usr/sbin/sshd", "-D"] + # command: ["/usr/sbin/sshd"] + +# networks: +# ldh_overlay_network: +# external: true From d52d7c885808c57ab5647271ca474a924b7ec569 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Fri, 12 Jul 2024 17:16:16 +0800 Subject: [PATCH 2/3] update --- finetune/docker-compose_pytorch2.34060.yml | 63 ++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 finetune/docker-compose_pytorch2.34060.yml diff --git a/finetune/docker-compose_pytorch2.34060.yml b/finetune/docker-compose_pytorch2.34060.yml new file mode 100644 index 0000000..52d11be --- /dev/null +++ b/finetune/docker-compose_pytorch2.34060.yml @@ -0,0 +1,63 @@ +version: '3.8' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./src:/bbtft + container_name: ubuntu-finetune + pull_policy: if_not_present + # tty: true + restart: unless-stopped + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - network_finetune + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +networks: + network_finetune: + name: network_finetune From 250b66147c042de6f0b0df3eb1ca11308a19c28b Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Fri, 12 Jul 2024 18:50:27 +0800 Subject: [PATCH 3/3] && --- finetune/Dockerfile.o | 330 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 finetune/Dockerfile.o diff --git a/finetune/Dockerfile.o b/finetune/Dockerfile.o new file mode 100644 index 0000000..1a26cc5 --- /dev/null +++ b/finetune/Dockerfile.o @@ -0,0 +1,330 @@ +# syntax=docker/dockerfile:1 + +# NOTE: Building this image require's docker version >= 23.0. +# +# For reference: +# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] +COPY id_rsa.pub /root/.ssh/id_rsa.pub + +# base tools +RUN apt-get update && \ + apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ + apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 && \ + apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c && \ + add-apt-repository ppa:git-core/ppa -y && \ + apt-get install -y git libnuma-dev wget && \ + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \ + sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + sudo apt-get update && \ + sudo apt-get install -y cmake && \ + sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ + sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ + sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config && \ + mkdir /var/run/sshd && \ + echo "root:${ROOT_PASSWD}" | chpasswd && \ + mkdir -p ~/.pip && \ + wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -b -p /opt/conda && \ + rm /tmp/miniconda.sh && \ + conda init bash && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH +ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.1 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.1 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.1 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +ARG USE_CUDA=1 +ENV USE_CUDA=${USE_CUDA} +ARG USE_ROCM=0 +ENV USE_ROCM=${USE_ROCM} +ARG USE_XPU=0 +ENV USE_XPU=${USE_XPU} +ARG _GLIBCXX_USE_CXX11_ABI=1 +ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y && \ + echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ + which python > ~/python_path.txt && \ + conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja && \ + cd ${STAGE_DIR}/ninja && \ + git clone https://github.com/google/googletest.git && \ + python ./configure.py --bootstrap && \ + conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" && \ + ./ninja all && \ + ./ninja_test && \ + python3 -m pip install --no-cache-dir --upgrade pip && \ + python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python && \ + conda clean -afy && \ + git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. && \ + python -m pip install setuptools==${SETUPTOOLS_VERSION} && \ + python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] && \ + python3 -m pip uninstall -y torch torchvision torchaudio && \ + python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} && \ + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate && \ + python3 -m pip uninstall -y transformer-engine && \ + python3 -m pip uninstall -y torch-tensorrt && \ + python3 -m pip uninstall -y apex + +# install apex +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex && \ + cd apex && \ + MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ + python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ + cd .. && \ + rm -rf ${STAGE_DIR}/apex + +ENV MLNX_OFED_VERSION=5.8-4.1.5.0 +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + mkdir -p ${STAGE_DIR} && \ + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* + +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + rm -rf ${STAGE_DIR} && \ + mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt --fix-broken install -y && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + +# install mpi +ENV PATH=/usr/local/mpi/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + rm -rf ${STAGE_DIR} && \ + mkdir -p ${STAGE_DIR} && \ + cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} && \ + mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + +# SSH daemon port inside container cannot conflict with host OS port +# ENV SSH_PORT=2222 +# RUN source /opt/conda/etc/profile.d/conda.sh && \ +# conda activate ${CONDA_ENV_NAME} && \ +# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ +# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config + +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + useradd --create-home --uid 1000 --shell /bin/bash deepspeed && \ + usermod -aG sudo deepspeed && \ + echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +# install cutlass https://github.com/NVIDIA/cutlass +# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# A100: architecture is Ampere +# V100: architecture is Volta +# T4: architecture is Turing +# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# 89:GeForce RTX 4090 +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ + cd /opt/cutlass && \ + git checkout . && \ + git checkout master && \ + mkdir build && \ + cd build && \ + cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ + make -j"$(nproc)" install && \ + cd .. + +# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# RUN source /opt/conda/etc/profile.d/conda.sh && \ +# conda activate ${CONDA_ENV_NAME} && \ +# apt-get update && \ +# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev && \ +# python -m pip install pipdeptree psutil yappi cffi ipdb pandas matplotlib py3nvml pyarrow graphviz astor boto3 tqdm sentencepiece msgpack requests pandas sphinx sphinx_rtd_theme scipy numpy scikit-learn nvidia-ml-py3 mpi4py + +# install deepspeed step 1 +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + /opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION} && \ + git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL && \ + cd ${STAGE_DIR}/oneCCL && \ + git checkout . && \ + git checkout master && \ + mkdir build && \ + cd build && \ + cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \ + make -j"$(nproc)" install + +# install deepspeed step 2 +ARG CUDA_ARCH_LIST="80;86;89;90" +ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST} +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels && \ + cd ${STAGE_DIR}/DeepSpeed-Kernels && \ + CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v . + +ARG DEEPSPEED_VERSION="v0.14.3" +ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION} +ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose" +ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS} +ARG DS_BUILD_SPARSE_ATTN=0 +ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN} +ARG DS_BUILD_FUSED_ADAM=1 +ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM} +ARG DS_BUILD_CPU_ADAM=1 +ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} +ARG DS_BUILD_OPS=1 +ENV DS_BUILD_OPS=${DS_BUILD_OPS} +ARG HOSTFILE_CONTENT="" +ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT} +ENV CUTLASS_PATH='/opt/cutlass' +ENV CUDA_HOME='/usr/local/cuda' +ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +ENV PATH=${CUDA_HOME}/bin:${PATH} + +# install deepspeed step 3 +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \ + cd ${STAGE_DIR}/DeepSpeed && \ + git checkout ${DEEPSPEED_VERSION} && \ + sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \ + chmod +x ./install_modified.sh && \ + if [ -n "${HOSTFILE_CONTENT}" ]; then \ + echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \ + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \ + else \ + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \ + fi && \ + eval $INSTALL_CMD && \ + cat < ~/compile_deepspeed_ops.py +import deepspeed + +def compile_ops(): + builders = [ + deepspeed.ops.op_builder.AsyncIOBuilder, + deepspeed.ops.op_builder.FusedAdamBuilder, + deepspeed.ops.op_builder.CPUAdamBuilder, + deepspeed.ops.op_builder.CPUAdagradBuilder, + deepspeed.ops.op_builder.CPULionBuilder, + deepspeed.ops.op_builder.EvoformerAttnBuilder, + deepspeed.ops.op_builder.FPQuantizerBuilder, + deepspeed.ops.op_builder.FusedLambBuilder, + deepspeed.ops.op_builder.FusedLionBuilder, + deepspeed.ops.op_builder.QuantizerBuilder, + deepspeed.ops.op_builder.RaggedOpsBuilder, + deepspeed.ops.op_builder.RandomLTDBuilder, + deepspeed.ops.op_builder.SparseAttnBuilder, + deepspeed.ops.op_builder.SpatialInferenceBuilder, + deepspeed.ops.op_builder.TransformerBuilder, + deepspeed.ops.op_builder.StochasticTransformerBuilder, + ] + + for builder in builders: + print(f"Compiling {builder.__name__}") + builder().load() + +if __name__ == "__main__": + compile_ops() +EOF && \ + python compile_deepspeed_ops.py && \ + ds_report + +# install transformers and flash-attn +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers && \ + cd ${STAGE_DIR}/transformers && \ + python3 ./setup.py develop && \ + python3 -m pip install -U --no-cache-dir "pydantic<2" && \ + pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org + +# other packages +ENV TORCH_CUDA_ARCH_LIST="80;86;89;90" +RUN source /opt/conda/etc/profile.d/conda.sh && \ + conda activate ${CONDA_ENV_NAME} && \ + pip3 install optimum && \ + pip3 install peft tiktoken tqdm matplotlib seaborn numpy pandas scikit-learn diffusers huggingface_hub spacy blobfile pycocotools open_clip_torch zstandard -i https://pypi.org/simple/ --trusted-host pypi.org + +ARG DEEPSPEED_TRAIN='/data/train_data' +ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN +ARG DEEPSPEED_VALIDATION='/data/validation_data' +ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION +ARG NCCL_SOCKET_IFNAME='eth0' + +CMD ["/usr/sbin/sshd", "-D"] +# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]