# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 ENV DEBIAN_FRONTEND="noninteractive" ENV STAGE_DIR="/tmp" RUN mkdir -p ${STAGE_DIR} ENV CUTLASS_PATH="/opt/cutlass" ARG CONDA_ENV_NAME="deepspeed" ENV CUDA_HOME="/usr/local/cuda" ENV PATH=${CUDA_HOME}/bin:${PATH} ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} SHELL ["/bin/bash", "-c"] WORKDIR /root RUN \ apt-get update && \ apt-get install -y --no-install-recommends \ software-properties-common build-essential autotools-dev \ nfs-common pdsh \ cmake g++ gcc \ curl wget vim tmux emacs less unzip \ htop iftop iotop ca-certificates openssh-client openssh-server \ rsync iputils-ping net-tools sudo \ llvm-dev && \ apt-get install -y git python3 python3-pip && \ apt-get install -y --no-install-recommends \ libsndfile-dev \ libcupti-dev \ libjpeg-dev \ libpng-dev \ screen \ libaio-dev RUN \ apt-get update && \ apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \ graphviz ethtool bison libpci3 kmod pciutils dpatch libnl-route-3-200 libusb-1.0-0 \ tk m4 autoconf debhelper flex gfortran libnl-route-3-dev automake libnl-3-dev chrpath && \ apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ apt-get install -y quilt python3-distutils # install latest cmake RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \ sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get update && \ apt-get install -y cmake # Install Miniconda RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda -u && \ rm ~/miniconda.sh # Add conda to PATH and initialize conda ENV PATH=/opt/conda/bin:${PATH} RUN \ /opt/conda/bin/conda init bash # Create and activate a conda environment RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=3.10 -y && \ echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}" ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH} # install cutlass https://github.com/NVIDIA/cutlass ARG DCUTLASS_NVCC_ARCHS="80;89;90a" ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ cd /opt/cutlass && \ git checkout . && \ git checkout main && \ mkdir build && \ cd build && \ cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ make -j"$(nproc)" install # Mellanox OFED # ENV MLNX_OFED_VERSION=4.9-7.1.0.0 RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ apt-get install -y libnuma-dev libnvidia-compute-515 && \ cd ${STAGE_DIR} && \ wget -q -O - https://content.mellanox.com/ofed/MLNX_OFED-5.8-4.1.5.0/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz | tar xzf - && \ cd MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64 && \ ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ cd ${STAGE_DIR} && \ rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz* # nv_peer_mem ENV NV_PEER_MEM_VERSION=1.2 # ENV NV_PEER_MEM_VERSION=1.3 ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ mkdir -p ${STAGE_DIR} && \ git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ cd ${STAGE_DIR}/nv_peer_memory && \ ./build_module.sh && \ cd ${STAGE_DIR} && \ tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ apt-get update && \ apt-get install -y dkms && \ dpkg-buildpackage -us -uc && \ dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb # OPENMPI # ENV OPENMPI_BASEVERSION=4.1 # ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 ENV OPENMPI_BASEVERSION=5.0 ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3 RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ cd ${STAGE_DIR} && \ wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ cd openmpi-${OPENMPI_VERSION} && \ ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ make -j"$(nproc)" install && \ ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ # Sanity check: test -f /usr/local/mpi/bin/mpic++ && \ cd ${STAGE_DIR} && \ rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} ENV PATH=/usr/local/mpi/bin:${PATH} \ LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} # Create a wrapper for OpenMPI to allow running as root by default RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ chmod a+x /usr/local/mpi/bin/mpirun ENV PYTORCH_VERSION=2.3.0 ENV TORCHVISION_VERSION=0.18.0 ENV TORCHAUDIO_VERSION=2.3.0 ENV PYTORCH_CUDA_VERSION='cu121' RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \ pip install packaging && \ pip install flash-attn && \ pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub optimum-benchmark && \ pip install tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython psutil pydantic # Install apex with CUDA and C++ extensions # pip --version | grep -q "pip 23.1" && \ # (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \ # (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \ RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/NVIDIA/apex /tmp/apex && \ cd /tmp/apex && \ pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ cd / && \ rm -rf /tmp/apex RUN \ echo 'root:root' | chpasswd && \ cp /etc/ssh/sshd_config /tmp/sshd_config && \ echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \ sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \ sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \ sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \ sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \ chown root:root /etc/ssh/sshd_config && \ mkdir -p /run/sshd && chmod 0755 /run/sshd RUN \ bash -c 'echo -e "export CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc'