From ddd89a900b97c24ab4d875c649e9fc660a2fe1e0 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Sat, 13 Jul 2024 16:07:59 +0800 Subject: [PATCH] ldh --- Dockerfile.ldh | 200 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 174 insertions(+), 26 deletions(-) diff --git a/Dockerfile.ldh b/Dockerfile.ldh index 911c336..8e5a48a 100644 --- a/Dockerfile.ldh +++ b/Dockerfile.ldh @@ -1,29 +1,177 @@ -# syntax=docker/dockerfile:1 -FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 +# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 -ENV MAMBA_ROOT_PREFIX=~/micromamba -ENV PATH="/usr/local/bin:$PATH" -RUN <> /etc/ssh/sshd_config -echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config -echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config -echo "Port 22" >> /etc/ssh/sshd_config -mkdir /var/run/sshd -echo 'root:cdcdocker' | chpasswd -# Install Micromamba -echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) -micromamba shell init -s bash -p ~/micromamba -mkdir -p ~/.pip -echo " -[global] -index-url = https://mirrors.aliyun.com/pypi/simple/ +ENV DEBIAN_FRONTEND="noninteractive" -[install] -trusted-host=mirrors.aliyun.com -" >> ~/.pip/pip.conf -EOT +ENV STAGE_DIR="/tmp" +RUN mkdir -p ${STAGE_DIR} + +ENV CUTLASS_PATH="/opt/cutlass" +ARG CONDA_ENV_NAME="deepspeed" + +ENV CUDA_HOME="/usr/local/cuda" +ENV PATH=${CUDA_HOME}/bin:${PATH} +ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} + +SHELL ["/bin/bash", "-c"] + +WORKDIR /root + +RUN \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + nfs-common pdsh \ + cmake g++ gcc \ + curl wget vim tmux emacs less unzip \ + htop iftop iotop ca-certificates openssh-client openssh-server \ + rsync iputils-ping net-tools sudo \ + llvm-dev && \ + apt-get install -y git python3 python3-pip && \ + apt-get install -y --no-install-recommends \ + libsndfile-dev \ + libcupti-dev \ + libjpeg-dev \ + libpng-dev \ + screen \ + libaio-dev +RUN \ + apt-get update && \ + apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \ + graphviz ethtool bison libpci3 kmod pciutils dpatch libnl-route-3-200 libusb-1.0-0 \ + tk m4 autoconf debhelper flex gfortran libnl-route-3-dev automake libnl-3-dev chrpath && \ + apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ + apt-get install -y quilt python3-distutils + + +# install latest cmake +RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \ + sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + apt-get update && \ + apt-get install -y cmake + + +# Install Miniconda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda -u && \ + rm ~/miniconda.sh +# Add conda to PATH and initialize conda +ENV PATH=/opt/conda/bin:${PATH} +RUN \ + /opt/conda/bin/conda init bash + +# Create and activate a conda environment +RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=3.10 -y && \ + echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ + /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}" +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH} + +# install cutlass https://github.com/NVIDIA/cutlass +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ + cd /opt/cutlass && \ + git checkout . && \ + git checkout main && \ + mkdir build && \ + cd build && \ + cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ + make -j"$(nproc)" install + + +# Mellanox OFED +# ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + apt-get install -y libnuma-dev libnvidia-compute-515 && \ + cd ${STAGE_DIR} && \ + wget -q -O - https://content.mellanox.com/ofed/MLNX_OFED-5.8-4.1.5.0/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz* + + +# nv_peer_mem +ENV NV_PEER_MEM_VERSION=1.2 +# ENV NV_PEER_MEM_VERSION=1.3 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + + +# OPENMPI +# ENV OPENMPI_BASEVERSION=4.1 +# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ENV OPENMPI_BASEVERSION=5.0 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3 +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + +ENV PYTORCH_VERSION=2.3.0 +ENV TORCHVISION_VERSION=0.18.0 +ENV TORCHAUDIO_VERSION=2.3.0 +ENV PYTORCH_CUDA_VERSION='cu121' + +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \ + pip install packaging && \ + pip install flash-attn && \ + pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub optimum-benchmark && \ + pip install tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython psutil pydantic + +# Install apex with CUDA and C++ extensions +# pip --version | grep -q "pip 23.1" && \ +# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \ +# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \ +RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/apex /tmp/apex && \ + cd /tmp/apex && \ + pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ + python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ + cd / && \ + rm -rf /tmp/apex + +RUN \ + echo 'root:root' | chpasswd && \ + cp /etc/ssh/sshd_config /tmp/sshd_config && \ + echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \ + sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \ + sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \ + chown root:root /etc/ssh/sshd_config && \ + mkdir -p /run/sshd && chmod 0755 /run/sshd + +RUN \ + bash -c 'echo -e "export CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc' -CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file