This commit is contained in:
2024-07-15 12:38:14 +08:00
parent 47f460bbc0
commit ae03ae6b7b

View File

@@ -7,7 +7,6 @@ ENV STAGE_DIR="/tmp"
RUN mkdir -p ${STAGE_DIR}
ENV CUTLASS_PATH="/opt/cutlass"
ARG CONDA_ENV_NAME="deepspeed"
ENV CUDA_HOME="/usr/local/cuda"
ENV PATH=${CUDA_HOME}/bin:${PATH}
@@ -43,14 +42,6 @@ RUN \
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
apt-get install -y quilt python3-distutils
# install latest cmake
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
apt-get update && \
apt-get install -y cmake
# Install Miniconda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda -u && \
@@ -60,17 +51,22 @@ ENV PATH=/opt/conda/bin:${PATH}
RUN \
/opt/conda/bin/conda init bash
ARG CONDA_ENV_NAME="deepspeed"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION="3.10"
ENV PYTHON_VERSION=${PYTHON_VERSION}
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
# Create and activate a conda environment
RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=3.10 -y && \
RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \
echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
/bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}"
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
# install cutlass https://github.com/NVIDIA/cutlass
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
ARG DCUTLASS_NVCC_ARCHS="80;89;90;90a"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
cd /opt/cutlass && \
git checkout . && \
@@ -83,7 +79,8 @@ RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}
# Mellanox OFED
# ENV MLNX_OFED_VERSION=4.9-7.1.0.0
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
apt-get install -y libnuma-dev libnvidia-compute-515 && \
cd ${STAGE_DIR} && \
wget -q -O - https://content.mellanox.com/ofed/MLNX_OFED-5.8-4.1.5.0/MLNX_OFED_LINUX-5.8-4.1.5.0-ubuntu22.04-x86_64.tgz | tar xzf - && \
@@ -97,7 +94,8 @@ RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}
ENV NV_PEER_MEM_VERSION=1.2
# ENV NV_PEER_MEM_VERSION=1.3
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
@@ -116,7 +114,8 @@ RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}
# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ENV OPENMPI_BASEVERSION=5.0
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
cd ${STAGE_DIR} && \
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
cd openmpi-${OPENMPI_VERSION} && \
@@ -130,7 +129,8 @@ RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}
ENV PATH=/usr/local/mpi/bin:${PATH} \
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
# Create a wrapper for OpenMPI to allow running as root by default
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
@@ -141,18 +141,20 @@ ENV TORCHVISION_VERSION=0.18.0
ENV TORCHAUDIO_VERSION=2.3.0
ENV PYTORCH_CUDA_VERSION='cu121'
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
pip install packaging && \
pip install flash-attn && \
pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \
pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub optimum-benchmark && \
pip install tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython psutil pydantic
# Install apex with CUDA and C++ extensions
# pip --version | grep -q "pip 23.1" && \
# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \
# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \
RUN source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/NVIDIA/apex /tmp/apex && \
cd /tmp/apex && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \