Files
cdc_dockerfile/finetune/Dockerfile.ldh
2024-07-18 16:45:59 +08:00

198 lines
9.4 KiB
Docker

# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
# FROM nvcr.io/nvidia/pytorch:24.02-py3
ENV DEBIAN_FRONTEND="noninteractive"
ENV STAGE_DIR="/tmp"
RUN mkdir -p ${STAGE_DIR}
ENV CUTLASS_PATH="/opt/cutlass"
ENV CUDA_HOME="/usr/local/cuda"
ENV PATH=${CUDA_HOME}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0+PTX"
SHELL ["/bin/bash", "-c"]
WORKDIR /root
RUN \
apt-get update && \
apt-get install -y --no-install-recommends \
software-properties-common build-essential autotools-dev \
nfs-common pdsh \
cmake g++ gcc \
curl wget vim tmux emacs less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools sudo \
llvm-dev && \
apt-get install -y git python3 python3-pip && \
apt-get install -y --no-install-recommends \
libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
RUN \
apt-get update && \
apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \
graphviz ethtool bison libpci3 kmod pciutils dpatch libnl-route-3-200 libusb-1.0-0 \
tk m4 autoconf debhelper flex gfortran libnl-route-3-dev automake libnl-3-dev chrpath && \
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
apt-get install -y quilt python3-distutils
# Install Miniconda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda -u && \
rm ~/miniconda.sh
# Add conda to PATH and initialize conda
ENV PATH=/opt/conda/bin:${PATH}
RUN \
/opt/conda/bin/conda init bash
ENV CONDA_ENV_NAME="deepspeed"
ENV PYTHON_VERSION="3.10"
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
# Create and activate a conda environment
RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \
echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
/bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}"
# install cutlass https://github.com/NVIDIA/cutlass
ENV DCUTLASS_NVCC_ARCHS="80;89;90;90a"
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
cd /opt/cutlass && \
git checkout . && \
git checkout main && \
mkdir build && \
cd build && \
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
make -j"$(nproc)" install
# Mellanox OFED
# ENV MLNX_OFED_VERSION=5.8-5.1.1.2
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
apt-get install -y libnuma-dev libnvidia-compute-515 && \
# apt-get install -y libnuma-dev libnvidia-compute-535 && \
cd ${STAGE_DIR} && \
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
cd ${STAGE_DIR} && \
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
# nv_peer_mem
ENV NV_PEER_MEM_VERSION=1.2
# ENV NV_PEER_MEM_VERSION=1.3
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
mkdir -p ${STAGE_DIR} && \
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
cd ${STAGE_DIR}/nv_peer_memory && \
./build_module.sh && \
cd ${STAGE_DIR} && \
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
apt-get update && \
apt-get install -y dkms && \
dpkg-buildpackage -us -uc && \
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
# OPENMPI
# ENV OPENMPI_BASEVERSION=4.1
# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ENV OPENMPI_BASEVERSION=5.0
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
cd ${STAGE_DIR} && \
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
cd openmpi-${OPENMPI_VERSION} && \
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
make -j"$(nproc)" install && \
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
# Sanity check:
test -f /usr/local/mpi/bin/mpic++ && \
cd ${STAGE_DIR} && \
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
ENV PATH=/usr/local/mpi/bin:${PATH} \
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
# Create a wrapper for OpenMPI to allow running as root by default
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
chmod a+x /usr/local/mpi/bin/mpirun
ENV PYTORCH_VERSION=2.3.0
ENV TORCHVISION_VERSION=0.18.0
ENV TORCHAUDIO_VERSION=2.3.0
ENV PYTORCH_CUDA_VERSION='cu121'
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
pip install packaging && \
pip install flash-attn
# Install apex with CUDA and C++ extensions
# pip --version | grep -q "pip 23.1" && \
# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \
# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
git clone https://github.com/NVIDIA/apex /tmp/apex && \
cd /tmp/apex && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \
python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \
cd / && \
rm -rf /tmp/apex
# RUN \
# source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
# git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \
# cd ${STAGE_DIR}/DeepSpeed && \
# git checkout ${DEEPSPEED_VERSION} && \
# sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \
# chmod +x ./install_modified.sh && \
# if [ -n "${HOSTFILE_CONTENT}" ]; then \
# echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \
# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \
# else \
# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \
# fi && \
# eval $INSTALL_CMD && \
# ds_report
RUN \
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \
pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython ipdb psutil pydantic
RUN \
echo 'root:root' | chpasswd && \
cp /etc/ssh/sshd_config /tmp/sshd_config && \
echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \
sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \
sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \
sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \
sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \
chown root:root /etc/ssh/sshd_config && \
mkdir -p /run/sshd && chmod 0755 /run/sshd
# RUN \
# bash -c 'echo -e "export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"\nexport CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc'