update
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
|
# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
|
||||||
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
|
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
|
||||||
|
# FROM nvcr.io/nvidia/pytorch:24.02-py3
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND="noninteractive"
|
ENV DEBIAN_FRONTEND="noninteractive"
|
||||||
|
|
||||||
@@ -7,10 +8,10 @@ ENV STAGE_DIR="/tmp"
|
|||||||
RUN mkdir -p ${STAGE_DIR}
|
RUN mkdir -p ${STAGE_DIR}
|
||||||
|
|
||||||
ENV CUTLASS_PATH="/opt/cutlass"
|
ENV CUTLASS_PATH="/opt/cutlass"
|
||||||
|
|
||||||
ENV CUDA_HOME="/usr/local/cuda"
|
ENV CUDA_HOME="/usr/local/cuda"
|
||||||
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
||||||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0+PTX"
|
||||||
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
SHELL ["/bin/bash", "-c"]
|
||||||
|
|
||||||
@@ -28,12 +29,7 @@ RUN \
|
|||||||
llvm-dev && \
|
llvm-dev && \
|
||||||
apt-get install -y git python3 python3-pip && \
|
apt-get install -y git python3 python3-pip && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
libsndfile-dev \
|
libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
|
||||||
libcupti-dev \
|
|
||||||
libjpeg-dev \
|
|
||||||
libpng-dev \
|
|
||||||
screen \
|
|
||||||
libaio-dev
|
|
||||||
RUN \
|
RUN \
|
||||||
apt-get update && \
|
apt-get update && \
|
||||||
apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \
|
apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \
|
||||||
@@ -42,6 +38,7 @@ RUN \
|
|||||||
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
|
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
|
||||||
apt-get install -y quilt python3-distutils
|
apt-get install -y quilt python3-distutils
|
||||||
|
|
||||||
|
|
||||||
# Install Miniconda
|
# Install Miniconda
|
||||||
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
|
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
|
||||||
/bin/bash ~/miniconda.sh -b -p /opt/conda -u && \
|
/bin/bash ~/miniconda.sh -b -p /opt/conda -u && \
|
||||||
@@ -51,10 +48,8 @@ ENV PATH=/opt/conda/bin:${PATH}
|
|||||||
RUN \
|
RUN \
|
||||||
/opt/conda/bin/conda init bash
|
/opt/conda/bin/conda init bash
|
||||||
|
|
||||||
ARG CONDA_ENV_NAME="deepspeed"
|
ENV CONDA_ENV_NAME="deepspeed"
|
||||||
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
ENV PYTHON_VERSION="3.10"
|
||||||
ARG PYTHON_VERSION="3.10"
|
|
||||||
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
|
||||||
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
|
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
|
||||||
# Create and activate a conda environment
|
# Create and activate a conda environment
|
||||||
RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \
|
RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \
|
||||||
@@ -62,9 +57,9 @@ RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cm
|
|||||||
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
|
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
|
||||||
/bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}"
|
/bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}"
|
||||||
|
|
||||||
|
|
||||||
# install cutlass https://github.com/NVIDIA/cutlass
|
# install cutlass https://github.com/NVIDIA/cutlass
|
||||||
ARG DCUTLASS_NVCC_ARCHS="80;89;90;90a"
|
ENV DCUTLASS_NVCC_ARCHS="80;89;90;90a"
|
||||||
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
|
||||||
RUN \
|
RUN \
|
||||||
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
||||||
git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
|
git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
|
||||||
@@ -78,10 +73,12 @@ RUN \
|
|||||||
|
|
||||||
|
|
||||||
# Mellanox OFED
|
# Mellanox OFED
|
||||||
|
# ENV MLNX_OFED_VERSION=5.8-5.1.1.2
|
||||||
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
||||||
RUN \
|
RUN \
|
||||||
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
||||||
apt-get install -y libnuma-dev libnvidia-compute-515 && \
|
apt-get install -y libnuma-dev libnvidia-compute-515 && \
|
||||||
|
# apt-get install -y libnuma-dev libnvidia-compute-535 && \
|
||||||
cd ${STAGE_DIR} && \
|
cd ${STAGE_DIR} && \
|
||||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
|
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
|
||||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
|
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
|
||||||
@@ -89,6 +86,7 @@ RUN \
|
|||||||
cd ${STAGE_DIR} && \
|
cd ${STAGE_DIR} && \
|
||||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||||||
|
|
||||||
|
|
||||||
# nv_peer_mem
|
# nv_peer_mem
|
||||||
ENV NV_PEER_MEM_VERSION=1.2
|
ENV NV_PEER_MEM_VERSION=1.2
|
||||||
# ENV NV_PEER_MEM_VERSION=1.3
|
# ENV NV_PEER_MEM_VERSION=1.3
|
||||||
@@ -135,6 +133,7 @@ RUN \
|
|||||||
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
|
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
|
||||||
chmod a+x /usr/local/mpi/bin/mpirun
|
chmod a+x /usr/local/mpi/bin/mpirun
|
||||||
|
|
||||||
|
|
||||||
ENV PYTORCH_VERSION=2.3.0
|
ENV PYTORCH_VERSION=2.3.0
|
||||||
ENV TORCHVISION_VERSION=0.18.0
|
ENV TORCHVISION_VERSION=0.18.0
|
||||||
ENV TORCHAUDIO_VERSION=2.3.0
|
ENV TORCHAUDIO_VERSION=2.3.0
|
||||||
@@ -144,9 +143,7 @@ RUN \
|
|||||||
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
||||||
pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
|
pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
|
||||||
pip install packaging && \
|
pip install packaging && \
|
||||||
pip install flash-attn && \
|
pip install flash-attn
|
||||||
pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub optimum-benchmark && \
|
|
||||||
pip install tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython psutil pydantic
|
|
||||||
|
|
||||||
# Install apex with CUDA and C++ extensions
|
# Install apex with CUDA and C++ extensions
|
||||||
# pip --version | grep -q "pip 23.1" && \
|
# pip --version | grep -q "pip 23.1" && \
|
||||||
@@ -161,6 +158,28 @@ RUN \
|
|||||||
cd / && \
|
cd / && \
|
||||||
rm -rf /tmp/apex
|
rm -rf /tmp/apex
|
||||||
|
|
||||||
|
# RUN \
|
||||||
|
# source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
||||||
|
# git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \
|
||||||
|
# cd ${STAGE_DIR}/DeepSpeed && \
|
||||||
|
# git checkout ${DEEPSPEED_VERSION} && \
|
||||||
|
# sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \
|
||||||
|
# chmod +x ./install_modified.sh && \
|
||||||
|
# if [ -n "${HOSTFILE_CONTENT}" ]; then \
|
||||||
|
# echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \
|
||||||
|
# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \
|
||||||
|
# else \
|
||||||
|
# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \
|
||||||
|
# fi && \
|
||||||
|
# eval $INSTALL_CMD && \
|
||||||
|
# ds_report
|
||||||
|
|
||||||
|
RUN \
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
|
||||||
|
pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \
|
||||||
|
pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython ipdb psutil pydantic
|
||||||
|
|
||||||
|
|
||||||
RUN \
|
RUN \
|
||||||
echo 'root:root' | chpasswd && \
|
echo 'root:root' | chpasswd && \
|
||||||
cp /etc/ssh/sshd_config /tmp/sshd_config && \
|
cp /etc/ssh/sshd_config /tmp/sshd_config && \
|
||||||
@@ -173,6 +192,6 @@ RUN \
|
|||||||
chown root:root /etc/ssh/sshd_config && \
|
chown root:root /etc/ssh/sshd_config && \
|
||||||
mkdir -p /run/sshd && chmod 0755 /run/sshd
|
mkdir -p /run/sshd && chmod 0755 /run/sshd
|
||||||
|
|
||||||
RUN \
|
# RUN \
|
||||||
bash -c 'echo -e "export CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc'
|
# bash -c 'echo -e "export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"\nexport CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc'
|
||||||
|
|
||||||
|
|||||||
35
finetune/docker-compose_m_d.yml
Normal file
35
finetune/docker-compose_m_d.yml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
|
||||||
|
services:
|
||||||
|
ldh-megatron-deepspeed-test:
|
||||||
|
image: hotwa/magadeep:latest
|
||||||
|
shm_size: '128gb'
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
#runtime: nvidia
|
||||||
|
environment:
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
# stdin_open: true
|
||||||
|
# tty: true
|
||||||
|
privileged: true
|
||||||
|
cap_add:
|
||||||
|
- IPC_LOCK
|
||||||
|
volumes:
|
||||||
|
- /root/workspace:/root/data
|
||||||
|
- /dev/infiniband:/dev/infiniband
|
||||||
|
# ports:
|
||||||
|
# - "22242:22242"
|
||||||
|
# - "5000:5000"
|
||||||
|
# networks:
|
||||||
|
# - ldh_overlay_network
|
||||||
|
network_mode: host
|
||||||
|
command: ["/usr/sbin/sshd", "-D"]
|
||||||
|
|
||||||
|
# networks:
|
||||||
|
# ldh_overlay_network:
|
||||||
|
# external: true
|
||||||
38
finetune/docker-compose_mega.yml
Normal file
38
finetune/docker-compose_mega.yml
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
services:
|
||||||
|
megatron-test:
|
||||||
|
image: nvcr.io/nvidia/pytorch:24.02-py3
|
||||||
|
shm_size: '560gb'
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
#runtime: nvidia
|
||||||
|
environment:
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
#- CUTLASS_PATH="/opt/cutlass"
|
||||||
|
#- CUDA_HOME="/usr/local/cuda"
|
||||||
|
#- PATH="${CUDA_HOME}/bin:${PATH}"
|
||||||
|
#- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
||||||
|
stdin_open: true
|
||||||
|
tty: true
|
||||||
|
privileged: true
|
||||||
|
cap_add:
|
||||||
|
- IPC_LOCK
|
||||||
|
volumes:
|
||||||
|
- /root/workspace:/mnt
|
||||||
|
- /dev/infiniband:/dev/infiniband
|
||||||
|
# - /mnt/local-nvme:/root/
|
||||||
|
ports:
|
||||||
|
- "5000:5000"
|
||||||
|
# networks:
|
||||||
|
# - ldh_overlay_network
|
||||||
|
network_mode: host
|
||||||
|
|
||||||
|
# networks:
|
||||||
|
# ldh_overlay_network:
|
||||||
|
# external: true
|
||||||
Reference in New Issue
Block a user