diff --git a/finetune/Dockerfile.ldh b/finetune/Dockerfile.ldh index ecfe16d..7424c7f 100644 --- a/finetune/Dockerfile.ldh +++ b/finetune/Dockerfile.ldh @@ -1,5 +1,6 @@ # FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 +# FROM nvcr.io/nvidia/pytorch:24.02-py3 ENV DEBIAN_FRONTEND="noninteractive" @@ -7,10 +8,10 @@ ENV STAGE_DIR="/tmp" RUN mkdir -p ${STAGE_DIR} ENV CUTLASS_PATH="/opt/cutlass" - ENV CUDA_HOME="/usr/local/cuda" ENV PATH=${CUDA_HOME}/bin:${PATH} ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0+PTX" SHELL ["/bin/bash", "-c"] @@ -28,12 +29,7 @@ RUN \ llvm-dev && \ apt-get install -y git python3 python3-pip && \ apt-get install -y --no-install-recommends \ - libsndfile-dev \ - libcupti-dev \ - libjpeg-dev \ - libpng-dev \ - screen \ - libaio-dev + libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev RUN \ apt-get update && \ apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \ @@ -42,6 +38,7 @@ RUN \ apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ apt-get install -y quilt python3-distutils + # Install Miniconda RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ /bin/bash ~/miniconda.sh -b -p /opt/conda -u && \ @@ -51,10 +48,8 @@ ENV PATH=/opt/conda/bin:${PATH} RUN \ /opt/conda/bin/conda init bash -ARG CONDA_ENV_NAME="deepspeed" -ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} -ARG PYTHON_VERSION="3.10" -ENV PYTHON_VERSION=${PYTHON_VERSION} +ENV CONDA_ENV_NAME="deepspeed" +ENV PYTHON_VERSION="3.10" ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH} # Create and activate a conda environment RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \ @@ -62,9 +57,9 @@ RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cm echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}" + # install cutlass https://github.com/NVIDIA/cutlass -ARG DCUTLASS_NVCC_ARCHS="80;89;90;90a" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +ENV DCUTLASS_NVCC_ARCHS="80;89;90;90a" RUN \ source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ @@ -78,10 +73,12 @@ RUN \ # Mellanox OFED +# ENV MLNX_OFED_VERSION=5.8-5.1.1.2 ENV MLNX_OFED_VERSION=23.10-3.2.2.0 RUN \ source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ apt-get install -y libnuma-dev libnvidia-compute-515 && \ + # apt-get install -y libnuma-dev libnvidia-compute-535 && \ cd ${STAGE_DIR} && \ wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \ cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \ @@ -89,6 +86,7 @@ RUN \ cd ${STAGE_DIR} && \ rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* + # nv_peer_mem ENV NV_PEER_MEM_VERSION=1.2 # ENV NV_PEER_MEM_VERSION=1.3 @@ -135,6 +133,7 @@ RUN \ echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ chmod a+x /usr/local/mpi/bin/mpirun + ENV PYTORCH_VERSION=2.3.0 ENV TORCHVISION_VERSION=0.18.0 ENV TORCHAUDIO_VERSION=2.3.0 @@ -144,9 +143,7 @@ RUN \ source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \ pip install packaging && \ - pip install flash-attn && \ - pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub optimum-benchmark && \ - pip install tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython psutil pydantic + pip install flash-attn # Install apex with CUDA and C++ extensions # pip --version | grep -q "pip 23.1" && \ @@ -161,6 +158,28 @@ RUN \ cd / && \ rm -rf /tmp/apex +# RUN \ +# source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ +# git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \ +# cd ${STAGE_DIR}/DeepSpeed && \ +# git checkout ${DEEPSPEED_VERSION} && \ +# sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \ +# chmod +x ./install_modified.sh && \ +# if [ -n "${HOSTFILE_CONTENT}" ]; then \ +# echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \ +# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \ +# else \ +# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \ +# fi && \ +# eval $INSTALL_CMD && \ +# ds_report + +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \ + pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython ipdb psutil pydantic + + RUN \ echo 'root:root' | chpasswd && \ cp /etc/ssh/sshd_config /tmp/sshd_config && \ @@ -173,6 +192,6 @@ RUN \ chown root:root /etc/ssh/sshd_config && \ mkdir -p /run/sshd && chmod 0755 /run/sshd -RUN \ - bash -c 'echo -e "export CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc' +# RUN \ +# bash -c 'echo -e "export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"\nexport CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc' diff --git a/finetune/docker-compose_m_d.yml b/finetune/docker-compose_m_d.yml new file mode 100644 index 0000000..f82ad3f --- /dev/null +++ b/finetune/docker-compose_m_d.yml @@ -0,0 +1,35 @@ + +services: + ldh-megatron-deepspeed-test: + image: hotwa/magadeep:latest + shm_size: '128gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + #runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + # stdin_open: true + # tty: true + privileged: true + cap_add: + - IPC_LOCK + volumes: + - /root/workspace:/root/data + - /dev/infiniband:/dev/infiniband + # ports: + # - "22242:22242" + # - "5000:5000" + # networks: + # - ldh_overlay_network + network_mode: host + command: ["/usr/sbin/sshd", "-D"] + +# networks: +# ldh_overlay_network: +# external: true diff --git a/finetune/docker-compose_mega.yml b/finetune/docker-compose_mega.yml new file mode 100644 index 0000000..adeb72a --- /dev/null +++ b/finetune/docker-compose_mega.yml @@ -0,0 +1,38 @@ + +services: + megatron-test: + image: nvcr.io/nvidia/pytorch:24.02-py3 + shm_size: '560gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + #runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + #- CUTLASS_PATH="/opt/cutlass" + #- CUDA_HOME="/usr/local/cuda" + #- PATH="${CUDA_HOME}/bin:${PATH}" + #- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" + stdin_open: true + tty: true + privileged: true + cap_add: + - IPC_LOCK + volumes: + - /root/workspace:/mnt + - /dev/infiniband:/dev/infiniband + # - /mnt/local-nvme:/root/ + ports: + - "5000:5000" + # networks: + # - ldh_overlay_network + network_mode: host + +# networks: +# ldh_overlay_network: +# external: true