From 76a0bfc1dfc208fdb08581b96067f8bce0e344c5 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Thu, 18 Jul 2024 16:42:26 +0800 Subject: [PATCH] add --- finetune/Dockerfile.o | 330 -------------------------------- finetune/docker-compose_ldh.yml | 37 ++-- 2 files changed, 24 insertions(+), 343 deletions(-) delete mode 100644 finetune/Dockerfile.o diff --git a/finetune/Dockerfile.o b/finetune/Dockerfile.o deleted file mode 100644 index 1a26cc5..0000000 --- a/finetune/Dockerfile.o +++ /dev/null @@ -1,330 +0,0 @@ -# syntax=docker/dockerfile:1 - -# NOTE: Building this image require's docker version >= 23.0. -# -# For reference: -# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel -ARG CUDA_VERSION=12.1.0 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 -ARG HTTP_PROXY -ARG HTTPS_PROXY -ENV http_proxy=${HTTP_PROXY} -ENV https_proxy=${HTTPS_PROXY} -ARG DEBIAN_FRONTEND="noninteractive" -ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} -ENV MAMBA_ROOT_PREFIX=~/micromamba -ARG ROOT_PASSWD="root" -ENV ROOT_PASSWD=${ROOT_PASSWD} -WORKDIR /root -SHELL ["/bin/bash", "-c"] -COPY id_rsa.pub /root/.ssh/id_rsa.pub - -# base tools -RUN apt-get update && \ - apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \ - apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 && \ - apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c && \ - add-apt-repository ppa:git-core/ppa -y && \ - apt-get install -y git libnuma-dev wget && \ - wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \ - sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ - sudo apt-get update && \ - sudo apt-get install -y cmake && \ - sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ - sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ - sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config && \ - mkdir /var/run/sshd && \ - echo "root:${ROOT_PASSWD}" | chpasswd && \ - mkdir -p ~/.pip && \ - wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \ - bash /tmp/miniconda.sh -b -p /opt/conda && \ - rm /tmp/miniconda.sh && \ - conda init bash && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - cat < ~/.condarc -channels: - - conda-forge - - bioconda - - pytorch - - pytorch-nightly - - nvidia - - defaults -show_channel_urls: true -EOF - -# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile -# PyTorch -ARG CONDA_ENV_NAME="deepspeed" -ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} -ARG PYTHON_VERSION=3.10 -ENV PYTHON_VERSION=${PYTHON_VERSION} -ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH -ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" -ENV REF='main' -ENV STAGE_DIR=/tmp -ENV OPENMPI_BASEVERSION=4.1 -ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 -ARG CUDA='cu121' -ENV CUDA=${CUDA} -ARG PYTORCH_VERSION=2.3.1 -ENV PYTORCH_VERSION=${PYTORCH_VERSION} -ARG TORCHVISION_VERSION=0.18.1 -ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} -ARG TORCHAUDIO_VERSION=2.3.1 -ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} -ARG PYTORCH_CUDA_VERSION=12.1 -ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} -ARG SETUPTOOLS_VERSION=69.5.1 -ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} -ARG USE_CUDA=1 -ENV USE_CUDA=${USE_CUDA} -ARG USE_ROCM=0 -ENV USE_ROCM=${USE_ROCM} -ARG USE_XPU=0 -ENV USE_XPU=${USE_XPU} -ARG _GLIBCXX_USE_CXX11_ABI=1 -ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y && \ - echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ - which python > ~/python_path.txt && \ - conda activate ${CONDA_ENV_NAME} && \ - git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja && \ - cd ${STAGE_DIR}/ninja && \ - git clone https://github.com/google/googletest.git && \ - python ./configure.py --bootstrap && \ - conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" && \ - ./ninja all && \ - ./ninja_test && \ - python3 -m pip install --no-cache-dir --upgrade pip && \ - python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python && \ - conda clean -afy && \ - git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. && \ - python -m pip install setuptools==${SETUPTOOLS_VERSION} && \ - python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] && \ - python3 -m pip uninstall -y torch torchvision torchaudio && \ - python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} && \ - python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate && \ - python3 -m pip uninstall -y transformer-engine && \ - python3 -m pip uninstall -y torch-tensorrt && \ - python3 -m pip uninstall -y apex - -# install apex -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex && \ - cd apex && \ - MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ - python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ - cd .. && \ - rm -rf ${STAGE_DIR}/apex - -ENV MLNX_OFED_VERSION=5.8-4.1.5.0 -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - mkdir -p ${STAGE_DIR} && \ - wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \ - cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \ - ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ - cd ${STAGE_DIR} && \ - rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* - -ENV NV_PEER_MEM_VERSION=1.2 -ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - rm -rf ${STAGE_DIR} && \ - mkdir -p ${STAGE_DIR} && \ - git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ - cd ${STAGE_DIR}/nv_peer_memory && \ - ./build_module.sh && \ - cd ${STAGE_DIR} && \ - tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ - cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ - apt-get update && \ - apt --fix-broken install -y && \ - apt-get install -y dkms && \ - dpkg-buildpackage -us -uc && \ - dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb - -# install mpi -ENV PATH=/usr/local/mpi/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - rm -rf ${STAGE_DIR} && \ - mkdir -p ${STAGE_DIR} && \ - cd ${STAGE_DIR} && \ - wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ - cd openmpi-${OPENMPI_VERSION} && \ - ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ - make -j"$(nproc)" install && \ - ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ - test -f /usr/local/mpi/bin/mpic++ && \ - cd ${STAGE_DIR} && \ - rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} && \ - mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ - echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ - echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ - chmod a+x /usr/local/mpi/bin/mpirun - -# SSH daemon port inside container cannot conflict with host OS port -# ENV SSH_PORT=2222 -# RUN source /opt/conda/etc/profile.d/conda.sh && \ -# conda activate ${CONDA_ENV_NAME} && \ -# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ -# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config - -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - useradd --create-home --uid 1000 --shell /bin/bash deepspeed && \ - usermod -aG sudo deepspeed && \ - echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers - -# install cutlass https://github.com/NVIDIA/cutlass -# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) -# A100: architecture is Ampere -# V100: architecture is Volta -# T4: architecture is Turing -# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 -# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 -# 80:适用于 NVIDIA Ampere 架构(如 A100)。 -# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 -# 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="80;89;90a" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ - cd /opt/cutlass && \ - git checkout . && \ - git checkout master && \ - mkdir build && \ - cd build && \ - cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ - make -j"$(nproc)" install && \ - cd .. - -# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile -# RUN source /opt/conda/etc/profile.d/conda.sh && \ -# conda activate ${CONDA_ENV_NAME} && \ -# apt-get update && \ -# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev && \ -# python -m pip install pipdeptree psutil yappi cffi ipdb pandas matplotlib py3nvml pyarrow graphviz astor boto3 tqdm sentencepiece msgpack requests pandas sphinx sphinx_rtd_theme scipy numpy scikit-learn nvidia-ml-py3 mpi4py - -# install deepspeed step 1 -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - /opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION} && \ - git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL && \ - cd ${STAGE_DIR}/oneCCL && \ - git checkout . && \ - git checkout master && \ - mkdir build && \ - cd build && \ - cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \ - make -j"$(nproc)" install - -# install deepspeed step 2 -ARG CUDA_ARCH_LIST="80;86;89;90" -ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST} -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels && \ - cd ${STAGE_DIR}/DeepSpeed-Kernels && \ - CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v . - -ARG DEEPSPEED_VERSION="v0.14.3" -ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION} -ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose" -ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS} -ARG DS_BUILD_SPARSE_ATTN=0 -ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN} -ARG DS_BUILD_FUSED_ADAM=1 -ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM} -ARG DS_BUILD_CPU_ADAM=1 -ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} -ARG DS_BUILD_OPS=1 -ENV DS_BUILD_OPS=${DS_BUILD_OPS} -ARG HOSTFILE_CONTENT="" -ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT} -ENV CUTLASS_PATH='/opt/cutlass' -ENV CUDA_HOME='/usr/local/cuda' -ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} -ENV PATH=${CUDA_HOME}/bin:${PATH} - -# install deepspeed step 3 -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \ - cd ${STAGE_DIR}/DeepSpeed && \ - git checkout ${DEEPSPEED_VERSION} && \ - sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \ - chmod +x ./install_modified.sh && \ - if [ -n "${HOSTFILE_CONTENT}" ]; then \ - echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \ - INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \ - else \ - INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \ - fi && \ - eval $INSTALL_CMD && \ - cat < ~/compile_deepspeed_ops.py -import deepspeed - -def compile_ops(): - builders = [ - deepspeed.ops.op_builder.AsyncIOBuilder, - deepspeed.ops.op_builder.FusedAdamBuilder, - deepspeed.ops.op_builder.CPUAdamBuilder, - deepspeed.ops.op_builder.CPUAdagradBuilder, - deepspeed.ops.op_builder.CPULionBuilder, - deepspeed.ops.op_builder.EvoformerAttnBuilder, - deepspeed.ops.op_builder.FPQuantizerBuilder, - deepspeed.ops.op_builder.FusedLambBuilder, - deepspeed.ops.op_builder.FusedLionBuilder, - deepspeed.ops.op_builder.QuantizerBuilder, - deepspeed.ops.op_builder.RaggedOpsBuilder, - deepspeed.ops.op_builder.RandomLTDBuilder, - deepspeed.ops.op_builder.SparseAttnBuilder, - deepspeed.ops.op_builder.SpatialInferenceBuilder, - deepspeed.ops.op_builder.TransformerBuilder, - deepspeed.ops.op_builder.StochasticTransformerBuilder, - ] - - for builder in builders: - print(f"Compiling {builder.__name__}") - builder().load() - -if __name__ == "__main__": - compile_ops() -EOF && \ - python compile_deepspeed_ops.py && \ - ds_report - -# install transformers and flash-attn -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers && \ - cd ${STAGE_DIR}/transformers && \ - python3 ./setup.py develop && \ - python3 -m pip install -U --no-cache-dir "pydantic<2" && \ - pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org - -# other packages -ENV TORCH_CUDA_ARCH_LIST="80;86;89;90" -RUN source /opt/conda/etc/profile.d/conda.sh && \ - conda activate ${CONDA_ENV_NAME} && \ - pip3 install optimum && \ - pip3 install peft tiktoken tqdm matplotlib seaborn numpy pandas scikit-learn diffusers huggingface_hub spacy blobfile pycocotools open_clip_torch zstandard -i https://pypi.org/simple/ --trusted-host pypi.org - -ARG DEEPSPEED_TRAIN='/data/train_data' -ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN -ARG DEEPSPEED_VALIDATION='/data/validation_data' -ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION -ARG NCCL_SOCKET_IFNAME='eth0' - -CMD ["/usr/sbin/sshd", "-D"] -# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"] diff --git a/finetune/docker-compose_ldh.yml b/finetune/docker-compose_ldh.yml index 1baeef1..ffefef1 100644 --- a/finetune/docker-compose_ldh.yml +++ b/finetune/docker-compose_ldh.yml @@ -5,10 +5,26 @@ services: context: . dockerfile: Dockerfile.ldh args: + # PYTHON_VERSION: "3.10" + # CUDA_VERSION: "12.1.0" + # PYTORCH_VERSION: "2.3.0" + # TORCHVISION_VERSION: "0.18.0" + # TORCHAUDIO_VERSION: "2.3.0" + # DS_BUILD_OPS: 1 + # USE_CUDA: 1 + # USE_ROCM: 0 + # USE_XPU: 0 + # CUDA: cu121 + # CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + # SETUPTOOLS_VERSION: "69.5.1" + # DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + # DEEPSPEED_VERSION: "master" + # DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" HTTP_PROXY: "http://127.0.0.1:15777" HTTPS_PROXY: "http://127.0.0.1:15777" + # cache-from: "type=local" image: ldh/deepspeed:test - shm_size: '32gb' + shm_size: '128gb' deploy: resources: reservations: @@ -20,22 +36,17 @@ services: environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - #- CUTLASS_PATH="/opt/cutlass" - #- CUDA_HOME="/usr/local/cuda" - #- PATH="${CUDA_HOME}/bin:${PATH}" - #- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" - stdin_open: true - tty: true + # stdin_open: true + # tty: true privileged: true cap_add: - IPC_LOCK volumes: - # - /mnt/local-nvme2:/root/workspace - # - /dev/infiniband:/dev/infiniband - # - /mnt/local-nvme:/root/ - ports: - - "22242:22242" - - "5000:5000" + - /root/workspace:/root/data + - /dev/infiniband:/dev/infiniband + # ports: + # - "22242:22242" + # - "5000:5000" # networks: # - ldh_overlay_network network_mode: host