first add

2024-08-28 15:18:15 +08:00
commit 873429d4e6
57 changed files with 4892 additions and 0 deletions
--- a/ldh/.deepspeed_env
+++ b/ldh/.deepspeed_env
@@ -0,0 +1,6 @@
+CUDA_HOME=/usr/local/cuda
+CUTLASS_PATH=/opt/cutlass
+TORCH_CUDA_ARCH_LIST="8.0 9.0+PTX"
+PATH=/opt/openmpi/bin:/usr/lib/jvm/default-java/bin:/usr/local/cuda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+LD_LIBRARY_PATH=/opt/openmpi/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs
--- a/ldh/Dockerfile
+++ b/ldh/Dockerfile
@@ -0,0 +1,212 @@
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND="noninteractive"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV JAVA_HOME="/usr/lib/jvm/default-java"
+ENV CUTLASS_PATH="/opt/cutlass"
+ENV CUTLASS_NVCC_ARCHS="80;90a"
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ENV PYTORCH_CUDA_VERSION="cu124"
+ENV TORCH_CUDA_ARCH_LIST="8.0 9.0+PTX"
+
+ENV PATH=/opt/openmpi/bin:${CUDA_HOME}/bin:$JAVA_HOME/bin:${PATH}
+ENV LD_LIBRARY_PATH=/opt/openmpi/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH=${CUDA_HOME}/lib64:${LIBRARY_PATH}
+
+ENV HF_TOKEN=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
+
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /root
+
+RUN \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip screen \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-dev \
+        libsndfile-dev libcupti-dev libjpeg-dev libpng-dev \
+        libaio-dev libnuma-dev && \
+        apt-get update && \
+        apt-get install -y \
+        git python3 python3-pip ninja-build default-jre && \
+        python3 -m pip install --upgrade pip wheel && \
+        apt-get -y install antlr4 && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+
+
+# # DOCA https://developer.nvidia.com/doca-archive
+# RUN \
+#         wget --quiet https://www.mellanox.com/downloads/DOCA/DOCA_v2.5.2/host/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb -O /tmp/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb && \
+#         dpkg -i /tmp/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb && \
+#         apt-get update && \
+#         apt-get -y install doca-runtime doca-sdk doca-tools
+
+
+# cutlass https://github.com/NVIDIA/cutlass
+RUN \
+        git clone https://github.com/NVIDIA/cutlass.git /opt/cutlass && \
+        cd /opt/cutlass && \
+        git fetch --all --tags && \
+        git checkout main && \
+        git submodule update --init --recursive && \
+        export CUDACXX=${CUDA_HOME}/bin/nvcc && \
+        mkdir build && \
+        cd build && \
+        cmake .. -DCUTLASS_NVCC_ARCHS=${CUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON
+        # cmake .. -DCUTLASS_NVCC_ARCHS=${CUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=ON -DCUTLASS_LIBRARY_KERNELS=all -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
+        # make -j"$(nproc)" install
+        # make cutlass_profiler -j"$(nproc)"
+        # make test_unit -j"$(nproc)" VERBOSE=1
+
+
+# OPENMPI https://www.open-mpi.org/software/ompi/v4.1/
+RUN \
+        wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz && \
+        tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
+        cd /tmp/openmpi-${OPENMPI_VERSION} && \
+        ./configure --prefix=/opt/openmpi-${OPENMPI_VERSION} && \
+        # ./configure --prefix=/opt/openmpi-${OPENMPI_VERSION} --with-cuda=/usr/local/cuda --enable-python-bindings --with-python=/usr/bin/python3 && \
+        make -j$(nproc) && \
+        make install && \
+        ln -s /opt/openmpi-${OPENMPI_VERSION} /opt/openmpi && \
+        # Sanity check:
+        test -f /opt/openmpi/bin/mpic++ && \
+        cd /root && \
+        rm -rf /tmp/*
+
+
+# pytorch https://pytorch.org
+RUN \
+        python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
+        python3 -m pip install packaging pillow requests jinja2 triton networkx numpy tqdm urllib3 certifi setuptools --index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION}
+
+
+# Install apex with CUDA and C++ extensions https://github.com/NVIDIA/apex
+# # if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+# pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+# # otherwise
+# pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+RUN \
+        git clone https://github.com/NVIDIA/apex.git /tmp/apex && \
+        cd /tmp/apex && \
+        git fetch --all --tags && \
+        git checkout tags/24.04.01 && \
+        git submodule update --init --recursive && \
+        python3 setup.py develop --cpp_ext --cuda_ext
+
+
+# flash-attention https://github.com/Dao-AILab/flash-attention
+# pip install flash-attn --no-build-isolation
+# MAX_JOBS=4 pip install flash-attn --no-build-isolation
+RUN \
+        git clone https://github.com/Dao-AILab/flash-attention.git /tmp/flash-attention && \
+        cd /tmp/flash-attention && \
+        git submodule update --init --recursive && \
+        python3 setup.py install
+        # pytest -q -s tests/test_flash_attn.py
+        # cd hopper
+        # python3 setup.py install
+        # export PYTHONPATH=$PWD
+        # pytest -q -s test_flash_attn.py
+
+
+# xformers https://github.com/facebookresearch/xformers
+RUN \ 
+        git clone https://github.com/facebookresearch/xformers.git /tmp/xformers && \
+        cd /tmp/xformers && \
+        git submodule update --init --recursive && \
+        python3 -m pip install -v -U /tmp/xformers
+        # python3 -m xformers.info
+
+
+# TransformerEngine https://github.com/NVIDIA/TransformerEngine
+RUN \
+        git clone --branch stable https://github.com/NVIDIA/TransformerEngine.git /tmp/TransformerEngine && \
+        cd /tmp/TransformerEngine && \
+        git submodule update --init --recursive && \
+        python3 setup.py install
+
+
+RUN \
+        python3 -m pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub trl optimum tokenizers && \
+        python3 -m pip install packaging jinja2 triton networkx urllib3 certifi requests protobuf blobfile pytest && \
+        python3 -m pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy pillow scipy && \
+        python3 -m pip install pyyaml ipython ipdb pydantic psutil yappi cffi py3nvml pyarrow graphviz astor boto3 msgpack ipykernel cython
+RUN \
+        python3 -m pip install zstandard nvitop pycocotools tensorboard tensor_parallel && \
+#         # https://github.com/mpi4py/mpi4py/issues/335
+#         rm /opt/conda/envs/${CONDA_ENV_NAME}/compiler_compat/ld && \
+        python3 -m pip install mpi4py 
+
+
+# lm-eval https://github.com/EleutherAI/lm-evaluation-harness
+# ENV ANTLR_VERSION=4.13.2
+# wget -q -O /root/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
+RUN \ 
+        python3 -m pip install immutabledict langdetect && \
+        python3 -m nltk.downloader popular punkt punkt_tab && \
+         && \
+        python3 -m pip install antlr4-python3-runtime==4.11 && \
+        huggingface-cli login --token ${HF_TOKEN} && \
+        git clone https://github.com/EleutherAI/lm-evaluation-harness.git /root/lm-evaluation-harness && \
+        cd /root/lm-evaluation-harness && \
+        python3 -m pip install -e ".[dev]"
+
+
+# Megatron-LM https://github.com/NVIDIA/Megatron-LM
+RUN \
+        git clone https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \
+        /root/Megatron-LM && \
+        git checkout core_r0.5.0 && \
+        pip install --no-use-pep517 -e .
+
+
+# SSH config
+RUN \
+        echo 'root:root' | chpasswd && \
+        cp /etc/ssh/sshd_config /tmp/sshd_config && \
+        echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \
+        sed -i "s/#Port 22/Port 22222/" /etc/ssh/sshd_config && \
+        sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \
+        sed -i "s/#StrictModes yes/StrictModes no/" /etc/ssh/sshd_config && \
+        sed -i "s/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/" /etc/ssh/ssh_config && \
+        chown root:root /etc/ssh/sshd_config && \
+        mkdir -p /run/sshd && chmod 0755 /run/sshd && \
+        ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \
+        cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+# ENV config
+RUN \
+        unset https_proxy http_proxy && \
+        echo "CUDA_HOME=${CUDA_HOME}" > ~/.deepspeed_env && \
+        echo "CUTLASS_PATH=${CUTLASS_PATH}" >> ~/.deepspeed_env && \
+        echo "TORCH_CUDA_ARCH_LIST=\"${TORCH_CUDA_ARCH_LIST}\"" >> ~/.deepspeed_env && \
+        echo "PATH=${PATH}" >> ~/.deepspeed_env && \
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ~/.deepspeed_env && \
+        echo "LIBRARY_PATH=${LIBRARY_PATH}" >> ~/.deepspeed_env && \
+        echo "export CUDA_HOME=${CUDA_HOME}" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export CUTLASS_PATH=${CUTLASS_PATH}" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export TORCH_CUDA_ARCH_LIST=\"${TORCH_CUDA_ARCH_LIST}\"" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export PATH=$PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export LIBRARY_PATH=$LIBRARY_PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc
+# clean
+RUN \
+        cd ~ && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        rm -rf /tmp/* && \
+        rm -rf /var/tmp/* && \
+        rm -rf /root/.cache/pip
--- a/ldh/compose.yml
+++ b/ldh/compose.yml
@@ -0,0 +1,40 @@
+
+services:
+  ldh-deepspeed-test:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      # args:
+      #   HTTP_PROXY: "http://127.0.0.1:15777"
+      #   HTTPS_PROXY: "http://127.0.0.1:15777"
+      #   cache-from: "type=local"
+    image: ldh/deepspeed:test
+    container_name: ldh-deepspeed-test
+    shm_size: '1024gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    # runtime: nvidia
+    # ipc: host
+    pid: host
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # stdin_open: true
+    # tty: true
+    # privileged: true
+    cap_add:
+      - ALL
+    volumes:
+      - /mnt/beegfs:/root/shared/beegfs
+      - /mnt/yrfs:/root/shared/yrfs
+    # ports:
+    #   - "22242:22242"
+    #   - "5000:5000"
+    network_mode: host
+    command: ["/usr/sbin/sshd", "-D"]
+    # command: ["/bin/bash", "-c", "while true; do sleep 1000; done"]
--- a/ldh/requirements.txt
+++ b/ldh/requirements.txt
@@ -0,0 +1,238 @@
+absl-py==2.1.0
+accelerate==0.33.0
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.11.0
+apex @ file:///tmp/apex
+astor==0.8.1
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==24.2.0
+blinker==1.4
+blis==0.7.11
+blobfile==2.1.1
+boto3==1.35.5
+botocore==1.35.5
+cachetools==5.5.0
+catalogue==2.0.10
+certifi==2022.12.7
+cffi==1.17.0
+cfgv==3.4.0
+chardet==5.2.0
+charset-normalizer==2.1.1
+click==8.0.3
+cloudpathlib==0.18.1
+colorama==0.4.4
+coloredlogs==15.0.1
+comm==0.2.2
+confection==0.1.5
+contourpy==1.2.1
+coverage==7.6.1
+cryptography==3.4.8
+cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.11
+DataProperty==1.0.1
+datasets==2.21.0
+dbus-python==1.2.18
+debugpy==1.8.5
+decorator==5.1.1
+deepspeed==0.15.0
+diffusers==0.30.1
+dill==0.3.8
+distlib==0.3.8
+distro==1.7.0
+docstring_parser==0.16
+einops==0.8.0
+evaluate==0.4.2
+exceptiongroup==1.2.2
+execnet==2.1.1
+executing==2.0.1
+filelock==3.13.1
+flash-attn==2.6.3
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.2.0
+graphviz==0.20.3
+grpcio==1.66.0
+hjson==3.1.0
+httplib2==0.20.2
+huggingface-hub==0.24.6
+humanfriendly==10.0
+identify==2.6.0
+idna==3.4
+immutabledict==4.2.0
+importlib-metadata==4.6.4
+iniconfig==2.0.0
+iotop==0.6
+ipdb==0.13.13
+ipykernel==6.29.5
+ipython==8.26.0
+jedi==0.19.1
+jeepney==0.7.1
+Jinja2==3.1.3
+jmespath==1.0.1
+joblib==1.4.2
+jsonlines==4.0.0
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+keyring==23.5.0
+kiwisolver==1.4.5
+langcodes==3.4.0
+langdetect==1.0.9
+language_data==1.2.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+-e git+https://github.com/EleutherAI/lm-evaluation-harness.git@aab42ba836b4af28cc1c5c1e697ea334c6ea7ced#egg=lm_eval
+lxml==4.9.4
+marisa-trie==1.2.0
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mbstrdecoder==1.1.3
+mdurl==0.1.2
+meson==0.61.2
+more-itertools==8.10.0
+mpi4py==4.0.0
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.0.5
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy==1.11.1
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.2.1
+ninja==1.11.1.1
+nltk==3.9.1
+nodeenv==1.9.1
+numexpr==2.10.1
+numpy==1.26.3
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-ml-py==12.535.161
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+nvitop==1.3.2
+oauthlib==3.2.0
+optimum==1.21.4
+packaging==22.0
+pandas==2.2.2
+parso==0.8.4
+pathvalidate==3.2.1
+peft==0.12.0
+pexpect==4.9.0
+pillow==10.2.0
+platformdirs==4.2.2
+pluggy==1.5.0
+portalocker==2.10.1
+pre-commit==3.8.0
+preshed==3.0.9
+prompt_toolkit==3.0.47
+protobuf==5.27.3
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+py3nvml==0.2.7
+pyarrow==17.0.0
+pybind11==2.13.5
+pycocotools==2.0.8
+pycparser==2.22
+pycryptodomex==3.20.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+pyelftools==0.27
+Pygments==2.18.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+pyparsing==2.4.7
+pytablewriter==1.2.0
+pytest==8.3.2
+pytest-cov==5.0.0
+pytest-xdist==3.6.1
+python-apt==2.4.0+ubuntu3
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==5.4.1
+pyzmq==26.2.0
+regex==2024.7.24
+requests==2.32.3
+rich==13.7.1
+rouge-score==0.1.2
+s3transfer==0.10.2
+sacrebleu==2.4.3
+safetensors==0.4.4
+scikit-learn==1.5.1
+scipy==1.14.1
+seaborn==0.13.2
+SecretStorage==3.3.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+shtab==1.7.1
+six==1.16.0
+smart-open==7.0.4
+spacy==3.7.6
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tcolorpy==0.1.6
+tensor-parallel==2.0.0
+tensorboard==2.17.1
+tensorboard-data-server==0.7.2
+termcolor==2.4.0
+thinc==8.2.5
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+timm==1.0.9
+tokenizers==0.19.1
+tomli==2.0.1
+torch==2.4.0+cu124
+torchaudio==2.4.0+cu124
+torchvision==0.19.0+cu124
+tornado==6.4.1
+tqdm==4.66.5
+tqdm-multiprocess==0.0.11
+traitlets==5.14.3
+transformers==4.43.4
+triton==3.0.0
+trl==0.9.6
+typepy==1.3.2
+typer==0.12.4
+typing_extensions==4.9.0
+tyro==0.8.8
+tzdata==2024.1
+urllib3==1.26.13
+virtualenv==20.26.3
+wadllib==1.3.6
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+Werkzeug==3.0.4
+word2number==1.1
+wrapt==1.16.0
+xformers @ file:///tmp/xformers
+xmltodict==0.13.0
+xxhash==3.5.0
+yappi==1.6.0
+yarl==1.9.4
+zipp==1.0.0
+zstandard==0.23.0