commit 873429d4e66cfc2d390422048a5dc7869e0431da Author: lingyuzeng Date: Wed Aug 28 15:18:15 2024 +0800 first add diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f5ddb54 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.tar +build_d/ +*_src +evo_src/ +megaDNA_src/ +evo/huggingface/ +*.zip +finetune/binbbt/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..eab1248 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ + + +## 预训练 + +GitHub - huggingface/transformers: 🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX. +GitHub - microsoft/DeepSpeed: DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective. +GitHub - huggingface/peft: 🤗 PEFT: State-of-the-art Parameter-Efficient Fine-Tuning. +GitHub - huggingface/accelerate: 🚀 A simple way to launch, train, and use PyTorch models on almost any device and distributed configuration, automatic mixed precision (including fp8), and easy-to-configure FSDP and DeepSpeed support + +```shell +# torch +https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +https://github.com/huggingface/transformers/blob/main/docker/transformers-all-latest-gpu/Dockerfile +https://github.com/huggingface/peft/tree/main/docker/peft-gpu-bnb-source +https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +``` \ No newline at end of file diff --git a/bgpt/Dockerfile.bgpt b/bgpt/Dockerfile.bgpt new file mode 100644 index 0000000..e32f214 --- /dev/null +++ b/bgpt/Dockerfile.bgpt @@ -0,0 +1,76 @@ +# syntax=docker/dockerfile:1 +FROM nvidia/cuda:11.6.1-devel-ubuntu20.04 +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +WORKDIR /root +SHELL ["/bin/bash", "-c"] +COPY requirements.txt /root/ +RUN <> /etc/ssh/sshd_config +echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config +echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config +echo "Port 22" >> /etc/ssh/sshd_config +mkdir /var/run/sshd +echo 'root:cdcdocker' | chpasswd +mkdir -p ~/.pip +# install miniconda +wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh +bash /tmp/miniconda.sh -b -p /opt/conda +rm /tmp/miniconda.sh +conda init bash +ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh +echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc +echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba 并配置 mambarc +echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +micromamba shell init -s bash -p ~/micromamba +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +alias mamba=micromamba +alias mba=mamba +EOF +# 配置 .mambarc 文件 +cat < ~/.mambarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia +EOF +mkdir -p ~/.pip +echo " +[global] +index-url = https://mirrors.aliyun.com/pypi/simple/ + +[install] +trusted-host=mirrors.aliyun.com +" >> ~/.pip/pip.conf +micromamba create -n bgpt -c conda-forge python=3.7.9 -y +micromamba run -n bgpt pip install -r requirements.txt +micromamba run -n bgpt pip install ipykernel attrs seaborn +micromamba run -n bgpt python -m ipykernel install --user --name="bgpt" --display-name="bgpt_env" +micromamba run -n bgpt pip install seaborn attrs torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 +echo "micromamba activate bgpt" >> ~/.bashrc +EOT + +# Expose SSH port +EXPOSE 3222 + +# Keep the container running +CMD ["/usr/sbin/sshd", "-D"] diff --git a/bgpt/docker-compose-bgpt.yml b/bgpt/docker-compose-bgpt.yml new file mode 100644 index 0000000..2599b51 --- /dev/null +++ b/bgpt/docker-compose-bgpt.yml @@ -0,0 +1,32 @@ +version: '3.8' + +services: + ubuntu-ssh: + build: + context: . + dockerfile: Dockerfile.bgpt + volumes: + - /data:/data + container_name: ubuntu-ssh + pull_policy: if_not_present + tty: true + restart: unless-stopped + image: zly/cuda-bgpt:latest + ports: + - 3222:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + networks: + - network_bgpt + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +networks: + network_bgpt: + name: network_bgpt diff --git a/bgpt/requirements.txt b/bgpt/requirements.txt new file mode 100644 index 0000000..2b819fa Binary files /dev/null and b/bgpt/requirements.txt differ diff --git a/dcgm-exporter/Dockerfile.grafana b/dcgm-exporter/Dockerfile.grafana new file mode 100644 index 0000000..c908baf --- /dev/null +++ b/dcgm-exporter/Dockerfile.grafana @@ -0,0 +1,30 @@ +# syntax=docker/dockerfile:1 + +ARG GRAFANA_VERSION="9.5.2" + +FROM ubuntu:22.04 + +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} + +# 替换 sources.list 文件以使用阿里云镜像源 +# RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.aliyun.com/ubuntu/|g' /etc/apt/sources.list && \ +# sed -i 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list + +# 安装必要的工具和库 +RUN apt-get update && \ + apt-get install -y wget vim bash ca-certificates + +RUN wget https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_amd64.deb && \ + dpkg -i grafana_${GRAFANA_VERSION}_amd64.deb && \ + rm grafana_${GRAFANA_VERSION}_amd64.deb + +COPY grafana.ini /etc/grafana/grafana.ini + +EXPOSE 3000 + +CMD ["/usr/sbin/grafana-server", "--config=/etc/grafana/grafana.ini"] diff --git a/dcgm-exporter/Dockerfile.prometheus b/dcgm-exporter/Dockerfile.prometheus new file mode 100644 index 0000000..44a7a70 --- /dev/null +++ b/dcgm-exporter/Dockerfile.prometheus @@ -0,0 +1,32 @@ +# syntax=docker/dockerfile:1 + +ARG PROMETHEUS_VERSION="2.45.6" + +FROM ubuntu:22.04 + +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} + +# 替换 sources.list 文件以使用阿里云镜像源 +# RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.aliyun.com/ubuntu/|g' /etc/apt/sources.list && \ +# sed -i 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list + + +# 安装必要的工具和库 +RUN apt-get update && \ + apt-get install -y wget vim bash ca-certificates + +RUN wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz && \ + tar xvfz prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz && \ + mv prometheus-${PROMETHEUS_VERSION}.linux-amd64 /opt/prometheus && \ + rm prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz + +COPY prometheus.yml /opt/prometheus/prometheus.yml + +EXPOSE 9090 + +CMD ["/opt/prometheus/prometheus", "--config.file=/opt/prometheus/prometheus.yml"] diff --git a/dcgm-exporter/README.md b/dcgm-exporter/README.md new file mode 100644 index 0000000..166e59e --- /dev/null +++ b/dcgm-exporter/README.md @@ -0,0 +1,7 @@ + +构建和运行 +使用 Docker Compose 构建和运行容器: + +```shell +docker-compose up --build -d +``` \ No newline at end of file diff --git a/dcgm-exporter/docker-compose.yml b/dcgm-exporter/docker-compose.yml new file mode 100644 index 0000000..b994086 --- /dev/null +++ b/dcgm-exporter/docker-compose.yml @@ -0,0 +1,34 @@ +version: '3.8' + +services: + prometheus: + build: + context: . + dockerfile: Dockerfile.prometheus + args: + PROMETHEUS_VERSION: "2.45.6" + HTTP_PROXY: "http://localhost:15777" + HTTPS_PROXY: "http://localhost:15777" + image: zly/prometheus:latest + container_name: prometheus + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/opt/prometheus/prometheus.yml + restart: unless-stopped + + grafana: + build: + context: . + dockerfile: Dockerfile.grafana + args: + GRAFANA_VERSION: "9.5.2" + HTTP_PROXY: "http://localhost:15777" + HTTPS_PROXY: "http://localhost:15777" + image: zly/grafana:latest + container_name: grafana + ports: + - "3000:3000" + volumes: + - ./grafana.ini:/etc/grafana/grafana.ini + restart: unless-stopped diff --git a/dcgm-exporter/grafana.ini b/dcgm-exporter/grafana.ini new file mode 100644 index 0000000..ce0b534 --- /dev/null +++ b/dcgm-exporter/grafana.ini @@ -0,0 +1,6 @@ +[server] +http_port = 3000 + +[security] +admin_user = admin +admin_password = grafana diff --git a/dcgm-exporter/prometheus.yml b/dcgm-exporter/prometheus.yml new file mode 100644 index 0000000..ab388ea --- /dev/null +++ b/dcgm-exporter/prometheus.yml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'dcgm-exporter' + static_configs: + - targets: ['127.0.0.1:9400'] diff --git a/evo/Dockerfile b/evo/Dockerfile new file mode 100644 index 0000000..4591208 --- /dev/null +++ b/evo/Dockerfile @@ -0,0 +1,139 @@ +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH +WORKDIR /root +SHELL ["/bin/bash", "-c"] +# base tools +RUN <> ~/.bashrc +echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba +echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +micromamba shell init -s bash -p ~/micromamba +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +alias mamba=micromamba +alias mba=mamba +EOF +# 配置 .mambarc 文件 +cat < ~/.mambarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.0 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.0 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.0 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +RUN <= 23.0. +# +# For reference: +# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] +# base tools +RUN <> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba +# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +# micromamba shell init -s bash -p ~/micromamba +# cat <<'EOF' >> ~/.bashrc +# source ~/micromamba/etc/profile.d/micromamba.sh +# EOF +# # 配置 .mambarc 文件 +# cat < ~/.mambarc +# channels: +# - conda-forge +# - bioconda +# - pytorch +# - pytorch-nightly +# - nvidia +# - defaults +# show_channel_urls: true +# EOF +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH +ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.1 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.1 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.1 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +ARG USE_CUDA=1 +ENV USE_CUDA=${USE_CUDA} +ARG USE_ROCM=0 +ENV USE_ROCM=${USE_ROCM} +ARG USE_XPU=0 +ENV USE_XPU=${USE_XPU} +ARG _GLIBCXX_USE_CXX11_ABI=1 +ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} +RUN <> ~/.bashrc +which python > ~/python_path.txt +conda activate ${CONDA_ENV_NAME} +# 克隆 ninja 源码并编译 +git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja +cd ${STAGE_DIR}/ninja +# 克隆 GoogleTest 源码 +git clone https://github.com/google/googletest.git +python ./configure.py --bootstrap +# 配置并构建 Ninja 测试,添加 pthread 链接选项 +# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest +conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" +./ninja all +# 运行 Ninja 单元测试 +./ninja_test +python3 -m pip install --no-cache-dir --upgrade pip +python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python +conda clean -afy +git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. +python -m pip install setuptools==${SETUPTOOLS_VERSION} +python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) +# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) +python3 -m pip uninstall -y torch torchvision torchaudio +# # install pytorch create conda env aleay exists +python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} +python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +python3 -m pip uninstall -y transformer-engine +python3 -m pip uninstall -y torch-tensorrt +python3 -m pip uninstall -y apex +EOT + +# install apex +RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... +MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +cd .. +rm -rf ${STAGE_DIR}/apex +EOT + +RUN <&1 +# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail +# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# install deepspeed prepare +# install Mellanox OFED +mkdir -p ${STAGE_DIR} +wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - +cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 +./mlnxofedinstall --user-space-only --without-fw-update --all -q +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* +cd .. +# install nv_peer_mem +rm -rf ${STAGE_DIR} +mkdir -p ${STAGE_DIR} +git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory +cd ${STAGE_DIR}/nv_peer_memory +./build_module.sh +cd ${STAGE_DIR} +tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz +cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} +apt-get update +apt-get install -y dkms +dpkg-buildpackage -us -uc +dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb +EOT + +# install mpi +ENV PATH=/usr/local/mpi/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +RUN < /usr/local/mpi/bin/mpirun +echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun +chmod a+x /usr/local/mpi/bin/mpirun +EOT + +# SSH daemon port inside container cannot conflict with host OS port +ENV SSH_PORT=2222 +RUN < ${STAGE_DIR}/sshd_config && \ +sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +EOT + +# 29.78 Usage: install.sh [options...] +# 29.78 +# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in +# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally +# 29.78 +# 29.78 [optional] +# 29.78 -l, --local_only Install only on local machine +# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) +# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) +# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels +# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) +# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) +# 29.78 -e, --examples Checkout deepspeed example submodule (no install) +# 29.78 -v, --verbose Verbose logging +# 29.78 -h, --help This help text + +RUN <> /etc/sudoers +EOT + +# install cutlass https://github.com/NVIDIA/cutlass +# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# A100: architecture is Ampere +# V100: architecture is Volta +# T4: architecture is Turing +# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# 89:GeForce RTX 4090 +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN < install_modified.sh +chmod +x ./install_modified.sh +# 检查 HOSTFILE_CONTENT 并写入文件 +if [ -n "${HOSTFILE_CONTENT}" ]; then + echo "${HOSTFILE_CONTENT}" > /tmp/hostfile + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" +else + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" +fi +eval $INSTALL_CMD +# compile deepspeed ops +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +echo "alias mamba=micromamba" >> ~/.bashrc +echo "alias mba=mamba" >> ~/.bashrc +EOF +# 配置 .mambarc 文件 +cat < ~/compile_deepspeed_ops.py +import deepspeed + +def compile_ops(): + builders = [ + deepspeed.ops.op_builder.AsyncIOBuilder, + deepspeed.ops.op_builder.FusedAdamBuilder, + deepspeed.ops.op_builder.CPUAdamBuilder, + deepspeed.ops.op_builder.CPUAdagradBuilder, + deepspeed.ops.op_builder.CPULionBuilder, + deepspeed.ops.op_builder.EvoformerAttnBuilder, + deepspeed.ops.op_builder.FPQuantizerBuilder, + deepspeed.ops.op_builder.FusedLambBuilder, + deepspeed.ops.op_builder.FusedLionBuilder, + deepspeed.ops.op_builder.QuantizerBuilder, + deepspeed.ops.op_builder.RaggedOpsBuilder, + deepspeed.ops.op_builder.RandomLTDBuilder, + deepspeed.ops.op_builder.SparseAttnBuilder, + deepspeed.ops.op_builder.SpatialInferenceBuilder, + deepspeed.ops.op_builder.TransformerBuilder, + deepspeed.ops.op_builder.StochasticTransformerBuilder, + ] + + for builder in builders: + print(f"Compiling {builder.__name__}") + builder().load() + +if __name__ == "__main__": + compile_ops() +EOF +python compile_deepspeed_ops.py +ds_report +# clean up +# rm -f deepspeed/git_version_info_installed.py +# rm -rf dist build deepspeed.egg-info +# python setup.py bdist_wheel +# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl +# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt +# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 +EOT + +# install transformers and flash-attn +RUN <= 23.0. +# +# For reference: +# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] +# base tools +RUN <> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba +# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +# micromamba shell init -s bash -p ~/micromamba +# cat <<'EOF' >> ~/.bashrc +# source ~/micromamba/etc/profile.d/micromamba.sh +# EOF +# # 配置 .mambarc 文件 +# cat < ~/.mambarc +# channels: +# - conda-forge +# - bioconda +# - pytorch +# - pytorch-nightly +# - nvidia +# - defaults +# show_channel_urls: true +# EOF +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH +ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.1 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.1 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.1 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +ARG USE_CUDA=1 +ENV USE_CUDA=${USE_CUDA} +ARG USE_ROCM=0 +ENV USE_ROCM=${USE_ROCM} +ARG USE_XPU=0 +ENV USE_XPU=${USE_XPU} +ARG _GLIBCXX_USE_CXX11_ABI=1 +ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} +RUN <> ~/.bashrc +which python > ~/python_path.txt +conda activate ${CONDA_ENV_NAME} +# 克隆 ninja 源码并编译 +git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja +cd ${STAGE_DIR}/ninja +# 克隆 GoogleTest 源码 +git clone https://github.com/google/googletest.git +python ./configure.py --bootstrap +# 配置并构建 Ninja 测试,添加 pthread 链接选项 +# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest +conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" +./ninja all +# 运行 Ninja 单元测试 +./ninja_test +python3 -m pip install --no-cache-dir --upgrade pip +python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python +conda clean -afy +git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. +python -m pip install setuptools==${SETUPTOOLS_VERSION} +python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) +# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) +python3 -m pip uninstall -y torch torchvision torchaudio +# # install pytorch create conda env aleay exists +python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} +python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +python3 -m pip uninstall -y transformer-engine +python3 -m pip uninstall -y torch-tensorrt +python3 -m pip uninstall -y apex +EOT + +# install apex +RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... +MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +cd .. +rm -rf ${STAGE_DIR}/apex +EOT + +RUN <&1 +# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail +# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# install deepspeed prepare +# install Mellanox OFED +mkdir -p ${STAGE_DIR} +wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - +cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 +./mlnxofedinstall --user-space-only --without-fw-update --all -q +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* +cd .. +# install nv_peer_mem +rm -rf ${STAGE_DIR} +mkdir -p ${STAGE_DIR} +git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory +cd ${STAGE_DIR}/nv_peer_memory +./build_module.sh +cd ${STAGE_DIR} +tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz +cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} +apt-get update +apt-get install -y dkms +dpkg-buildpackage -us -uc +dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb +EOT + +# install mpi +ENV PATH=/usr/local/mpi/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +RUN < /usr/local/mpi/bin/mpirun +echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun +chmod a+x /usr/local/mpi/bin/mpirun +EOT + +# SSH daemon port inside container cannot conflict with host OS port +ENV SSH_PORT=2222 +RUN < ${STAGE_DIR}/sshd_config && \ +sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +EOT + +# 29.78 Usage: install.sh [options...] +# 29.78 +# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in +# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally +# 29.78 +# 29.78 [optional] +# 29.78 -l, --local_only Install only on local machine +# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) +# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) +# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels +# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) +# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) +# 29.78 -e, --examples Checkout deepspeed example submodule (no install) +# 29.78 -v, --verbose Verbose logging +# 29.78 -h, --help This help text + +RUN <> /etc/sudoers +EOT + +# install cutlass https://github.com/NVIDIA/cutlass +# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# A100: architecture is Ampere +# V100: architecture is Volta +# T4: architecture is Turing +# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# 89:GeForce RTX 4090 +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN < install_modified.sh +chmod +x ./install_modified.sh +# 检查 HOSTFILE_CONTENT 并写入文件 +if [ -n "${HOSTFILE_CONTENT}" ]; then + echo "${HOSTFILE_CONTENT}" > /tmp/hostfile + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" +else + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" +fi +eval $INSTALL_CMD +# compile deepspeed ops +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +echo "alias mamba=micromamba" >> ~/.bashrc +echo "alias mba=mamba" >> ~/.bashrc +EOF +# 配置 .mambarc 文件 +cat < ~/compile_deepspeed_ops.py +import deepspeed + +def compile_ops(): + builders = [ + deepspeed.ops.op_builder.AsyncIOBuilder, + deepspeed.ops.op_builder.FusedAdamBuilder, + deepspeed.ops.op_builder.CPUAdamBuilder, + deepspeed.ops.op_builder.CPUAdagradBuilder, + deepspeed.ops.op_builder.CPULionBuilder, + deepspeed.ops.op_builder.EvoformerAttnBuilder, + deepspeed.ops.op_builder.FPQuantizerBuilder, + deepspeed.ops.op_builder.FusedLambBuilder, + deepspeed.ops.op_builder.FusedLionBuilder, + deepspeed.ops.op_builder.QuantizerBuilder, + deepspeed.ops.op_builder.RaggedOpsBuilder, + deepspeed.ops.op_builder.RandomLTDBuilder, + deepspeed.ops.op_builder.SparseAttnBuilder, + deepspeed.ops.op_builder.SpatialInferenceBuilder, + deepspeed.ops.op_builder.TransformerBuilder, + deepspeed.ops.op_builder.StochasticTransformerBuilder, + ] + + for builder in builders: + print(f"Compiling {builder.__name__}") + builder().load() + +if __name__ == "__main__": + compile_ops() +EOF +python compile_deepspeed_ops.py +ds_report +# clean up +# rm -f deepspeed/git_version_info_installed.py +# rm -rf dist build deepspeed.egg-info +# python setup.py bdist_wheel +# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl +# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt +# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 +EOT + +# install transformers and flash-attn +RUN <> ~/.bashrc && \ + echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \ + /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}" + + +# install cutlass https://github.com/NVIDIA/cutlass +ENV DCUTLASS_NVCC_ARCHS="80;89;90;90a" +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \ + cd /opt/cutlass && \ + git checkout . && \ + git checkout main && \ + mkdir build && \ + cd build && \ + cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ + make -j"$(nproc)" install + + +# Mellanox OFED +# ENV MLNX_OFED_VERSION=5.8-5.1.1.2 +ENV MLNX_OFED_VERSION=23.10-3.2.2.0 +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + apt-get install -y libnuma-dev libnvidia-compute-515 && \ + # apt-get install -y libnuma-dev libnvidia-compute-535 && \ + cd ${STAGE_DIR} && \ + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* + + +# nv_peer_mem +ENV NV_PEER_MEM_VERSION=1.2 +# ENV NV_PEER_MEM_VERSION=1.3 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + + +# OPENMPI +# ENV OPENMPI_BASEVERSION=4.1 +# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ENV OPENMPI_BASEVERSION=5.0 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3 +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + + +ENV PYTORCH_VERSION=2.3.0 +ENV TORCHVISION_VERSION=0.18.0 +ENV TORCHAUDIO_VERSION=2.3.0 +ENV PYTORCH_CUDA_VERSION='cu121' + +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \ + pip install packaging && \ + pip install flash-attn + +# Install apex with CUDA and C++ extensions +# pip --version | grep -q "pip 23.1" && \ +# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \ +# (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \ +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + git clone https://github.com/NVIDIA/apex /tmp/apex && \ + cd /tmp/apex && \ + pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \ + python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \ + cd / && \ + rm -rf /tmp/apex + +# RUN \ +# source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ +# git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \ +# cd ${STAGE_DIR}/DeepSpeed && \ +# git checkout ${DEEPSPEED_VERSION} && \ +# sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \ +# chmod +x ./install_modified.sh && \ +# if [ -n "${HOSTFILE_CONTENT}" ]; then \ +# echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \ +# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \ +# else \ +# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \ +# fi && \ +# eval $INSTALL_CMD && \ +# ds_report + +RUN \ + source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \ + pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \ + pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython ipdb psutil pydantic + + +RUN \ + echo 'root:root' | chpasswd && \ + cp /etc/ssh/sshd_config /tmp/sshd_config && \ + echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \ + sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \ + sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \ + chown root:root /etc/ssh/sshd_config && \ + mkdir -p /run/sshd && chmod 0755 /run/sshd + +# RUN \ +# bash -c 'echo -e "export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"\nexport CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc' + diff --git a/finetune/Dockerfile.ngc b/finetune/Dockerfile.ngc new file mode 100644 index 0000000..bfd89cb --- /dev/null +++ b/finetune/Dockerfile.ngc @@ -0,0 +1,162 @@ +ARG REGISTRY=quay.io +ARG OWNER=jupyter +ARG LABEL=notebook +ARG VERSION +ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION +FROM $BASE_CONTAINER +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] + +# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ +ENV MLNX_OFED_VERSION=23.10-3.2.2.0 +ENV STAGE_DIR=/tmp +RUN < ${STAGE_DIR}/mlnxofedinstall.log 2>&1 +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* +EOT + +ARG NV_PEER_MEM_VERSION="1.2" +ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN <=0.17.0 +python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality] +python -m pip install evaluate datasets +EOT + +RUN < ~/.deepspeed_env +TORCH_USE_CUDA_DSA=1 +DEEPSPEED_VERBOSE=1 +DEEPSPEED_LOG_LEVEL=DEBUG +CUTLASS_PATH=${CUTLASS_PATH} +TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +CUDA_HOME=${CUDA_HOME} +LD_LIBRARY_PATH=${LD_LIBRARY_PATH} +EOF +unset https_proxy http_proxy +EOT + +CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/finetune/Dockerfile.update b/finetune/Dockerfile.update new file mode 100644 index 0000000..9d5f049 --- /dev/null +++ b/finetune/Dockerfile.update @@ -0,0 +1,427 @@ + +# NOTE: Building this image require's docker version >= 23.0. +# +# For reference: +# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04 +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] +# base tools +RUN <> ~/.bashrc +. /opt/conda/etc/profile.d/conda.sh +conda init bash +conda config --set show_channel_urls true +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# install pixi +curl -fsSL https://pixi.sh/install.sh | bash +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH +ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" +ENV REF='main' +ENV STAGE_DIR=/tmp +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.1 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.1 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.1 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +ARG USE_CUDA=1 +ENV USE_CUDA=${USE_CUDA} +ARG USE_ROCM=0 +ENV USE_ROCM=${USE_ROCM} +ARG USE_XPU=0 +ENV USE_XPU=${USE_XPU} +ARG _GLIBCXX_USE_CXX11_ABI=1 +ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI} +RUN <> ~/.bashrc +conda activate ${CONDA_ENV_NAME} +python3 -m pip install --no-cache-dir --upgrade pip +python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python +conda clean -afy +git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. +python -m pip install setuptools==${SETUPTOOLS_VERSION} +python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] +# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) +# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) +python3 -m pip uninstall -y torch torchvision torchaudio +# # install pytorch create conda env aleay exists +# 直接将 PyTorch 安装指引 中的 https://download.pytorch.org/whl 替换为 https://mirror.sjtu.edu.cn/pytorch-wheels 即可。 +python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} +python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +python3 -m pip uninstall -y transformer-engine +python3 -m pip uninstall -y torch-tensorrt +python3 -m pip uninstall -y apex +EOT + +# install apex TORCH_CUDA_ARCH_LIST all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" +ARG TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0" +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... +MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +cd .. +rm -rf ${STAGE_DIR}/apex +EOT + +# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ +ENV MLNX_OFED_VERSION=23.10-3.2.2.0 +RUN <&1 +# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail +# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# install deepspeed prepare +# install Mellanox OFED +mkdir -p ${STAGE_DIR} +wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - +cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 +./mlnxofedinstall --user-space-only --without-fw-update --all -q +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* +EOT + +ARG NV_PEER_MEM_VERSION="1.2" +ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN < /usr/local/mpi/bin/mpirun +echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun +chmod a+x /usr/local/mpi/bin/mpirun +EOT + +# SSH daemon port inside container cannot conflict with host OS port +ENV SSH_PORT=2222 +RUN < ${STAGE_DIR}/sshd_config && \ +sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +EOT + +# 29.78 Usage: install.sh [options...] +# 29.78 +# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in +# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally +# 29.78 +# 29.78 [optional] +# 29.78 -l, --local_only Install only on local machine +# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) +# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) +# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels +# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) +# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) +# 29.78 -e, --examples Checkout deepspeed example submodule (no install) +# 29.78 -v, --verbose Verbose logging +# 29.78 -h, --help This help text + +RUN <> /etc/sudoers +EOT + +# install cutlass https://github.com/NVIDIA/cutlass +# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# A100: architecture is Ampere +# V100: architecture is Volta +# T4: architecture is Turing +# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# 89:GeForce RTX 4090 +ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +RUN < install_modified.sh +chmod +x ./install_modified.sh +# 检查 HOSTFILE_CONTENT 并写入文件 +if [ -n "${HOSTFILE_CONTENT}" ]; then + echo "${HOSTFILE_CONTENT}" > /tmp/hostfile + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" +else + INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" +fi +eval $INSTALL_CMD +# compile deepspeed ops +ds_report +# clean up +# rm -f deepspeed/git_version_info_installed.py +# rm -rf dist build deepspeed.egg-info +# python setup.py bdist_wheel +# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl +# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt +# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 +EOT + +# install transformers and flash-attn +RUN < ~/.deepspeed_env +NCCL_IB_DISABLE=${NCCL_IB_DISABLE} +NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME} +NCCL_DEBUG=INFO +CUTLASS_PATH=${CUTLASS_PATH} +CUDA_HOME=${CUDA_HOME} +EOF +#CUDA_VISIBLE_DEVICES=0,1,2,3 +#OMP_NUM_THREADS=8 +#MASTER_ADDR=192.168.1.1 +#MASTER_PORT=12345 +EOT + +CMD ["/usr/sbin/sshd", "-D"] diff --git a/finetune/Dockfile-colosial b/finetune/Dockfile-colosial new file mode 100644 index 0000000..0d28277 --- /dev/null +++ b/finetune/Dockfile-colosial @@ -0,0 +1,46 @@ +FROM hpcaitech/cuda-conda:12.1 + +# metainformation +LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI" +LABEL org.opencontainers.image.licenses = "Apache License 2.0" +LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1" + +# enable passwordless ssh +RUN mkdir ~/.ssh && \ + printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config && \ + ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + +# enable RDMA support +RUN apt-get update && \ + apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# install torch +RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia + +# install ninja +RUN apt-get update && \ + apt-get install -y --no-install-recommends ninja-build && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# install apex +RUN git clone https://github.com/NVIDIA/apex && \ + cd apex && \ + git checkout a7de60 && \ + pip install packaging && \ + pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ + +# install colossalai +ARG VERSION=main +RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \ + && cd ./ColossalAI \ + && BUILD_EXT=1 pip install -v . \ + && rm -rf colossalai + +# install tensornvme +RUN conda install -y cmake && \ + apt update -y && apt install -y libaio-dev && \ + pip install -v git+https://github.com/hpcaitech/TensorNVMe.git diff --git a/finetune/README.md b/finetune/README.md new file mode 100644 index 0000000..9a57dcf --- /dev/null +++ b/finetune/README.md @@ -0,0 +1,350 @@ +## deepspeed docker image build + +```shell +docker-compose -f docker-compose_pytorch1.13.yml build +docker-compose -f docker-compose_pytorch2.3.yml build +``` + +## 英伟达显卡安装卸载驱动 + +卸载 + +```shell +cd /usr/local/cuda +ll +cd .. +cd cuda-12.3/ +ll +cd bin/ +ll +./cuda-uninstaller +cd ~ +nvidia-uninstall +sudo modprobe -r nvidia-drm nvidia-modeset nvidia-uvm nvidia +sudo rm -rf /usr/lib64/nvidia /usr/lib/nvidia +sudo apt autoremove nvidia* +sudo apt clean all +sudo dracut --force +sudo reboot +``` + +安装 + +```shell +wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb +dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb +wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda_12.5.1_555.42.06_linux.run +ll +sudo sh cuda_12.5.1_555.42.06_linux.run +echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc && echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc && source /root/.bashrc +nvcc -V +nvidia-smi +nvidia-smi -pm 1 +modprobe nvidia_peermem +nvidia-smi +modinfo nvidia_peermem +lsmod | grep nvidia_peermem +systemctl mask apt-daily-upgrade.service +systemctl mask apt-daily-upgrade.timer +systemctl disable apt-daily-upgrade.timer +systemctl disable apt-daily-upgrade.service +ll +wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb +dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb +sudo systemctl start nvidia-fabricmanager +sudo systemctl status nvidia-fabricmanager +``` + +## 镜像测试命令 + +docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test +docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update +docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash + +pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple + + +```shell + 1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers + 2 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers + 3 curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash + 4 pigchacli + 5 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777 + 6 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777 + 7 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers + 8 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers + 9 python -c "from xformers import ops as xops" + 10 python -c "import apex.amp; print('Apex is installed and the amp module is available.')" + 11 env + 12 pip install git+https://github.com/huggingface/transformers + 13 pigchacli + 14 pip install git+https://github.com/huggingface/transformers + 15 pip list + 16 export STAGE_DIR=/tmp + 17 git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL + 18 cd ${STAGE_DIR}/oneCCL + 19 git checkout . + 20 git checkout master + 21 mkdir build + 22 cd build + 23 cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local + 24 make -j"$(nproc)" install + 25 ls + 26 echo ${CUDA_ARCH_LIST} + 27 git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels + 28 cd ${STAGE_DIR}/DeepSpeed-Kernels + 29 python -m pip install -v . + 30 env + 31 python -m pip install -v . + 32 git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed + 33 cd ${STAGE_DIR}/DeepSpeed + 34 export DEEPSPEED_VERSION="v0.14.3" + 35 git checkout ${DEEPSPEED_VERSION} + 36 ls + 37 ./install.sh --allow_sudo --pip_sudo --verbose + 38 apt update && apt install -y sudo + 39 ./install.sh --allow_sudo --pip_sudo --verbose +``` + +```shell +nvidia-smi +nvcc -V +ninja --version +ds_report +python -c "import torch; print('torch:', torch.__version__, torch)" +python -c "import torch; print('CUDA available:', torch.cuda.is_available())" +python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()" +python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +python -c "from xformers import ops as xops" +ibstat +ofed_info -s +mst version +mpirun --version +``` + +```shell +cat < ~/compile_deepspeed_ops.py +import deepspeed + +def compile_ops(): + builders = [ + deepspeed.ops.op_builder.AsyncIOBuilder, + deepspeed.ops.op_builder.FusedAdamBuilder, + deepspeed.ops.op_builder.CPUAdamBuilder, + deepspeed.ops.op_builder.CPUAdagradBuilder, + deepspeed.ops.op_builder.CPULionBuilder, + deepspeed.ops.op_builder.EvoformerAttnBuilder, + deepspeed.ops.op_builder.FPQuantizerBuilder, + deepspeed.ops.op_builder.FusedLambBuilder, + deepspeed.ops.op_builder.FusedLionBuilder, + deepspeed.ops.op_builder.QuantizerBuilder, + deepspeed.ops.op_builder.RaggedOpsBuilder, + deepspeed.ops.op_builder.RandomLTDBuilder, + deepspeed.ops.op_builder.SparseAttnBuilder, + deepspeed.ops.op_builder.SpatialInferenceBuilder, + deepspeed.ops.op_builder.TransformerBuilder, + deepspeed.ops.op_builder.StochasticTransformerBuilder, + ] + + for builder in builders: + print(f"Compiling {builder.__name__}") + builder().load() + +if __name__ == "__main__": + compile_ops() +EOF +python compile_deepspeed_ops.py +``` + +## 配置vscode的docker的插件 + +[nerdctl配置](https://blog.csdn.net/margu_168/article/details/139822555) + + + +```shell +cat << 'EOF' > /usr/local/bin/docker +#!/bin/bash +exec nerdctl "$@" +EOF +chmod +x /usr/local/bin/docker +``` + +nerdctl bash自动补全 + +```shell +apt update +apt install bash-completion -y +nerdctl completion bash > /etc/bash_completion.d/nerdctl +nerdctl completion bash > /etc/bash_completion.d/docker +source /etc/bash_completion.d/nerdctl +source /etc/bash_completion.d/docker +``` + +## 物理机更新内核 + +```shell +uname -r # 5.4.0-144-generic +lsb_release -a +sudo apt-get update # This will update the repositories list +sudo apt-get upgrade # This will update all the necessary packages on your system +sudo apt-get dist-upgrade # This will add/remove any needed packages +reboot # You may need this since sometimes after a upgrade/dist-upgrade, there are some left over entries that get fixed after a reboot +sudo apt-get install linux-headers-$(uname -r) # This should work now +``` + +## test command + +```shell +docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash +``` + +## [查询GPU 架构 给变量赋值](https://blog.csdn.net/zong596568821xp/article/details/106411024) + +```shell +git clone https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps.git +cd deepstream_tlt_apps/TRT-OSS/x86 +nvcc deviceQuery.cpp -o deviceQuery +./deviceQuery +``` + +H100 输出 + +```shell +(base) root@node19:~/bgpt/deepstream_tlt_apps/TRT-OSS/x86# ./deviceQuery +Detected 8 CUDA Capable device(s) + +Device 0: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 1: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 2: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 3: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 4: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 5: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 6: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +Device 7: "NVIDIA H100 80GB HBM3" + CUDA Driver Version / Runtime Version 12.4 / 10.1 + CUDA Capability Major/Minor version number: 9.0 + +``` + + +## DeepSpeed hostfile 分发 + +要手动分发 hostfile 并进行分布式安装,你需要以下几个步骤: + +1. 准备 hostfile +确保 hostfile 文件包含所有参与的主机及其配置。 + +示例 hostfile 内容: + +```plaintext +host1 slots=4 +host2 slots=4 +host3 slots=8 +``` + +2. 确保 SSH 配置正确 +确保你能够通过 SSH 无密码登录到所有主机。可以使用 ssh-keygen 和 ssh-copy-id 配置 SSH 密钥。 + +生成 SSH 密钥(如果尚未生成): + +```shell +ssh-keygen -t rsa +``` + +将 SSH 公钥复制到每个主机: + +```shell +ssh-copy-id user@host1 +ssh-copy-id user@host2 +ssh-copy-id user@host3 +``` + +3. 创建临时目录并复制 wheel 文件 +在所有主机上创建一个临时目录,用于存放分发的 wheel 文件。 + +```shell +export PDSH_RCMD_TYPE=ssh +hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",") +tmp_wheel_path="/tmp/deepspeed_wheels" + +pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}" +pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/ +pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/ +``` + +4. 在每个主机上安装 DeepSpeed 和依赖项 +在所有主机上安装 DeepSpeed 和所需的依赖项。 + +```shell +pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl" +pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt" +``` + +5. 清理临时文件 +安装完成后,删除所有主机上的临时文件。 + +```shell +pdsh -w $hosts "rm -rf ${tmp_wheel_path}" +``` + +详细步骤 +确保 SSH 配置正确: + +```shell +ssh-keygen -t rsa +ssh-copy-id user@host1 +ssh-copy-id user@host2 +ssh-copy-id user@host3 +``` + +创建临时目录并复制文件: + +```shell +export PDSH_RCMD_TYPE=ssh +hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",") +tmp_wheel_path="/tmp/deepspeed_wheels" + +pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}" +pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/ +pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/ +``` + +在所有主机上安装 DeepSpeed 和依赖项: + +```shell +pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl" +pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt" +``` + +清理临时文件: + +```shell +pdsh -w $hosts "rm -rf ${tmp_wheel_path}" +``` + +通过这些步骤,你可以手动分发 hostfile 并在多个主机上安装 DeepSpeed 和其依赖项。这种方法确保了每个主机的环境配置一致,从而支持分布式训练或部署。 \ No newline at end of file diff --git a/finetune/accelerate-gpu-deepspeed.Dockerfile b/finetune/accelerate-gpu-deepspeed.Dockerfile new file mode 100644 index 0000000..d35fc1b --- /dev/null +++ b/finetune/accelerate-gpu-deepspeed.Dockerfile @@ -0,0 +1,46 @@ +# Builds GPU docker image of PyTorch specifically +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +# Note: DeepSpeed beyond v0.12.6 requires py 3.10 +ENV PYTHON_VERSION=3.10 +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Create our conda env +RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/accelerate/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] +# Activate the conda env, install mpy4pi, and install torch + accelerate +RUN source activate accelerate && conda install -c conda-forge mpi4py +RUN source activate accelerate && \ + python3 -m pip install --no-cache-dir \ + git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \ + --extra-index-url https://download.pytorch.org/whl/cu117 + +RUN python3 -m pip install --no-cache-dir bitsandbytes + +# Stage 2 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +RUN echo "source activate accelerate" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] \ No newline at end of file diff --git a/finetune/binbbt.tar.gz b/finetune/binbbt.tar.gz new file mode 100644 index 0000000..b878c9c Binary files /dev/null and b/finetune/binbbt.tar.gz differ diff --git a/finetune/configure_gpu.sh b/finetune/configure_gpu.sh new file mode 100755 index 0000000..2494aee --- /dev/null +++ b/finetune/configure_gpu.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# 提取GPU UUID +GPU_UUIDS=$(nvidia-smi -a | grep 'GPU UUID' | awk '{print $4}') + +# 生成node-generic-resources JSON片段 +NODE_RESOURCES=$(echo "$GPU_UUIDS" | awk '{print "\"NVIDIA-GPU=" $1 "\","}' | tr -d '\n') +NODE_RESOURCES=${NODE_RESOURCES%,} # 移除最后一个逗号 + +# 生成完整的daemon.json内容 +DAEMON_JSON=$(cat < /dev/null + +# 添加swarm-resource配置 +# swarm-resource = "DOCKER_RESOURCE_NVIDIAGPU" + +sudo sed -i '/^#.*swarm-resource/s/^#//' /etc/nvidia-container-runtime/config.toml +sudo sed -i '/swarm-resource =/s/=.*/= "DOCKER_RESOURCE_GPU"/' /etc/nvidia-container-runtime/config.toml + +# 重启Docker服务 +sudo systemctl restart docker.service + +# 验证配置 +docker info | grep -i 'nvidia' + +echo "GPU UUIDs have been configured and Docker has been restarted." diff --git a/finetune/deepspeed.Dockerfile b/finetune/deepspeed.Dockerfile new file mode 100644 index 0000000..fecb0c7 --- /dev/null +++ b/finetune/deepspeed.Dockerfile @@ -0,0 +1,184 @@ +FROM nvidia/cuda:12.2.2-devel-ubuntu20.04 + +ENV DEBIAN_FRONTEND noninteractive + +############################################################################## +# Temporary Installation Directory +############################################################################## +ENV STAGE_DIR=/tmp +RUN mkdir -p ${STAGE_DIR} + +############################################################################## +# Installation/Basic Utilities +############################################################################## +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + nfs-common pdsh \ + cmake g++ gcc \ + curl wget vim tmux emacs less unzip \ + htop iftop iotop ca-certificates openssh-client openssh-server \ + rsync iputils-ping net-tools sudo \ + llvm-dev + +############################################################################## +# Installation Latest Git +############################################################################## +RUN add-apt-repository ppa:git-core/ppa -y && \ + apt-get update && \ + apt-get install -y git && \ + git --version + +############################################################################## +# Client Liveness & Uncomment Port 22 for SSH Daemon +############################################################################## +# Keep SSH client alive from server side +RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config +RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \ + sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config + +############################################################################## +# Mellanox OFED +############################################################################## +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +RUN apt-get install -y libnuma-dev +RUN cd ${STAGE_DIR} && \ + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* + +############################################################################## +# nv_peer_mem +############################################################################## +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + +############################################################################## +# OPENMPI +############################################################################## +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +RUN cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + +############################################################################## +# Python +############################################################################## +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHON_VERSION=3 +RUN apt-get install -y python3 python3-dev && \ + rm -f /usr/bin/python && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py && \ + pip install --upgrade pip && \ + # Print python an pip version + python -V && pip -V +RUN pip install pyyaml +RUN pip install ipython + +############################################################################## +# Some Packages +############################################################################## +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libsndfile-dev \ + libcupti-dev \ + libjpeg-dev \ + libpng-dev \ + screen \ + libaio-dev +RUN pip install psutil \ + yappi \ + cffi \ + ipdb \ + pandas \ + matplotlib \ + py3nvml \ + pyarrow \ + graphviz \ + astor \ + boto3 \ + tqdm \ + sentencepiece \ + msgpack \ + requests \ + pandas \ + sphinx \ + sphinx_rtd_theme \ + scipy \ + numpy \ + scikit-learn \ + nvidia-ml-py3 \ + mpi4py + +############################################################################## +## SSH daemon port inside container cannot conflict with host OS port +############################################################################### +ENV SSH_PORT=2222 +RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ + sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config + +############################################################################## +# PyTorch +############################################################################## +ENV PYTORCH_VERSION=1.13.0 +RUN pip install torch==${PYTORCH_VERSION} + +############################################################################## +# PyYAML build issue +# https://stackoverflow.com/a/53926898 +############################################################################## +RUN rm -rf /usr/lib/python3/dist-packages/yaml && \ + rm -rf /usr/lib/python3/dist-packages/PyYAML-* + +############################################################################## +## Add deepspeed user +############################################################################### +# Add a deepspeed user with user id 8877 +#RUN useradd --create-home --uid 8877 deepspeed +RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed +RUN usermod -aG sudo deepspeed +RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +# # Change to non-root privilege +USER deepspeed + +############################################################################## +# DeepSpeed +############################################################################## +RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed +RUN cd ${STAGE_DIR}/DeepSpeed && \ + git checkout . && \ + git checkout master && \ + ./install.sh --pip_sudo +RUN rm -rf ${STAGE_DIR}/DeepSpeed +RUN python -c "import deepspeed; print(deepspeed.__version__)" \ No newline at end of file diff --git a/finetune/docker-compose.yml b/finetune/docker-compose.yml new file mode 100644 index 0000000..52da7ba --- /dev/null +++ b/finetune/docker-compose.yml @@ -0,0 +1,33 @@ +version: '3.8' + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + volumes: + - ./src:/bbtft + container_name: ubuntu-finetune + pull_policy: if_not_present + tty: true + restart: unless-stopped + image: hotwa/finetune:test + shm_size: '32gb' + ports: + - 3227:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + networks: + - network_finetune + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +networks: + network_finetune: + name: network_finetune diff --git a/finetune/docker-compose_ldh.yml b/finetune/docker-compose_ldh.yml new file mode 100644 index 0000000..ffefef1 --- /dev/null +++ b/finetune/docker-compose_ldh.yml @@ -0,0 +1,57 @@ + +services: + ldh-deepspeed-test: + build: + context: . + dockerfile: Dockerfile.ldh + args: + # PYTHON_VERSION: "3.10" + # CUDA_VERSION: "12.1.0" + # PYTORCH_VERSION: "2.3.0" + # TORCHVISION_VERSION: "0.18.0" + # TORCHAUDIO_VERSION: "2.3.0" + # DS_BUILD_OPS: 1 + # USE_CUDA: 1 + # USE_ROCM: 0 + # USE_XPU: 0 + # CUDA: cu121 + # CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + # SETUPTOOLS_VERSION: "69.5.1" + # DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + # DEEPSPEED_VERSION: "master" + # DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + HTTP_PROXY: "http://127.0.0.1:15777" + HTTPS_PROXY: "http://127.0.0.1:15777" + # cache-from: "type=local" + image: ldh/deepspeed:test + shm_size: '128gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + #runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + # stdin_open: true + # tty: true + privileged: true + cap_add: + - IPC_LOCK + volumes: + - /root/workspace:/root/data + - /dev/infiniband:/dev/infiniband + # ports: + # - "22242:22242" + # - "5000:5000" + # networks: + # - ldh_overlay_network + network_mode: host + command: ["/usr/sbin/sshd", "-D"] + +# networks: +# ldh_overlay_network: +# external: true diff --git a/finetune/docker-compose_m_d.yml b/finetune/docker-compose_m_d.yml new file mode 100644 index 0000000..f82ad3f --- /dev/null +++ b/finetune/docker-compose_m_d.yml @@ -0,0 +1,35 @@ + +services: + ldh-megatron-deepspeed-test: + image: hotwa/magadeep:latest + shm_size: '128gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + #runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + # stdin_open: true + # tty: true + privileged: true + cap_add: + - IPC_LOCK + volumes: + - /root/workspace:/root/data + - /dev/infiniband:/dev/infiniband + # ports: + # - "22242:22242" + # - "5000:5000" + # networks: + # - ldh_overlay_network + network_mode: host + command: ["/usr/sbin/sshd", "-D"] + +# networks: +# ldh_overlay_network: +# external: true diff --git a/finetune/docker-compose_mega.yml b/finetune/docker-compose_mega.yml new file mode 100644 index 0000000..adeb72a --- /dev/null +++ b/finetune/docker-compose_mega.yml @@ -0,0 +1,38 @@ + +services: + megatron-test: + image: nvcr.io/nvidia/pytorch:24.02-py3 + shm_size: '560gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + #runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + #- CUTLASS_PATH="/opt/cutlass" + #- CUDA_HOME="/usr/local/cuda" + #- PATH="${CUDA_HOME}/bin:${PATH}" + #- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" + stdin_open: true + tty: true + privileged: true + cap_add: + - IPC_LOCK + volumes: + - /root/workspace:/mnt + - /dev/infiniband:/dev/infiniband + # - /mnt/local-nvme:/root/ + ports: + - "5000:5000" + # networks: + # - ldh_overlay_network + network_mode: host + +# networks: +# ldh_overlay_network: +# external: true diff --git a/finetune/docker-compose_nccl.yml b/finetune/docker-compose_nccl.yml new file mode 100644 index 0000000..e3ce1ad --- /dev/null +++ b/finetune/docker-compose_nccl.yml @@ -0,0 +1,28 @@ +version: '3.8' +# https://github.com/mayooot/build-nccl-tests-with-pytorch +services: + nccl-test-container: + image: mayooot/nccl-tests-with-pytorch:v0.0.2 + container_name: nccl-test-container + network_mode: host + environment: + - PORT=1998 + - PASS=P@88w0rd + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + volumes: + - ./id_rsa_finetune:/root/.ssh/id_rsa + - ./id_rsa.pub:/root/.ssh/id_rsa.pub + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + cap_add: + - IPC_LOCK + devices: + - /dev/infiniband:/dev/infiniband + shm_size: '32gb' + restart: unless-stopped diff --git a/finetune/docker-compose_ngc.yml b/finetune/docker-compose_ngc.yml new file mode 100644 index 0000000..5d173f9 --- /dev/null +++ b/finetune/docker-compose_ngc.yml @@ -0,0 +1,72 @@ +version: '3.9' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +# 检测系统总内存(以GB为单位) +# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo) +# echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。" + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile.ngc + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + REGISTRY: "nvcr.io" + OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3 + LABEL: "pytorch" + VERSION: "24.06-py3" + DS_BUILD_OPS: 1 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + HTTP_PROXY: "http://127.0.0.1:15777" + HTTPS_PROXY: "http://127.0.0.1:15777" + CACHEBUST: 1 + # volumes: + # - ./workspace:/workspace + # - /tmp:/tmp + container_name: ubuntu-ngc + pull_policy: if_not_present + ulimits: + memlock: + soft: -1 + hard: -1 + # tty: true + # stdin_open: true + restart: unless-stopped + image: quay.io/hotwa/ngc:latest + privileged: true + ipc: host + network_mode: host + shm_size: '128gb' + # ports: + # - 3228:2222 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + # networks: + # - network_finetune + # command: ["/usr/sbin/sshd", "-D"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +# networks: +# network_finetune: +# name: network_finetune diff --git a/finetune/docker-compose_pytorch1.13.yml b/finetune/docker-compose_pytorch1.13.yml new file mode 100644 index 0000000..e39c544 --- /dev/null +++ b/finetune/docker-compose_pytorch1.13.yml @@ -0,0 +1,52 @@ +version: '3.8' + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: 3.9 + CUDA_VERSION: 11.7.1 + PYTORCH_VERSION: 1.13.1 + TORCHVISION_VERSION: 0.14.1 + TORCHAUDIO_VERSION: 0.13.1 + DS_BUILD_OPS: 1 + DS_BUILD_SPARSE_ATTN: 1 + DS_BUILD_FUSED_ADAM: 1 + DS_BUILD_CPU_ADAM: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu117 + CUDA_ARCH_LIST: "80;86" # for RTX 4090, all : "80;86;89;90" 编译deepspeed内核需要,这个参数很严格 + SETUPTOOLS_VERSION: "69.5.1" + ROOT_PASSWD: "root" + DCUTLASS_NVCC_ARCHS: "90a" # 90a for H100 ,89:GeForce RTX 4090 + volumes: + - ./src:/bbtft + container_name: ubuntu-finetune + pull_policy: if_not_present + tty: true + restart: unless-stopped + image: hotwa/deepspeed:pt113 + shm_size: '32gb' + ports: + - 3227:2222 + command: ["/usr/sbin/sshd", "-D"] + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + networks: + - network_finetune + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +networks: + network_finetune: + name: network_finetune diff --git a/finetune/docker-compose_pytorch2.3.yml b/finetune/docker-compose_pytorch2.3.yml new file mode 100644 index 0000000..4390e55 --- /dev/null +++ b/finetune/docker-compose_pytorch2.3.yml @@ -0,0 +1,65 @@ +version: '3.8' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + HTTP_PROXY: "http://127.0.0.1:15777" + HTTPS_PROXY: "http://127.0.0.1:15777" + volumes: + - ./src:/bbtft + container_name: ubuntu-finetune + pull_policy: if_not_present + # tty: true + restart: unless-stopped + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - network_finetune + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +networks: + network_finetune: + name: network_finetune diff --git a/finetune/docker-compose_pytorch2.34060.yml b/finetune/docker-compose_pytorch2.34060.yml new file mode 100644 index 0000000..52d11be --- /dev/null +++ b/finetune/docker-compose_pytorch2.34060.yml @@ -0,0 +1,63 @@ +version: '3.8' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./src:/bbtft + container_name: ubuntu-finetune + pull_policy: if_not_present + # tty: true + restart: unless-stopped + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - network_finetune + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +networks: + network_finetune: + name: network_finetune diff --git a/finetune/docker-compose_pytorch2.3_device.yml b/finetune/docker-compose_pytorch2.3_device.yml new file mode 100644 index 0000000..b9752d6 --- /dev/null +++ b/finetune/docker-compose_pytorch2.3_device.yml @@ -0,0 +1,71 @@ +version: '3.8' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./src:/bbtft + - ./id_rsa_finetune:/root/.ssh/id_rsa + - ./id_rsa.pub:/root/.ssh/id_rsa.pub + container_name: ubuntu-finetune + pull_policy: if_not_present + # tty: true + restart: unless-stopped + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - my-custom-bridge + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + cap_add: + - IPC_LOCK + devices: + - /dev/infiniband:/dev/infiniband +# docker swarm init +# docker swarm join-token manager +# docker network create -d overlay --subnet=192.168.200.0/24 my-overlay-network +networks: + my-custom-bridge: + external: true diff --git a/finetune/docker-compose_stack.yml b/finetune/docker-compose_stack.yml new file mode 100644 index 0000000..ef9aa2b --- /dev/null +++ b/finetune/docker-compose_stack.yml @@ -0,0 +1,58 @@ +version: '3.8' + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./src:/bbtft + - ./id_rsa_finetune:/root/.ssh/id_rsa + - ./id_rsa.pub:/root/.ssh/id_rsa.pub + container_name: ubuntu-finetune + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - my-custom-bridge + deploy: + replicas: 2 + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: "NVIDIA-GPU" + value: 1 + placement: + constraints: [node.platform.os == linux] + cap_add: + - IPC_LOCK + devices: + - /dev/infiniband:/dev/infiniband + +networks: + my-custom-bridge: + external: true + +# docker stack deploy -c docker-compose_stack.yml rdma_stack diff --git a/finetune/docker-compose_stack1.yml b/finetune/docker-compose_stack1.yml new file mode 100644 index 0000000..7698b4a --- /dev/null +++ b/finetune/docker-compose_stack1.yml @@ -0,0 +1,37 @@ +version: '3.8' + +services: + ubuntu-finetune: + image: hotwa/deepspeed:pt23 + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + deploy: + replicas: 1 + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: "NVIDIA-GPU" + value: 1 + placement: + constraints: + - node.labels.gpu == true + cap_add: + - IPC_LOCK + +networks: + default: + driver: overlay + +# 为节点添加标签: +# docker node ls + + +# docker node update --label-add gpu=true node1 + +# docker stack deploy -c docker-compose.yml rdma_stack + diff --git a/finetune/docker-compose_stack2.yml b/finetune/docker-compose_stack2.yml new file mode 100644 index 0000000..89c357c --- /dev/null +++ b/finetune/docker-compose_stack2.yml @@ -0,0 +1,62 @@ +version: '3.8' + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - type: tmpfs + target: /dev/shm + tmpfs: + size: 32000000000 # 32GB + # - ./src:/bbtft + # - ./id_rsa_finetune:/root/.ssh/id_rsa + # - ./id_rsa.pub:/root/.ssh/id_rsa.pub + # container_name: ubuntu-finetune + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + # networks: + # - my-custom-bridge + deploy: + replicas: 4 + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: "NVIDIA-GPU" + value: 8 + - discrete_resource_spec: + kind: "SRIOV-VF" + value: 1 + placement: + constraints: [node.labels.gpu == true] + cap_add: + - IPC_LOCK + privileged: true + +# networks: +# my-custom-bridge: +# external: true diff --git a/finetune/docker-compose_swarm.yml b/finetune/docker-compose_swarm.yml new file mode 100644 index 0000000..45d9300 --- /dev/null +++ b/finetune/docker-compose_swarm.yml @@ -0,0 +1,50 @@ +version: '3.8' + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./binbbt:/bbtft + container_name: ubuntu-finetune + pull_policy: if_not_present + restart: unless-stopped + image: hotwa/deepspeed:pt23 + shm_size: '40gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - test-net + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +# 修改为docker-swarm的网络 +networks: + test-net: + external: true diff --git a/finetune/docker-compose_update.yml b/finetune/docker-compose_update.yml new file mode 100644 index 0000000..cc24ed4 --- /dev/null +++ b/finetune/docker-compose_update.yml @@ -0,0 +1,81 @@ +version: '3.8' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 +# 检测系统总内存(以GB为单位) +# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo) +# echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。" + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile.update + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: "3.10" + NV_PEER_MEM_VERSION: "1.2" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + HTTP_PROXY: "http://127.0.0.1:15777" + HTTPS_PROXY: "http://127.0.0.1:15777" + CACHEBUST: 1 + volumes: + - ./src:/bbtft + # - /tmp:/tmp + container_name: ubuntu-finetune + pull_policy: if_not_present + ulimits: + memlock: + soft: -1 + hard: -1 + # tty: true + # stdin_open: true + restart: unless-stopped + image: hotwa/deepspeed:pt23_update + privileged: true + ipc: host + network_mode: host + shm_size: '128gb' + # ports: + # - 3228:2222 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + # networks: + # - network_finetune + # command: ["/usr/sbin/sshd", "-D"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +# networks: +# network_finetune: +# name: network_finetune diff --git a/finetune/hostfile b/finetune/hostfile new file mode 100644 index 0000000..4046630 --- /dev/null +++ b/finetune/hostfile @@ -0,0 +1,3 @@ +host1 slots=4 +host2 slots=4 +host3 slots=8 diff --git a/finetune/id_rsa.pub b/finetune/id_rsa.pub new file mode 100644 index 0000000..abfe5ea --- /dev/null +++ b/finetune/id_rsa.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC1CQs1rWF7KFg5SKeNHm3EGLEx8pgegdy2voQMAEInOTjeIoWpcXk7R65NLGG6k1J10f5GYg3A0XxmNf/7nUWn0T/D31dwcFvP5BAIpJl8IMDkFj36SoNKTX5XIhbCet7sJgsLY4yKlOVahVNK+La9nbLDEd7GGNzBVUpccc2uXDJul+r1QSoXssV5Q7QBa17Sf2en6swXrtjyPz4W+Tg7/ANzF3P9y9roIcdlAm/jZb0gMLFsteyt+ThqrP3+hSgFrOlJNgEL5qkOG0dI5rHpjeJnBzPAA1FLAQFhdtSrL+Cd9INSvV0lNwAROl5FpSMVmE7UzeeUy70cqw5b7ReJsEpHDbpd6rUEwC09mJlSaHQ9ApKbCD0u9aXeuTlbgHqcs2JDZTLT7Yf+JxO7yVc2QaJ3iiLkVTyiXhby5YWO++lBvhXX+zMLsUvIXD6MMBeyC0Azjb41qguhJvV8H9wI+2nBZEcgSB2vhYM+/rdDw5+v3WqgGsUqpf1GLTeWP8oTxJDrDM20crW3bcEoEFlMZRpVOnWFBIniU8T1TLxP92lElWTkX+eptJVffoPxRvSPLgaNN2toY9K1MVcQ8+ckJJ6te7sjXlOupJDpNH+tshYlMsUfi1FrsRhGT0yHZtDZ3YibZ0l/8AGUWvnNC/pFqtqBLaAsfll5jsqt06pp7Q== docker@example.com diff --git a/finetune/id_rsa_finetune b/finetune/id_rsa_finetune new file mode 100644 index 0000000..9d0e0c9 --- /dev/null +++ b/finetune/id_rsa_finetune @@ -0,0 +1,49 @@ +-----BEGIN OPENSSH PRIVATE KEY----- +b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAACFwAAAAdzc2gtcn +NhAAAAAwEAAQAAAgEAtQkLNa1heyhYOUinjR5txBixMfKYHoHctr6EDABCJzk43iKFqXF5 +O0euTSxhupNSddH+RmINwNF8ZjX/+51Fp9E/w99XcHBbz+QQCKSZfCDA5BY9+kqDSk1+Vy +IWwnre7CYLC2OMipTlWoVTSvi2vZ2ywxHexhjcwVVKXHHNrlwybpfq9UEqF7LFeUO0AWte +0n9np+rMF67Y8j8+Fvk4O/wDcxdz/cva6CHHZQJv42W9IDCxbLXsrfk4aqz9/oUoBazpST +YBC+apDhtHSOax6Y3iZwczwANRSwEBYXbUqy/gnfSDUr1dJTcAETpeRaUjFZhO1M3nlMu9 +HKsOW+0XibBKRw26Xeq1BMAtPZiZUmh0PQKSmwg9LvWl3rk5W4B6nLNiQ2Uy0+2H/icTu8 +lXNkGid4oi5FU8ol4W8uWFjvvpQb4V1/szC7FLyFw+jDAXsgtAM42+NaoLoSb1fB/cCPtp +wWRHIEgdr4WDPv63Q8Ofr91qoBrFKqX9Ri03lj/KE8SQ6wzNtHK1t23BKBBZTGUaVTp1hQ +SJ4lPE9Uy8T/dpRJVk5F/nqbSVX36D8Ub0jy4GjTdraGPStTFXEPPnJCSerXu7I15TrqSQ +6TR/rbIWJTLFH4tRa7EYRk9Mh2bQ2d2Im2dJf/ABlFr5zQv6RaragS2gLH5ZeY7KrdOqae +0AAAdIJh5TtyYeU7cAAAAHc3NoLXJzYQAAAgEAtQkLNa1heyhYOUinjR5txBixMfKYHoHc +tr6EDABCJzk43iKFqXF5O0euTSxhupNSddH+RmINwNF8ZjX/+51Fp9E/w99XcHBbz+QQCK +SZfCDA5BY9+kqDSk1+VyIWwnre7CYLC2OMipTlWoVTSvi2vZ2ywxHexhjcwVVKXHHNrlwy +bpfq9UEqF7LFeUO0AWte0n9np+rMF67Y8j8+Fvk4O/wDcxdz/cva6CHHZQJv42W9IDCxbL +Xsrfk4aqz9/oUoBazpSTYBC+apDhtHSOax6Y3iZwczwANRSwEBYXbUqy/gnfSDUr1dJTcA +ETpeRaUjFZhO1M3nlMu9HKsOW+0XibBKRw26Xeq1BMAtPZiZUmh0PQKSmwg9LvWl3rk5W4 +B6nLNiQ2Uy0+2H/icTu8lXNkGid4oi5FU8ol4W8uWFjvvpQb4V1/szC7FLyFw+jDAXsgtA +M42+NaoLoSb1fB/cCPtpwWRHIEgdr4WDPv63Q8Ofr91qoBrFKqX9Ri03lj/KE8SQ6wzNtH +K1t23BKBBZTGUaVTp1hQSJ4lPE9Uy8T/dpRJVk5F/nqbSVX36D8Ub0jy4GjTdraGPStTFX +EPPnJCSerXu7I15TrqSQ6TR/rbIWJTLFH4tRa7EYRk9Mh2bQ2d2Im2dJf/ABlFr5zQv6Ra +ragS2gLH5ZeY7KrdOqae0AAAADAQABAAACAANNbXXIduH4PT8aDGQy41I4+6VplUKKUjKd +HLZF431FaG4jZAaJXOqKyMsDqhxmEDYOZuyY7u12EUn20Slhd+Pokm4S/qHSRDrxbparG5 +Jy+GZH4l5GlPq20nXw9CvyHHnG2HECqVvPRCZgqxbW8mI8S6MOZol83DsvMjVEWBZjJuXP +vl8ZztugbNMPkU8z3/hrj2Xglf56DPuYUXjIF83UGlUBu4wzYh1Hcunsm/wUN9mIVzLnkQ +WYcJOqtpnH4JA41HktnlP9qqwaguYVzURxaQXB2CCGRhRlDVQI6m+kdPltkd8ocR8T3hSy +X9tg/61fwVNHMxSY8IkGUXqn39IZuwtIOflybXc1w3VQBwGuI2UF/U/5wmIJdQimsDPzhX +o5uENWiL5Lei5sxxUmnZw78xoXHino1LNceBKhQHrKS8R36QsdK7+INbiW3Tt2TmCyH563 +UH7dgS2moTrtiXh+gPk32okTnwquRWHJ7uurxgmnncoTEdmkcTCeXv7B1CBdH9WGyCtyV1 +oKK+qNEXCrLaIOD49zF2qPUmxUOuGzcBKgavXDSPmj5jB/4k3ipsjlRX12l8xCEycKLHG3 +6LuP4jgoalNtjJGJozpya4/tsOhE6jEB74xIXUuCUlBo9Q8xmHYnv2/8jdSdR6rx6N9odw +XMYjVcs63rLZMKsljhAAABADaOQoVNSfTbhyG9wJN7+XyeXHBkfMKg4kGNYB28l4mbB1eR +8i/cZPvIDrcz1FjvYQXEWmK+XS9QVMz0EGLse5JIYhXUFtZin8VVqttIBZLXhw0nImD9sK +HlbxlKj+Savlx+oZDDxAGNMDGGhbc7uuWgX1O1Bsr5sQR+neTV91iLcMWB0XfP8CK70uXf +l7NQ88RaWn79JggKKuqVs1THhlfeMlBJ0RsUdRw69gs228++btif2bxCoY0IH3mCsmaux9 +JNI8bqZ5yws4XE7l0jaOnVFQywHP/4FCjZ2MQONhG10vYWpRjXpEEf1hXN6xDKWC5t50d0 +o79xP/Vp4Nk7pFsAAAEBAPeE7OCOS96fAz8hBI+4CXVjKzy39slPgsi64hMZYtUgY9fZ/k +5L/n831+Do7Yrrng/1pUzrHvaiip4XP2WcPmz9y2PYhi5RZzzffAmCudDVGP5ftoUcAtrj +cVzP4kmeRPP1kTsP3M3fNphrfgPkpGD1TFRxxT5wwVnsiRzQ5c1ykX8jn2xd8QpUoSdK0Y +SetryzmRf+OlDJyQljoNZ76wu5GjsejMjtIKO9oua5avgJhLKpyfAVTz2QZBBxrCUrp2+P +iM+/f4tXqF45eCFjGqiyFvKUCD1VHp5Oup4rQIi4PgD1H/MdT5XmZNeFqxwo+/2QzwIAp9 +AKQqQ/KX+7YWEAAAEBALs89zawDSGbZemROsreRDapohYnHiSAZvGzjqaevjF0oFkLpRr5 +/9jcRZf4QBDTZah5y8ATNs6KECvmRQ0mMkDSI2FSOM1bZ2yndxbtmM9kaAmpqdrRVBChVX +nopPfQ8dQ2RkPzp5YIL1QvAQbaP+B+lB8sZVtEK1OnxwCOCcVukGpkw2cE7aGDITDi1Mqg +Obj3sxHjQ+ysMZ1lOrKadpDQZXFpgp6MFVrNVlpv2QanbMGTB9GPynvCHGf5KJvKnot7L/ +rjTd2Da5SII3Mx9d6YAkYQpJkguNkJ2Q05+7PvyNNmj+Nk3ZwgqFA+3edc2exXMf9FzdmJ +iJcbS3QheA0AAAASZG9ja2VyQGV4YW1wbGUuY29tAQ== +-----END OPENSSH PRIVATE KEY----- diff --git a/finetune/peft-gpu-bnb-multi-source.Dockerfile b/finetune/peft-gpu-bnb-multi-source.Dockerfile new file mode 100644 index 0000000..2c839c4 --- /dev/null +++ b/finetune/peft-gpu-bnb-multi-source.Dockerfile @@ -0,0 +1,68 @@ +# Builds GPU docker image of PyTorch +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +ENV PYTHON_VERSION=3.8 +# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN apt-get update && \ + apt-get install -y curl git wget software-properties-common git-lfs && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Install audio-related libraries +RUN apt-get update && \ + apt install -y ffmpeg + +RUN apt install -y libsndfile1-dev +RUN git lfs install + +# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip +RUN python3 -m pip install --no-cache-dir --upgrade pip + +# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/peft/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Stage 2 +FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget cmake && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Activate the conda env and install transformers + accelerate from source +# Also clone BNB and build it from source. +RUN source activate peft && \ + python3 -m pip install -U --no-cache-dir \ + librosa \ + "soundfile>=0.12.1" \ + scipy \ + git+https://github.com/huggingface/transformers \ + git+https://github.com/huggingface/accelerate \ + peft[test]@git+https://github.com/huggingface/peft \ + optimum \ + auto-gptq && \ + git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && git checkout multi-backend-refactor && \ + cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \ + cmake --build . && \ + pip install -e . && \ + pip freeze | grep bitsandbytes + +RUN echo "source activate peft" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] \ No newline at end of file diff --git a/finetune/requirements.txt b/finetune/requirements.txt new file mode 100644 index 0000000..cb14ebf --- /dev/null +++ b/finetune/requirements.txt @@ -0,0 +1,37 @@ +pytorch +torchvision +torchaudio +pydantic +transformers +datasets +accelerate +evaluate +peft +deepspeed +tiktoken +sentencepiece +tqdm +nltk +matplotlib +seaborn +numpy +pandas +scikit-learn +diffusers +huggingface_hub +spacy +Pillow +blobfile +requests +scipy +pycocotools +protobuf +timm +pyyaml +ipython +xformers +opencv-contrib-python +open_clip_torch +flash-attn +packaging +psutil diff --git a/finetune/setup_ssh.sh b/finetune/setup_ssh.sh new file mode 100644 index 0000000..514d283 --- /dev/null +++ b/finetune/setup_ssh.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# 定义主机列表 +hosts=("10.200.1.10" "10.200.1.11" "10.200.1.12") + +# 当前主机的用户名 +user="root" + +# 检查ssh-keygen是否已经生成密钥对 +if [ ! -f ~/.ssh/id_rsa ]; then + echo "生成SSH密钥对..." + ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa +else + echo "SSH密钥对已经存在..." +fi + +# 分发公钥到其他主机 +for host in "${hosts[@]}"; do + if [ "$host" != "$(hostname -I | awk '{print $1}')" ]; then + echo "将公钥复制到$host..." + ssh-copy-id -i ~/.ssh/id_rsa.pub "$user@$host" + fi +done + +echo "密钥认证配置完成。" \ No newline at end of file diff --git a/finetune/test.txt b/finetune/test.txt new file mode 100644 index 0000000..48b0982 --- /dev/null +++ b/finetune/test.txt @@ -0,0 +1,182 @@ +absl-py==2.1.0 +accelerate @ git+https://github.com/huggingface/accelerate@1f7a79b428749f45187ec69485f2c966fe21926e +aiohttp==3.9.5 +aiosignal==1.3.1 +alabaster==0.7.16 +alembic==1.13.1 +annotated-types==0.7.0 +arrow==1.3.0 +astor==0.8.1 +asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work +async-timeout==4.0.3 +attrs==23.2.0 +Babel==2.15.0 +beautifulsoup4==4.12.3 +binaryornot==0.4.4 +boto3==1.34.129 +botocore==1.34.129 +certifi==2024.6.2 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cmake==3.29.5.1 +colorlog==6.8.2 +contourpy==1.2.1 +cookiecutter==1.7.3 +cycler==0.12.1 +datasets==2.20.0 +decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work +deepspeed @ file:///tmp/DeepSpeed/dist/deepspeed-0.14.4%2B0c979d67-cp310-cp310-linux_x86_64.whl#sha256=3990df7f730604f29f51d6e5aa83ec09da6a4ea584504d27dc2d0fad7b8a4582 +deepspeed-kernels @ file:///tmp/DeepSpeed-Kernels +dill==0.3.4 +docutils==0.20.1 +einops==0.8.0 +evaluate==0.4.2 +exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work +execnet==2.1.1 +executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work +faiss-cpu==1.8.0 +filelock==3.15.3 +flash-attn==2.5.9.post1 +fonttools==4.53.0 +frozenlist==1.4.1 +fsspec==2024.6.0 +ftfy==6.2.0 +gitdb==4.0.11 +GitPython==3.1.18 +graphviz==0.20.3 +greenlet==3.0.3 +grpcio==1.64.1 +hjson==3.1.0 +huggingface-hub==0.23.4 +idna==3.7 +imagesize==1.4.1 +iniconfig==2.0.0 +ipdb==0.13.13 +ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1717182742060/work +jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work +Jinja2==3.1.4 +jinja2-time==0.2.0 +jmespath==1.0.1 +joblib==1.4.2 +kiwisolver==1.4.5 +Mako==1.3.5 +Markdown==3.6 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.9.0 +matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work +mdurl==0.1.2 +mpi4py @ https://github.com/mpi4py/mpi4py/tarball/master#sha256=e9d1ce01a4c5f95c704743ed13a2d90517dcafdfcde40e050903d583e9ca1260 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.12.2 +networkx==3.3 +ninja==1.11.1.1 +nltk==3.8.1 +numpy==2.0.0 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-ml-py==12.555.43 +nvidia-ml-py3==7.352.0 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.5.40 +nvidia-nvtx-cu12==12.1.105 +open-clip-torch==2.24.0 +opencv-contrib-python==4.10.0.84 +optuna==3.6.1 +packaging==24.1 +pandas==2.2.2 +parameterized==0.9.0 +parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work +pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1706113125309/work +pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work +pillow==10.3.0 +pluggy==1.5.0 +portalocker==2.0.0 +poyo==0.5.0 +prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1718047967974/work +protobuf==4.25.3 +psutil==6.0.0 +ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl +pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work +py-cpuinfo==9.0.0 +py3nvml==0.2.7 +pyarrow==16.1.0 +pyarrow-hotfix==0.6 +pycparser==2.22 +pydantic==1.10.16 +pydantic_core==2.18.4 +Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1714846767233/work +pyparsing==3.1.2 +pytest==7.4.4 +pytest-rich==0.1.1 +pytest-timeout==2.3.1 +pytest-xdist==3.6.1 +python-dateutil==2.9.0.post0 +python-slugify==8.0.4 +pytz==2024.1 +PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work +regex==2024.5.15 +requests==2.32.3 +rich==13.7.1 +rjieba==0.1.11 +rouge_score==0.1.2 +ruff==0.4.4 +s3transfer==0.10.1 +sacrebleu==1.5.1 +sacremoses==0.1.1 +safetensors==0.4.3 +scikit-learn==1.5.0 +scipy==1.13.1 +sentencepiece==0.2.0 +six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work +smmap==5.0.1 +snowballstemmer==2.2.0 +soupsieve==2.5 +Sphinx==7.3.7 +sphinx-rtd-theme==2.0.0 +sphinxcontrib-applehelp==1.0.8 +sphinxcontrib-devhelp==1.0.6 +sphinxcontrib-htmlhelp==2.0.5 +sphinxcontrib-jquery==4.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.7 +sphinxcontrib-serializinghtml==1.1.10 +SQLAlchemy==2.0.31 +stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work +sympy==1.12.1 +tensorboard==2.17.0 +tensorboard-data-server==0.7.2 +text-unidecode==1.3 +threadpoolctl==3.5.0 +timeout-decorator==0.5.0 +timm==1.0.7 +tokenizers==0.19.1 +tomli==2.0.1 +torch==2.3.1 +torchaudio==0.13.1+cu117 +torchvision==0.14.1+cu117 +tqdm==4.66.4 +traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work +transformers @ file:///root/ninja/transformers +triton==2.3.1 +types-python-dateutil==2.9.0.20240316 +typing_extensions==4.12.2 +tzdata==2024.1 +urllib3==2.2.2 +wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work +Werkzeug==3.0.3 +xmltodict==0.13.0 +xxhash==3.4.1 +yappi==1.6.0 +yarl==1.9.4 \ No newline at end of file diff --git a/finetune/transformer.Dockerfile b/finetune/transformer.Dockerfile new file mode 100644 index 0000000..e38170e --- /dev/null +++ b/finetune/transformer.Dockerfile @@ -0,0 +1,70 @@ +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands) +SHELL ["sh", "-lc"] + +# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant +# to be used as arguments for docker build (so far). + +ARG PYTORCH='2.3.0' +# (not always a valid torch version) +ARG INTEL_TORCH_EXT='2.3.0' +# Example: `cu102`, `cu113`, etc. +ARG CUDA='cu121' + +RUN apt update +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs +RUN git lfs install +RUN python3 -m pip install --no-cache-dir --upgrade pip + +ARG REF=main +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + +# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. +# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. +# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). +RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA + +RUN python3 -m pip uninstall -y flax jax + +RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu + +RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract +RUN python3 -m pip install -U "itsdangerous<2.1.0" + +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft + +# For bettertransformer +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum + +# For video model testing +RUN python3 -m pip install --no-cache-dir decord av==9.2.0 + +# Some slow tests require bnb +RUN python3 -m pip install --no-cache-dir bitsandbytes + +# Some tests require quanto +RUN python3 -m pip install --no-cache-dir quanto + +# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests +# (`deformable_detr`, `rwkv`, `mra`) +RUN python3 -m pip uninstall -y ninja + +# For `dinat` model +# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent) +RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels + +# For `nougat` tokenizer +RUN python3 -m pip install --no-cache-dir python-Levenshtein + +# For `FastSpeech2ConformerTokenizer` tokenizer +RUN python3 -m pip install --no-cache-dir g2p-en + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop \ No newline at end of file diff --git a/finetune/update_sriov_vf.sh b/finetune/update_sriov_vf.sh new file mode 100755 index 0000000..677fe93 --- /dev/null +++ b/finetune/update_sriov_vf.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# 提取 Port GUID 并格式化为 SRIOV-VF 配置 +generate_sriov_vf_config() { + GUIDS=($(ibstat | grep "Port GUID" | awk '{print $3}')) + for i in "${!GUIDS[@]}"; do + echo "SRIOV-VF=${GUIDS[$i]}" + done +} + +# 更新 Docker 配置文件 +update_docker_config() { + local GUIDS=("$@") + local DAEMON_JSON="/etc/docker/daemon.json" + local TMP_JSON="/tmp/daemon.json" + + if [ ! -f "$DAEMON_JSON" ]; then + echo "$DAEMON_JSON 文件不存在" + exit 1 + fi + + local NODE_GENERIC_RESOURCES=$(jq -c '.["node-generic-resources"]' "$DAEMON_JSON") + + if [ "$NODE_GENERIC_RESOURCES" == "null" ]; then + NODE_GENERIC_RESOURCES="[]" + fi + + for GUID in "${GUIDS[@]}"; do + if [[ ! $NODE_GENERIC_RESOURCES == *"$GUID"* ]]; then + NODE_GENERIC_RESOURCES=$(echo "$NODE_GENERIC_RESOURCES" | jq --arg vf "$GUID" '. += [$vf]') + fi + done + + jq '.["node-generic-resources"] = '"$NODE_GENERIC_RESOURCES" "$DAEMON_JSON" > "$TMP_JSON" + mv "$TMP_JSON" "$DAEMON_JSON" +} + +# 主函数 +main() { + if [[ $EUID -ne 0 ]]; then + echo "此脚本必须以 root 用户运行" + exit 1 + fi + + GUIDS=($(generate_sriov_vf_config)) + if [ ${#GUIDS[@]} -eq 0 ]; then + echo "未找到 SR-IOV VF 设备" + exit 1 + fi + + update_docker_config "${GUIDS[@]}" + echo "成功更新 $DAEMON_JSON 文件" + systemctl restart docker +} + +main "$@" diff --git a/ldh/.deepspeed_env b/ldh/.deepspeed_env new file mode 100644 index 0000000..be22bac --- /dev/null +++ b/ldh/.deepspeed_env @@ -0,0 +1,6 @@ +CUDA_HOME=/usr/local/cuda +CUTLASS_PATH=/opt/cutlass +TORCH_CUDA_ARCH_LIST="8.0 9.0+PTX" +PATH=/opt/openmpi/bin:/usr/lib/jvm/default-java/bin:/usr/local/cuda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +LD_LIBRARY_PATH=/opt/openmpi/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 +LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs diff --git a/ldh/Dockerfile b/ldh/Dockerfile new file mode 100644 index 0000000..bf1a48f --- /dev/null +++ b/ldh/Dockerfile @@ -0,0 +1,212 @@ +FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND="noninteractive" +ENV CUDA_HOME="/usr/local/cuda" +ENV JAVA_HOME="/usr/lib/jvm/default-java" +ENV CUTLASS_PATH="/opt/cutlass" +ENV CUTLASS_NVCC_ARCHS="80;90a" +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ENV PYTORCH_CUDA_VERSION="cu124" +ENV TORCH_CUDA_ARCH_LIST="8.0 9.0+PTX" + +ENV PATH=/opt/openmpi/bin:${CUDA_HOME}/bin:$JAVA_HOME/bin:${PATH} +ENV LD_LIBRARY_PATH=/opt/openmpi/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} +ENV LIBRARY_PATH=${CUDA_HOME}/lib64:${LIBRARY_PATH} + +ENV HF_TOKEN=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 + +SHELL ["/bin/bash", "-c"] + +WORKDIR /root + +RUN \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + nfs-common pdsh \ + cmake g++ gcc \ + curl wget vim tmux emacs less unzip screen \ + htop iftop iotop ca-certificates openssh-client openssh-server \ + rsync iputils-ping net-tools sudo \ + llvm-dev \ + libsndfile-dev libcupti-dev libjpeg-dev libpng-dev \ + libaio-dev libnuma-dev && \ + apt-get update && \ + apt-get install -y \ + git python3 python3-pip ninja-build default-jre && \ + python3 -m pip install --upgrade pip wheel && \ + apt-get -y install antlr4 && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# # DOCA https://developer.nvidia.com/doca-archive +# RUN \ +# wget --quiet https://www.mellanox.com/downloads/DOCA/DOCA_v2.5.2/host/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb -O /tmp/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb && \ +# dpkg -i /tmp/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb && \ +# apt-get update && \ +# apt-get -y install doca-runtime doca-sdk doca-tools + + +# cutlass https://github.com/NVIDIA/cutlass +RUN \ + git clone https://github.com/NVIDIA/cutlass.git /opt/cutlass && \ + cd /opt/cutlass && \ + git fetch --all --tags && \ + git checkout main && \ + git submodule update --init --recursive && \ + export CUDACXX=${CUDA_HOME}/bin/nvcc && \ + mkdir build && \ + cd build && \ + cmake .. -DCUTLASS_NVCC_ARCHS=${CUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON + # cmake .. -DCUTLASS_NVCC_ARCHS=${CUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=ON -DCUTLASS_LIBRARY_KERNELS=all -DCUTLASS_UNITY_BUILD_ENABLED=ON && \ + # make -j"$(nproc)" install + # make cutlass_profiler -j"$(nproc)" + # make test_unit -j"$(nproc)" VERBOSE=1 + + +# OPENMPI https://www.open-mpi.org/software/ompi/v4.1/ +RUN \ + wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz && \ + tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \ + cd /tmp/openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/opt/openmpi-${OPENMPI_VERSION} && \ + # ./configure --prefix=/opt/openmpi-${OPENMPI_VERSION} --with-cuda=/usr/local/cuda --enable-python-bindings --with-python=/usr/bin/python3 && \ + make -j$(nproc) && \ + make install && \ + ln -s /opt/openmpi-${OPENMPI_VERSION} /opt/openmpi && \ + # Sanity check: + test -f /opt/openmpi/bin/mpic++ && \ + cd /root && \ + rm -rf /tmp/* + + +# pytorch https://pytorch.org +RUN \ + python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \ + python3 -m pip install packaging pillow requests jinja2 triton networkx numpy tqdm urllib3 certifi setuptools --index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} + + +# Install apex with CUDA and C++ extensions https://github.com/NVIDIA/apex +# # if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... +# pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ +# # otherwise +# pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./ + +RUN \ + git clone https://github.com/NVIDIA/apex.git /tmp/apex && \ + cd /tmp/apex && \ + git fetch --all --tags && \ + git checkout tags/24.04.01 && \ + git submodule update --init --recursive && \ + python3 setup.py develop --cpp_ext --cuda_ext + + +# flash-attention https://github.com/Dao-AILab/flash-attention +# pip install flash-attn --no-build-isolation +# MAX_JOBS=4 pip install flash-attn --no-build-isolation +RUN \ + git clone https://github.com/Dao-AILab/flash-attention.git /tmp/flash-attention && \ + cd /tmp/flash-attention && \ + git submodule update --init --recursive && \ + python3 setup.py install + # pytest -q -s tests/test_flash_attn.py + # cd hopper + # python3 setup.py install + # export PYTHONPATH=$PWD + # pytest -q -s test_flash_attn.py + + +# xformers https://github.com/facebookresearch/xformers +RUN \ + git clone https://github.com/facebookresearch/xformers.git /tmp/xformers && \ + cd /tmp/xformers && \ + git submodule update --init --recursive && \ + python3 -m pip install -v -U /tmp/xformers + # python3 -m xformers.info + + +# TransformerEngine https://github.com/NVIDIA/TransformerEngine +RUN \ + git clone --branch stable https://github.com/NVIDIA/TransformerEngine.git /tmp/TransformerEngine && \ + cd /tmp/TransformerEngine && \ + git submodule update --init --recursive && \ + python3 setup.py install + + +RUN \ + python3 -m pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub trl optimum tokenizers && \ + python3 -m pip install packaging jinja2 triton networkx urllib3 certifi requests protobuf blobfile pytest && \ + python3 -m pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy pillow scipy && \ + python3 -m pip install pyyaml ipython ipdb pydantic psutil yappi cffi py3nvml pyarrow graphviz astor boto3 msgpack ipykernel cython +RUN \ + python3 -m pip install zstandard nvitop pycocotools tensorboard tensor_parallel && \ +# # https://github.com/mpi4py/mpi4py/issues/335 +# rm /opt/conda/envs/${CONDA_ENV_NAME}/compiler_compat/ld && \ + python3 -m pip install mpi4py + + +# lm-eval https://github.com/EleutherAI/lm-evaluation-harness +# ENV ANTLR_VERSION=4.13.2 +# wget -q -O /root/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar +RUN \ + python3 -m pip install immutabledict langdetect && \ + python3 -m nltk.downloader popular punkt punkt_tab && \ + && \ + python3 -m pip install antlr4-python3-runtime==4.11 && \ + huggingface-cli login --token ${HF_TOKEN} && \ + git clone https://github.com/EleutherAI/lm-evaluation-harness.git /root/lm-evaluation-harness && \ + cd /root/lm-evaluation-harness && \ + python3 -m pip install -e ".[dev]" + + +# Megatron-LM https://github.com/NVIDIA/Megatron-LM +RUN \ + git clone https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \ + /root/Megatron-LM && \ + git checkout core_r0.5.0 && \ + pip install --no-use-pep517 -e . + + +# SSH config +RUN \ + echo 'root:root' | chpasswd && \ + cp /etc/ssh/sshd_config /tmp/sshd_config && \ + echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \ + sed -i "s/#Port 22/Port 22222/" /etc/ssh/sshd_config && \ + sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \ + sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \ + sed -i "s/#StrictModes yes/StrictModes no/" /etc/ssh/sshd_config && \ + sed -i "s/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/" /etc/ssh/ssh_config && \ + chown root:root /etc/ssh/sshd_config && \ + mkdir -p /run/sshd && chmod 0755 /run/sshd && \ + ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \ + cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys +# ENV config +RUN \ + unset https_proxy http_proxy && \ + echo "CUDA_HOME=${CUDA_HOME}" > ~/.deepspeed_env && \ + echo "CUTLASS_PATH=${CUTLASS_PATH}" >> ~/.deepspeed_env && \ + echo "TORCH_CUDA_ARCH_LIST=\"${TORCH_CUDA_ARCH_LIST}\"" >> ~/.deepspeed_env && \ + echo "PATH=${PATH}" >> ~/.deepspeed_env && \ + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ~/.deepspeed_env && \ + echo "LIBRARY_PATH=${LIBRARY_PATH}" >> ~/.deepspeed_env && \ + echo "export CUDA_HOME=${CUDA_HOME}" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \ + echo "export CUTLASS_PATH=${CUTLASS_PATH}" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \ + echo "export TORCH_CUDA_ARCH_LIST=\"${TORCH_CUDA_ARCH_LIST}\"" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \ + echo "export PATH=$PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \ + echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \ + echo "export LIBRARY_PATH=$LIBRARY_PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc +# clean +RUN \ + cd ~ && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + rm -rf /tmp/* && \ + rm -rf /var/tmp/* && \ + rm -rf /root/.cache/pip diff --git a/ldh/compose.yml b/ldh/compose.yml new file mode 100644 index 0000000..1e09496 --- /dev/null +++ b/ldh/compose.yml @@ -0,0 +1,40 @@ + +services: + ldh-deepspeed-test: + build: + context: . + dockerfile: Dockerfile + # args: + # HTTP_PROXY: "http://127.0.0.1:15777" + # HTTPS_PROXY: "http://127.0.0.1:15777" + # cache-from: "type=local" + image: ldh/deepspeed:test + container_name: ldh-deepspeed-test + shm_size: '1024gb' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + # runtime: nvidia + # ipc: host + pid: host + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + # stdin_open: true + # tty: true + # privileged: true + cap_add: + - ALL + volumes: + - /mnt/beegfs:/root/shared/beegfs + - /mnt/yrfs:/root/shared/yrfs + # ports: + # - "22242:22242" + # - "5000:5000" + network_mode: host + command: ["/usr/sbin/sshd", "-D"] + # command: ["/bin/bash", "-c", "while true; do sleep 1000; done"] diff --git a/ldh/requirements.txt b/ldh/requirements.txt new file mode 100644 index 0000000..ade6dea --- /dev/null +++ b/ldh/requirements.txt @@ -0,0 +1,238 @@ +absl-py==2.1.0 +accelerate==0.33.0 +aiohappyeyeballs==2.4.0 +aiohttp==3.10.5 +aiosignal==1.3.1 +annotated-types==0.7.0 +antlr4-python3-runtime==4.11.0 +apex @ file:///tmp/apex +astor==0.8.1 +asttokens==2.4.1 +async-timeout==4.0.3 +attrs==24.2.0 +blinker==1.4 +blis==0.7.11 +blobfile==2.1.1 +boto3==1.35.5 +botocore==1.35.5 +cachetools==5.5.0 +catalogue==2.0.10 +certifi==2022.12.7 +cffi==1.17.0 +cfgv==3.4.0 +chardet==5.2.0 +charset-normalizer==2.1.1 +click==8.0.3 +cloudpathlib==0.18.1 +colorama==0.4.4 +coloredlogs==15.0.1 +comm==0.2.2 +confection==0.1.5 +contourpy==1.2.1 +coverage==7.6.1 +cryptography==3.4.8 +cycler==0.12.1 +cymem==2.0.8 +Cython==3.0.11 +DataProperty==1.0.1 +datasets==2.21.0 +dbus-python==1.2.18 +debugpy==1.8.5 +decorator==5.1.1 +deepspeed==0.15.0 +diffusers==0.30.1 +dill==0.3.8 +distlib==0.3.8 +distro==1.7.0 +docstring_parser==0.16 +einops==0.8.0 +evaluate==0.4.2 +exceptiongroup==1.2.2 +execnet==2.1.1 +executing==2.0.1 +filelock==3.13.1 +flash-attn==2.6.3 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.2.0 +graphviz==0.20.3 +grpcio==1.66.0 +hjson==3.1.0 +httplib2==0.20.2 +huggingface-hub==0.24.6 +humanfriendly==10.0 +identify==2.6.0 +idna==3.4 +immutabledict==4.2.0 +importlib-metadata==4.6.4 +iniconfig==2.0.0 +iotop==0.6 +ipdb==0.13.13 +ipykernel==6.29.5 +ipython==8.26.0 +jedi==0.19.1 +jeepney==0.7.1 +Jinja2==3.1.3 +jmespath==1.0.1 +joblib==1.4.2 +jsonlines==4.0.0 +jupyter_client==8.6.2 +jupyter_core==5.7.2 +keyring==23.5.0 +kiwisolver==1.4.5 +langcodes==3.4.0 +langdetect==1.0.9 +language_data==1.2.0 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +-e git+https://github.com/EleutherAI/lm-evaluation-harness.git@aab42ba836b4af28cc1c5c1e697ea334c6ea7ced#egg=lm_eval +lxml==4.9.4 +marisa-trie==1.2.0 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.9.2 +matplotlib-inline==0.1.7 +mbstrdecoder==1.1.3 +mdurl==0.1.2 +meson==0.61.2 +more-itertools==8.10.0 +mpi4py==4.0.0 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +multiprocess==0.70.16 +murmurhash==1.0.10 +mypy==1.11.1 +mypy-extensions==1.0.0 +nest-asyncio==1.6.0 +networkx==3.2.1 +ninja==1.11.1.1 +nltk==3.9.1 +nodeenv==1.9.1 +numexpr==2.10.1 +numpy==1.26.3 +nvidia-cublas-cu12==12.4.2.65 +nvidia-cuda-cupti-cu12==12.4.99 +nvidia-cuda-nvrtc-cu12==12.4.99 +nvidia-cuda-runtime-cu12==12.4.99 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.0.44 +nvidia-curand-cu12==10.3.5.119 +nvidia-cusolver-cu12==11.6.0.99 +nvidia-cusparse-cu12==12.3.0.142 +nvidia-ml-py==12.535.161 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.4.99 +nvidia-nvtx-cu12==12.4.99 +nvitop==1.3.2 +oauthlib==3.2.0 +optimum==1.21.4 +packaging==22.0 +pandas==2.2.2 +parso==0.8.4 +pathvalidate==3.2.1 +peft==0.12.0 +pexpect==4.9.0 +pillow==10.2.0 +platformdirs==4.2.2 +pluggy==1.5.0 +portalocker==2.10.1 +pre-commit==3.8.0 +preshed==3.0.9 +prompt_toolkit==3.0.47 +protobuf==5.27.3 +psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-cpuinfo==9.0.0 +py3nvml==0.2.7 +pyarrow==17.0.0 +pybind11==2.13.5 +pycocotools==2.0.8 +pycparser==2.22 +pycryptodomex==3.20.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +pyelftools==0.27 +Pygments==2.18.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +pyparsing==2.4.7 +pytablewriter==1.2.0 +pytest==8.3.2 +pytest-cov==5.0.0 +pytest-xdist==3.6.1 +python-apt==2.4.0+ubuntu3 +python-dateutil==2.9.0.post0 +pytz==2024.1 +PyYAML==5.4.1 +pyzmq==26.2.0 +regex==2024.7.24 +requests==2.32.3 +rich==13.7.1 +rouge-score==0.1.2 +s3transfer==0.10.2 +sacrebleu==2.4.3 +safetensors==0.4.4 +scikit-learn==1.5.1 +scipy==1.14.1 +seaborn==0.13.2 +SecretStorage==3.3.1 +sentencepiece==0.2.0 +shellingham==1.5.4 +shtab==1.7.1 +six==1.16.0 +smart-open==7.0.4 +spacy==3.7.6 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +sqlitedict==2.1.0 +srsly==2.4.8 +stack-data==0.6.3 +sympy==1.12 +tabledata==1.3.3 +tabulate==0.9.0 +tcolorpy==0.1.6 +tensor-parallel==2.0.0 +tensorboard==2.17.1 +tensorboard-data-server==0.7.2 +termcolor==2.4.0 +thinc==8.2.5 +threadpoolctl==3.5.0 +tiktoken==0.7.0 +timm==1.0.9 +tokenizers==0.19.1 +tomli==2.0.1 +torch==2.4.0+cu124 +torchaudio==2.4.0+cu124 +torchvision==0.19.0+cu124 +tornado==6.4.1 +tqdm==4.66.5 +tqdm-multiprocess==0.0.11 +traitlets==5.14.3 +transformers==4.43.4 +triton==3.0.0 +trl==0.9.6 +typepy==1.3.2 +typer==0.12.4 +typing_extensions==4.9.0 +tyro==0.8.8 +tzdata==2024.1 +urllib3==1.26.13 +virtualenv==20.26.3 +wadllib==1.3.6 +wasabi==1.1.3 +wcwidth==0.2.13 +weasel==0.4.1 +Werkzeug==3.0.4 +word2number==1.1 +wrapt==1.16.0 +xformers @ file:///tmp/xformers +xmltodict==0.13.0 +xxhash==3.5.0 +yappi==1.6.0 +yarl==1.9.4 +zipp==1.0.0 +zstandard==0.23.0 \ No newline at end of file diff --git a/megadna/Dockerfile b/megadna/Dockerfile new file mode 100644 index 0000000..ed7639e --- /dev/null +++ b/megadna/Dockerfile @@ -0,0 +1,137 @@ +ARG CUDA_VERSION=12.1.0 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04 +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +ARG CONDA_ENV_NAME="deepspeed" +ENV CONDA_ENV_NAME=${CONDA_ENV_NAME} +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH +WORKDIR /root +SHELL ["/bin/bash", "-c"] +# base tools +RUN <> ~/.bashrc +echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# 安装 micromamba +echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) +micromamba shell init -s bash -p ~/micromamba +cat <<'EOF' >> ~/.bashrc +source ~/micromamba/etc/profile.d/micromamba.sh +alias mamba=micromamba +alias mba=mamba +EOF +# 配置 .mambarc 文件 +cat < ~/.mambarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +EOT + +# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +# PyTorch +ENV REF='main' +ENV STAGE_DIR=/tmp +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +ARG CUDA='cu121' +ENV CUDA=${CUDA} +ARG PYTORCH_VERSION=2.3.0 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ARG TORCHVISION_VERSION=0.18.0 +ENV TORCHVISION_VERSION=${TORCHVISION_VERSION} +ARG TORCHAUDIO_VERSION=2.3.0 +ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION} +ARG PYTORCH_CUDA_VERSION=12.1 +ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION} +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +ARG SETUPTOOLS_VERSION=69.5.1 +ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION} +RUN <