commit 873429d4e66cfc2d390422048a5dc7869e0431da
Author: lingyuzeng <pylyzeng@gmail.com>
Date:   Wed Aug 28 15:18:15 2024 +0800

    first add

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f5ddb54
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+*.tar
+build_d/
+*_src
+evo_src/
+megaDNA_src/
+evo/huggingface/
+*.zip
+finetune/binbbt/
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..eab1248
--- /dev/null
+++ b/README.md
@@ -0,0 +1,16 @@
+
+
+## 预训练
+
+GitHub - huggingface/transformers: 🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
+GitHub - microsoft/DeepSpeed: DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
+GitHub - huggingface/peft: 🤗 PEFT: State-of-the-art Parameter-Efficient Fine-Tuning.
+GitHub - huggingface/accelerate: 🚀 A simple way to launch, train, and use PyTorch models on almost any device and distributed configuration, automatic mixed precision (including fp8), and easy-to-configure FSDP and DeepSpeed support
+
+```shell
+# torch 
+https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+https://github.com/huggingface/transformers/blob/main/docker/transformers-all-latest-gpu/Dockerfile
+https://github.com/huggingface/peft/tree/main/docker/peft-gpu-bnb-source
+https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+```
\ No newline at end of file
diff --git a/bgpt/Dockerfile.bgpt b/bgpt/Dockerfile.bgpt
new file mode 100644
index 0000000..e32f214
--- /dev/null
+++ b/bgpt/Dockerfile.bgpt
@@ -0,0 +1,76 @@
+# syntax=docker/dockerfile:1
+FROM nvidia/cuda:11.6.1-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+COPY requirements.txt /root/
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl git git-lfs jq vim bash libaio-dev build-essential openssh-server lsof python3 python3-pip
+echo "PermitRootLogin yes" >> /etc/ssh/sshd_config
+echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config
+echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config
+echo "Port 22" >> /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo 'root:cdcdocker' | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba 并配置 mambarc
+echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+micromamba shell init -s bash -p ~/micromamba
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+alias mamba=micromamba
+alias mba=mamba
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/.mambarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+EOF
+mkdir -p ~/.pip
+echo "
+[global]
+index-url = https://mirrors.aliyun.com/pypi/simple/
+
+[install]
+trusted-host=mirrors.aliyun.com
+" >> ~/.pip/pip.conf
+micromamba create -n bgpt -c conda-forge python=3.7.9 -y
+micromamba run -n bgpt pip install -r requirements.txt
+micromamba run -n bgpt pip install ipykernel attrs seaborn
+micromamba run -n bgpt python -m ipykernel install --user --name="bgpt" --display-name="bgpt_env"
+micromamba run -n bgpt pip install seaborn attrs torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+echo "micromamba activate bgpt" >> ~/.bashrc
+EOT
+
+# Expose SSH port
+EXPOSE 3222
+
+# Keep the container running
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/bgpt/docker-compose-bgpt.yml b/bgpt/docker-compose-bgpt.yml
new file mode 100644
index 0000000..2599b51
--- /dev/null
+++ b/bgpt/docker-compose-bgpt.yml
@@ -0,0 +1,32 @@
+version: '3.8'
+
+services:
+  ubuntu-ssh:
+    build: 
+      context: .
+      dockerfile: Dockerfile.bgpt
+    volumes:
+      - /data:/data
+    container_name: ubuntu-ssh
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: zly/cuda-bgpt:latest
+    ports:
+      - 3222:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_bgpt
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_bgpt:
+    name: network_bgpt
diff --git a/bgpt/requirements.txt b/bgpt/requirements.txt
new file mode 100644
index 0000000..2b819fa
Binary files /dev/null and b/bgpt/requirements.txt differ
diff --git a/dcgm-exporter/Dockerfile.grafana b/dcgm-exporter/Dockerfile.grafana
new file mode 100644
index 0000000..c908baf
--- /dev/null
+++ b/dcgm-exporter/Dockerfile.grafana
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1
+
+ARG GRAFANA_VERSION="9.5.2"
+
+FROM ubuntu:22.04
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+
+# 替换 sources.list 文件以使用阿里云镜像源
+# RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.aliyun.com/ubuntu/|g' /etc/apt/sources.list && \
+#     sed -i 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list
+
+# 安装必要的工具和库
+RUN apt-get update && \
+    apt-get install -y wget vim bash ca-certificates
+
+RUN wget https://dl.grafana.com/oss/release/grafana_${GRAFANA_VERSION}_amd64.deb && \
+    dpkg -i grafana_${GRAFANA_VERSION}_amd64.deb && \
+    rm grafana_${GRAFANA_VERSION}_amd64.deb
+
+COPY grafana.ini /etc/grafana/grafana.ini
+
+EXPOSE 3000
+
+CMD ["/usr/sbin/grafana-server", "--config=/etc/grafana/grafana.ini"]
diff --git a/dcgm-exporter/Dockerfile.prometheus b/dcgm-exporter/Dockerfile.prometheus
new file mode 100644
index 0000000..44a7a70
--- /dev/null
+++ b/dcgm-exporter/Dockerfile.prometheus
@@ -0,0 +1,32 @@
+# syntax=docker/dockerfile:1
+
+ARG PROMETHEUS_VERSION="2.45.6"
+
+FROM ubuntu:22.04
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+
+# 替换 sources.list 文件以使用阿里云镜像源
+# RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.aliyun.com/ubuntu/|g' /etc/apt/sources.list && \
+#     sed -i 's|http://security.ubuntu.com/ubuntu|http://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list
+
+
+# 安装必要的工具和库
+RUN apt-get update && \
+    apt-get install -y wget vim bash ca-certificates
+
+RUN wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz && \
+    tar xvfz prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz && \
+    mv prometheus-${PROMETHEUS_VERSION}.linux-amd64 /opt/prometheus && \
+    rm prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
+
+COPY prometheus.yml /opt/prometheus/prometheus.yml
+
+EXPOSE 9090
+
+CMD ["/opt/prometheus/prometheus", "--config.file=/opt/prometheus/prometheus.yml"]
diff --git a/dcgm-exporter/README.md b/dcgm-exporter/README.md
new file mode 100644
index 0000000..166e59e
--- /dev/null
+++ b/dcgm-exporter/README.md
@@ -0,0 +1,7 @@
+
+构建和运行
+使用 Docker Compose 构建和运行容器：
+
+```shell
+docker-compose up --build -d
+```
\ No newline at end of file
diff --git a/dcgm-exporter/docker-compose.yml b/dcgm-exporter/docker-compose.yml
new file mode 100644
index 0000000..b994086
--- /dev/null
+++ b/dcgm-exporter/docker-compose.yml
@@ -0,0 +1,34 @@
+version: '3.8'
+
+services:
+  prometheus:
+    build:
+      context: .
+      dockerfile: Dockerfile.prometheus
+      args: 
+        PROMETHEUS_VERSION: "2.45.6"
+        HTTP_PROXY: "http://localhost:15777"
+        HTTPS_PROXY: "http://localhost:15777"
+    image: zly/prometheus:latest
+    container_name: prometheus
+    ports:
+      - "9090:9090"
+    volumes:
+      - ./prometheus.yml:/opt/prometheus/prometheus.yml
+    restart: unless-stopped
+
+  grafana:
+    build:
+      context: .
+      dockerfile: Dockerfile.grafana
+      args: 
+        GRAFANA_VERSION: "9.5.2"
+        HTTP_PROXY: "http://localhost:15777"
+        HTTPS_PROXY: "http://localhost:15777"
+    image: zly/grafana:latest
+    container_name: grafana
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./grafana.ini:/etc/grafana/grafana.ini
+    restart: unless-stopped
diff --git a/dcgm-exporter/grafana.ini b/dcgm-exporter/grafana.ini
new file mode 100644
index 0000000..ce0b534
--- /dev/null
+++ b/dcgm-exporter/grafana.ini
@@ -0,0 +1,6 @@
+[server]
+http_port = 3000
+
+[security]
+admin_user = admin
+admin_password = grafana
diff --git a/dcgm-exporter/prometheus.yml b/dcgm-exporter/prometheus.yml
new file mode 100644
index 0000000..ab388ea
--- /dev/null
+++ b/dcgm-exporter/prometheus.yml
@@ -0,0 +1,11 @@
+global:
+  scrape_interval: 15s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  - job_name: 'dcgm-exporter'
+    static_configs:
+      - targets: ['127.0.0.1:9400']
diff --git a/evo/Dockerfile b/evo/Dockerfile
new file mode 100644
index 0000000..4591208
--- /dev/null
+++ b/evo/Dockerfile
@@ -0,0 +1,139 @@
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# install latest cmake
+wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
+sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+sudo apt-get update
+sudo apt-get install -y cmake
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo 'root:${ROOT_PASSWD}' | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba
+echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+micromamba shell init -s bash -p ~/micromamba
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+alias mamba=micromamba
+alias mba=mamba
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/.mambarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.0
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.0
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.0
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# 克隆 ninja 源码并编译
+git clone https://github.com/ninja-build/ninja.git 
+cd ninja
+# 克隆 GoogleTest 源码
+git clone https://github.com/google/googletest.git
+conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap
+# 配置并构建 Ninja 测试，添加 pthread 链接选项
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
+./ninja all
+# 运行 Ninja 单元测试
+./ninja_test
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
+python3 -m pip install --no-cache-dir --upgrade pip
+conda clean -afy 
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/evo-design/evo.git ${STAGE_DIR}/evo
+cd ${STAGE_DIR}/evo
+pip install packaging
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+pip install .
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip install ipykernel
+python3 -m ipykernel install --user --name=${CONDA_ENV_NAME} --display-name=${CONDA_ENV_NAME}
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
\ No newline at end of file
diff --git a/evo/docker-compose_pytorch1.13.yml b/evo/docker-compose_pytorch1.13.yml
new file mode 100644
index 0000000..d103c8b
--- /dev/null
+++ b/evo/docker-compose_pytorch1.13.yml
@@ -0,0 +1,48 @@
+version: '3.8'
+
+services:
+  ubuntu-evo:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        CONDA_ENV_NAME: evo
+        CUDA_VERSION: 11.7.1
+        PYTORCH_VERSION: 1.13.1
+        TORCHVISION_VERSION: 0.14.1
+        TORCHAUDIO_VERSION: 0.13.1
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 0
+        DS_BUILD_FUSED_ADAM: 0
+        DS_BUILD_CPU_ADAM: 0
+        CUDA: cu117
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        ROOT_PASSWD: "root"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-evo
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:test
+    shm_size: '32gb'
+    ports:
+      - 3227:2222
+    command: ["/usr/sbin/sshd", "-D"]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_evo
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_evo:
+    name: network_evo
diff --git a/evo/docker-compose_pytorch2.3.yml b/evo/docker-compose_pytorch2.3.yml
new file mode 100644
index 0000000..c1fa9df
--- /dev/null
+++ b/evo/docker-compose_pytorch2.3.yml
@@ -0,0 +1,46 @@
+version: '3.8'
+
+services:
+  ubuntu-evo:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        CONDA_ENV_NAME: evo
+        CUDA_VERSION: 12.1.0
+        PYTORCH_VERSION: 2.3.0
+        TORCHVISION_VERSION: 0.18.0
+        TORCHAUDIO_VERSION: 2.3.0
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 0
+        DS_BUILD_FUSED_ADAM: 0
+        DS_BUILD_CPU_ADAM: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+    volumes:
+      - ./huggingface:/root/.cache/huggingface
+    container_name: ubuntu-evo
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/evo:latest
+    shm_size: '32gb'
+    ports:
+      - 3227:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_evo
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_evo:
+    name: network_evo
diff --git a/finetune/Dockerfile b/finetune/Dockerfile
new file mode 100644
index 0000000..3e8735e
--- /dev/null
+++ b/finetune/Dockerfile
@@ -0,0 +1,463 @@
+
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# install latest cmake
+wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
+sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+sudo apt-get update
+sudo apt-get install -y cmake
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba
+# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+# micromamba shell init -s bash -p ~/micromamba
+# cat <<'EOF' >> ~/.bashrc
+# source ~/micromamba/etc/profile.d/micromamba.sh
+# EOF
+# # 配置 .mambarc 文件
+# cat <<EOF > ~/.mambarc
+# channels:
+#   - conda-forge
+#   - bioconda
+#   - pytorch
+#   - pytorch-nightly
+#   - nvidia
+#   - defaults
+# show_channel_urls: true
+# EOF
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+which python > ~/python_path.txt
+conda activate ${CONDA_ENV_NAME}
+# 克隆 ninja 源码并编译
+git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
+cd ${STAGE_DIR}/ninja
+# 克隆 GoogleTest 源码
+git clone https://github.com/google/googletest.git
+python ./configure.py --bootstrap
+# 配置并构建 Ninja 测试，添加 pthread 链接选项
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
+./ninja all
+# 运行 Ninja 单元测试
+./ninja_test
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+cd ..
+# install nv_peer_mem
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+echo "alias mamba=micromamba" >> ~/.bashrc
+echo "alias mba=mamba" >> ~/.bashrc
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/compile_deepspeed_ops.py
+import deepspeed
+
+def compile_ops():
+    builders = [
+        deepspeed.ops.op_builder.AsyncIOBuilder,
+        deepspeed.ops.op_builder.FusedAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdagradBuilder,
+        deepspeed.ops.op_builder.CPULionBuilder,
+        deepspeed.ops.op_builder.EvoformerAttnBuilder,
+        deepspeed.ops.op_builder.FPQuantizerBuilder,
+        deepspeed.ops.op_builder.FusedLambBuilder,
+        deepspeed.ops.op_builder.FusedLionBuilder,
+        deepspeed.ops.op_builder.QuantizerBuilder,
+        deepspeed.ops.op_builder.RaggedOpsBuilder,
+        deepspeed.ops.op_builder.RandomLTDBuilder,
+        deepspeed.ops.op_builder.SparseAttnBuilder,
+        deepspeed.ops.op_builder.SpatialInferenceBuilder,
+        deepspeed.ops.op_builder.TransformerBuilder,
+        deepspeed.ops.op_builder.StochasticTransformerBuilder,
+    ]
+    
+    for builder in builders:
+        print(f"Compiling {builder.__name__}")
+        builder().load()
+
+if __name__ == "__main__":
+    compile_ops()
+EOF
+python compile_deepspeed_ops.py
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/finetune/Dockerfile.bak b/finetune/Dockerfile.bak
new file mode 100644
index 0000000..94baaef
--- /dev/null
+++ b/finetune/Dockerfile.bak
@@ -0,0 +1,464 @@
+# syntax=docker/dockerfile:1
+
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# install latest cmake
+wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
+sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+sudo apt-get update
+sudo apt-get install -y cmake
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba
+# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+# micromamba shell init -s bash -p ~/micromamba
+# cat <<'EOF' >> ~/.bashrc
+# source ~/micromamba/etc/profile.d/micromamba.sh
+# EOF
+# # 配置 .mambarc 文件
+# cat <<EOF > ~/.mambarc
+# channels:
+#   - conda-forge
+#   - bioconda
+#   - pytorch
+#   - pytorch-nightly
+#   - nvidia
+#   - defaults
+# show_channel_urls: true
+# EOF
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+which python > ~/python_path.txt
+conda activate ${CONDA_ENV_NAME}
+# 克隆 ninja 源码并编译
+git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
+cd ${STAGE_DIR}/ninja
+# 克隆 GoogleTest 源码
+git clone https://github.com/google/googletest.git
+python ./configure.py --bootstrap
+# 配置并构建 Ninja 测试，添加 pthread 链接选项
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
+./ninja all
+# 运行 Ninja 单元测试
+./ninja_test
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+cd ..
+# install nv_peer_mem
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+echo "alias mamba=micromamba" >> ~/.bashrc
+echo "alias mba=mamba" >> ~/.bashrc
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/compile_deepspeed_ops.py
+import deepspeed
+
+def compile_ops():
+    builders = [
+        deepspeed.ops.op_builder.AsyncIOBuilder,
+        deepspeed.ops.op_builder.FusedAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdagradBuilder,
+        deepspeed.ops.op_builder.CPULionBuilder,
+        deepspeed.ops.op_builder.EvoformerAttnBuilder,
+        deepspeed.ops.op_builder.FPQuantizerBuilder,
+        deepspeed.ops.op_builder.FusedLambBuilder,
+        deepspeed.ops.op_builder.FusedLionBuilder,
+        deepspeed.ops.op_builder.QuantizerBuilder,
+        deepspeed.ops.op_builder.RaggedOpsBuilder,
+        deepspeed.ops.op_builder.RandomLTDBuilder,
+        deepspeed.ops.op_builder.SparseAttnBuilder,
+        deepspeed.ops.op_builder.SpatialInferenceBuilder,
+        deepspeed.ops.op_builder.TransformerBuilder,
+        deepspeed.ops.op_builder.StochasticTransformerBuilder,
+    ]
+    
+    for builder in builders:
+        print(f"Compiling {builder.__name__}")
+        builder().load()
+
+if __name__ == "__main__":
+    compile_ops()
+EOF
+python compile_deepspeed_ops.py
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
\ No newline at end of file
diff --git a/finetune/Dockerfile.ldh b/finetune/Dockerfile.ldh
new file mode 100644
index 0000000..7424c7f
--- /dev/null
+++ b/finetune/Dockerfile.ldh
@@ -0,0 +1,197 @@
+# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
+# FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+ENV DEBIAN_FRONTEND="noninteractive"
+
+ENV STAGE_DIR="/tmp"
+RUN mkdir -p ${STAGE_DIR}
+
+ENV CUTLASS_PATH="/opt/cutlass"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0+PTX"
+
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /root
+
+RUN \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-dev && \
+        apt-get install -y git python3 python3-pip && \
+        apt-get install -y --no-install-recommends \
+        libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+RUN \
+        apt-get update && \
+        apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \
+        graphviz ethtool bison libpci3 kmod pciutils dpatch libnl-route-3-200 libusb-1.0-0 \
+        tk m4 autoconf debhelper flex gfortran libnl-route-3-dev automake libnl-3-dev chrpath && \
+        apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
+        apt-get install -y quilt python3-distutils
+
+
+# Install Miniconda
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
+        /bin/bash ~/miniconda.sh -b -p /opt/conda -u && \
+        rm ~/miniconda.sh
+# Add conda to PATH and initialize conda
+ENV PATH=/opt/conda/bin:${PATH}
+RUN \
+        /opt/conda/bin/conda init bash
+
+ENV CONDA_ENV_NAME="deepspeed"
+ENV PYTHON_VERSION="3.10"
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
+# Create and activate a conda environment
+RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \
+        echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+        echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
+        /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}"
+
+
+# install cutlass https://github.com/NVIDIA/cutlass
+ENV DCUTLASS_NVCC_ARCHS="80;89;90;90a"
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
+        cd /opt/cutlass && \
+        git checkout . && \
+        git checkout main && \
+        mkdir build && \
+        cd build && \
+        cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
+        make -j"$(nproc)" install
+
+
+# Mellanox OFED
+# ENV MLNX_OFED_VERSION=5.8-5.1.1.2
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        apt-get install -y libnuma-dev libnvidia-compute-515 && \
+        # apt-get install -y libnuma-dev libnvidia-compute-535 && \
+        cd ${STAGE_DIR} && \
+        wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
+        cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
+        ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+        cd ${STAGE_DIR} && \
+        rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+
+
+# nv_peer_mem
+ENV NV_PEER_MEM_VERSION=1.2
+# ENV NV_PEER_MEM_VERSION=1.3
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        mkdir -p ${STAGE_DIR} && \
+        git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
+        cd ${STAGE_DIR}/nv_peer_memory && \
+        ./build_module.sh && \
+        cd ${STAGE_DIR} && \
+        tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
+        cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
+        apt-get update && \
+        apt-get install -y dkms && \
+        dpkg-buildpackage -us -uc && \
+        dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+
+
+# OPENMPI
+# ENV OPENMPI_BASEVERSION=4.1
+# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ENV OPENMPI_BASEVERSION=5.0
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        cd ${STAGE_DIR} && \
+        wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+        cd openmpi-${OPENMPI_VERSION} && \
+        ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+        make -j"$(nproc)" install && \
+        ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+        # Sanity check:
+        test -f /usr/local/mpi/bin/mpic++ && \
+        cd ${STAGE_DIR} && \
+        rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+        LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+        echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+        echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+        chmod a+x /usr/local/mpi/bin/mpirun
+
+
+ENV PYTORCH_VERSION=2.3.0
+ENV TORCHVISION_VERSION=0.18.0
+ENV TORCHAUDIO_VERSION=2.3.0
+ENV PYTORCH_CUDA_VERSION='cu121'
+
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
+        pip install packaging && \
+        pip install flash-attn
+
+# Install apex with CUDA and C++ extensions
+# pip --version | grep -q "pip 23.1" && \
+#     (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \
+#     (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        git clone https://github.com/NVIDIA/apex /tmp/apex && \
+        cd /tmp/apex && \
+        pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./  && \
+        python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \
+        cd / && \
+        rm -rf /tmp/apex
+
+# RUN \
+#         source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+#         git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \
+#         cd ${STAGE_DIR}/DeepSpeed && \
+#         git checkout ${DEEPSPEED_VERSION} && \
+#         sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \
+#         chmod +x ./install_modified.sh && \
+#         if [ -n "${HOSTFILE_CONTENT}" ]; then \
+#             echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \
+#             INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \
+#         else \
+#             INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \
+#         fi && \
+#         eval $INSTALL_CMD && \
+#         ds_report
+
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \
+        pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython ipdb psutil pydantic
+
+
+RUN \
+        echo 'root:root' | chpasswd && \
+        cp /etc/ssh/sshd_config /tmp/sshd_config && \
+        echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \
+        sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \
+        sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \
+        chown root:root /etc/ssh/sshd_config && \
+        mkdir -p /run/sshd && chmod 0755 /run/sshd
+
+# RUN \
+#         bash -c 'echo -e "export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"\nexport CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc'
+
diff --git a/finetune/Dockerfile.ngc b/finetune/Dockerfile.ngc
new file mode 100644
index 0000000..bfd89cb
--- /dev/null
+++ b/finetune/Dockerfile.ngc
@@ -0,0 +1,162 @@
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+ARG LABEL=notebook
+ARG VERSION
+ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION
+FROM $BASE_CONTAINER
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+
+# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
+ENV STAGE_DIR=/tmp
+RUN <<EOT
+#!/bin/bash
+# install Mellanox OFED prepare
+apt-get update
+apt install -y libnvidia-compute-535
+apt-get install -y pciutils tk kmod libusb-1.0-0 tcl chrpath libpci3 bison lsof graphviz ethtool swig udev libltdl-dev libelf1 libmnl0 debhelper flex libfuse2
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz -O ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz
+tar xzf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz -C ${STAGE_DIR}
+cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
+./mlnxofedinstall --user-space-only --without-fw-update --all -q > ${STAGE_DIR}/mlnxofedinstall.log 2>&1
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+EOT
+
+ARG NV_PEER_MEM_VERSION="1.2"
+ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN <<EOT
+#!/bin/bash
+# install nv_peer_mem
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt --fix-broken install -y
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 sudo 
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+eval "$(curl https://get.x-cmd.com)"
+# install pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+EOT
+
+RUN <<EOT
+#!/bin/bash
+pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+EOT
+
+RUN <<EOT
+#!/bin/bash
+pip install git+https://github.com/huggingface/transformers
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+EOT
+
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+python -m pip install -v .
+EOT
+
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH="/opt/pytorch/pytorch/third_party/cutlass"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+./install.sh ${DEEPSPEED_INSTALL_FLAGS}
+ds_report
+EOT
+
+RUN <<EOT
+#!/bin/bash
+python -m pip install --upgrade pip
+python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
+# optimum 手动解决依赖
+python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
+python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
+python -m pip install evaluate datasets
+EOT
+
+RUN <<EOT
+#!/bin/bash
+# 项目目录中的定义通常会覆盖用户家目录中的定义
+# 配置 .deepspeed_env 文件
+cat <<EOF > ~/.deepspeed_env
+TORCH_USE_CUDA_DSA=1
+DEEPSPEED_VERBOSE=1
+DEEPSPEED_LOG_LEVEL=DEBUG
+CUTLASS_PATH=${CUTLASS_PATH}
+TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+CUDA_HOME=${CUDA_HOME}
+LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
+EOF
+unset https_proxy http_proxy
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
\ No newline at end of file
diff --git a/finetune/Dockerfile.update b/finetune/Dockerfile.update
new file mode 100644
index 0000000..9d5f049
--- /dev/null
+++ b/finetune/Dockerfile.update
@@ -0,0 +1,427 @@
+
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+. /opt/conda/etc/profile.d/conda.sh 
+conda init bash
+conda config --set show_channel_urls true
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# install pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} ninja cmake -c conda-forge -y
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+conda activate ${CONDA_ENV_NAME}
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+# 直接将 PyTorch 安装指引 中的 https://download.pytorch.org/whl 替换为 https://mirror.sjtu.edu.cn/pytorch-wheels 即可。
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex TORCH_CUDA_ARCH_LIST all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
+ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd ${STAGE_DIR}/apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+EOT
+
+ARG NV_PEER_MEM_VERSION="1.2"
+ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install nv_peer_mem
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt --fix-broken install -y
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard mpi4py -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+ARG NCCL_IB_DISABLE="1"
+ARG NCCL_SOCKET_IFNAME="eth0"
+ENV NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
+ENV NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
+# deepspeed env
+RUN <<EOT
+#!/bin/bash
+cat <<EOF > ~/.deepspeed_env
+NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
+NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
+NCCL_DEBUG=INFO
+CUTLASS_PATH=${CUTLASS_PATH}
+CUDA_HOME=${CUDA_HOME}
+EOF
+#CUDA_VISIBLE_DEVICES=0,1,2,3
+#OMP_NUM_THREADS=8
+#MASTER_ADDR=192.168.1.1
+#MASTER_PORT=12345
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/finetune/Dockfile-colosial b/finetune/Dockfile-colosial
new file mode 100644
index 0000000..0d28277
--- /dev/null
+++ b/finetune/Dockfile-colosial
@@ -0,0 +1,46 @@
+FROM hpcaitech/cuda-conda:12.1
+
+# metainformation
+LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
+
+# enable passwordless ssh
+RUN mkdir ~/.ssh && \
+    printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config && \
+    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+
+# enable RDMA support
+RUN apt-get update && \
+    apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install torch
+RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
+
+# install ninja
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ninja-build && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install apex
+RUN git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    git checkout a7de60 && \
+    pip install packaging && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+# install colossalai
+ARG VERSION=main
+RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
+    && cd ./ColossalAI \
+    && BUILD_EXT=1 pip install -v . \
+    && rm -rf colossalai
+
+# install tensornvme
+RUN conda install -y cmake && \
+    apt update -y && apt install -y libaio-dev && \
+    pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
diff --git a/finetune/README.md b/finetune/README.md
new file mode 100644
index 0000000..9a57dcf
--- /dev/null
+++ b/finetune/README.md
@@ -0,0 +1,350 @@
+## deepspeed docker image build
+
+```shell
+docker-compose -f docker-compose_pytorch1.13.yml build
+docker-compose -f docker-compose_pytorch2.3.yml build
+```
+
+## 英伟达显卡安装卸载驱动
+
+卸载
+
+```shell
+cd /usr/local/cuda
+ll
+cd ..
+cd cuda-12.3/
+ll
+cd bin/
+ll
+./cuda-uninstaller 
+cd ~
+nvidia-uninstall
+sudo modprobe -r nvidia-drm nvidia-modeset nvidia-uvm nvidia
+sudo rm -rf /usr/lib64/nvidia /usr/lib/nvidia
+sudo apt autoremove nvidia*
+sudo apt clean all
+sudo dracut --force
+sudo reboot
+```
+
+安装
+
+```shell
+wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
+dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb
+wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda_12.5.1_555.42.06_linux.run
+ll
+sudo sh  cuda_12.5.1_555.42.06_linux.run
+echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc && echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc && source /root/.bashrc
+nvcc -V
+nvidia-smi
+nvidia-smi -pm 1
+modprobe nvidia_peermem
+nvidia-smi
+modinfo nvidia_peermem
+lsmod | grep nvidia_peermem
+systemctl mask apt-daily-upgrade.service
+systemctl mask apt-daily-upgrade.timer
+systemctl disable apt-daily-upgrade.timer
+systemctl disable apt-daily-upgrade.service
+ll
+wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
+dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb 
+sudo systemctl start nvidia-fabricmanager
+sudo systemctl status nvidia-fabricmanager
+```
+
+## 镜像测试命令
+
+docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
+docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
+docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
+
+pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+
+```shell
+    1  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    2  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    3  curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
+    4  pigchacli 
+    5  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
+    6  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
+    7  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    8  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    9  python -c "from xformers import ops as xops"
+   10  python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+   11  env
+   12  pip install git+https://github.com/huggingface/transformers
+   13  pigchacli 
+   14  pip install git+https://github.com/huggingface/transformers
+   15  pip list
+   16  export STAGE_DIR=/tmp
+   17  git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+   18  cd ${STAGE_DIR}/oneCCL
+   19  git checkout . 
+   20  git checkout master
+   21  mkdir build
+   22  cd build 
+   23  cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+   24  make -j"$(nproc)" install
+   25  ls
+   26  echo ${CUDA_ARCH_LIST}
+   27  git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+   28  cd ${STAGE_DIR}/DeepSpeed-Kernels
+   29  python -m pip install -v .
+   30  env
+   31  python -m pip install -v .
+   32  git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+   33  cd ${STAGE_DIR}/DeepSpeed
+   34  export DEEPSPEED_VERSION="v0.14.3"
+   35  git checkout ${DEEPSPEED_VERSION}
+   36  ls
+   37  ./install.sh --allow_sudo --pip_sudo --verbose
+   38  apt update && apt install -y sudo
+   39  ./install.sh --allow_sudo --pip_sudo --verbose
+```
+
+```shell
+nvidia-smi
+nvcc -V
+ninja --version
+ds_report
+python -c "import torch; print('torch:', torch.__version__, torch)"
+python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()"
+python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+python -c "from xformers import ops as xops"
+ibstat
+ofed_info  -s
+mst version
+mpirun --version
+```
+
+```shell
+cat <<EOF > ~/compile_deepspeed_ops.py
+import deepspeed
+
+def compile_ops():
+    builders = [
+        deepspeed.ops.op_builder.AsyncIOBuilder,
+        deepspeed.ops.op_builder.FusedAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdagradBuilder,
+        deepspeed.ops.op_builder.CPULionBuilder,
+        deepspeed.ops.op_builder.EvoformerAttnBuilder,
+        deepspeed.ops.op_builder.FPQuantizerBuilder,
+        deepspeed.ops.op_builder.FusedLambBuilder,
+        deepspeed.ops.op_builder.FusedLionBuilder,
+        deepspeed.ops.op_builder.QuantizerBuilder,
+        deepspeed.ops.op_builder.RaggedOpsBuilder,
+        deepspeed.ops.op_builder.RandomLTDBuilder,
+        deepspeed.ops.op_builder.SparseAttnBuilder,
+        deepspeed.ops.op_builder.SpatialInferenceBuilder,
+        deepspeed.ops.op_builder.TransformerBuilder,
+        deepspeed.ops.op_builder.StochasticTransformerBuilder,
+    ]
+    
+    for builder in builders:
+        print(f"Compiling {builder.__name__}")
+        builder().load()
+
+if __name__ == "__main__":
+    compile_ops()
+EOF
+python compile_deepspeed_ops.py
+```
+
+## 配置vscode的docker的插件
+
+[nerdctl配置](https://blog.csdn.net/margu_168/article/details/139822555)
+
+
+
+```shell
+cat << 'EOF' > /usr/local/bin/docker
+#!/bin/bash
+exec nerdctl "$@"
+EOF
+chmod +x /usr/local/bin/docker
+```
+
+nerdctl bash自动补全
+
+```shell
+apt update
+apt install bash-completion -y
+nerdctl completion bash > /etc/bash_completion.d/nerdctl
+nerdctl completion bash > /etc/bash_completion.d/docker
+source /etc/bash_completion.d/nerdctl
+source /etc/bash_completion.d/docker
+```
+
+## 物理机更新内核
+
+```shell
+uname -r # 5.4.0-144-generic
+lsb_release -a
+sudo apt-get update # This will update the repositories list
+sudo apt-get upgrade # This will update all the necessary packages on your system
+sudo apt-get dist-upgrade # This will add/remove any needed packages
+reboot # You may need this since sometimes after a upgrade/dist-upgrade, there are some left over entries that get fixed after a reboot
+sudo apt-get install linux-headers-$(uname -r) # This should work now
+```
+
+## test command
+
+```shell
+docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash
+```
+
+## [查询GPU 架构 给变量赋值](https://blog.csdn.net/zong596568821xp/article/details/106411024)
+
+```shell
+git clone https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps.git
+cd deepstream_tlt_apps/TRT-OSS/x86
+nvcc deviceQuery.cpp -o deviceQuery
+./deviceQuery
+```
+
+H100 输出
+
+```shell
+(base) root@node19:~/bgpt/deepstream_tlt_apps/TRT-OSS/x86# ./deviceQuery
+Detected 8 CUDA Capable device(s)
+
+Device 0: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 1: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 2: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 3: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 4: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 5: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 6: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 7: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+```
+
+
+## DeepSpeed hostfile 分发
+
+要手动分发 hostfile 并进行分布式安装，你需要以下几个步骤：
+
+1. 准备 hostfile
+确保 hostfile 文件包含所有参与的主机及其配置。
+
+示例 hostfile 内容：
+
+```plaintext
+host1 slots=4
+host2 slots=4
+host3 slots=8
+```
+
+2. 确保 SSH 配置正确
+确保你能够通过 SSH 无密码登录到所有主机。可以使用 ssh-keygen 和 ssh-copy-id 配置 SSH 密钥。
+
+生成 SSH 密钥（如果尚未生成）：
+
+```shell
+ssh-keygen -t rsa
+```
+
+将 SSH 公钥复制到每个主机：
+
+```shell
+ssh-copy-id user@host1
+ssh-copy-id user@host2
+ssh-copy-id user@host3
+```
+
+3. 创建临时目录并复制 wheel 文件
+在所有主机上创建一个临时目录，用于存放分发的 wheel 文件。
+
+```shell
+export PDSH_RCMD_TYPE=ssh
+hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",")
+tmp_wheel_path="/tmp/deepspeed_wheels"
+
+pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}"
+pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/
+pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+```
+
+4. 在每个主机上安装 DeepSpeed 和依赖项
+在所有主机上安装 DeepSpeed 和所需的依赖项。
+
+```shell
+pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl"
+pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt"
+```
+
+5. 清理临时文件
+安装完成后，删除所有主机上的临时文件。
+
+```shell
+pdsh -w $hosts "rm -rf ${tmp_wheel_path}"
+```
+
+详细步骤
+确保 SSH 配置正确：
+
+```shell
+ssh-keygen -t rsa
+ssh-copy-id user@host1
+ssh-copy-id user@host2
+ssh-copy-id user@host3
+```
+
+创建临时目录并复制文件：
+
+```shell
+export PDSH_RCMD_TYPE=ssh
+hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",")
+tmp_wheel_path="/tmp/deepspeed_wheels"
+
+pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}"
+pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/
+pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+```
+
+在所有主机上安装 DeepSpeed 和依赖项：
+
+```shell
+pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl"
+pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt"
+```
+
+清理临时文件：
+
+```shell
+pdsh -w $hosts "rm -rf ${tmp_wheel_path}"
+```
+
+通过这些步骤，你可以手动分发 hostfile 并在多个主机上安装 DeepSpeed 和其依赖项。这种方法确保了每个主机的环境配置一致，从而支持分布式训练或部署。
\ No newline at end of file
diff --git a/finetune/accelerate-gpu-deepspeed.Dockerfile b/finetune/accelerate-gpu-deepspeed.Dockerfile
new file mode 100644
index 0000000..d35fc1b
--- /dev/null
+++ b/finetune/accelerate-gpu-deepspeed.Dockerfile
@@ -0,0 +1,46 @@
+# Builds GPU docker image of PyTorch specifically
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+# Note: DeepSpeed beyond v0.12.6 requires py 3.10
+ENV PYTHON_VERSION=3.10
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Create our conda env
+RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/accelerate/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+# Activate the conda env, install mpy4pi, and install torch + accelerate
+RUN source activate accelerate && conda install -c conda-forge mpi4py
+RUN source activate accelerate && \
+    python3 -m pip install --no-cache-dir \
+    git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
+    --extra-index-url https://download.pytorch.org/whl/cu117
+
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Stage 2
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+RUN echo "source activate accelerate" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/finetune/binbbt.tar.gz b/finetune/binbbt.tar.gz
new file mode 100644
index 0000000..b878c9c
Binary files /dev/null and b/finetune/binbbt.tar.gz differ
diff --git a/finetune/configure_gpu.sh b/finetune/configure_gpu.sh
new file mode 100755
index 0000000..2494aee
--- /dev/null
+++ b/finetune/configure_gpu.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# 提取GPU UUID
+GPU_UUIDS=$(nvidia-smi -a | grep 'GPU UUID' | awk '{print $4}')
+
+# 生成node-generic-resources JSON片段
+NODE_RESOURCES=$(echo "$GPU_UUIDS" | awk '{print "\"NVIDIA-GPU=" $1 "\","}' | tr -d '\n')
+NODE_RESOURCES=${NODE_RESOURCES%,}  # 移除最后一个逗号
+
+# 生成完整的daemon.json内容
+DAEMON_JSON=$(cat <<EOF
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "/usr/bin/nvidia-container-runtime",
+      "runtimeArgs": []
+    }
+  },
+  "default-runtime": "nvidia",
+  "node-generic-resources": [
+    $NODE_RESOURCES
+  ]
+}
+EOF
+)
+
+# 备份当前的daemon.json文件
+if [ -f /etc/docker/daemon.json ]; then
+  sudo cp /etc/docker/daemon.json /etc/docker/daemon.json.bak
+fi
+
+# 写入新的daemon.json内容
+echo "$DAEMON_JSON" | sudo tee /etc/docker/daemon.json > /dev/null
+
+# 添加swarm-resource配置
+# swarm-resource = "DOCKER_RESOURCE_NVIDIAGPU"
+
+sudo sed -i '/^#.*swarm-resource/s/^#//' /etc/nvidia-container-runtime/config.toml
+sudo sed -i '/swarm-resource =/s/=.*/= "DOCKER_RESOURCE_GPU"/' /etc/nvidia-container-runtime/config.toml
+
+# 重启Docker服务
+sudo systemctl restart docker.service
+
+# 验证配置
+docker info | grep -i 'nvidia'
+
+echo "GPU UUIDs have been configured and Docker has been restarted."
diff --git a/finetune/deepspeed.Dockerfile b/finetune/deepspeed.Dockerfile
new file mode 100644
index 0000000..fecb0c7
--- /dev/null
+++ b/finetune/deepspeed.Dockerfile
@@ -0,0 +1,184 @@
+FROM nvidia/cuda:12.2.2-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND noninteractive
+
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN apt-get update && \
+        apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-dev
+
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+        apt-get update && \
+        apt-get install -y git && \
+        git --version
+
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive from server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+        sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# Mellanox OFED
+##############################################################################
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+RUN apt-get install -y libnuma-dev
+RUN cd ${STAGE_DIR} && \
+        wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
+        cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
+        ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+        cd ${STAGE_DIR} && \
+        rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+
+##############################################################################
+# nv_peer_mem
+##############################################################################
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN mkdir -p ${STAGE_DIR} && \
+        git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
+        cd ${STAGE_DIR}/nv_peer_memory && \
+        ./build_module.sh && \
+        cd ${STAGE_DIR} && \
+        tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
+        cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
+        apt-get update && \
+        apt-get install -y dkms && \
+        dpkg-buildpackage -us -uc && \
+        dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+
+##############################################################################
+# OPENMPI
+##############################################################################
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+RUN cd ${STAGE_DIR} && \
+        wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+        cd openmpi-${OPENMPI_VERSION} && \
+        ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+        make -j"$(nproc)" install && \
+        ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+        # Sanity check:
+        test -f /usr/local/mpi/bin/mpic++ && \
+        cd ${STAGE_DIR} && \
+        rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+        LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+        echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+        echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+        chmod a+x /usr/local/mpi/bin/mpirun
+
+##############################################################################
+# Python
+##############################################################################
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHON_VERSION=3
+RUN apt-get install -y python3 python3-dev && \
+        rm -f /usr/bin/python && \
+        ln -s /usr/bin/python3 /usr/bin/python && \
+        curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \
+        python get-pip.py && \
+        rm get-pip.py && \
+        pip install --upgrade pip && \
+        # Print python an pip version
+        python -V && pip -V
+RUN pip install pyyaml
+RUN pip install ipython
+
+##############################################################################
+# Some Packages
+##############################################################################
+RUN apt-get update && \
+        apt-get install -y --no-install-recommends \
+        libsndfile-dev \
+        libcupti-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen \
+        libaio-dev
+RUN pip install psutil \
+        yappi \
+        cffi \
+        ipdb \
+        pandas \
+        matplotlib \
+        py3nvml \
+        pyarrow \
+        graphviz \
+        astor \
+        boto3 \
+        tqdm \
+        sentencepiece \
+        msgpack \
+        requests \
+        pandas \
+        sphinx \
+        sphinx_rtd_theme \
+        scipy \
+        numpy \
+        scikit-learn \
+        nvidia-ml-py3 \
+        mpi4py
+
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+        sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# PyTorch
+##############################################################################
+ENV PYTORCH_VERSION=1.13.0
+RUN pip install torch==${PYTORCH_VERSION}
+
+##############################################################################
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
+##############################################################################
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+        rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+
+##############################################################################
+## Add deepspeed user
+###############################################################################
+# Add a deepspeed user with user id 8877
+#RUN useradd --create-home --uid 8877 deepspeed
+RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+RUN usermod -aG sudo deepspeed
+RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+# # Change to non-root privilege
+USER deepspeed
+
+##############################################################################
+# DeepSpeed
+##############################################################################
+RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+RUN cd ${STAGE_DIR}/DeepSpeed && \
+        git checkout . && \
+        git checkout master && \
+        ./install.sh --pip_sudo
+RUN rm -rf ${STAGE_DIR}/DeepSpeed
+RUN python -c "import deepspeed; print(deepspeed.__version__)"
\ No newline at end of file
diff --git a/finetune/docker-compose.yml b/finetune/docker-compose.yml
new file mode 100644
index 0000000..52da7ba
--- /dev/null
+++ b/finetune/docker-compose.yml
@@ -0,0 +1,33 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/finetune:test
+    shm_size: '32gb'
+    ports:
+      - 3227:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/finetune/docker-compose_ldh.yml b/finetune/docker-compose_ldh.yml
new file mode 100644
index 0000000..ffefef1
--- /dev/null
+++ b/finetune/docker-compose_ldh.yml
@@ -0,0 +1,57 @@
+
+services:
+  ldh-deepspeed-test:
+    build: 
+      context: .
+      dockerfile: Dockerfile.ldh
+      args:
+        # PYTHON_VERSION: "3.10"
+        # CUDA_VERSION: "12.1.0"
+        # PYTORCH_VERSION: "2.3.0"
+        # TORCHVISION_VERSION: "0.18.0"
+        # TORCHAUDIO_VERSION: "2.3.0"
+        # DS_BUILD_OPS: 1
+        # USE_CUDA: 1
+        # USE_ROCM: 0
+        # USE_XPU: 0
+        # CUDA: cu121
+        # CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        # SETUPTOOLS_VERSION: "69.5.1"
+        # DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        # DEEPSPEED_VERSION: "master"
+        # DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+        # cache-from: "type=local"
+    image: ldh/deepspeed:test
+    shm_size: '128gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    #runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # stdin_open: true
+    # tty: true
+    privileged: true
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - /root/workspace:/root/data
+      - /dev/infiniband:/dev/infiniband
+    # ports:
+    #   - "22242:22242"
+    #   - "5000:5000"
+    # networks:
+    #   - ldh_overlay_network
+    network_mode: host
+    command: ["/usr/sbin/sshd", "-D"]
+
+# networks:
+#   ldh_overlay_network:
+#     external: true
diff --git a/finetune/docker-compose_m_d.yml b/finetune/docker-compose_m_d.yml
new file mode 100644
index 0000000..f82ad3f
--- /dev/null
+++ b/finetune/docker-compose_m_d.yml
@@ -0,0 +1,35 @@
+
+services:
+  ldh-megatron-deepspeed-test:
+    image: hotwa/magadeep:latest
+    shm_size: '128gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    #runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # stdin_open: true
+    # tty: true
+    privileged: true
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - /root/workspace:/root/data
+      - /dev/infiniband:/dev/infiniband
+    # ports:
+    #   - "22242:22242"
+    #   - "5000:5000"
+    # networks:
+    #   - ldh_overlay_network
+    network_mode: host
+    command: ["/usr/sbin/sshd", "-D"]
+
+# networks:
+#   ldh_overlay_network:
+#     external: true
diff --git a/finetune/docker-compose_mega.yml b/finetune/docker-compose_mega.yml
new file mode 100644
index 0000000..adeb72a
--- /dev/null
+++ b/finetune/docker-compose_mega.yml
@@ -0,0 +1,38 @@
+
+services:
+  megatron-test:
+    image: nvcr.io/nvidia/pytorch:24.02-py3
+    shm_size: '560gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    #runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+        #- CUTLASS_PATH="/opt/cutlass"
+        #- CUDA_HOME="/usr/local/cuda"
+        #- PATH="${CUDA_HOME}/bin:${PATH}"
+        #- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+    stdin_open: true
+    tty: true
+    privileged: true
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - /root/workspace:/mnt
+      - /dev/infiniband:/dev/infiniband
+    #  - /mnt/local-nvme:/root/
+    ports:
+      - "5000:5000"
+    # networks:
+    #   - ldh_overlay_network
+    network_mode: host
+
+# networks:
+#   ldh_overlay_network:
+#     external: true
diff --git a/finetune/docker-compose_nccl.yml b/finetune/docker-compose_nccl.yml
new file mode 100644
index 0000000..e3ce1ad
--- /dev/null
+++ b/finetune/docker-compose_nccl.yml
@@ -0,0 +1,28 @@
+version: '3.8'
+# https://github.com/mayooot/build-nccl-tests-with-pytorch
+services:
+  nccl-test-container:
+    image: mayooot/nccl-tests-with-pytorch:v0.0.2
+    container_name: nccl-test-container
+    network_mode: host
+    environment:
+      - PORT=1998
+      - PASS=P@88w0rd
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    volumes:
+      - ./id_rsa_finetune:/root/.ssh/id_rsa
+      - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    cap_add:
+      - IPC_LOCK
+    devices:
+      - /dev/infiniband:/dev/infiniband
+    shm_size: '32gb'
+    restart: unless-stopped
diff --git a/finetune/docker-compose_ngc.yml b/finetune/docker-compose_ngc.yml
new file mode 100644
index 0000000..5d173f9
--- /dev/null
+++ b/finetune/docker-compose_ngc.yml
@@ -0,0 +1,72 @@
+version: '3.9'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+# 检测系统总内存（以GB为单位）
+# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
+# echo "Docker Compose 文件已生成，shm_size 设置为 ${TOTAL_MEM}GB。"
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile.ngc
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        REGISTRY: "nvcr.io"
+        OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
+        LABEL: "pytorch"
+        VERSION: "24.06-py3"
+        DS_BUILD_OPS: 1
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+        CACHEBUST: 1
+    # volumes:
+    #   - ./workspace:/workspace
+      # - /tmp:/tmp
+    container_name: ubuntu-ngc
+    pull_policy: if_not_present
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    # tty: true
+    # stdin_open: true
+    restart: unless-stopped
+    image: quay.io/hotwa/ngc:latest
+    privileged: true
+    ipc: host
+    network_mode: host
+    shm_size: '128gb'
+    # ports:
+    #   - 3228:2222
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - network_finetune
+    # command: ["/usr/sbin/sshd", "-D"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# networks:
+#   network_finetune:
+#     name: network_finetune
diff --git a/finetune/docker-compose_pytorch1.13.yml b/finetune/docker-compose_pytorch1.13.yml
new file mode 100644
index 0000000..e39c544
--- /dev/null
+++ b/finetune/docker-compose_pytorch1.13.yml
@@ -0,0 +1,52 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: 3.9
+        CUDA_VERSION: 11.7.1
+        PYTORCH_VERSION: 1.13.1
+        TORCHVISION_VERSION: 0.14.1
+        TORCHAUDIO_VERSION: 0.13.1
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 1
+        DS_BUILD_FUSED_ADAM: 1
+        DS_BUILD_CPU_ADAM: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu117
+        CUDA_ARCH_LIST: "80;86" # for RTX 4090, all : "80;86;89;90" 编译deepspeed内核需要，这个参数很严格
+        SETUPTOOLS_VERSION: "69.5.1"
+        ROOT_PASSWD: "root"
+        DCUTLASS_NVCC_ARCHS: "90a" # 90a for H100 ,89:GeForce RTX 4090 
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt113
+    shm_size: '32gb'
+    ports:
+      - 3227:2222
+    command: ["/usr/sbin/sshd", "-D"]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/finetune/docker-compose_pytorch2.3.yml b/finetune/docker-compose_pytorch2.3.yml
new file mode 100644
index 0000000..4390e55
--- /dev/null
+++ b/finetune/docker-compose_pytorch2.3.yml
@@ -0,0 +1,65 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    # tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/finetune/docker-compose_pytorch2.34060.yml b/finetune/docker-compose_pytorch2.34060.yml
new file mode 100644
index 0000000..52d11be
--- /dev/null
+++ b/finetune/docker-compose_pytorch2.34060.yml
@@ -0,0 +1,63 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    # tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/finetune/docker-compose_pytorch2.3_device.yml b/finetune/docker-compose_pytorch2.3_device.yml
new file mode 100644
index 0000000..b9752d6
--- /dev/null
+++ b/finetune/docker-compose_pytorch2.3_device.yml
@@ -0,0 +1,71 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./src:/bbtft
+      - ./id_rsa_finetune:/root/.ssh/id_rsa
+      - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    # tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - my-custom-bridge
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    cap_add:
+      - IPC_LOCK
+    devices:
+      - /dev/infiniband:/dev/infiniband
+# docker swarm init
+# docker swarm join-token manager
+# docker network create -d overlay --subnet=192.168.200.0/24 my-overlay-network
+networks:
+  my-custom-bridge:
+    external: true
diff --git a/finetune/docker-compose_stack.yml b/finetune/docker-compose_stack.yml
new file mode 100644
index 0000000..ef9aa2b
--- /dev/null
+++ b/finetune/docker-compose_stack.yml
@@ -0,0 +1,58 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./src:/bbtft
+      - ./id_rsa_finetune:/root/.ssh/id_rsa
+      - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    container_name: ubuntu-finetune
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - my-custom-bridge
+    deploy:
+      replicas: 2
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "NVIDIA-GPU"
+                value: 1
+      placement:
+        constraints: [node.platform.os == linux]
+    cap_add:
+      - IPC_LOCK
+    devices:
+      - /dev/infiniband:/dev/infiniband
+
+networks:
+  my-custom-bridge:
+    external: true
+
+# docker stack deploy -c docker-compose_stack.yml rdma_stack
diff --git a/finetune/docker-compose_stack1.yml b/finetune/docker-compose_stack1.yml
new file mode 100644
index 0000000..7698b4a
--- /dev/null
+++ b/finetune/docker-compose_stack1.yml
@@ -0,0 +1,37 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    image: hotwa/deepspeed:pt23
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    deploy:
+      replicas: 1
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "NVIDIA-GPU"
+                value: 1
+      placement:
+        constraints:
+          - node.labels.gpu == true
+    cap_add:
+      - IPC_LOCK
+
+networks:
+  default:
+    driver: overlay
+
+# 为节点添加标签：
+# docker node ls
+
+
+# docker node update --label-add gpu=true node1
+
+# docker stack deploy -c docker-compose.yml rdma_stack
+
diff --git a/finetune/docker-compose_stack2.yml b/finetune/docker-compose_stack2.yml
new file mode 100644
index 0000000..89c357c
--- /dev/null
+++ b/finetune/docker-compose_stack2.yml
@@ -0,0 +1,62 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args:
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - type: tmpfs
+        target: /dev/shm
+        tmpfs:
+          size: 32000000000 # 32GB
+      # - ./src:/bbtft
+      # - ./id_rsa_finetune:/root/.ssh/id_rsa
+      # - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    # container_name: ubuntu-finetune
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - my-custom-bridge
+    deploy:
+      replicas: 4
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "NVIDIA-GPU"
+                value: 8
+            - discrete_resource_spec:
+                kind: "SRIOV-VF"
+                value: 1
+      placement:
+        constraints: [node.labels.gpu == true]
+    cap_add:
+      - IPC_LOCK
+    privileged: true
+
+# networks:
+#   my-custom-bridge:
+#     external: true
diff --git a/finetune/docker-compose_swarm.yml b/finetune/docker-compose_swarm.yml
new file mode 100644
index 0000000..45d9300
--- /dev/null
+++ b/finetune/docker-compose_swarm.yml
@@ -0,0 +1,50 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: 
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./binbbt:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '40gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - test-net
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# 修改为docker-swarm的网络
+networks:
+  test-net:
+    external: true
diff --git a/finetune/docker-compose_update.yml b/finetune/docker-compose_update.yml
new file mode 100644
index 0000000..cc24ed4
--- /dev/null
+++ b/finetune/docker-compose_update.yml
@@ -0,0 +1,81 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+# 检测系统总内存（以GB为单位）
+# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
+# echo "Docker Compose 文件已生成，shm_size 设置为 ${TOTAL_MEM}GB。"
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile.update
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        NV_PEER_MEM_VERSION: "1.2"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+        CACHEBUST: 1
+    volumes:
+      - ./src:/bbtft
+      # - /tmp:/tmp
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    # tty: true
+    # stdin_open: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23_update
+    privileged: true
+    ipc: host
+    network_mode: host
+    shm_size: '128gb'
+    # ports:
+    #   - 3228:2222
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - network_finetune
+    # command: ["/usr/sbin/sshd", "-D"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# networks:
+#   network_finetune:
+#     name: network_finetune
diff --git a/finetune/hostfile b/finetune/hostfile
new file mode 100644
index 0000000..4046630
--- /dev/null
+++ b/finetune/hostfile
@@ -0,0 +1,3 @@
+host1 slots=4
+host2 slots=4
+host3 slots=8
diff --git a/finetune/id_rsa.pub b/finetune/id_rsa.pub
new file mode 100644
index 0000000..abfe5ea
--- /dev/null
+++ b/finetune/id_rsa.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC1CQs1rWF7KFg5SKeNHm3EGLEx8pgegdy2voQMAEInOTjeIoWpcXk7R65NLGG6k1J10f5GYg3A0XxmNf/7nUWn0T/D31dwcFvP5BAIpJl8IMDkFj36SoNKTX5XIhbCet7sJgsLY4yKlOVahVNK+La9nbLDEd7GGNzBVUpccc2uXDJul+r1QSoXssV5Q7QBa17Sf2en6swXrtjyPz4W+Tg7/ANzF3P9y9roIcdlAm/jZb0gMLFsteyt+ThqrP3+hSgFrOlJNgEL5qkOG0dI5rHpjeJnBzPAA1FLAQFhdtSrL+Cd9INSvV0lNwAROl5FpSMVmE7UzeeUy70cqw5b7ReJsEpHDbpd6rUEwC09mJlSaHQ9ApKbCD0u9aXeuTlbgHqcs2JDZTLT7Yf+JxO7yVc2QaJ3iiLkVTyiXhby5YWO++lBvhXX+zMLsUvIXD6MMBeyC0Azjb41qguhJvV8H9wI+2nBZEcgSB2vhYM+/rdDw5+v3WqgGsUqpf1GLTeWP8oTxJDrDM20crW3bcEoEFlMZRpVOnWFBIniU8T1TLxP92lElWTkX+eptJVffoPxRvSPLgaNN2toY9K1MVcQ8+ckJJ6te7sjXlOupJDpNH+tshYlMsUfi1FrsRhGT0yHZtDZ3YibZ0l/8AGUWvnNC/pFqtqBLaAsfll5jsqt06pp7Q== docker@example.com
diff --git a/finetune/id_rsa_finetune b/finetune/id_rsa_finetune
new file mode 100644
index 0000000..9d0e0c9
--- /dev/null
+++ b/finetune/id_rsa_finetune
@@ -0,0 +1,49 @@
+-----BEGIN OPENSSH PRIVATE KEY-----
+b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAACFwAAAAdzc2gtcn
+NhAAAAAwEAAQAAAgEAtQkLNa1heyhYOUinjR5txBixMfKYHoHctr6EDABCJzk43iKFqXF5
+O0euTSxhupNSddH+RmINwNF8ZjX/+51Fp9E/w99XcHBbz+QQCKSZfCDA5BY9+kqDSk1+Vy
+IWwnre7CYLC2OMipTlWoVTSvi2vZ2ywxHexhjcwVVKXHHNrlwybpfq9UEqF7LFeUO0AWte
+0n9np+rMF67Y8j8+Fvk4O/wDcxdz/cva6CHHZQJv42W9IDCxbLXsrfk4aqz9/oUoBazpST
+YBC+apDhtHSOax6Y3iZwczwANRSwEBYXbUqy/gnfSDUr1dJTcAETpeRaUjFZhO1M3nlMu9
+HKsOW+0XibBKRw26Xeq1BMAtPZiZUmh0PQKSmwg9LvWl3rk5W4B6nLNiQ2Uy0+2H/icTu8
+lXNkGid4oi5FU8ol4W8uWFjvvpQb4V1/szC7FLyFw+jDAXsgtAM42+NaoLoSb1fB/cCPtp
+wWRHIEgdr4WDPv63Q8Ofr91qoBrFKqX9Ri03lj/KE8SQ6wzNtHK1t23BKBBZTGUaVTp1hQ
+SJ4lPE9Uy8T/dpRJVk5F/nqbSVX36D8Ub0jy4GjTdraGPStTFXEPPnJCSerXu7I15TrqSQ
+6TR/rbIWJTLFH4tRa7EYRk9Mh2bQ2d2Im2dJf/ABlFr5zQv6RaragS2gLH5ZeY7KrdOqae
+0AAAdIJh5TtyYeU7cAAAAHc3NoLXJzYQAAAgEAtQkLNa1heyhYOUinjR5txBixMfKYHoHc
+tr6EDABCJzk43iKFqXF5O0euTSxhupNSddH+RmINwNF8ZjX/+51Fp9E/w99XcHBbz+QQCK
+SZfCDA5BY9+kqDSk1+VyIWwnre7CYLC2OMipTlWoVTSvi2vZ2ywxHexhjcwVVKXHHNrlwy
+bpfq9UEqF7LFeUO0AWte0n9np+rMF67Y8j8+Fvk4O/wDcxdz/cva6CHHZQJv42W9IDCxbL
+Xsrfk4aqz9/oUoBazpSTYBC+apDhtHSOax6Y3iZwczwANRSwEBYXbUqy/gnfSDUr1dJTcA
+ETpeRaUjFZhO1M3nlMu9HKsOW+0XibBKRw26Xeq1BMAtPZiZUmh0PQKSmwg9LvWl3rk5W4
+B6nLNiQ2Uy0+2H/icTu8lXNkGid4oi5FU8ol4W8uWFjvvpQb4V1/szC7FLyFw+jDAXsgtA
+M42+NaoLoSb1fB/cCPtpwWRHIEgdr4WDPv63Q8Ofr91qoBrFKqX9Ri03lj/KE8SQ6wzNtH
+K1t23BKBBZTGUaVTp1hQSJ4lPE9Uy8T/dpRJVk5F/nqbSVX36D8Ub0jy4GjTdraGPStTFX
+EPPnJCSerXu7I15TrqSQ6TR/rbIWJTLFH4tRa7EYRk9Mh2bQ2d2Im2dJf/ABlFr5zQv6Ra
+ragS2gLH5ZeY7KrdOqae0AAAADAQABAAACAANNbXXIduH4PT8aDGQy41I4+6VplUKKUjKd
+HLZF431FaG4jZAaJXOqKyMsDqhxmEDYOZuyY7u12EUn20Slhd+Pokm4S/qHSRDrxbparG5
+Jy+GZH4l5GlPq20nXw9CvyHHnG2HECqVvPRCZgqxbW8mI8S6MOZol83DsvMjVEWBZjJuXP
+vl8ZztugbNMPkU8z3/hrj2Xglf56DPuYUXjIF83UGlUBu4wzYh1Hcunsm/wUN9mIVzLnkQ
+WYcJOqtpnH4JA41HktnlP9qqwaguYVzURxaQXB2CCGRhRlDVQI6m+kdPltkd8ocR8T3hSy
+X9tg/61fwVNHMxSY8IkGUXqn39IZuwtIOflybXc1w3VQBwGuI2UF/U/5wmIJdQimsDPzhX
+o5uENWiL5Lei5sxxUmnZw78xoXHino1LNceBKhQHrKS8R36QsdK7+INbiW3Tt2TmCyH563
+UH7dgS2moTrtiXh+gPk32okTnwquRWHJ7uurxgmnncoTEdmkcTCeXv7B1CBdH9WGyCtyV1
+oKK+qNEXCrLaIOD49zF2qPUmxUOuGzcBKgavXDSPmj5jB/4k3ipsjlRX12l8xCEycKLHG3
+6LuP4jgoalNtjJGJozpya4/tsOhE6jEB74xIXUuCUlBo9Q8xmHYnv2/8jdSdR6rx6N9odw
+XMYjVcs63rLZMKsljhAAABADaOQoVNSfTbhyG9wJN7+XyeXHBkfMKg4kGNYB28l4mbB1eR
+8i/cZPvIDrcz1FjvYQXEWmK+XS9QVMz0EGLse5JIYhXUFtZin8VVqttIBZLXhw0nImD9sK
+HlbxlKj+Savlx+oZDDxAGNMDGGhbc7uuWgX1O1Bsr5sQR+neTV91iLcMWB0XfP8CK70uXf
+l7NQ88RaWn79JggKKuqVs1THhlfeMlBJ0RsUdRw69gs228++btif2bxCoY0IH3mCsmaux9
+JNI8bqZ5yws4XE7l0jaOnVFQywHP/4FCjZ2MQONhG10vYWpRjXpEEf1hXN6xDKWC5t50d0
+o79xP/Vp4Nk7pFsAAAEBAPeE7OCOS96fAz8hBI+4CXVjKzy39slPgsi64hMZYtUgY9fZ/k
+5L/n831+Do7Yrrng/1pUzrHvaiip4XP2WcPmz9y2PYhi5RZzzffAmCudDVGP5ftoUcAtrj
+cVzP4kmeRPP1kTsP3M3fNphrfgPkpGD1TFRxxT5wwVnsiRzQ5c1ykX8jn2xd8QpUoSdK0Y
+SetryzmRf+OlDJyQljoNZ76wu5GjsejMjtIKO9oua5avgJhLKpyfAVTz2QZBBxrCUrp2+P
+iM+/f4tXqF45eCFjGqiyFvKUCD1VHp5Oup4rQIi4PgD1H/MdT5XmZNeFqxwo+/2QzwIAp9
+AKQqQ/KX+7YWEAAAEBALs89zawDSGbZemROsreRDapohYnHiSAZvGzjqaevjF0oFkLpRr5
+/9jcRZf4QBDTZah5y8ATNs6KECvmRQ0mMkDSI2FSOM1bZ2yndxbtmM9kaAmpqdrRVBChVX
+nopPfQ8dQ2RkPzp5YIL1QvAQbaP+B+lB8sZVtEK1OnxwCOCcVukGpkw2cE7aGDITDi1Mqg
+Obj3sxHjQ+ysMZ1lOrKadpDQZXFpgp6MFVrNVlpv2QanbMGTB9GPynvCHGf5KJvKnot7L/
+rjTd2Da5SII3Mx9d6YAkYQpJkguNkJ2Q05+7PvyNNmj+Nk3ZwgqFA+3edc2exXMf9FzdmJ
+iJcbS3QheA0AAAASZG9ja2VyQGV4YW1wbGUuY29tAQ==
+-----END OPENSSH PRIVATE KEY-----
diff --git a/finetune/peft-gpu-bnb-multi-source.Dockerfile b/finetune/peft-gpu-bnb-multi-source.Dockerfile
new file mode 100644
index 0000000..2c839c4
--- /dev/null
+++ b/finetune/peft-gpu-bnb-multi-source.Dockerfile
@@ -0,0 +1,68 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.8
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/peft/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Stage 2
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget cmake && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Activate the conda env and install transformers + accelerate from source
+# Also clone BNB and build it from source.
+RUN source activate peft && \
+    python3 -m pip install -U --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    peft[test]@git+https://github.com/huggingface/peft \
+    optimum \
+    auto-gptq && \
+    git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && git checkout multi-backend-refactor && \
+    cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \
+    cmake --build . && \
+    pip install -e . && \ 
+    pip freeze | grep bitsandbytes
+
+RUN echo "source activate peft" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/finetune/requirements.txt b/finetune/requirements.txt
new file mode 100644
index 0000000..cb14ebf
--- /dev/null
+++ b/finetune/requirements.txt
@@ -0,0 +1,37 @@
+pytorch
+torchvision
+torchaudio
+pydantic
+transformers
+datasets
+accelerate
+evaluate
+peft
+deepspeed
+tiktoken
+sentencepiece
+tqdm
+nltk
+matplotlib
+seaborn
+numpy
+pandas
+scikit-learn
+diffusers
+huggingface_hub
+spacy
+Pillow
+blobfile
+requests
+scipy
+pycocotools
+protobuf
+timm
+pyyaml
+ipython
+xformers
+opencv-contrib-python
+open_clip_torch
+flash-attn
+packaging
+psutil
diff --git a/finetune/setup_ssh.sh b/finetune/setup_ssh.sh
new file mode 100644
index 0000000..514d283
--- /dev/null
+++ b/finetune/setup_ssh.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# 定义主机列表
+hosts=("10.200.1.10" "10.200.1.11" "10.200.1.12")
+
+# 当前主机的用户名
+user="root"
+
+# 检查ssh-keygen是否已经生成密钥对
+if [ ! -f ~/.ssh/id_rsa ]; then
+    echo "生成SSH密钥对..."
+    ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa
+else
+    echo "SSH密钥对已经存在..."
+fi
+
+# 分发公钥到其他主机
+for host in "${hosts[@]}"; do
+    if [ "$host" != "$(hostname -I | awk '{print $1}')" ]; then
+        echo "将公钥复制到$host..."
+        ssh-copy-id -i ~/.ssh/id_rsa.pub "$user@$host"
+    fi
+done
+
+echo "密钥认证配置完成。"
\ No newline at end of file
diff --git a/finetune/test.txt b/finetune/test.txt
new file mode 100644
index 0000000..48b0982
--- /dev/null
+++ b/finetune/test.txt
@@ -0,0 +1,182 @@
+absl-py==2.1.0
+accelerate @ git+https://github.com/huggingface/accelerate@1f7a79b428749f45187ec69485f2c966fe21926e
+aiohttp==3.9.5
+aiosignal==1.3.1
+alabaster==0.7.16
+alembic==1.13.1
+annotated-types==0.7.0
+arrow==1.3.0
+astor==0.8.1
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
+async-timeout==4.0.3
+attrs==23.2.0
+Babel==2.15.0
+beautifulsoup4==4.12.3
+binaryornot==0.4.4
+boto3==1.34.129
+botocore==1.34.129
+certifi==2024.6.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cmake==3.29.5.1
+colorlog==6.8.2
+contourpy==1.2.1
+cookiecutter==1.7.3
+cycler==0.12.1
+datasets==2.20.0
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
+deepspeed @ file:///tmp/DeepSpeed/dist/deepspeed-0.14.4%2B0c979d67-cp310-cp310-linux_x86_64.whl#sha256=3990df7f730604f29f51d6e5aa83ec09da6a4ea584504d27dc2d0fad7b8a4582
+deepspeed-kernels @ file:///tmp/DeepSpeed-Kernels
+dill==0.3.4
+docutils==0.20.1
+einops==0.8.0
+evaluate==0.4.2
+exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work
+execnet==2.1.1
+executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work
+faiss-cpu==1.8.0
+filelock==3.15.3
+flash-attn==2.5.9.post1
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.0
+ftfy==6.2.0
+gitdb==4.0.11
+GitPython==3.1.18
+graphviz==0.20.3
+greenlet==3.0.3
+grpcio==1.64.1
+hjson==3.1.0
+huggingface-hub==0.23.4
+idna==3.7
+imagesize==1.4.1
+iniconfig==2.0.0
+ipdb==0.13.13
+ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1717182742060/work
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work
+Jinja2==3.1.4
+jinja2-time==0.2.0
+jmespath==1.0.1
+joblib==1.4.2
+kiwisolver==1.4.5
+Mako==1.3.5
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work
+mdurl==0.1.2
+mpi4py @ https://github.com/mpi4py/mpi4py/tarball/master#sha256=e9d1ce01a4c5f95c704743ed13a2d90517dcafdfcde40e050903d583e9ca1260
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.0.5
+multiprocess==0.70.12.2
+networkx==3.3
+ninja==1.11.1.1
+nltk==3.8.1
+numpy==2.0.0
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.555.43
+nvidia-ml-py3==7.352.0
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+open-clip-torch==2.24.0
+opencv-contrib-python==4.10.0.84
+optuna==3.6.1
+packaging==24.1
+pandas==2.2.2
+parameterized==0.9.0
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1706113125309/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
+pillow==10.3.0
+pluggy==1.5.0
+portalocker==2.0.0
+poyo==0.5.0
+prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1718047967974/work
+protobuf==4.25.3
+psutil==6.0.0
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
+py-cpuinfo==9.0.0
+py3nvml==0.2.7
+pyarrow==16.1.0
+pyarrow-hotfix==0.6
+pycparser==2.22
+pydantic==1.10.16
+pydantic_core==2.18.4
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1714846767233/work
+pyparsing==3.1.2
+pytest==7.4.4
+pytest-rich==0.1.1
+pytest-timeout==2.3.1
+pytest-xdist==3.6.1
+python-dateutil==2.9.0.post0
+python-slugify==8.0.4
+pytz==2024.1
+PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rjieba==0.1.11
+rouge_score==0.1.2
+ruff==0.4.4
+s3transfer==0.10.1
+sacrebleu==1.5.1
+sacremoses==0.1.1
+safetensors==0.4.3
+scikit-learn==1.5.0
+scipy==1.13.1
+sentencepiece==0.2.0
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
+smmap==5.0.1
+snowballstemmer==2.2.0
+soupsieve==2.5
+Sphinx==7.3.7
+sphinx-rtd-theme==2.0.0
+sphinxcontrib-applehelp==1.0.8
+sphinxcontrib-devhelp==1.0.6
+sphinxcontrib-htmlhelp==2.0.5
+sphinxcontrib-jquery==4.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.7
+sphinxcontrib-serializinghtml==1.1.10
+SQLAlchemy==2.0.31
+stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
+sympy==1.12.1
+tensorboard==2.17.0
+tensorboard-data-server==0.7.2
+text-unidecode==1.3
+threadpoolctl==3.5.0
+timeout-decorator==0.5.0
+timm==1.0.7
+tokenizers==0.19.1
+tomli==2.0.1
+torch==2.3.1
+torchaudio==0.13.1+cu117
+torchvision==0.14.1+cu117
+tqdm==4.66.4
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work
+transformers @ file:///root/ninja/transformers
+triton==2.3.1
+types-python-dateutil==2.9.0.20240316
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work
+Werkzeug==3.0.3
+xmltodict==0.13.0
+xxhash==3.4.1
+yappi==1.6.0
+yarl==1.9.4
\ No newline at end of file
diff --git a/finetune/transformer.Dockerfile b/finetune/transformer.Dockerfile
new file mode 100644
index 0000000..e38170e
--- /dev/null
+++ b/finetune/transformer.Dockerfile
@@ -0,0 +1,70 @@
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
+
+# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
+# to be used as arguments for docker build (so far).
+
+ARG PYTORCH='2.3.0'
+# (not always a valid torch version)
+ARG INTEL_TORCH_EXT='2.3.0'
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu121'
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN git lfs install
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
+# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
+#    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+
+RUN python3 -m pip uninstall -y flax jax
+
+RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
+
+# For bettertransformer
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+
+# For video model testing
+RUN python3 -m pip install --no-cache-dir decord av==9.2.0
+
+# Some slow tests require bnb
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Some tests require quanto
+RUN python3 -m pip install --no-cache-dir quanto
+
+# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
+# (`deformable_detr`, `rwkv`, `mra`)
+RUN python3 -m pip uninstall -y ninja
+
+# For `dinat` model
+# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
+RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
+
+# For `nougat` tokenizer
+RUN python3 -m pip install --no-cache-dir python-Levenshtein
+
+# For `FastSpeech2ConformerTokenizer` tokenizer
+RUN python3 -m pip install --no-cache-dir g2p-en
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
\ No newline at end of file
diff --git a/finetune/update_sriov_vf.sh b/finetune/update_sriov_vf.sh
new file mode 100755
index 0000000..677fe93
--- /dev/null
+++ b/finetune/update_sriov_vf.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# 提取 Port GUID 并格式化为 SRIOV-VF 配置
+generate_sriov_vf_config() {
+    GUIDS=($(ibstat | grep "Port GUID" | awk '{print $3}'))
+    for i in "${!GUIDS[@]}"; do
+        echo "SRIOV-VF=${GUIDS[$i]}"
+    done
+}
+
+# 更新 Docker 配置文件
+update_docker_config() {
+    local GUIDS=("$@")
+    local DAEMON_JSON="/etc/docker/daemon.json"
+    local TMP_JSON="/tmp/daemon.json"
+
+    if [ ! -f "$DAEMON_JSON" ]; then
+        echo "$DAEMON_JSON 文件不存在"
+        exit 1
+    fi
+
+    local NODE_GENERIC_RESOURCES=$(jq -c '.["node-generic-resources"]' "$DAEMON_JSON")
+
+    if [ "$NODE_GENERIC_RESOURCES" == "null" ]; then
+        NODE_GENERIC_RESOURCES="[]"
+    fi
+
+    for GUID in "${GUIDS[@]}"; do
+        if [[ ! $NODE_GENERIC_RESOURCES == *"$GUID"* ]]; then
+            NODE_GENERIC_RESOURCES=$(echo "$NODE_GENERIC_RESOURCES" | jq --arg vf "$GUID" '. += [$vf]')
+        fi
+    done
+
+    jq '.["node-generic-resources"] = '"$NODE_GENERIC_RESOURCES" "$DAEMON_JSON" > "$TMP_JSON"
+    mv "$TMP_JSON" "$DAEMON_JSON"
+}
+
+# 主函数
+main() {
+    if [[ $EUID -ne 0 ]]; then
+        echo "此脚本必须以 root 用户运行"
+        exit 1
+    fi
+
+    GUIDS=($(generate_sriov_vf_config))
+    if [ ${#GUIDS[@]} -eq 0 ]; then
+        echo "未找到 SR-IOV VF 设备"
+        exit 1
+    fi
+
+    update_docker_config "${GUIDS[@]}"
+    echo "成功更新 $DAEMON_JSON 文件"
+    systemctl restart docker
+}
+
+main "$@"
diff --git a/ldh/.deepspeed_env b/ldh/.deepspeed_env
new file mode 100644
index 0000000..be22bac
--- /dev/null
+++ b/ldh/.deepspeed_env
@@ -0,0 +1,6 @@
+CUDA_HOME=/usr/local/cuda
+CUTLASS_PATH=/opt/cutlass
+TORCH_CUDA_ARCH_LIST="8.0 9.0+PTX"
+PATH=/opt/openmpi/bin:/usr/lib/jvm/default-java/bin:/usr/local/cuda/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+LD_LIBRARY_PATH=/opt/openmpi/lib:/usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs
diff --git a/ldh/Dockerfile b/ldh/Dockerfile
new file mode 100644
index 0000000..bf1a48f
--- /dev/null
+++ b/ldh/Dockerfile
@@ -0,0 +1,212 @@
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND="noninteractive"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV JAVA_HOME="/usr/lib/jvm/default-java"
+ENV CUTLASS_PATH="/opt/cutlass"
+ENV CUTLASS_NVCC_ARCHS="80;90a"
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ENV PYTORCH_CUDA_VERSION="cu124"
+ENV TORCH_CUDA_ARCH_LIST="8.0 9.0+PTX"
+
+ENV PATH=/opt/openmpi/bin:${CUDA_HOME}/bin:$JAVA_HOME/bin:${PATH}
+ENV LD_LIBRARY_PATH=/opt/openmpi/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH=${CUDA_HOME}/lib64:${LIBRARY_PATH}
+
+ENV HF_TOKEN=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
+
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /root
+
+RUN \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip screen \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-dev \
+        libsndfile-dev libcupti-dev libjpeg-dev libpng-dev \
+        libaio-dev libnuma-dev && \
+        apt-get update && \
+        apt-get install -y \
+        git python3 python3-pip ninja-build default-jre && \
+        python3 -m pip install --upgrade pip wheel && \
+        apt-get -y install antlr4 && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+
+
+# # DOCA https://developer.nvidia.com/doca-archive
+# RUN \
+#         wget --quiet https://www.mellanox.com/downloads/DOCA/DOCA_v2.5.2/host/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb -O /tmp/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb && \
+#         dpkg -i /tmp/doca-host-repo-ubuntu2204_2.5.2-0.0.6.2.5.2003.1.23.10.3.2.2.0_amd64.deb && \
+#         apt-get update && \
+#         apt-get -y install doca-runtime doca-sdk doca-tools
+
+
+# cutlass https://github.com/NVIDIA/cutlass
+RUN \
+        git clone https://github.com/NVIDIA/cutlass.git /opt/cutlass && \
+        cd /opt/cutlass && \
+        git fetch --all --tags && \
+        git checkout main && \
+        git submodule update --init --recursive && \
+        export CUDACXX=${CUDA_HOME}/bin/nvcc && \
+        mkdir build && \
+        cd build && \
+        cmake .. -DCUTLASS_NVCC_ARCHS=${CUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON
+        # cmake .. -DCUTLASS_NVCC_ARCHS=${CUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=ON -DCUTLASS_LIBRARY_KERNELS=all -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
+        # make -j"$(nproc)" install
+        # make cutlass_profiler -j"$(nproc)"
+        # make test_unit -j"$(nproc)" VERBOSE=1
+
+
+# OPENMPI https://www.open-mpi.org/software/ompi/v4.1/
+RUN \
+        wget -q -O /tmp/openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz && \
+        tar -xzf /tmp/openmpi-${OPENMPI_VERSION}.tar.gz -C /tmp && \
+        cd /tmp/openmpi-${OPENMPI_VERSION} && \
+        ./configure --prefix=/opt/openmpi-${OPENMPI_VERSION} && \
+        # ./configure --prefix=/opt/openmpi-${OPENMPI_VERSION} --with-cuda=/usr/local/cuda --enable-python-bindings --with-python=/usr/bin/python3 && \
+        make -j$(nproc) && \
+        make install && \
+        ln -s /opt/openmpi-${OPENMPI_VERSION} /opt/openmpi && \
+        # Sanity check:
+        test -f /opt/openmpi/bin/mpic++ && \
+        cd /root && \
+        rm -rf /tmp/*
+
+
+# pytorch https://pytorch.org
+RUN \
+        python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
+        python3 -m pip install packaging pillow requests jinja2 triton networkx numpy tqdm urllib3 certifi setuptools --index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION}
+
+
+# Install apex with CUDA and C++ extensions https://github.com/NVIDIA/apex
+# # if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+# pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+# # otherwise
+# pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+RUN \
+        git clone https://github.com/NVIDIA/apex.git /tmp/apex && \
+        cd /tmp/apex && \
+        git fetch --all --tags && \
+        git checkout tags/24.04.01 && \
+        git submodule update --init --recursive && \
+        python3 setup.py develop --cpp_ext --cuda_ext
+
+
+# flash-attention https://github.com/Dao-AILab/flash-attention
+# pip install flash-attn --no-build-isolation
+# MAX_JOBS=4 pip install flash-attn --no-build-isolation
+RUN \
+        git clone https://github.com/Dao-AILab/flash-attention.git /tmp/flash-attention && \
+        cd /tmp/flash-attention && \
+        git submodule update --init --recursive && \
+        python3 setup.py install
+        # pytest -q -s tests/test_flash_attn.py
+        # cd hopper
+        # python3 setup.py install
+        # export PYTHONPATH=$PWD
+        # pytest -q -s test_flash_attn.py
+
+
+# xformers https://github.com/facebookresearch/xformers
+RUN \ 
+        git clone https://github.com/facebookresearch/xformers.git /tmp/xformers && \
+        cd /tmp/xformers && \
+        git submodule update --init --recursive && \
+        python3 -m pip install -v -U /tmp/xformers
+        # python3 -m xformers.info
+
+
+# TransformerEngine https://github.com/NVIDIA/TransformerEngine
+RUN \
+        git clone --branch stable https://github.com/NVIDIA/TransformerEngine.git /tmp/TransformerEngine && \
+        cd /tmp/TransformerEngine && \
+        git submodule update --init --recursive && \
+        python3 setup.py install
+
+
+RUN \
+        python3 -m pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub trl optimum tokenizers && \
+        python3 -m pip install packaging jinja2 triton networkx urllib3 certifi requests protobuf blobfile pytest && \
+        python3 -m pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy pillow scipy && \
+        python3 -m pip install pyyaml ipython ipdb pydantic psutil yappi cffi py3nvml pyarrow graphviz astor boto3 msgpack ipykernel cython
+RUN \
+        python3 -m pip install zstandard nvitop pycocotools tensorboard tensor_parallel && \
+#         # https://github.com/mpi4py/mpi4py/issues/335
+#         rm /opt/conda/envs/${CONDA_ENV_NAME}/compiler_compat/ld && \
+        python3 -m pip install mpi4py 
+
+
+# lm-eval https://github.com/EleutherAI/lm-evaluation-harness
+# ENV ANTLR_VERSION=4.13.2
+# wget -q -O /root/antlr-${ANTLR_VERSION}-complete.jar https://www.antlr.org/download/antlr-${ANTLR_VERSION}-complete.jar
+RUN \ 
+        python3 -m pip install immutabledict langdetect && \
+        python3 -m nltk.downloader popular punkt punkt_tab && \
+         && \
+        python3 -m pip install antlr4-python3-runtime==4.11 && \
+        huggingface-cli login --token ${HF_TOKEN} && \
+        git clone https://github.com/EleutherAI/lm-evaluation-harness.git /root/lm-evaluation-harness && \
+        cd /root/lm-evaluation-harness && \
+        python3 -m pip install -e ".[dev]"
+
+
+# Megatron-LM https://github.com/NVIDIA/Megatron-LM
+RUN \
+        git clone https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \
+        /root/Megatron-LM && \
+        git checkout core_r0.5.0 && \
+        pip install --no-use-pep517 -e .
+
+
+# SSH config
+RUN \
+        echo 'root:root' | chpasswd && \
+        cp /etc/ssh/sshd_config /tmp/sshd_config && \
+        echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \
+        sed -i "s/#Port 22/Port 22222/" /etc/ssh/sshd_config && \
+        sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \
+        sed -i "s/#StrictModes yes/StrictModes no/" /etc/ssh/sshd_config && \
+        sed -i "s/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/" /etc/ssh/ssh_config && \
+        chown root:root /etc/ssh/sshd_config && \
+        mkdir -p /run/sshd && chmod 0755 /run/sshd && \
+        ssh-keygen -t rsa -f /root/.ssh/id_rsa -q -N "" && \
+        cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+# ENV config
+RUN \
+        unset https_proxy http_proxy && \
+        echo "CUDA_HOME=${CUDA_HOME}" > ~/.deepspeed_env && \
+        echo "CUTLASS_PATH=${CUTLASS_PATH}" >> ~/.deepspeed_env && \
+        echo "TORCH_CUDA_ARCH_LIST=\"${TORCH_CUDA_ARCH_LIST}\"" >> ~/.deepspeed_env && \
+        echo "PATH=${PATH}" >> ~/.deepspeed_env && \
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ~/.deepspeed_env && \
+        echo "LIBRARY_PATH=${LIBRARY_PATH}" >> ~/.deepspeed_env && \
+        echo "export CUDA_HOME=${CUDA_HOME}" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export CUTLASS_PATH=${CUTLASS_PATH}" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export TORCH_CUDA_ARCH_LIST=\"${TORCH_CUDA_ARCH_LIST}\"" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export PATH=$PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc && \
+        echo "export LIBRARY_PATH=$LIBRARY_PATH" | cat - ~/.bashrc > /tmp/.bashrc && mv /tmp/.bashrc ~/.bashrc
+# clean
+RUN \
+        cd ~ && \
+        apt-get autoremove -y && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        rm -rf /tmp/* && \
+        rm -rf /var/tmp/* && \
+        rm -rf /root/.cache/pip
diff --git a/ldh/compose.yml b/ldh/compose.yml
new file mode 100644
index 0000000..1e09496
--- /dev/null
+++ b/ldh/compose.yml
@@ -0,0 +1,40 @@
+
+services:
+  ldh-deepspeed-test:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      # args:
+      #   HTTP_PROXY: "http://127.0.0.1:15777"
+      #   HTTPS_PROXY: "http://127.0.0.1:15777"
+      #   cache-from: "type=local"
+    image: ldh/deepspeed:test
+    container_name: ldh-deepspeed-test
+    shm_size: '1024gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    # runtime: nvidia
+    # ipc: host
+    pid: host
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # stdin_open: true
+    # tty: true
+    # privileged: true
+    cap_add:
+      - ALL
+    volumes:
+      - /mnt/beegfs:/root/shared/beegfs
+      - /mnt/yrfs:/root/shared/yrfs
+    # ports:
+    #   - "22242:22242"
+    #   - "5000:5000"
+    network_mode: host
+    command: ["/usr/sbin/sshd", "-D"]
+    # command: ["/bin/bash", "-c", "while true; do sleep 1000; done"]
diff --git a/ldh/requirements.txt b/ldh/requirements.txt
new file mode 100644
index 0000000..ade6dea
--- /dev/null
+++ b/ldh/requirements.txt
@@ -0,0 +1,238 @@
+absl-py==2.1.0
+accelerate==0.33.0
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.11.0
+apex @ file:///tmp/apex
+astor==0.8.1
+asttokens==2.4.1
+async-timeout==4.0.3
+attrs==24.2.0
+blinker==1.4
+blis==0.7.11
+blobfile==2.1.1
+boto3==1.35.5
+botocore==1.35.5
+cachetools==5.5.0
+catalogue==2.0.10
+certifi==2022.12.7
+cffi==1.17.0
+cfgv==3.4.0
+chardet==5.2.0
+charset-normalizer==2.1.1
+click==8.0.3
+cloudpathlib==0.18.1
+colorama==0.4.4
+coloredlogs==15.0.1
+comm==0.2.2
+confection==0.1.5
+contourpy==1.2.1
+coverage==7.6.1
+cryptography==3.4.8
+cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.11
+DataProperty==1.0.1
+datasets==2.21.0
+dbus-python==1.2.18
+debugpy==1.8.5
+decorator==5.1.1
+deepspeed==0.15.0
+diffusers==0.30.1
+dill==0.3.8
+distlib==0.3.8
+distro==1.7.0
+docstring_parser==0.16
+einops==0.8.0
+evaluate==0.4.2
+exceptiongroup==1.2.2
+execnet==2.1.1
+executing==2.0.1
+filelock==3.13.1
+flash-attn==2.6.3
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.2.0
+graphviz==0.20.3
+grpcio==1.66.0
+hjson==3.1.0
+httplib2==0.20.2
+huggingface-hub==0.24.6
+humanfriendly==10.0
+identify==2.6.0
+idna==3.4
+immutabledict==4.2.0
+importlib-metadata==4.6.4
+iniconfig==2.0.0
+iotop==0.6
+ipdb==0.13.13
+ipykernel==6.29.5
+ipython==8.26.0
+jedi==0.19.1
+jeepney==0.7.1
+Jinja2==3.1.3
+jmespath==1.0.1
+joblib==1.4.2
+jsonlines==4.0.0
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+keyring==23.5.0
+kiwisolver==1.4.5
+langcodes==3.4.0
+langdetect==1.0.9
+language_data==1.2.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+-e git+https://github.com/EleutherAI/lm-evaluation-harness.git@aab42ba836b4af28cc1c5c1e697ea334c6ea7ced#egg=lm_eval
+lxml==4.9.4
+marisa-trie==1.2.0
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mbstrdecoder==1.1.3
+mdurl==0.1.2
+meson==0.61.2
+more-itertools==8.10.0
+mpi4py==4.0.0
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.0.5
+multiprocess==0.70.16
+murmurhash==1.0.10
+mypy==1.11.1
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+networkx==3.2.1
+ninja==1.11.1.1
+nltk==3.9.1
+nodeenv==1.9.1
+numexpr==2.10.1
+numpy==1.26.3
+nvidia-cublas-cu12==12.4.2.65
+nvidia-cuda-cupti-cu12==12.4.99
+nvidia-cuda-nvrtc-cu12==12.4.99
+nvidia-cuda-runtime-cu12==12.4.99
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.0.44
+nvidia-curand-cu12==10.3.5.119
+nvidia-cusolver-cu12==11.6.0.99
+nvidia-cusparse-cu12==12.3.0.142
+nvidia-ml-py==12.535.161
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.4.99
+nvidia-nvtx-cu12==12.4.99
+nvitop==1.3.2
+oauthlib==3.2.0
+optimum==1.21.4
+packaging==22.0
+pandas==2.2.2
+parso==0.8.4
+pathvalidate==3.2.1
+peft==0.12.0
+pexpect==4.9.0
+pillow==10.2.0
+platformdirs==4.2.2
+pluggy==1.5.0
+portalocker==2.10.1
+pre-commit==3.8.0
+preshed==3.0.9
+prompt_toolkit==3.0.47
+protobuf==5.27.3
+psutil==6.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+py3nvml==0.2.7
+pyarrow==17.0.0
+pybind11==2.13.5
+pycocotools==2.0.8
+pycparser==2.22
+pycryptodomex==3.20.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+pyelftools==0.27
+Pygments==2.18.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+pyparsing==2.4.7
+pytablewriter==1.2.0
+pytest==8.3.2
+pytest-cov==5.0.0
+pytest-xdist==3.6.1
+python-apt==2.4.0+ubuntu3
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==5.4.1
+pyzmq==26.2.0
+regex==2024.7.24
+requests==2.32.3
+rich==13.7.1
+rouge-score==0.1.2
+s3transfer==0.10.2
+sacrebleu==2.4.3
+safetensors==0.4.4
+scikit-learn==1.5.1
+scipy==1.14.1
+seaborn==0.13.2
+SecretStorage==3.3.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+shtab==1.7.1
+six==1.16.0
+smart-open==7.0.4
+spacy==3.7.6
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+sqlitedict==2.1.0
+srsly==2.4.8
+stack-data==0.6.3
+sympy==1.12
+tabledata==1.3.3
+tabulate==0.9.0
+tcolorpy==0.1.6
+tensor-parallel==2.0.0
+tensorboard==2.17.1
+tensorboard-data-server==0.7.2
+termcolor==2.4.0
+thinc==8.2.5
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+timm==1.0.9
+tokenizers==0.19.1
+tomli==2.0.1
+torch==2.4.0+cu124
+torchaudio==2.4.0+cu124
+torchvision==0.19.0+cu124
+tornado==6.4.1
+tqdm==4.66.5
+tqdm-multiprocess==0.0.11
+traitlets==5.14.3
+transformers==4.43.4
+triton==3.0.0
+trl==0.9.6
+typepy==1.3.2
+typer==0.12.4
+typing_extensions==4.9.0
+tyro==0.8.8
+tzdata==2024.1
+urllib3==1.26.13
+virtualenv==20.26.3
+wadllib==1.3.6
+wasabi==1.1.3
+wcwidth==0.2.13
+weasel==0.4.1
+Werkzeug==3.0.4
+word2number==1.1
+wrapt==1.16.0
+xformers @ file:///tmp/xformers
+xmltodict==0.13.0
+xxhash==3.5.0
+yappi==1.6.0
+yarl==1.9.4
+zipp==1.0.0
+zstandard==0.23.0
\ No newline at end of file
diff --git a/megadna/Dockerfile b/megadna/Dockerfile
new file mode 100644
index 0000000..ed7639e
--- /dev/null
+++ b/megadna/Dockerfile
@@ -0,0 +1,137 @@
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# install latest cmake
+wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
+sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+sudo apt-get update
+sudo apt-get install -y cmake
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo 'root:${ROOT_PASSWD}' | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba
+echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+micromamba shell init -s bash -p ~/micromamba
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+alias mamba=micromamba
+alias mba=mamba
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/.mambarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.0
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.0
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.0
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# 克隆 ninja 源码并编译
+git clone https://github.com/ninja-build/ninja.git 
+cd ninja
+# 克隆 GoogleTest 源码
+git clone https://github.com/google/googletest.git
+conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap
+# 配置并构建 Ninja 测试，添加 pthread 链接选项
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
+./ninja all
+# 运行 Ninja 单元测试
+./ninja_test
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
+python3 -m pip install --no-cache-dir --upgrade pip
+conda clean -afy 
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/lingxusb/megaDNA.git ${STAGE_DIR}/megaDNA
+cd ${STAGE_DIR}/megaDNA
+pip install .
+
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip install ipykernel
+python3 -m ipykernel install --user --name=${CONDA_ENV_NAME} --display-name=${CONDA_ENV_NAME}
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
\ No newline at end of file
diff --git a/megadna/docker-compose_pytorch1.13.yml b/megadna/docker-compose_pytorch1.13.yml
new file mode 100644
index 0000000..beada8a
--- /dev/null
+++ b/megadna/docker-compose_pytorch1.13.yml
@@ -0,0 +1,48 @@
+version: '3.8'
+
+services:
+  ubuntu-megadna:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        CONDA_ENV_NAME: megadna
+        CUDA_VERSION: 11.7.1
+        PYTORCH_VERSION: 1.13.1
+        TORCHVISION_VERSION: 0.14.1
+        TORCHAUDIO_VERSION: 0.13.1
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 0
+        DS_BUILD_FUSED_ADAM: 0
+        DS_BUILD_CPU_ADAM: 0
+        CUDA: cu117
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        ROOT_PASSWD: "root"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-megadna
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:test
+    shm_size: '32gb'
+    ports:
+      - 3227:2222
+    command: ["/usr/sbin/sshd", "-D"]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_megadna
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_megadna:
+    name: network_megadna
diff --git a/megadna/docker-compose_pytorch2.3.yml b/megadna/docker-compose_pytorch2.3.yml
new file mode 100644
index 0000000..61568cf
--- /dev/null
+++ b/megadna/docker-compose_pytorch2.3.yml
@@ -0,0 +1,46 @@
+version: '3.8'
+
+services:
+  ubuntu-megadna:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        CONDA_ENV_NAME: megadna
+        CUDA_VERSION: 12.1.0
+        PYTORCH_VERSION: 2.3.0
+        TORCHVISION_VERSION: 0.18.0
+        TORCHAUDIO_VERSION: 2.3.0
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 0
+        DS_BUILD_FUSED_ADAM: 0
+        DS_BUILD_CPU_ADAM: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-megadna
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:test
+    shm_size: '32gb'
+    ports:
+      - 3237:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_megadna
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_megadna:
+    name: network_megadna
diff --git a/passwd b/passwd
new file mode 100644
index 0000000..3cf1831
--- /dev/null
+++ b/passwd
@@ -0,0 +1 @@
+C2024D513C
\ No newline at end of file