diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3e8735e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,463 @@
+
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# install latest cmake
+wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
+sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+sudo apt-get update
+sudo apt-get install -y cmake
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba
+# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+# micromamba shell init -s bash -p ~/micromamba
+# cat <<'EOF' >> ~/.bashrc
+# source ~/micromamba/etc/profile.d/micromamba.sh
+# EOF
+# # 配置 .mambarc 文件
+# cat <<EOF > ~/.mambarc
+# channels:
+#   - conda-forge
+#   - bioconda
+#   - pytorch
+#   - pytorch-nightly
+#   - nvidia
+#   - defaults
+# show_channel_urls: true
+# EOF
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+which python > ~/python_path.txt
+conda activate ${CONDA_ENV_NAME}
+# 克隆 ninja 源码并编译
+git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
+cd ${STAGE_DIR}/ninja
+# 克隆 GoogleTest 源码
+git clone https://github.com/google/googletest.git
+python ./configure.py --bootstrap
+# 配置并构建 Ninja 测试，添加 pthread 链接选项
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
+./ninja all
+# 运行 Ninja 单元测试
+./ninja_test
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+cd ..
+# install nv_peer_mem
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+echo "alias mamba=micromamba" >> ~/.bashrc
+echo "alias mba=mamba" >> ~/.bashrc
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/compile_deepspeed_ops.py
+import deepspeed
+
+def compile_ops():
+    builders = [
+        deepspeed.ops.op_builder.AsyncIOBuilder,
+        deepspeed.ops.op_builder.FusedAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdagradBuilder,
+        deepspeed.ops.op_builder.CPULionBuilder,
+        deepspeed.ops.op_builder.EvoformerAttnBuilder,
+        deepspeed.ops.op_builder.FPQuantizerBuilder,
+        deepspeed.ops.op_builder.FusedLambBuilder,
+        deepspeed.ops.op_builder.FusedLionBuilder,
+        deepspeed.ops.op_builder.QuantizerBuilder,
+        deepspeed.ops.op_builder.RaggedOpsBuilder,
+        deepspeed.ops.op_builder.RandomLTDBuilder,
+        deepspeed.ops.op_builder.SparseAttnBuilder,
+        deepspeed.ops.op_builder.SpatialInferenceBuilder,
+        deepspeed.ops.op_builder.TransformerBuilder,
+        deepspeed.ops.op_builder.StochasticTransformerBuilder,
+    ]
+    
+    for builder in builders:
+        print(f"Compiling {builder.__name__}")
+        builder().load()
+
+if __name__ == "__main__":
+    compile_ops()
+EOF
+python compile_deepspeed_ops.py
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/Dockerfile.bak b/Dockerfile.bak
new file mode 100644
index 0000000..94baaef
--- /dev/null
+++ b/Dockerfile.bak
@@ -0,0 +1,464 @@
+# syntax=docker/dockerfile:1
+
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ENV MAMBA_ROOT_PREFIX=~/micromamba
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# install latest cmake
+wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
+sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+sudo apt-get update
+sudo apt-get install -y cmake
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+conda init bash 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# 安装 micromamba
+# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
+# micromamba shell init -s bash -p ~/micromamba
+# cat <<'EOF' >> ~/.bashrc
+# source ~/micromamba/etc/profile.d/micromamba.sh
+# EOF
+# # 配置 .mambarc 文件
+# cat <<EOF > ~/.mambarc
+# channels:
+#   - conda-forge
+#   - bioconda
+#   - pytorch
+#   - pytorch-nightly
+#   - nvidia
+#   - defaults
+# show_channel_urls: true
+# EOF
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+which python > ~/python_path.txt
+conda activate ${CONDA_ENV_NAME}
+# 克隆 ninja 源码并编译
+git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja
+cd ${STAGE_DIR}/ninja
+# 克隆 GoogleTest 源码
+git clone https://github.com/google/googletest.git
+python ./configure.py --bootstrap
+# 配置并构建 Ninja 测试，添加 pthread 链接选项
+# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
+conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
+./ninja all
+# 运行 Ninja 单元测试
+./ninja_test
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+cd ..
+# install nv_peer_mem
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+cat <<'EOF' >> ~/.bashrc
+source ~/micromamba/etc/profile.d/micromamba.sh
+echo "alias mamba=micromamba" >> ~/.bashrc
+echo "alias mba=mamba" >> ~/.bashrc
+EOF
+# 配置 .mambarc 文件
+cat <<EOF > ~/compile_deepspeed_ops.py
+import deepspeed
+
+def compile_ops():
+    builders = [
+        deepspeed.ops.op_builder.AsyncIOBuilder,
+        deepspeed.ops.op_builder.FusedAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdagradBuilder,
+        deepspeed.ops.op_builder.CPULionBuilder,
+        deepspeed.ops.op_builder.EvoformerAttnBuilder,
+        deepspeed.ops.op_builder.FPQuantizerBuilder,
+        deepspeed.ops.op_builder.FusedLambBuilder,
+        deepspeed.ops.op_builder.FusedLionBuilder,
+        deepspeed.ops.op_builder.QuantizerBuilder,
+        deepspeed.ops.op_builder.RaggedOpsBuilder,
+        deepspeed.ops.op_builder.RandomLTDBuilder,
+        deepspeed.ops.op_builder.SparseAttnBuilder,
+        deepspeed.ops.op_builder.SpatialInferenceBuilder,
+        deepspeed.ops.op_builder.TransformerBuilder,
+        deepspeed.ops.op_builder.StochasticTransformerBuilder,
+    ]
+    
+    for builder in builders:
+        print(f"Compiling {builder.__name__}")
+        builder().load()
+
+if __name__ == "__main__":
+    compile_ops()
+EOF
+python compile_deepspeed_ops.py
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
\ No newline at end of file
diff --git a/Dockerfile.ldh b/Dockerfile.ldh
new file mode 100644
index 0000000..7424c7f
--- /dev/null
+++ b/Dockerfile.ldh
@@ -0,0 +1,197 @@
+# FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
+# FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+ENV DEBIAN_FRONTEND="noninteractive"
+
+ENV STAGE_DIR="/tmp"
+RUN mkdir -p ${STAGE_DIR}
+
+ENV CUTLASS_PATH="/opt/cutlass"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0+PTX"
+
+SHELL ["/bin/bash", "-c"]
+
+WORKDIR /root
+
+RUN \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-dev && \
+        apt-get install -y git python3 python3-pip && \
+        apt-get install -y --no-install-recommends \
+        libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+RUN \
+        apt-get update && \
+        apt-get install -y lsof swig libmnl0 libltdl-dev libfuse2 udev tcl libgfortran5 \
+        graphviz ethtool bison libpci3 kmod pciutils dpatch libnl-route-3-200 libusb-1.0-0 \
+        tk m4 autoconf debhelper flex gfortran libnl-route-3-dev automake libnl-3-dev chrpath && \
+        apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
+        apt-get install -y quilt python3-distutils
+
+
+# Install Miniconda
+RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
+        /bin/bash ~/miniconda.sh -b -p /opt/conda -u && \
+        rm ~/miniconda.sh
+# Add conda to PATH and initialize conda
+ENV PATH=/opt/conda/bin:${PATH}
+RUN \
+        /opt/conda/bin/conda init bash
+
+ENV CONDA_ENV_NAME="deepspeed"
+ENV PYTHON_VERSION="3.10"
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:${PATH}
+# Create and activate a conda environment
+RUN /opt/conda/bin/conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -y && \
+        echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+        echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
+        /bin/bash -c "source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME}"
+
+
+# install cutlass https://github.com/NVIDIA/cutlass
+ENV DCUTLASS_NVCC_ARCHS="80;89;90;90a"
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
+        cd /opt/cutlass && \
+        git checkout . && \
+        git checkout main && \
+        mkdir build && \
+        cd build && \
+        cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
+        make -j"$(nproc)" install
+
+
+# Mellanox OFED
+# ENV MLNX_OFED_VERSION=5.8-5.1.1.2
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        apt-get install -y libnuma-dev libnvidia-compute-515 && \
+        # apt-get install -y libnuma-dev libnvidia-compute-535 && \
+        cd ${STAGE_DIR} && \
+        wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
+        cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
+        ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+        cd ${STAGE_DIR} && \
+        rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+
+
+# nv_peer_mem
+ENV NV_PEER_MEM_VERSION=1.2
+# ENV NV_PEER_MEM_VERSION=1.3
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        mkdir -p ${STAGE_DIR} && \
+        git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
+        cd ${STAGE_DIR}/nv_peer_memory && \
+        ./build_module.sh && \
+        cd ${STAGE_DIR} && \
+        tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
+        cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
+        apt-get update && \
+        apt-get install -y dkms && \
+        dpkg-buildpackage -us -uc && \
+        dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+
+
+# OPENMPI
+# ENV OPENMPI_BASEVERSION=4.1
+# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ENV OPENMPI_BASEVERSION=5.0
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.3
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        cd ${STAGE_DIR} && \
+        wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+        cd openmpi-${OPENMPI_VERSION} && \
+        ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+        make -j"$(nproc)" install && \
+        ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+        # Sanity check:
+        test -f /usr/local/mpi/bin/mpic++ && \
+        cd ${STAGE_DIR} && \
+        rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+        LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+        echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+        echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+        chmod a+x /usr/local/mpi/bin/mpirun
+
+
+ENV PYTORCH_VERSION=2.3.0
+ENV TORCHVISION_VERSION=0.18.0
+ENV TORCHAUDIO_VERSION=2.3.0
+ENV PYTORCH_CUDA_VERSION='cu121'
+
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        pip install torch==${PYTORCH_VERSION}+${PYTORCH_CUDA_VERSION} torchvision==${TORCHVISION_VERSION}+${PYTORCH_CUDA_VERSION} torchaudio==${TORCHAUDIO_VERSION}+${PYTORCH_CUDA_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${PYTORCH_CUDA_VERSION} && \
+        pip install packaging && \
+        pip install flash-attn
+
+# Install apex with CUDA and C++ extensions
+# pip --version | grep -q "pip 23.1" && \
+#     (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./) || \
+#     (pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --global-option="--cpp_ext" --global-option="--cuda_ext" ./) && \
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        git clone https://github.com/NVIDIA/apex /tmp/apex && \
+        cd /tmp/apex && \
+        pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./  && \
+        python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \
+        cd / && \
+        rm -rf /tmp/apex
+
+# RUN \
+#         source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+#         git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \
+#         cd ${STAGE_DIR}/DeepSpeed && \
+#         git checkout ${DEEPSPEED_VERSION} && \
+#         sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \
+#         chmod +x ./install_modified.sh && \
+#         if [ -n "${HOSTFILE_CONTENT}" ]; then \
+#             echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \
+#             INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \
+#         else \
+#             INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \
+#         fi && \
+#         eval $INSTALL_CMD && \
+#         ds_report
+
+RUN \
+        source /opt/conda/etc/profile.d/conda.sh && conda activate ${CONDA_ENV_NAME} && \
+        pip install deepspeed transformers datasets accelerate evaluate peft timm diffusers huggingface_hub && \
+        pip install regex tiktoken sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn spacy Pillow blobfile requests scipy pycocotools protobuf pyyaml ipython ipdb psutil pydantic
+
+
+RUN \
+        echo 'root:root' | chpasswd && \
+        cp /etc/ssh/sshd_config /tmp/sshd_config && \
+        echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config && \
+        sed -i "s/#Port 22/Port 22242/" /etc/ssh/sshd_config && \
+        sed -i "s/#PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PasswordAuthentication yes/PasswordAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/#PubkeyAuthentication yes/PubkeyAuthentication yes/" /etc/ssh/sshd_config && \
+        sed -i "s/UsePAM yes/UsePAM no/" /etc/ssh/sshd_config && \
+        chown root:root /etc/ssh/sshd_config && \
+        mkdir -p /run/sshd && chmod 0755 /run/sshd
+
+# RUN \
+#         bash -c 'echo -e "export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"\nexport CUTLASS_PATH=${CUTLASS_PATH}\nexport CUDA_HOME=${CUDA_HOME}\nexport PATH=${PATH}\nexport LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\n" | cat - ~/.bashrc > temp && mv temp ~/.bashrc'
+
diff --git a/Dockerfile.ngc b/Dockerfile.ngc
new file mode 100644
index 0000000..0cd410d
--- /dev/null
+++ b/Dockerfile.ngc
@@ -0,0 +1,192 @@
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+ARG LABEL=notebook
+ARG VERSION
+ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION
+FROM $BASE_CONTAINER
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+
+# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
+ENV STAGE_DIR=/tmp
+RUN <<EOT
+#!/bin/bash
+# install Mellanox OFED prepare
+apt-get update
+apt install -y libnvidia-compute-535
+apt-get install -y pciutils tk kmod libusb-1.0-0 tcl chrpath libpci3 bison lsof graphviz ethtool swig udev libltdl-dev libelf1 libmnl0 debhelper flex libfuse2
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz -O ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz
+tar xzf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz -C ${STAGE_DIR}
+cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
+./mlnxofedinstall --user-space-only --without-fw-update --all -q > ${STAGE_DIR}/mlnxofedinstall.log 2>&1
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+EOT
+
+ARG NV_PEER_MEM_VERSION="1.2"
+ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN <<EOT
+#!/bin/bash
+# install nv_peer_mem
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt --fix-broken install -y
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# base tools
+ENV SSH_PORT=2222
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+pip install pipx
+pipx install nvitop
+pipx ensurepath 
+. ~/.bashrc
+# Configure SSH for password and public key authentication
+cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
+sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
+ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -q -N ""
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
+chmod 600 /root/.ssh/authorized_keys
+chmod 600 /root/.ssh/authorized_keys2
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+. /opt/conda/etc/profile.d/conda.sh 
+conda init bash
+conda config --set show_channel_urls true
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# install pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+EOT
+
+RUN <<EOT
+#!/bin/bash
+pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+EOT
+
+RUN <<EOT
+#!/bin/bash
+pip install git+https://github.com/huggingface/transformers
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+EOT
+
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+python -m pip install -v .
+EOT
+
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH="/opt/pytorch/pytorch/third_party/cutlass"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+./install.sh ${DEEPSPEED_INSTALL_FLAGS}
+ds_report
+EOT
+
+RUN <<EOT
+#!/bin/bash
+python -m pip install --upgrade pip
+python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
+# optimum 手动解决依赖
+python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
+python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
+python -m pip install evaluate datasets
+EOT
+
+RUN <<EOT
+#!/bin/bash
+# 项目目录中的定义通常会覆盖用户家目录中的定义
+# 配置 .deepspeed_env 文件
+cat <<EOF > ~/.deepspeed_env
+TORCH_USE_CUDA_DSA=1
+DEEPSPEED_VERBOSE=1
+DEEPSPEED_LOG_LEVEL=DEBUG
+CUTLASS_PATH=${CUTLASS_PATH}
+TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+CUDA_HOME=${CUDA_HOME}
+LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
+EOF
+unset https_proxy http_proxy
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
\ No newline at end of file
diff --git a/Dockerfile.update b/Dockerfile.update
new file mode 100644
index 0000000..fa82ee7
--- /dev/null
+++ b/Dockerfile.update
@@ -0,0 +1,439 @@
+
+# NOTE: Building this image require's docker version >= 23.0.
+#
+# For reference:
+# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
+ARG CUDA_VERSION=12.1.0
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+ENV SSH_PORT=2222
+ENV STAGE_DIR=/tmp
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+pip install pipx
+pipx install nvitop
+pipx ensurepath 
+. ~/.bashrc
+# Configure SSH for password and public key authentication
+cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
+sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
+ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -q -N ""
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
+chmod 600 /root/.ssh/authorized_keys
+chmod 600 /root/.ssh/authorized_keys2
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+. /opt/conda/etc/profile.d/conda.sh 
+conda init bash
+conda config --set show_channel_urls true
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
+# install pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+EOT
+
+# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+# PyTorch
+ARG CONDA_ENV_NAME="deepspeed"
+ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
+ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
+ENV REF='main'
+ENV STAGE_DIR=/tmp
+ARG CUDA='cu121'
+ENV CUDA=${CUDA}
+ARG PYTORCH_VERSION=2.3.1
+ENV PYTORCH_VERSION=${PYTORCH_VERSION}
+ARG TORCHVISION_VERSION=0.18.1
+ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
+ARG TORCHAUDIO_VERSION=2.3.1
+ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
+ARG PYTORCH_CUDA_VERSION=12.1
+ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
+ARG SETUPTOOLS_VERSION=69.5.1
+ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
+ARG USE_CUDA=1
+ENV USE_CUDA=${USE_CUDA}
+ARG USE_ROCM=0
+ENV USE_ROCM=${USE_ROCM}
+ARG USE_XPU=0
+ENV USE_XPU=${USE_XPU}
+ARG _GLIBCXX_USE_CXX11_ABI=1
+ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} ninja cmake -c conda-forge -y
+echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
+conda activate ${CONDA_ENV_NAME}
+python3 -m pip install --no-cache-dir --upgrade pip
+python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python 
+conda clean -afy 
+git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
+python -m pip install setuptools==${SETUPTOOLS_VERSION}
+python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+python3 -m pip uninstall -y torch torchvision torchaudio 
+# # install pytorch create conda env aleay exists
+# 直接将 PyTorch 安装指引 中的 https://download.pytorch.org/whl 替换为 https://mirror.sjtu.edu.cn/pytorch-wheels 即可。
+python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA}
+python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+python3 -m pip uninstall -y transformer-engine
+python3 -m pip uninstall -y torch-tensorrt
+python3 -m pip uninstall -y apex
+EOT
+
+# install apex TORCH_CUDA_ARCH_LIST all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0"
+ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
+cd ${STAGE_DIR}/apex
+# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... 
+MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+cd ..
+rm -rf ${STAGE_DIR}/apex
+EOT
+
+# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+EOT
+
+ARG NV_PEER_MEM_VERSION="1.2"
+ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install nv_peer_mem
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt --fix-broken install -y
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# install mpi
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+ENV PATH=/usr/local/mpi/bin:${PATH} 
+ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# OPENMPI
+rm -rf ${STAGE_DIR}
+mkdir -p ${STAGE_DIR}
+cd ${STAGE_DIR}
+wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - 
+cd openmpi-${OPENMPI_VERSION} 
+./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} 
+make -j"$(nproc)" install 
+ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi 
+# Sanity check:
+test -f /usr/local/mpi/bin/mpic++ 
+cd ${STAGE_DIR} 
+rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
+echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
+chmod a+x /usr/local/mpi/bin/mpirun
+EOT
+
+# SSH daemon port inside container cannot conflict with host OS port
+# ENV SSH_PORT=2222
+# RUN <<EOT
+# #!/bin/bash
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+# EOT
+
+# 29.78 Usage: install.sh [options...]
+# 29.78 
+# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
+# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
+# 29.78 
+# 29.78 [optional]
+# 29.78     -l, --local_only        Install only on local machine
+# 29.78     -s, --pip_sudo          Run pip install with sudo (default: no sudo)
+# 29.78     -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
+# 29.78     -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
+# 29.78     -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
+# 29.78     -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
+# 29.78     -e, --examples          Checkout deepspeed example submodule (no install)
+# 29.78     -v, --verbose           Verbose logging
+# 29.78     -h, --help              This help text
+
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+usermod -aG sudo deepspeed
+echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+EOT
+
+# install cutlass https://github.com/NVIDIA/cutlass
+# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
+# A100: architecture is Ampere 
+# V100: architecture is Volta 
+# T4: architecture is Turing 
+# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
+# 70：适用于 NVIDIA Volta 架构（如 Tesla V100）。
+# 75：适用于 NVIDIA Turing 架构（如 Tesla T4）。
+# 80：适用于 NVIDIA Ampere 架构（如 A100）。
+# 90a：适用于 NVIDIA Hopper 架构（如 H100）。
+# 89:GeForce RTX 4090 
+ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
+ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/NVIDIA/cutlass /opt/cutlass
+cd /opt/cutlass
+git checkout . 
+git checkout master
+mkdir build
+cd build
+cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON            # compiles for NVIDIA Hopper GPU architecture, like H100
+make -j"$(nproc)" install
+cd ..
+# make test_unit -j"$(nproc)"
+# make test_unit_gemm_warp -j"$(nproc)"
+EOT
+
+# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# RUN <<EOT
+# source /opt/conda/etc/profile.d/conda.sh
+# conda activate ${CONDA_ENV_NAME}
+# apt-get update 
+# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
+# python -m pip install pipdeptree \
+# psutil \
+# yappi \
+# cffi \
+# ipdb \
+# pandas \
+# matplotlib \
+# py3nvml \
+# pyarrow \
+# graphviz \
+# astor \
+# boto3 \
+# tqdm \
+# sentencepiece \
+# msgpack \
+# requests \
+# pandas \
+# sphinx \
+# sphinx_rtd_theme \
+# scipy \
+# numpy \
+# scikit-learn \
+# nvidia-ml-py3 \
+# mpi4py
+# EOT
+
+# install deepspeed step 1
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
+# install oneapi for deepspeed
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+# install deepspeed step 2
+ARG CUDA_ARCH_LIST="80;86;89;90"
+ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+/opt/conda/bin/conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
+# pip install dist/deepspeed_kernels-*.whl
+CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH='/opt/cutlass'
+ENV CUDA_HOME='/usr/local/cuda'
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+# install deepspeed step 3
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
+chmod +x ./install_modified.sh
+# 检查 HOSTFILE_CONTENT 并写入文件
+if [ -n "${HOSTFILE_CONTENT}" ]; then
+    echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
+else
+    INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
+fi
+eval $INSTALL_CMD
+# compile deepspeed ops
+ds_report
+# clean up
+# rm -f deepspeed/git_version_info_installed.py
+# rm -rf dist build deepspeed.egg-info
+# python setup.py bdist_wheel
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
+# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
+# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
+EOT
+
+# install transformers and flash-attn
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+# install transformers
+git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
+cd ${STAGE_DIR}/transformers
+python3 ./setup.py develop
+python3 -m pip install -U --no-cache-dir "pydantic<2"
+# install flash-attn
+# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
+pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+# other packages
+RUN <<EOT
+#!/bin/bash
+source /opt/conda/etc/profile.d/conda.sh
+conda activate ${CONDA_ENV_NAME}
+pip3 install optimum
+pip3 install peft tiktoken \
+    tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
+    huggingface_hub spacy blobfile pycocotools \
+    open_clip_torch \
+    zstandard mpi4py -i https://pypi.org/simple/ --trusted-host pypi.org
+EOT
+
+ARG NCCL_IB_DISABLE="1"
+ARG NCCL_SOCKET_IFNAME="eth0"
+ENV NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
+ENV NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
+# deepspeed env
+RUN <<EOT
+#!/bin/bash
+cat <<EOF > ~/.deepspeed_env
+NCCL_IB_DISABLE=${NCCL_IB_DISABLE}
+NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}
+NCCL_DEBUG=INFO
+CUTLASS_PATH=${CUTLASS_PATH}
+CUDA_HOME=${CUDA_HOME}
+EOF
+#CUDA_VISIBLE_DEVICES=0,1,2,3
+#OMP_NUM_THREADS=8
+#MASTER_ADDR=192.168.1.1
+#MASTER_PORT=12345
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/Dockfile-colosial b/Dockfile-colosial
new file mode 100644
index 0000000..0d28277
--- /dev/null
+++ b/Dockfile-colosial
@@ -0,0 +1,46 @@
+FROM hpcaitech/cuda-conda:12.1
+
+# metainformation
+LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
+
+# enable passwordless ssh
+RUN mkdir ~/.ssh && \
+    printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config && \
+    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+
+# enable RDMA support
+RUN apt-get update && \
+    apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install torch
+RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
+
+# install ninja
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ninja-build && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install apex
+RUN git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    git checkout a7de60 && \
+    pip install packaging && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+# install colossalai
+ARG VERSION=main
+RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
+    && cd ./ColossalAI \
+    && BUILD_EXT=1 pip install -v . \
+    && rm -rf colossalai
+
+# install tensornvme
+RUN conda install -y cmake && \
+    apt update -y && apt install -y libaio-dev && \
+    pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
diff --git a/README.md b/README.md
index 167e7f7..9a57dcf 100644
--- a/README.md
+++ b/README.md
@@ -1,93 +1,350 @@
-# nvidia_docker
+## deepspeed docker image build
 
-
-
-## Getting started
-
-To make it easy for you to get started with GitLab, here's a list of recommended next steps.
-
-Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
-
-## Add your files
-
-- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
-- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
-
-```
-cd existing_repo
-git remote add origin http://gitlab.dockless.eu.org/lingyuzeng/nvidia_docker.git
-git branch -M main
-git push -uf origin main
+```shell
+docker-compose -f docker-compose_pytorch1.13.yml build
+docker-compose -f docker-compose_pytorch2.3.yml build
 ```
 
-## Integrate with your tools
+## 英伟达显卡安装卸载驱动
 
-- [ ] [Set up project integrations](http://gitlab.dockless.eu.org/lingyuzeng/nvidia_docker/-/settings/integrations)
+卸载
 
-## Collaborate with your team
+```shell
+cd /usr/local/cuda
+ll
+cd ..
+cd cuda-12.3/
+ll
+cd bin/
+ll
+./cuda-uninstaller 
+cd ~
+nvidia-uninstall
+sudo modprobe -r nvidia-drm nvidia-modeset nvidia-uvm nvidia
+sudo rm -rf /usr/lib64/nvidia /usr/lib/nvidia
+sudo apt autoremove nvidia*
+sudo apt clean all
+sudo dracut --force
+sudo reboot
+```
 
-- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
-- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
-- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
-- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
-- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
+安装
 
-## Test and Deploy
+```shell
+wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
+dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb
+wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda_12.5.1_555.42.06_linux.run
+ll
+sudo sh  cuda_12.5.1_555.42.06_linux.run
+echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc && echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc && source /root/.bashrc
+nvcc -V
+nvidia-smi
+nvidia-smi -pm 1
+modprobe nvidia_peermem
+nvidia-smi
+modinfo nvidia_peermem
+lsmod | grep nvidia_peermem
+systemctl mask apt-daily-upgrade.service
+systemctl mask apt-daily-upgrade.timer
+systemctl disable apt-daily-upgrade.timer
+systemctl disable apt-daily-upgrade.service
+ll
+wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
+dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb 
+sudo systemctl start nvidia-fabricmanager
+sudo systemctl status nvidia-fabricmanager
+```
 
-Use the built-in continuous integration in GitLab.
+## 镜像测试命令
 
-- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
-- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
-- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
-- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
-- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
+docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
+docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
+docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
 
-***
+pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
 
-# Editing this README
 
-When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template.
+```shell
+    1  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    2  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    3  curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
+    4  pigchacli 
+    5  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
+    6  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
+    7  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    8  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    9  python -c "from xformers import ops as xops"
+   10  python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+   11  env
+   12  pip install git+https://github.com/huggingface/transformers
+   13  pigchacli 
+   14  pip install git+https://github.com/huggingface/transformers
+   15  pip list
+   16  export STAGE_DIR=/tmp
+   17  git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+   18  cd ${STAGE_DIR}/oneCCL
+   19  git checkout . 
+   20  git checkout master
+   21  mkdir build
+   22  cd build 
+   23  cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+   24  make -j"$(nproc)" install
+   25  ls
+   26  echo ${CUDA_ARCH_LIST}
+   27  git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+   28  cd ${STAGE_DIR}/DeepSpeed-Kernels
+   29  python -m pip install -v .
+   30  env
+   31  python -m pip install -v .
+   32  git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+   33  cd ${STAGE_DIR}/DeepSpeed
+   34  export DEEPSPEED_VERSION="v0.14.3"
+   35  git checkout ${DEEPSPEED_VERSION}
+   36  ls
+   37  ./install.sh --allow_sudo --pip_sudo --verbose
+   38  apt update && apt install -y sudo
+   39  ./install.sh --allow_sudo --pip_sudo --verbose
+```
 
-## Suggestions for a good README
+```shell
+nvidia-smi
+nvcc -V
+ninja --version
+ds_report
+python -c "import torch; print('torch:', torch.__version__, torch)"
+python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()"
+python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+python -c "from xformers import ops as xops"
+ibstat
+ofed_info  -s
+mst version
+mpirun --version
+```
 
-Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
+```shell
+cat <<EOF > ~/compile_deepspeed_ops.py
+import deepspeed
 
-## Name
-Choose a self-explaining name for your project.
+def compile_ops():
+    builders = [
+        deepspeed.ops.op_builder.AsyncIOBuilder,
+        deepspeed.ops.op_builder.FusedAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdamBuilder,
+        deepspeed.ops.op_builder.CPUAdagradBuilder,
+        deepspeed.ops.op_builder.CPULionBuilder,
+        deepspeed.ops.op_builder.EvoformerAttnBuilder,
+        deepspeed.ops.op_builder.FPQuantizerBuilder,
+        deepspeed.ops.op_builder.FusedLambBuilder,
+        deepspeed.ops.op_builder.FusedLionBuilder,
+        deepspeed.ops.op_builder.QuantizerBuilder,
+        deepspeed.ops.op_builder.RaggedOpsBuilder,
+        deepspeed.ops.op_builder.RandomLTDBuilder,
+        deepspeed.ops.op_builder.SparseAttnBuilder,
+        deepspeed.ops.op_builder.SpatialInferenceBuilder,
+        deepspeed.ops.op_builder.TransformerBuilder,
+        deepspeed.ops.op_builder.StochasticTransformerBuilder,
+    ]
+    
+    for builder in builders:
+        print(f"Compiling {builder.__name__}")
+        builder().load()
 
-## Description
-Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
+if __name__ == "__main__":
+    compile_ops()
+EOF
+python compile_deepspeed_ops.py
+```
 
-## Badges
-On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
+## 配置vscode的docker的插件
 
-## Visuals
-Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
+[nerdctl配置](https://blog.csdn.net/margu_168/article/details/139822555)
 
-## Installation
-Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
 
-## Usage
-Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
 
-## Support
-Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
+```shell
+cat << 'EOF' > /usr/local/bin/docker
+#!/bin/bash
+exec nerdctl "$@"
+EOF
+chmod +x /usr/local/bin/docker
+```
 
-## Roadmap
-If you have ideas for releases in the future, it is a good idea to list them in the README.
+nerdctl bash自动补全
 
-## Contributing
-State if you are open to contributions and what your requirements are for accepting them.
+```shell
+apt update
+apt install bash-completion -y
+nerdctl completion bash > /etc/bash_completion.d/nerdctl
+nerdctl completion bash > /etc/bash_completion.d/docker
+source /etc/bash_completion.d/nerdctl
+source /etc/bash_completion.d/docker
+```
 
-For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
+## 物理机更新内核
 
-You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
+```shell
+uname -r # 5.4.0-144-generic
+lsb_release -a
+sudo apt-get update # This will update the repositories list
+sudo apt-get upgrade # This will update all the necessary packages on your system
+sudo apt-get dist-upgrade # This will add/remove any needed packages
+reboot # You may need this since sometimes after a upgrade/dist-upgrade, there are some left over entries that get fixed after a reboot
+sudo apt-get install linux-headers-$(uname -r) # This should work now
+```
 
-## Authors and acknowledgment
-Show your appreciation to those who have contributed to the project.
+## test command
 
-## License
-For open source projects, say how it is licensed.
+```shell
+docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash
+```
 
-## Project status
-If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
+## [查询GPU 架构 给变量赋值](https://blog.csdn.net/zong596568821xp/article/details/106411024)
+
+```shell
+git clone https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps.git
+cd deepstream_tlt_apps/TRT-OSS/x86
+nvcc deviceQuery.cpp -o deviceQuery
+./deviceQuery
+```
+
+H100 输出
+
+```shell
+(base) root@node19:~/bgpt/deepstream_tlt_apps/TRT-OSS/x86# ./deviceQuery
+Detected 8 CUDA Capable device(s)
+
+Device 0: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 1: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 2: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 3: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 4: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 5: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 6: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+Device 7: "NVIDIA H100 80GB HBM3"
+  CUDA Driver Version / Runtime Version          12.4 / 10.1
+  CUDA Capability Major/Minor version number:    9.0
+
+```
+
+
+## DeepSpeed hostfile 分发
+
+要手动分发 hostfile 并进行分布式安装，你需要以下几个步骤：
+
+1. 准备 hostfile
+确保 hostfile 文件包含所有参与的主机及其配置。
+
+示例 hostfile 内容：
+
+```plaintext
+host1 slots=4
+host2 slots=4
+host3 slots=8
+```
+
+2. 确保 SSH 配置正确
+确保你能够通过 SSH 无密码登录到所有主机。可以使用 ssh-keygen 和 ssh-copy-id 配置 SSH 密钥。
+
+生成 SSH 密钥（如果尚未生成）：
+
+```shell
+ssh-keygen -t rsa
+```
+
+将 SSH 公钥复制到每个主机：
+
+```shell
+ssh-copy-id user@host1
+ssh-copy-id user@host2
+ssh-copy-id user@host3
+```
+
+3. 创建临时目录并复制 wheel 文件
+在所有主机上创建一个临时目录，用于存放分发的 wheel 文件。
+
+```shell
+export PDSH_RCMD_TYPE=ssh
+hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",")
+tmp_wheel_path="/tmp/deepspeed_wheels"
+
+pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}"
+pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/
+pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+```
+
+4. 在每个主机上安装 DeepSpeed 和依赖项
+在所有主机上安装 DeepSpeed 和所需的依赖项。
+
+```shell
+pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl"
+pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt"
+```
+
+5. 清理临时文件
+安装完成后，删除所有主机上的临时文件。
+
+```shell
+pdsh -w $hosts "rm -rf ${tmp_wheel_path}"
+```
+
+详细步骤
+确保 SSH 配置正确：
+
+```shell
+ssh-keygen -t rsa
+ssh-copy-id user@host1
+ssh-copy-id user@host2
+ssh-copy-id user@host3
+```
+
+创建临时目录并复制文件：
+
+```shell
+export PDSH_RCMD_TYPE=ssh
+hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",")
+tmp_wheel_path="/tmp/deepspeed_wheels"
+
+pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}"
+pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/
+pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
+```
+
+在所有主机上安装 DeepSpeed 和依赖项：
+
+```shell
+pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl"
+pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt"
+```
+
+清理临时文件：
+
+```shell
+pdsh -w $hosts "rm -rf ${tmp_wheel_path}"
+```
+
+通过这些步骤，你可以手动分发 hostfile 并在多个主机上安装 DeepSpeed 和其依赖项。这种方法确保了每个主机的环境配置一致，从而支持分布式训练或部署。
\ No newline at end of file
diff --git a/accelerate-gpu-deepspeed.Dockerfile b/accelerate-gpu-deepspeed.Dockerfile
new file mode 100644
index 0000000..d35fc1b
--- /dev/null
+++ b/accelerate-gpu-deepspeed.Dockerfile
@@ -0,0 +1,46 @@
+# Builds GPU docker image of PyTorch specifically
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+# Note: DeepSpeed beyond v0.12.6 requires py 3.10
+ENV PYTHON_VERSION=3.10
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Create our conda env
+RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/accelerate/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+# Activate the conda env, install mpy4pi, and install torch + accelerate
+RUN source activate accelerate && conda install -c conda-forge mpi4py
+RUN source activate accelerate && \
+    python3 -m pip install --no-cache-dir \
+    git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \
+    --extra-index-url https://download.pytorch.org/whl/cu117
+
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Stage 2
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+RUN echo "source activate accelerate" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/binbbt.tar.gz b/binbbt.tar.gz
new file mode 100644
index 0000000..b878c9c
Binary files /dev/null and b/binbbt.tar.gz differ
diff --git a/configure_gpu.sh b/configure_gpu.sh
new file mode 100755
index 0000000..2494aee
--- /dev/null
+++ b/configure_gpu.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# 提取GPU UUID
+GPU_UUIDS=$(nvidia-smi -a | grep 'GPU UUID' | awk '{print $4}')
+
+# 生成node-generic-resources JSON片段
+NODE_RESOURCES=$(echo "$GPU_UUIDS" | awk '{print "\"NVIDIA-GPU=" $1 "\","}' | tr -d '\n')
+NODE_RESOURCES=${NODE_RESOURCES%,}  # 移除最后一个逗号
+
+# 生成完整的daemon.json内容
+DAEMON_JSON=$(cat <<EOF
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "/usr/bin/nvidia-container-runtime",
+      "runtimeArgs": []
+    }
+  },
+  "default-runtime": "nvidia",
+  "node-generic-resources": [
+    $NODE_RESOURCES
+  ]
+}
+EOF
+)
+
+# 备份当前的daemon.json文件
+if [ -f /etc/docker/daemon.json ]; then
+  sudo cp /etc/docker/daemon.json /etc/docker/daemon.json.bak
+fi
+
+# 写入新的daemon.json内容
+echo "$DAEMON_JSON" | sudo tee /etc/docker/daemon.json > /dev/null
+
+# 添加swarm-resource配置
+# swarm-resource = "DOCKER_RESOURCE_NVIDIAGPU"
+
+sudo sed -i '/^#.*swarm-resource/s/^#//' /etc/nvidia-container-runtime/config.toml
+sudo sed -i '/swarm-resource =/s/=.*/= "DOCKER_RESOURCE_GPU"/' /etc/nvidia-container-runtime/config.toml
+
+# 重启Docker服务
+sudo systemctl restart docker.service
+
+# 验证配置
+docker info | grep -i 'nvidia'
+
+echo "GPU UUIDs have been configured and Docker has been restarted."
diff --git a/deepspeed.Dockerfile b/deepspeed.Dockerfile
new file mode 100644
index 0000000..fecb0c7
--- /dev/null
+++ b/deepspeed.Dockerfile
@@ -0,0 +1,184 @@
+FROM nvidia/cuda:12.2.2-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND noninteractive
+
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN apt-get update && \
+        apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-dev
+
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+        apt-get update && \
+        apt-get install -y git && \
+        git --version
+
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive from server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+        sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# Mellanox OFED
+##############################################################################
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
+RUN apt-get install -y libnuma-dev
+RUN cd ${STAGE_DIR} && \
+        wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
+        cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
+        ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+        cd ${STAGE_DIR} && \
+        rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
+
+##############################################################################
+# nv_peer_mem
+##############################################################################
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN mkdir -p ${STAGE_DIR} && \
+        git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
+        cd ${STAGE_DIR}/nv_peer_memory && \
+        ./build_module.sh && \
+        cd ${STAGE_DIR} && \
+        tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
+        cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
+        apt-get update && \
+        apt-get install -y dkms && \
+        dpkg-buildpackage -us -uc && \
+        dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+
+##############################################################################
+# OPENMPI
+##############################################################################
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
+RUN cd ${STAGE_DIR} && \
+        wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+        cd openmpi-${OPENMPI_VERSION} && \
+        ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+        make -j"$(nproc)" install && \
+        ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+        # Sanity check:
+        test -f /usr/local/mpi/bin/mpic++ && \
+        cd ${STAGE_DIR} && \
+        rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+        LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+        echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+        echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+        chmod a+x /usr/local/mpi/bin/mpirun
+
+##############################################################################
+# Python
+##############################################################################
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHON_VERSION=3
+RUN apt-get install -y python3 python3-dev && \
+        rm -f /usr/bin/python && \
+        ln -s /usr/bin/python3 /usr/bin/python && \
+        curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \
+        python get-pip.py && \
+        rm get-pip.py && \
+        pip install --upgrade pip && \
+        # Print python an pip version
+        python -V && pip -V
+RUN pip install pyyaml
+RUN pip install ipython
+
+##############################################################################
+# Some Packages
+##############################################################################
+RUN apt-get update && \
+        apt-get install -y --no-install-recommends \
+        libsndfile-dev \
+        libcupti-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen \
+        libaio-dev
+RUN pip install psutil \
+        yappi \
+        cffi \
+        ipdb \
+        pandas \
+        matplotlib \
+        py3nvml \
+        pyarrow \
+        graphviz \
+        astor \
+        boto3 \
+        tqdm \
+        sentencepiece \
+        msgpack \
+        requests \
+        pandas \
+        sphinx \
+        sphinx_rtd_theme \
+        scipy \
+        numpy \
+        scikit-learn \
+        nvidia-ml-py3 \
+        mpi4py
+
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+        sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# PyTorch
+##############################################################################
+ENV PYTORCH_VERSION=1.13.0
+RUN pip install torch==${PYTORCH_VERSION}
+
+##############################################################################
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
+##############################################################################
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+        rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+
+##############################################################################
+## Add deepspeed user
+###############################################################################
+# Add a deepspeed user with user id 8877
+#RUN useradd --create-home --uid 8877 deepspeed
+RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+RUN usermod -aG sudo deepspeed
+RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+# # Change to non-root privilege
+USER deepspeed
+
+##############################################################################
+# DeepSpeed
+##############################################################################
+RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+RUN cd ${STAGE_DIR}/DeepSpeed && \
+        git checkout . && \
+        git checkout master && \
+        ./install.sh --pip_sudo
+RUN rm -rf ${STAGE_DIR}/DeepSpeed
+RUN python -c "import deepspeed; print(deepspeed.__version__)"
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..52da7ba
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,33 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/finetune:test
+    shm_size: '32gb'
+    ports:
+      - 3227:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/docker-compose_ldh.yml b/docker-compose_ldh.yml
new file mode 100644
index 0000000..ffefef1
--- /dev/null
+++ b/docker-compose_ldh.yml
@@ -0,0 +1,57 @@
+
+services:
+  ldh-deepspeed-test:
+    build: 
+      context: .
+      dockerfile: Dockerfile.ldh
+      args:
+        # PYTHON_VERSION: "3.10"
+        # CUDA_VERSION: "12.1.0"
+        # PYTORCH_VERSION: "2.3.0"
+        # TORCHVISION_VERSION: "0.18.0"
+        # TORCHAUDIO_VERSION: "2.3.0"
+        # DS_BUILD_OPS: 1
+        # USE_CUDA: 1
+        # USE_ROCM: 0
+        # USE_XPU: 0
+        # CUDA: cu121
+        # CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        # SETUPTOOLS_VERSION: "69.5.1"
+        # DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        # DEEPSPEED_VERSION: "master"
+        # DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+        # cache-from: "type=local"
+    image: ldh/deepspeed:test
+    shm_size: '128gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    #runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # stdin_open: true
+    # tty: true
+    privileged: true
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - /root/workspace:/root/data
+      - /dev/infiniband:/dev/infiniband
+    # ports:
+    #   - "22242:22242"
+    #   - "5000:5000"
+    # networks:
+    #   - ldh_overlay_network
+    network_mode: host
+    command: ["/usr/sbin/sshd", "-D"]
+
+# networks:
+#   ldh_overlay_network:
+#     external: true
diff --git a/docker-compose_m_d.yml b/docker-compose_m_d.yml
new file mode 100644
index 0000000..f82ad3f
--- /dev/null
+++ b/docker-compose_m_d.yml
@@ -0,0 +1,35 @@
+
+services:
+  ldh-megatron-deepspeed-test:
+    image: hotwa/magadeep:latest
+    shm_size: '128gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    #runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    # stdin_open: true
+    # tty: true
+    privileged: true
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - /root/workspace:/root/data
+      - /dev/infiniband:/dev/infiniband
+    # ports:
+    #   - "22242:22242"
+    #   - "5000:5000"
+    # networks:
+    #   - ldh_overlay_network
+    network_mode: host
+    command: ["/usr/sbin/sshd", "-D"]
+
+# networks:
+#   ldh_overlay_network:
+#     external: true
diff --git a/docker-compose_mega.yml b/docker-compose_mega.yml
new file mode 100644
index 0000000..adeb72a
--- /dev/null
+++ b/docker-compose_mega.yml
@@ -0,0 +1,38 @@
+
+services:
+  megatron-test:
+    image: nvcr.io/nvidia/pytorch:24.02-py3
+    shm_size: '560gb'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    #runtime: nvidia
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+        #- CUTLASS_PATH="/opt/cutlass"
+        #- CUDA_HOME="/usr/local/cuda"
+        #- PATH="${CUDA_HOME}/bin:${PATH}"
+        #- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
+    stdin_open: true
+    tty: true
+    privileged: true
+    cap_add:
+      - IPC_LOCK
+    volumes:
+      - /root/workspace:/mnt
+      - /dev/infiniband:/dev/infiniband
+    #  - /mnt/local-nvme:/root/
+    ports:
+      - "5000:5000"
+    # networks:
+    #   - ldh_overlay_network
+    network_mode: host
+
+# networks:
+#   ldh_overlay_network:
+#     external: true
diff --git a/docker-compose_nccl.yml b/docker-compose_nccl.yml
new file mode 100644
index 0000000..e3ce1ad
--- /dev/null
+++ b/docker-compose_nccl.yml
@@ -0,0 +1,28 @@
+version: '3.8'
+# https://github.com/mayooot/build-nccl-tests-with-pytorch
+services:
+  nccl-test-container:
+    image: mayooot/nccl-tests-with-pytorch:v0.0.2
+    container_name: nccl-test-container
+    network_mode: host
+    environment:
+      - PORT=1998
+      - PASS=P@88w0rd
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    volumes:
+      - ./id_rsa_finetune:/root/.ssh/id_rsa
+      - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    cap_add:
+      - IPC_LOCK
+    devices:
+      - /dev/infiniband:/dev/infiniband
+    shm_size: '32gb'
+    restart: unless-stopped
diff --git a/docker-compose_ngc.yml b/docker-compose_ngc.yml
new file mode 100644
index 0000000..06b55a8
--- /dev/null
+++ b/docker-compose_ngc.yml
@@ -0,0 +1,72 @@
+version: '3.9'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+# 检测系统总内存（以GB为单位）
+# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
+# echo "Docker Compose 文件已生成，shm_size 设置为 ${TOTAL_MEM}GB。"
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile.ngc
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        REGISTRY: "nvcr.io"
+        OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
+        LABEL: "pytorch"
+        VERSION: "24.07-py3"
+        DS_BUILD_OPS: 1
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+        CACHEBUST: 1
+    # volumes:
+    #   - ./workspace:/workspace
+      # - /tmp:/tmp
+    container_name: ubuntu-ngc
+    pull_policy: if_not_present
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    # tty: true
+    # stdin_open: true
+    restart: unless-stopped
+    image: quay.io/hotwa/ngc:latest
+    privileged: true
+    ipc: host
+    network_mode: host
+    shm_size: '128gb'
+    # ports:
+    #   - 3228:2222
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - network_finetune
+    # command: ["/usr/sbin/sshd", "-D"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# networks:
+#   network_finetune:
+#     name: network_finetune
diff --git a/docker-compose_pytorch1.13.yml b/docker-compose_pytorch1.13.yml
new file mode 100644
index 0000000..e39c544
--- /dev/null
+++ b/docker-compose_pytorch1.13.yml
@@ -0,0 +1,52 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: 3.9
+        CUDA_VERSION: 11.7.1
+        PYTORCH_VERSION: 1.13.1
+        TORCHVISION_VERSION: 0.14.1
+        TORCHAUDIO_VERSION: 0.13.1
+        DS_BUILD_OPS: 1
+        DS_BUILD_SPARSE_ATTN: 1
+        DS_BUILD_FUSED_ADAM: 1
+        DS_BUILD_CPU_ADAM: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu117
+        CUDA_ARCH_LIST: "80;86" # for RTX 4090, all : "80;86;89;90" 编译deepspeed内核需要，这个参数很严格
+        SETUPTOOLS_VERSION: "69.5.1"
+        ROOT_PASSWD: "root"
+        DCUTLASS_NVCC_ARCHS: "90a" # 90a for H100 ,89:GeForce RTX 4090 
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt113
+    shm_size: '32gb'
+    ports:
+      - 3227:2222
+    command: ["/usr/sbin/sshd", "-D"]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/docker-compose_pytorch2.3.yml b/docker-compose_pytorch2.3.yml
new file mode 100644
index 0000000..4390e55
--- /dev/null
+++ b/docker-compose_pytorch2.3.yml
@@ -0,0 +1,65 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    # tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/docker-compose_pytorch2.34060.yml b/docker-compose_pytorch2.34060.yml
new file mode 100644
index 0000000..52d11be
--- /dev/null
+++ b/docker-compose_pytorch2.34060.yml
@@ -0,0 +1,63 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./src:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    # tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - network_finetune
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+networks:
+  network_finetune:
+    name: network_finetune
diff --git a/docker-compose_pytorch2.3_device.yml b/docker-compose_pytorch2.3_device.yml
new file mode 100644
index 0000000..b9752d6
--- /dev/null
+++ b/docker-compose_pytorch2.3_device.yml
@@ -0,0 +1,71 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./src:/bbtft
+      - ./id_rsa_finetune:/root/.ssh/id_rsa
+      - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    # tty: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - my-custom-bridge
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    cap_add:
+      - IPC_LOCK
+    devices:
+      - /dev/infiniband:/dev/infiniband
+# docker swarm init
+# docker swarm join-token manager
+# docker network create -d overlay --subnet=192.168.200.0/24 my-overlay-network
+networks:
+  my-custom-bridge:
+    external: true
diff --git a/docker-compose_stack.yml b/docker-compose_stack.yml
new file mode 100644
index 0000000..ef9aa2b
--- /dev/null
+++ b/docker-compose_stack.yml
@@ -0,0 +1,58 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./src:/bbtft
+      - ./id_rsa_finetune:/root/.ssh/id_rsa
+      - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    container_name: ubuntu-finetune
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - my-custom-bridge
+    deploy:
+      replicas: 2
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "NVIDIA-GPU"
+                value: 1
+      placement:
+        constraints: [node.platform.os == linux]
+    cap_add:
+      - IPC_LOCK
+    devices:
+      - /dev/infiniband:/dev/infiniband
+
+networks:
+  my-custom-bridge:
+    external: true
+
+# docker stack deploy -c docker-compose_stack.yml rdma_stack
diff --git a/docker-compose_stack1.yml b/docker-compose_stack1.yml
new file mode 100644
index 0000000..7698b4a
--- /dev/null
+++ b/docker-compose_stack1.yml
@@ -0,0 +1,37 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    image: hotwa/deepspeed:pt23
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    deploy:
+      replicas: 1
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "NVIDIA-GPU"
+                value: 1
+      placement:
+        constraints:
+          - node.labels.gpu == true
+    cap_add:
+      - IPC_LOCK
+
+networks:
+  default:
+    driver: overlay
+
+# 为节点添加标签：
+# docker node ls
+
+
+# docker node update --label-add gpu=true node1
+
+# docker stack deploy -c docker-compose.yml rdma_stack
+
diff --git a/docker-compose_stack2.yml b/docker-compose_stack2.yml
new file mode 100644
index 0000000..89c357c
--- /dev/null
+++ b/docker-compose_stack2.yml
@@ -0,0 +1,62 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args:
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - type: tmpfs
+        target: /dev/shm
+        tmpfs:
+          size: 32000000000 # 32GB
+      # - ./src:/bbtft
+      # - ./id_rsa_finetune:/root/.ssh/id_rsa
+      # - ./id_rsa.pub:/root/.ssh/id_rsa.pub
+    # container_name: ubuntu-finetune
+    image: hotwa/deepspeed:pt23
+    shm_size: '32gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - my-custom-bridge
+    deploy:
+      replicas: 4
+      resources:
+        reservations:
+          generic_resources:
+            - discrete_resource_spec:
+                kind: "NVIDIA-GPU"
+                value: 8
+            - discrete_resource_spec:
+                kind: "SRIOV-VF"
+                value: 1
+      placement:
+        constraints: [node.labels.gpu == true]
+    cap_add:
+      - IPC_LOCK
+    privileged: true
+
+# networks:
+#   my-custom-bridge:
+#     external: true
diff --git a/docker-compose_swarm.yml b/docker-compose_swarm.yml
new file mode 100644
index 0000000..45d9300
--- /dev/null
+++ b/docker-compose_swarm.yml
@@ -0,0 +1,50 @@
+version: '3.8'
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: 
+        PYTHON_VERSION: "3.10"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+    volumes:
+      - ./binbbt:/bbtft
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23
+    shm_size: '40gb'
+    ports:
+      - 3228:22
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    networks:
+      - test-net
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# 修改为docker-swarm的网络
+networks:
+  test-net:
+    external: true
diff --git a/docker-compose_update.yml b/docker-compose_update.yml
new file mode 100644
index 0000000..64aa0ff
--- /dev/null
+++ b/docker-compose_update.yml
@@ -0,0 +1,81 @@
+version: '3.8'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+# 检测系统总内存（以GB为单位）
+# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
+# echo "Docker Compose 文件已生成，shm_size 设置为 ${TOTAL_MEM}GB。"
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile.update
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        PYTHON_VERSION: "3.10"
+        NV_PEER_MEM_VERSION: "1.2"
+        CUDA_VERSION: "12.1.0"
+        PYTORCH_VERSION: "2.3.0"
+        TORCHVISION_VERSION: "0.18.0"
+        TORCHAUDIO_VERSION: "2.3.0"
+        DS_BUILD_OPS: 1
+        USE_CUDA: 1
+        USE_ROCM: 0
+        USE_XPU: 0
+        CUDA: cu121
+        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
+        TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+        SETUPTOOLS_VERSION: "69.5.1"
+        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        # HTTP_PROXY: "http://127.0.0.1:15777"
+        # HTTPS_PROXY: "http://127.0.0.1:15777"
+        CACHEBUST: 1
+    volumes:
+      - ./src:/bbtft
+      # - /tmp:/tmp
+    container_name: ubuntu-finetune
+    pull_policy: if_not_present
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    # tty: true
+    # stdin_open: true
+    restart: unless-stopped
+    image: hotwa/deepspeed:pt23_update
+    privileged: true
+    ipc: host
+    network_mode: host
+    shm_size: '128gb'
+    # ports:
+    #   - 3228:2222
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - network_finetune
+    # command: ["/usr/sbin/sshd", "-D"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# networks:
+#   network_finetune:
+#     name: network_finetune
diff --git a/hostfile b/hostfile
new file mode 100644
index 0000000..4046630
--- /dev/null
+++ b/hostfile
@@ -0,0 +1,3 @@
+host1 slots=4
+host2 slots=4
+host3 slots=8
diff --git a/id_rsa.pub b/id_rsa.pub
new file mode 100644
index 0000000..abfe5ea
--- /dev/null
+++ b/id_rsa.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC1CQs1rWF7KFg5SKeNHm3EGLEx8pgegdy2voQMAEInOTjeIoWpcXk7R65NLGG6k1J10f5GYg3A0XxmNf/7nUWn0T/D31dwcFvP5BAIpJl8IMDkFj36SoNKTX5XIhbCet7sJgsLY4yKlOVahVNK+La9nbLDEd7GGNzBVUpccc2uXDJul+r1QSoXssV5Q7QBa17Sf2en6swXrtjyPz4W+Tg7/ANzF3P9y9roIcdlAm/jZb0gMLFsteyt+ThqrP3+hSgFrOlJNgEL5qkOG0dI5rHpjeJnBzPAA1FLAQFhdtSrL+Cd9INSvV0lNwAROl5FpSMVmE7UzeeUy70cqw5b7ReJsEpHDbpd6rUEwC09mJlSaHQ9ApKbCD0u9aXeuTlbgHqcs2JDZTLT7Yf+JxO7yVc2QaJ3iiLkVTyiXhby5YWO++lBvhXX+zMLsUvIXD6MMBeyC0Azjb41qguhJvV8H9wI+2nBZEcgSB2vhYM+/rdDw5+v3WqgGsUqpf1GLTeWP8oTxJDrDM20crW3bcEoEFlMZRpVOnWFBIniU8T1TLxP92lElWTkX+eptJVffoPxRvSPLgaNN2toY9K1MVcQ8+ckJJ6te7sjXlOupJDpNH+tshYlMsUfi1FrsRhGT0yHZtDZ3YibZ0l/8AGUWvnNC/pFqtqBLaAsfll5jsqt06pp7Q== docker@example.com
diff --git a/id_rsa_finetune b/id_rsa_finetune
new file mode 100644
index 0000000..9d0e0c9
--- /dev/null
+++ b/id_rsa_finetune
@@ -0,0 +1,49 @@
+-----BEGIN OPENSSH PRIVATE KEY-----
+b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAACFwAAAAdzc2gtcn
+NhAAAAAwEAAQAAAgEAtQkLNa1heyhYOUinjR5txBixMfKYHoHctr6EDABCJzk43iKFqXF5
+O0euTSxhupNSddH+RmINwNF8ZjX/+51Fp9E/w99XcHBbz+QQCKSZfCDA5BY9+kqDSk1+Vy
+IWwnre7CYLC2OMipTlWoVTSvi2vZ2ywxHexhjcwVVKXHHNrlwybpfq9UEqF7LFeUO0AWte
+0n9np+rMF67Y8j8+Fvk4O/wDcxdz/cva6CHHZQJv42W9IDCxbLXsrfk4aqz9/oUoBazpST
+YBC+apDhtHSOax6Y3iZwczwANRSwEBYXbUqy/gnfSDUr1dJTcAETpeRaUjFZhO1M3nlMu9
+HKsOW+0XibBKRw26Xeq1BMAtPZiZUmh0PQKSmwg9LvWl3rk5W4B6nLNiQ2Uy0+2H/icTu8
+lXNkGid4oi5FU8ol4W8uWFjvvpQb4V1/szC7FLyFw+jDAXsgtAM42+NaoLoSb1fB/cCPtp
+wWRHIEgdr4WDPv63Q8Ofr91qoBrFKqX9Ri03lj/KE8SQ6wzNtHK1t23BKBBZTGUaVTp1hQ
+SJ4lPE9Uy8T/dpRJVk5F/nqbSVX36D8Ub0jy4GjTdraGPStTFXEPPnJCSerXu7I15TrqSQ
+6TR/rbIWJTLFH4tRa7EYRk9Mh2bQ2d2Im2dJf/ABlFr5zQv6RaragS2gLH5ZeY7KrdOqae
+0AAAdIJh5TtyYeU7cAAAAHc3NoLXJzYQAAAgEAtQkLNa1heyhYOUinjR5txBixMfKYHoHc
+tr6EDABCJzk43iKFqXF5O0euTSxhupNSddH+RmINwNF8ZjX/+51Fp9E/w99XcHBbz+QQCK
+SZfCDA5BY9+kqDSk1+VyIWwnre7CYLC2OMipTlWoVTSvi2vZ2ywxHexhjcwVVKXHHNrlwy
+bpfq9UEqF7LFeUO0AWte0n9np+rMF67Y8j8+Fvk4O/wDcxdz/cva6CHHZQJv42W9IDCxbL
+Xsrfk4aqz9/oUoBazpSTYBC+apDhtHSOax6Y3iZwczwANRSwEBYXbUqy/gnfSDUr1dJTcA
+ETpeRaUjFZhO1M3nlMu9HKsOW+0XibBKRw26Xeq1BMAtPZiZUmh0PQKSmwg9LvWl3rk5W4
+B6nLNiQ2Uy0+2H/icTu8lXNkGid4oi5FU8ol4W8uWFjvvpQb4V1/szC7FLyFw+jDAXsgtA
+M42+NaoLoSb1fB/cCPtpwWRHIEgdr4WDPv63Q8Ofr91qoBrFKqX9Ri03lj/KE8SQ6wzNtH
+K1t23BKBBZTGUaVTp1hQSJ4lPE9Uy8T/dpRJVk5F/nqbSVX36D8Ub0jy4GjTdraGPStTFX
+EPPnJCSerXu7I15TrqSQ6TR/rbIWJTLFH4tRa7EYRk9Mh2bQ2d2Im2dJf/ABlFr5zQv6Ra
+ragS2gLH5ZeY7KrdOqae0AAAADAQABAAACAANNbXXIduH4PT8aDGQy41I4+6VplUKKUjKd
+HLZF431FaG4jZAaJXOqKyMsDqhxmEDYOZuyY7u12EUn20Slhd+Pokm4S/qHSRDrxbparG5
+Jy+GZH4l5GlPq20nXw9CvyHHnG2HECqVvPRCZgqxbW8mI8S6MOZol83DsvMjVEWBZjJuXP
+vl8ZztugbNMPkU8z3/hrj2Xglf56DPuYUXjIF83UGlUBu4wzYh1Hcunsm/wUN9mIVzLnkQ
+WYcJOqtpnH4JA41HktnlP9qqwaguYVzURxaQXB2CCGRhRlDVQI6m+kdPltkd8ocR8T3hSy
+X9tg/61fwVNHMxSY8IkGUXqn39IZuwtIOflybXc1w3VQBwGuI2UF/U/5wmIJdQimsDPzhX
+o5uENWiL5Lei5sxxUmnZw78xoXHino1LNceBKhQHrKS8R36QsdK7+INbiW3Tt2TmCyH563
+UH7dgS2moTrtiXh+gPk32okTnwquRWHJ7uurxgmnncoTEdmkcTCeXv7B1CBdH9WGyCtyV1
+oKK+qNEXCrLaIOD49zF2qPUmxUOuGzcBKgavXDSPmj5jB/4k3ipsjlRX12l8xCEycKLHG3
+6LuP4jgoalNtjJGJozpya4/tsOhE6jEB74xIXUuCUlBo9Q8xmHYnv2/8jdSdR6rx6N9odw
+XMYjVcs63rLZMKsljhAAABADaOQoVNSfTbhyG9wJN7+XyeXHBkfMKg4kGNYB28l4mbB1eR
+8i/cZPvIDrcz1FjvYQXEWmK+XS9QVMz0EGLse5JIYhXUFtZin8VVqttIBZLXhw0nImD9sK
+HlbxlKj+Savlx+oZDDxAGNMDGGhbc7uuWgX1O1Bsr5sQR+neTV91iLcMWB0XfP8CK70uXf
+l7NQ88RaWn79JggKKuqVs1THhlfeMlBJ0RsUdRw69gs228++btif2bxCoY0IH3mCsmaux9
+JNI8bqZ5yws4XE7l0jaOnVFQywHP/4FCjZ2MQONhG10vYWpRjXpEEf1hXN6xDKWC5t50d0
+o79xP/Vp4Nk7pFsAAAEBAPeE7OCOS96fAz8hBI+4CXVjKzy39slPgsi64hMZYtUgY9fZ/k
+5L/n831+Do7Yrrng/1pUzrHvaiip4XP2WcPmz9y2PYhi5RZzzffAmCudDVGP5ftoUcAtrj
+cVzP4kmeRPP1kTsP3M3fNphrfgPkpGD1TFRxxT5wwVnsiRzQ5c1ykX8jn2xd8QpUoSdK0Y
+SetryzmRf+OlDJyQljoNZ76wu5GjsejMjtIKO9oua5avgJhLKpyfAVTz2QZBBxrCUrp2+P
+iM+/f4tXqF45eCFjGqiyFvKUCD1VHp5Oup4rQIi4PgD1H/MdT5XmZNeFqxwo+/2QzwIAp9
+AKQqQ/KX+7YWEAAAEBALs89zawDSGbZemROsreRDapohYnHiSAZvGzjqaevjF0oFkLpRr5
+/9jcRZf4QBDTZah5y8ATNs6KECvmRQ0mMkDSI2FSOM1bZ2yndxbtmM9kaAmpqdrRVBChVX
+nopPfQ8dQ2RkPzp5YIL1QvAQbaP+B+lB8sZVtEK1OnxwCOCcVukGpkw2cE7aGDITDi1Mqg
+Obj3sxHjQ+ysMZ1lOrKadpDQZXFpgp6MFVrNVlpv2QanbMGTB9GPynvCHGf5KJvKnot7L/
+rjTd2Da5SII3Mx9d6YAkYQpJkguNkJ2Q05+7PvyNNmj+Nk3ZwgqFA+3edc2exXMf9FzdmJ
+iJcbS3QheA0AAAASZG9ja2VyQGV4YW1wbGUuY29tAQ==
+-----END OPENSSH PRIVATE KEY-----
diff --git a/peft-gpu-bnb-multi-source.Dockerfile b/peft-gpu-bnb-multi-source.Dockerfile
new file mode 100644
index 0000000..2c839c4
--- /dev/null
+++ b/peft-gpu-bnb-multi-source.Dockerfile
@@ -0,0 +1,68 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.8
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/peft/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Stage 2
+FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget cmake && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Activate the conda env and install transformers + accelerate from source
+# Also clone BNB and build it from source.
+RUN source activate peft && \
+    python3 -m pip install -U --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    peft[test]@git+https://github.com/huggingface/peft \
+    optimum \
+    auto-gptq && \
+    git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && git checkout multi-backend-refactor && \
+    cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \
+    cmake --build . && \
+    pip install -e . && \ 
+    pip freeze | grep bitsandbytes
+
+RUN echo "source activate peft" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..cb14ebf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,37 @@
+pytorch
+torchvision
+torchaudio
+pydantic
+transformers
+datasets
+accelerate
+evaluate
+peft
+deepspeed
+tiktoken
+sentencepiece
+tqdm
+nltk
+matplotlib
+seaborn
+numpy
+pandas
+scikit-learn
+diffusers
+huggingface_hub
+spacy
+Pillow
+blobfile
+requests
+scipy
+pycocotools
+protobuf
+timm
+pyyaml
+ipython
+xformers
+opencv-contrib-python
+open_clip_torch
+flash-attn
+packaging
+psutil
diff --git a/setup_ssh.sh b/setup_ssh.sh
new file mode 100644
index 0000000..514d283
--- /dev/null
+++ b/setup_ssh.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# 定义主机列表
+hosts=("10.200.1.10" "10.200.1.11" "10.200.1.12")
+
+# 当前主机的用户名
+user="root"
+
+# 检查ssh-keygen是否已经生成密钥对
+if [ ! -f ~/.ssh/id_rsa ]; then
+    echo "生成SSH密钥对..."
+    ssh-keygen -t rsa -b 4096 -N "" -f ~/.ssh/id_rsa
+else
+    echo "SSH密钥对已经存在..."
+fi
+
+# 分发公钥到其他主机
+for host in "${hosts[@]}"; do
+    if [ "$host" != "$(hostname -I | awk '{print $1}')" ]; then
+        echo "将公钥复制到$host..."
+        ssh-copy-id -i ~/.ssh/id_rsa.pub "$user@$host"
+    fi
+done
+
+echo "密钥认证配置完成。"
\ No newline at end of file
diff --git a/test.txt b/test.txt
new file mode 100644
index 0000000..48b0982
--- /dev/null
+++ b/test.txt
@@ -0,0 +1,182 @@
+absl-py==2.1.0
+accelerate @ git+https://github.com/huggingface/accelerate@1f7a79b428749f45187ec69485f2c966fe21926e
+aiohttp==3.9.5
+aiosignal==1.3.1
+alabaster==0.7.16
+alembic==1.13.1
+annotated-types==0.7.0
+arrow==1.3.0
+astor==0.8.1
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1698341106958/work
+async-timeout==4.0.3
+attrs==23.2.0
+Babel==2.15.0
+beautifulsoup4==4.12.3
+binaryornot==0.4.4
+boto3==1.34.129
+botocore==1.34.129
+certifi==2024.6.2
+cffi==1.16.0
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cmake==3.29.5.1
+colorlog==6.8.2
+contourpy==1.2.1
+cookiecutter==1.7.3
+cycler==0.12.1
+datasets==2.20.0
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
+deepspeed @ file:///tmp/DeepSpeed/dist/deepspeed-0.14.4%2B0c979d67-cp310-cp310-linux_x86_64.whl#sha256=3990df7f730604f29f51d6e5aa83ec09da6a4ea584504d27dc2d0fad7b8a4582
+deepspeed-kernels @ file:///tmp/DeepSpeed-Kernels
+dill==0.3.4
+docutils==0.20.1
+einops==0.8.0
+evaluate==0.4.2
+exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1704921103267/work
+execnet==2.1.1
+executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1698579936712/work
+faiss-cpu==1.8.0
+filelock==3.15.3
+flash-attn==2.5.9.post1
+fonttools==4.53.0
+frozenlist==1.4.1
+fsspec==2024.6.0
+ftfy==6.2.0
+gitdb==4.0.11
+GitPython==3.1.18
+graphviz==0.20.3
+greenlet==3.0.3
+grpcio==1.64.1
+hjson==3.1.0
+huggingface-hub==0.23.4
+idna==3.7
+imagesize==1.4.1
+iniconfig==2.0.0
+ipdb==0.13.13
+ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1717182742060/work
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1696326070614/work
+Jinja2==3.1.4
+jinja2-time==0.2.0
+jmespath==1.0.1
+joblib==1.4.2
+kiwisolver==1.4.5
+Mako==1.3.5
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.0
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1713250518406/work
+mdurl==0.1.2
+mpi4py @ https://github.com/mpi4py/mpi4py/tarball/master#sha256=e9d1ce01a4c5f95c704743ed13a2d90517dcafdfcde40e050903d583e9ca1260
+mpmath==1.3.0
+msgpack==1.0.8
+multidict==6.0.5
+multiprocess==0.70.12.2
+networkx==3.3
+ninja==1.11.1.1
+nltk==3.8.1
+numpy==2.0.0
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-ml-py==12.555.43
+nvidia-ml-py3==7.352.0
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+open-clip-torch==2.24.0
+opencv-contrib-python==4.10.0.84
+optuna==3.6.1
+packaging==24.1
+pandas==2.2.2
+parameterized==0.9.0
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1712320355065/work
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1706113125309/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
+pillow==10.3.0
+pluggy==1.5.0
+portalocker==2.0.0
+poyo==0.5.0
+prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1718047967974/work
+protobuf==4.25.3
+psutil==6.0.0
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
+py-cpuinfo==9.0.0
+py3nvml==0.2.7
+pyarrow==16.1.0
+pyarrow-hotfix==0.6
+pycparser==2.22
+pydantic==1.10.16
+pydantic_core==2.18.4
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1714846767233/work
+pyparsing==3.1.2
+pytest==7.4.4
+pytest-rich==0.1.1
+pytest-timeout==2.3.1
+pytest-xdist==3.6.1
+python-dateutil==2.9.0.post0
+python-slugify==8.0.4
+pytz==2024.1
+PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+rjieba==0.1.11
+rouge_score==0.1.2
+ruff==0.4.4
+s3transfer==0.10.1
+sacrebleu==1.5.1
+sacremoses==0.1.1
+safetensors==0.4.3
+scikit-learn==1.5.0
+scipy==1.13.1
+sentencepiece==0.2.0
+six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
+smmap==5.0.1
+snowballstemmer==2.2.0
+soupsieve==2.5
+Sphinx==7.3.7
+sphinx-rtd-theme==2.0.0
+sphinxcontrib-applehelp==1.0.8
+sphinxcontrib-devhelp==1.0.6
+sphinxcontrib-htmlhelp==2.0.5
+sphinxcontrib-jquery==4.1
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.7
+sphinxcontrib-serializinghtml==1.1.10
+SQLAlchemy==2.0.31
+stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
+sympy==1.12.1
+tensorboard==2.17.0
+tensorboard-data-server==0.7.2
+text-unidecode==1.3
+threadpoolctl==3.5.0
+timeout-decorator==0.5.0
+timm==1.0.7
+tokenizers==0.19.1
+tomli==2.0.1
+torch==2.3.1
+torchaudio==0.13.1+cu117
+torchvision==0.14.1+cu117
+tqdm==4.66.4
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1713535121073/work
+transformers @ file:///root/ninja/transformers
+triton==2.3.1
+types-python-dateutil==2.9.0.20240316
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1704731205417/work
+Werkzeug==3.0.3
+xmltodict==0.13.0
+xxhash==3.4.1
+yappi==1.6.0
+yarl==1.9.4
\ No newline at end of file
diff --git a/transformer.Dockerfile b/transformer.Dockerfile
new file mode 100644
index 0000000..e38170e
--- /dev/null
+++ b/transformer.Dockerfile
@@ -0,0 +1,70 @@
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
+
+# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
+# to be used as arguments for docker build (so far).
+
+ARG PYTORCH='2.3.0'
+# (not always a valid torch version)
+ARG INTEL_TORCH_EXT='2.3.0'
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu121'
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN git lfs install
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
+# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
+#    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+
+RUN python3 -m pip uninstall -y flax jax
+
+RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
+
+# For bettertransformer
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+
+# For video model testing
+RUN python3 -m pip install --no-cache-dir decord av==9.2.0
+
+# Some slow tests require bnb
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Some tests require quanto
+RUN python3 -m pip install --no-cache-dir quanto
+
+# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
+# (`deformable_detr`, `rwkv`, `mra`)
+RUN python3 -m pip uninstall -y ninja
+
+# For `dinat` model
+# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
+RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
+
+# For `nougat` tokenizer
+RUN python3 -m pip install --no-cache-dir python-Levenshtein
+
+# For `FastSpeech2ConformerTokenizer` tokenizer
+RUN python3 -m pip install --no-cache-dir g2p-en
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
\ No newline at end of file
diff --git a/update_sriov_vf.sh b/update_sriov_vf.sh
new file mode 100755
index 0000000..677fe93
--- /dev/null
+++ b/update_sriov_vf.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# 提取 Port GUID 并格式化为 SRIOV-VF 配置
+generate_sriov_vf_config() {
+    GUIDS=($(ibstat | grep "Port GUID" | awk '{print $3}'))
+    for i in "${!GUIDS[@]}"; do
+        echo "SRIOV-VF=${GUIDS[$i]}"
+    done
+}
+
+# 更新 Docker 配置文件
+update_docker_config() {
+    local GUIDS=("$@")
+    local DAEMON_JSON="/etc/docker/daemon.json"
+    local TMP_JSON="/tmp/daemon.json"
+
+    if [ ! -f "$DAEMON_JSON" ]; then
+        echo "$DAEMON_JSON 文件不存在"
+        exit 1
+    fi
+
+    local NODE_GENERIC_RESOURCES=$(jq -c '.["node-generic-resources"]' "$DAEMON_JSON")
+
+    if [ "$NODE_GENERIC_RESOURCES" == "null" ]; then
+        NODE_GENERIC_RESOURCES="[]"
+    fi
+
+    for GUID in "${GUIDS[@]}"; do
+        if [[ ! $NODE_GENERIC_RESOURCES == *"$GUID"* ]]; then
+            NODE_GENERIC_RESOURCES=$(echo "$NODE_GENERIC_RESOURCES" | jq --arg vf "$GUID" '. += [$vf]')
+        fi
+    done
+
+    jq '.["node-generic-resources"] = '"$NODE_GENERIC_RESOURCES" "$DAEMON_JSON" > "$TMP_JSON"
+    mv "$TMP_JSON" "$DAEMON_JSON"
+}
+
+# 主函数
+main() {
+    if [[ $EUID -ne 0 ]]; then
+        echo "此脚本必须以 root 用户运行"
+        exit 1
+    fi
+
+    GUIDS=($(generate_sriov_vf_config))
+    if [ ${#GUIDS[@]} -eq 0 ]; then
+        echo "未找到 SR-IOV VF 设备"
+        exit 1
+    fi
+
+    update_docker_config "${GUIDS[@]}"
+    echo "成功更新 $DAEMON_JSON 文件"
+    systemctl restart docker
+}
+
+main "$@"