From 2e67dfea5054878fd8fe8e66726efe3d667b75a5 Mon Sep 17 00:00:00 2001
From: lingyuzeng <pylyzeng@gmail.com>
Date: Wed, 17 Jul 2024 13:03:27 +0800
Subject: [PATCH 1/3] add install nvida drive command

---
 finetune/README.md | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/finetune/README.md b/finetune/README.md
index c83161f..9a57dcf 100644
--- a/finetune/README.md
+++ b/finetune/README.md
@@ -59,7 +59,54 @@ sudo systemctl status nvidia-fabricmanager
 
 docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
 docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
+docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
 
+pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+
+```shell
+    1  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    2  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    3  curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
+    4  pigchacli 
+    5  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
+    6  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
+    7  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    8  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+    9  python -c "from xformers import ops as xops"
+   10  python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+   11  env
+   12  pip install git+https://github.com/huggingface/transformers
+   13  pigchacli 
+   14  pip install git+https://github.com/huggingface/transformers
+   15  pip list
+   16  export STAGE_DIR=/tmp
+   17  git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+   18  cd ${STAGE_DIR}/oneCCL
+   19  git checkout . 
+   20  git checkout master
+   21  mkdir build
+   22  cd build 
+   23  cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+   24  make -j"$(nproc)" install
+   25  ls
+   26  echo ${CUDA_ARCH_LIST}
+   27  git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+   28  cd ${STAGE_DIR}/DeepSpeed-Kernels
+   29  python -m pip install -v .
+   30  env
+   31  python -m pip install -v .
+   32  git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+   33  cd ${STAGE_DIR}/DeepSpeed
+   34  export DEEPSPEED_VERSION="v0.14.3"
+   35  git checkout ${DEEPSPEED_VERSION}
+   36  ls
+   37  ./install.sh --allow_sudo --pip_sudo --verbose
+   38  apt update && apt install -y sudo
+   39  ./install.sh --allow_sudo --pip_sudo --verbose
+```
 
 ```shell
 nvidia-smi

From adb3a10bdbb1ee6c4cec564c042c3d06eb08ff5e Mon Sep 17 00:00:00 2001
From: lingyuzeng <pylyzeng@gmail.com>
Date: Wed, 17 Jul 2024 13:03:54 +0800
Subject: [PATCH 2/3] TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" add

---
 finetune/docker-compose_update.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetune/docker-compose_update.yml b/finetune/docker-compose_update.yml
index 252bb96..cc24ed4 100644
--- a/finetune/docker-compose_update.yml
+++ b/finetune/docker-compose_update.yml
@@ -34,7 +34,7 @@ services:
         USE_XPU: 0
         CUDA: cu121
         CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
-        TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+        TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
         SETUPTOOLS_VERSION: "69.5.1"
         DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
         DEEPSPEED_VERSION: "master"

From f55f3daba09fb127c8bef97402f06e3bc8fa6aca Mon Sep 17 00:00:00 2001
From: lingyuzeng <pylyzeng@gmail.com>
Date: Wed, 17 Jul 2024 13:04:38 +0800
Subject: [PATCH 3/3] add colossalai Dockerfile

---
 finetune/Dockfile-colosial | 46 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 finetune/Dockfile-colosial

diff --git a/finetune/Dockfile-colosial b/finetune/Dockfile-colosial
new file mode 100644
index 0000000..0d28277
--- /dev/null
+++ b/finetune/Dockfile-colosial
@@ -0,0 +1,46 @@
+FROM hpcaitech/cuda-conda:12.1
+
+# metainformation
+LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
+
+# enable passwordless ssh
+RUN mkdir ~/.ssh && \
+    printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config && \
+    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
+    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
+
+# enable RDMA support
+RUN apt-get update && \
+    apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install torch
+RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
+
+# install ninja
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ninja-build && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# install apex
+RUN git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    git checkout a7de60 && \
+    pip install packaging && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+# install colossalai
+ARG VERSION=main
+RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
+    && cd ./ColossalAI \
+    && BUILD_EXT=1 pip install -v . \
+    && rm -rf colossalai
+
+# install tensornvme
+RUN conda install -y cmake && \
+    apt update -y && apt install -y libaio-dev && \
+    pip install -v git+https://github.com/hpcaitech/TensorNVMe.git