From 2e67dfea5054878fd8fe8e66726efe3d667b75a5 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Wed, 17 Jul 2024 13:03:27 +0800 Subject: [PATCH 1/3] add install nvida drive command --- finetune/README.md | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/finetune/README.md b/finetune/README.md index c83161f..9a57dcf 100644 --- a/finetune/README.md +++ b/finetune/README.md @@ -59,7 +59,54 @@ sudo systemctl status nvidia-fabricmanager docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update +docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash +pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple + + +```shell + 1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers + 2 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers + 3 curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash + 4 pigchacli + 5 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777 + 6 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777 + 7 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers + 8 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers + 9 python -c "from xformers import ops as xops" + 10 python -c "import apex.amp; print('Apex is installed and the amp module is available.')" + 11 env + 12 pip install git+https://github.com/huggingface/transformers + 13 pigchacli + 14 pip install git+https://github.com/huggingface/transformers + 15 pip list + 16 export STAGE_DIR=/tmp + 17 git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL + 18 cd ${STAGE_DIR}/oneCCL + 19 git checkout . + 20 git checkout master + 21 mkdir build + 22 cd build + 23 cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local + 24 make -j"$(nproc)" install + 25 ls + 26 echo ${CUDA_ARCH_LIST} + 27 git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels + 28 cd ${STAGE_DIR}/DeepSpeed-Kernels + 29 python -m pip install -v . + 30 env + 31 python -m pip install -v . + 32 git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed + 33 cd ${STAGE_DIR}/DeepSpeed + 34 export DEEPSPEED_VERSION="v0.14.3" + 35 git checkout ${DEEPSPEED_VERSION} + 36 ls + 37 ./install.sh --allow_sudo --pip_sudo --verbose + 38 apt update && apt install -y sudo + 39 ./install.sh --allow_sudo --pip_sudo --verbose +``` ```shell nvidia-smi From adb3a10bdbb1ee6c4cec564c042c3d06eb08ff5e Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Wed, 17 Jul 2024 13:03:54 +0800 Subject: [PATCH 2/3] TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" add --- finetune/docker-compose_update.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/docker-compose_update.yml b/finetune/docker-compose_update.yml index 252bb96..cc24ed4 100644 --- a/finetune/docker-compose_update.yml +++ b/finetune/docker-compose_update.yml @@ -34,7 +34,7 @@ services: USE_XPU: 0 CUDA: cu121 CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" - TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" + TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" SETUPTOOLS_VERSION: "69.5.1" DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 DEEPSPEED_VERSION: "master" From f55f3daba09fb127c8bef97402f06e3bc8fa6aca Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Wed, 17 Jul 2024 13:04:38 +0800 Subject: [PATCH 3/3] add colossalai Dockerfile --- finetune/Dockfile-colosial | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 finetune/Dockfile-colosial diff --git a/finetune/Dockfile-colosial b/finetune/Dockfile-colosial new file mode 100644 index 0000000..0d28277 --- /dev/null +++ b/finetune/Dockfile-colosial @@ -0,0 +1,46 @@ +FROM hpcaitech/cuda-conda:12.1 + +# metainformation +LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI" +LABEL org.opencontainers.image.licenses = "Apache License 2.0" +LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1" + +# enable passwordless ssh +RUN mkdir ~/.ssh && \ + printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config && \ + ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys + +# enable RDMA support +RUN apt-get update && \ + apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# install torch +RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia + +# install ninja +RUN apt-get update && \ + apt-get install -y --no-install-recommends ninja-build && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# install apex +RUN git clone https://github.com/NVIDIA/apex && \ + cd apex && \ + git checkout a7de60 && \ + pip install packaging && \ + pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ + +# install colossalai +ARG VERSION=main +RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \ + && cd ./ColossalAI \ + && BUILD_EXT=1 pip install -v . \ + && rm -rf colossalai + +# install tensornvme +RUN conda install -y cmake && \ + apt update -y && apt install -y libaio-dev && \ + pip install -v git+https://github.com/hpcaitech/TensorNVMe.git