Merge branch 'main' of https://wtrr1.jmsu.top:8543/lingyuzeng/cdc_dockerfile

2024-07-18 03:08:43 +00:00
parent 9182646a46 f55f3daba0
commit ec9613729b
3 changed files with 94 additions and 1 deletions
--- a/finetune/Dockfile-colosial
+++ b/finetune/Dockfile-colosial
@@ -0,0 +1,46 @@
 FROM hpcaitech/cuda-conda:12.1
 # metainformation
 LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
 LABEL org.opencontainers.image.licenses = "Apache License 2.0"
 LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
 # enable passwordless ssh
 RUN mkdir ~/.ssh && \
    printf "Host * \n    ForwardAgent yes\nHost *\n    StrictHostKeyChecking no" > ~/.ssh/config && \
    ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
 # enable RDMA support
 RUN apt-get update && \
    apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # install torch
 RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
 # install ninja
 RUN apt-get update && \
    apt-get install -y --no-install-recommends ninja-build && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # install apex
 RUN git clone https://github.com/NVIDIA/apex && \
    cd apex && \
    git checkout a7de60 && \
    pip install packaging && \
    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
 # install colossalai
 ARG VERSION=main
 RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
    && cd ./ColossalAI \
    && BUILD_EXT=1 pip install -v . \
    && rm -rf colossalai
 # install tensornvme
 RUN conda install -y cmake && \
    apt update -y && apt install -y libaio-dev && \
    pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
--- a/finetune/README.md
+++ b/finetune/README.md
@@ -59,7 +59,54 @@ sudo systemctl status nvidia-fabricmanager
 docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
 docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
 docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
 pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
 pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
 ```shell
    1  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
    2  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
    3  curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
    4  pigchacli 
    5  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
    6  export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
    7  pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
    8  pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
    9  python -c "from xformers import ops as xops"
   10  python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
   11  env
   12  pip install git+https://github.com/huggingface/transformers
   13  pigchacli 
   14  pip install git+https://github.com/huggingface/transformers
   15  pip list
   16  export STAGE_DIR=/tmp
   17  git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
   18  cd ${STAGE_DIR}/oneCCL
   19  git checkout . 
   20  git checkout master
   21  mkdir build
   22  cd build 
   23  cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
   24  make -j"$(nproc)" install
   25  ls
   26  echo ${CUDA_ARCH_LIST}
   27  git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
   28  cd ${STAGE_DIR}/DeepSpeed-Kernels
   29  python -m pip install -v .
   30  env
   31  python -m pip install -v .
   32  git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
   33  cd ${STAGE_DIR}/DeepSpeed
   34  export DEEPSPEED_VERSION="v0.14.3"
   35  git checkout ${DEEPSPEED_VERSION}
   36  ls
   37  ./install.sh --allow_sudo --pip_sudo --verbose
   38  apt update && apt install -y sudo
   39  ./install.sh --allow_sudo --pip_sudo --verbose
 ```
 ```shell
 nvidia-smi
--- a/finetune/docker-compose_update.yml
+++ b/finetune/docker-compose_update.yml
@@ -34,7 +34,7 @@ services:
        USE_XPU: 0
        CUDA: cu121
        CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
-        TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
+        TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
        SETUPTOOLS_VERSION: "69.5.1"
        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 
        DEEPSPEED_VERSION: "master"