This commit is contained in:
Your Name
2024-07-18 03:08:43 +00:00
3 changed files with 94 additions and 1 deletions

View File

@@ -0,0 +1,46 @@
FROM hpcaitech/cuda-conda:12.1
# metainformation
LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
LABEL org.opencontainers.image.licenses = "Apache License 2.0"
LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
# enable passwordless ssh
RUN mkdir ~/.ssh && \
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config && \
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# enable RDMA support
RUN apt-get update && \
apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# install torch
RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
# install ninja
RUN apt-get update && \
apt-get install -y --no-install-recommends ninja-build && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# install apex
RUN git clone https://github.com/NVIDIA/apex && \
cd apex && \
git checkout a7de60 && \
pip install packaging && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
# install colossalai
ARG VERSION=main
RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
&& cd ./ColossalAI \
&& BUILD_EXT=1 pip install -v . \
&& rm -rf colossalai
# install tensornvme
RUN conda install -y cmake && \
apt update -y && apt install -y libaio-dev && \
pip install -v git+https://github.com/hpcaitech/TensorNVMe.git

View File

@@ -59,7 +59,54 @@ sudo systemctl status nvidia-fabricmanager
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
```shell
1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
2 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
3 curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
4 pigchacli
5 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
6 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
7 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
8 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
9 python -c "from xformers import ops as xops"
10 python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
11 env
12 pip install git+https://github.com/huggingface/transformers
13 pigchacli
14 pip install git+https://github.com/huggingface/transformers
15 pip list
16 export STAGE_DIR=/tmp
17 git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
18 cd ${STAGE_DIR}/oneCCL
19 git checkout .
20 git checkout master
21 mkdir build
22 cd build
23 cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
24 make -j"$(nproc)" install
25 ls
26 echo ${CUDA_ARCH_LIST}
27 git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
28 cd ${STAGE_DIR}/DeepSpeed-Kernels
29 python -m pip install -v .
30 env
31 python -m pip install -v .
32 git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
33 cd ${STAGE_DIR}/DeepSpeed
34 export DEEPSPEED_VERSION="v0.14.3"
35 git checkout ${DEEPSPEED_VERSION}
36 ls
37 ./install.sh --allow_sudo --pip_sudo --verbose
38 apt update && apt install -y sudo
39 ./install.sh --allow_sudo --pip_sudo --verbose
```
```shell
nvidia-smi

View File

@@ -34,7 +34,7 @@ services:
USE_XPU: 0
CUDA: cu121
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
SETUPTOOLS_VERSION: "69.5.1"
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
DEEPSPEED_VERSION: "master"