Merge branch 'main' of https://wtrr1.jmsu.top:8543/lingyuzeng/cdc_dockerfile
This commit is contained in:
46
finetune/Dockfile-colosial
Normal file
46
finetune/Dockfile-colosial
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
FROM hpcaitech/cuda-conda:12.1
|
||||||
|
|
||||||
|
# metainformation
|
||||||
|
LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/ColossalAI"
|
||||||
|
LABEL org.opencontainers.image.licenses = "Apache License 2.0"
|
||||||
|
LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/cuda-conda:12.1"
|
||||||
|
|
||||||
|
# enable passwordless ssh
|
||||||
|
RUN mkdir ~/.ssh && \
|
||||||
|
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config && \
|
||||||
|
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
|
||||||
|
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
|
||||||
|
|
||||||
|
# enable RDMA support
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y infiniband-diags perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# install torch
|
||||||
|
RUN conda install -y python==3.10 && conda install -y pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=12.1 -c pytorch -c nvidia
|
||||||
|
|
||||||
|
# install ninja
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends ninja-build && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# install apex
|
||||||
|
RUN git clone https://github.com/NVIDIA/apex && \
|
||||||
|
cd apex && \
|
||||||
|
git checkout a7de60 && \
|
||||||
|
pip install packaging && \
|
||||||
|
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||||||
|
|
||||||
|
# install colossalai
|
||||||
|
ARG VERSION=main
|
||||||
|
RUN git clone -b ${VERSION} https://github.com/hpcaitech/ColossalAI.git \
|
||||||
|
&& cd ./ColossalAI \
|
||||||
|
&& BUILD_EXT=1 pip install -v . \
|
||||||
|
&& rm -rf colossalai
|
||||||
|
|
||||||
|
# install tensornvme
|
||||||
|
RUN conda install -y cmake && \
|
||||||
|
apt update -y && apt install -y libaio-dev && \
|
||||||
|
pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
|
||||||
@@ -59,7 +59,54 @@ sudo systemctl status nvidia-fabricmanager
|
|||||||
|
|
||||||
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
|
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
|
||||||
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
|
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
|
||||||
|
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
|
||||||
|
|
||||||
|
pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
|
||||||
|
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
|
||||||
|
|
||||||
|
```shell
|
||||||
|
1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||||
|
2 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||||
|
3 curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
|
||||||
|
4 pigchacli
|
||||||
|
5 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
|
||||||
|
6 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
|
||||||
|
7 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||||
|
8 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||||
|
9 python -c "from xformers import ops as xops"
|
||||||
|
10 python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||||||
|
11 env
|
||||||
|
12 pip install git+https://github.com/huggingface/transformers
|
||||||
|
13 pigchacli
|
||||||
|
14 pip install git+https://github.com/huggingface/transformers
|
||||||
|
15 pip list
|
||||||
|
16 export STAGE_DIR=/tmp
|
||||||
|
17 git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||||||
|
18 cd ${STAGE_DIR}/oneCCL
|
||||||
|
19 git checkout .
|
||||||
|
20 git checkout master
|
||||||
|
21 mkdir build
|
||||||
|
22 cd build
|
||||||
|
23 cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||||||
|
24 make -j"$(nproc)" install
|
||||||
|
25 ls
|
||||||
|
26 echo ${CUDA_ARCH_LIST}
|
||||||
|
27 git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
|
28 cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
|
29 python -m pip install -v .
|
||||||
|
30 env
|
||||||
|
31 python -m pip install -v .
|
||||||
|
32 git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||||
|
33 cd ${STAGE_DIR}/DeepSpeed
|
||||||
|
34 export DEEPSPEED_VERSION="v0.14.3"
|
||||||
|
35 git checkout ${DEEPSPEED_VERSION}
|
||||||
|
36 ls
|
||||||
|
37 ./install.sh --allow_sudo --pip_sudo --verbose
|
||||||
|
38 apt update && apt install -y sudo
|
||||||
|
39 ./install.sh --allow_sudo --pip_sudo --verbose
|
||||||
|
```
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ services:
|
|||||||
USE_XPU: 0
|
USE_XPU: 0
|
||||||
CUDA: cu121
|
CUDA: cu121
|
||||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||||
TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
|
TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
|
||||||
SETUPTOOLS_VERSION: "69.5.1"
|
SETUPTOOLS_VERSION: "69.5.1"
|
||||||
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
||||||
DEEPSPEED_VERSION: "master"
|
DEEPSPEED_VERSION: "master"
|
||||||
|
|||||||
Reference in New Issue
Block a user