This commit is contained in:
2024-06-20 16:17:56 +08:00
parent 2be8ab5de0
commit 8dc78b7faa
5 changed files with 382 additions and 108 deletions

View File

@@ -1,8 +1,9 @@
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba
ARG CONDA_ENV_NAME="ldh"
ARG CONDA_ENV_NAME="deepspeed"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
@@ -15,7 +16,7 @@ SHELL ["/bin/bash", "-c"]
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
@@ -38,9 +39,6 @@ wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
conda init bash
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -y
conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch vidia-ml-py3 opencv-contrib-python
conda clean -afy
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
@@ -76,10 +74,29 @@ show_channel_urls: true
EOF
EOT
# 安装 ninja 并测试
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
# PyTorch
ENV REF='main'
ENV STAGE_DIR=/tmp
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ARG CUDA='cu121'
ENV CUDA=${CUDA}
ARG PYTORCH_VERSION=2.3.0
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ARG TORCHVISION_VERSION=0.18.0
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
ARG TORCHAUDIO_VERSION=2.3.0
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
ARG PYTORCH_CUDA_VERSION=12.1
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
ARG SETUPTOOLS_VERSION=69.5.1
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
RUN <<EOT
#!/bin/bash
# 安装 ninja
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# 克隆 ninja 源码并编译
@@ -89,116 +106,270 @@ cd ninja
git clone https://github.com/google/googletest.git
conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap
# 配置并构建 Ninja 测试,添加 pthread 链接选项
CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
./ninja all
# 运行 Ninja 单元测试
./ninja_test
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
python3 -m pip install --no-cache-dir --upgrade pip
conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
conda clean -afy
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
conda run -n ${CONDA_ENV_NAME} python -m pip install setuptools==${SETUPTOOLS_VERSION}
conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch torchvision torchaudio
# # install pytorch create conda env aleay exists
conda run -n ${CONDA_ENV_NAME} python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y transformer-engine
conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch-tensorrt
conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y apex
EOT
# # deepspeed
# ENV STAGE_DIR=/tmp
# RUN <<EOT
# #!/bin/bash
# mkdir -p ${STAGE_DIR}
# echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
# cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config
# sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
# EOT
# # Mellanox OFED
# WORKDIR ${STAGE_DIR}
# ENV MLNX_OFED_VERSION=4.9-7.1.0.0
# RUN wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
# cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
# ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
# cd ${STAGE_DIR} && \
# rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
# # nv_peer_mem
# ENV NV_PEER_MEM_VERSION=1.2
# ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
# RUN mkdir -p ${STAGE_DIR} && \
# git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
# cd ${STAGE_DIR}/nv_peer_memory && \
# ./build_module.sh && \
# cd ${STAGE_DIR} && \
# tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
# cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
# apt-get update && \
# apt-get install -y dkms && \
# dpkg-buildpackage -us -uc && \
# dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
# # OPENMPI
# ENV OPENMPI_BASEVERSION=4.1
# ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
# RUN <<EOT
# #!/bin/bash
# cd ${STAGE_DIR}
# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
# cd openmpi-${OPENMPI_VERSION}
# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
# make -j"$(nproc)" install
# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
# # Sanity check:
# test -f /usr/local/mpi/bin/mpic++
# cd ${STAGE_DIR}
# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
# EOT
# ENV PATH=/usr/local/mpi/bin:${PATH}
# ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
# # Create a wrapper for OpenMPI to allow running as root by default
# RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
# chmod a+x /usr/local/mpi/bin/mpirun
# # Some Packages
# RUN <<EOT
# apt-get update
# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
# source /opt/conda/etc/profile.d/conda.sh
# conda activate ${CONDA_ENV_NAME}
# conda install -y mpi4py
# python -m pip install psutil \
# yappi \
# cffi \
# ipdb \
# pandas \
# matplotlib \
# py3nvml \
# pyarrow \
# graphviz \
# astor \
# boto3 \
# tqdm \
# sentencepiece \
# msgpack \
# requests \
# pandas \
# sphinx \
# sphinx_rtd_theme \
# scipy \
# numpy \
# scikit-learn \
# nvidia-ml-py3
# EOT
# PyTorch
ARG PYTORCH_VERSION=2.2.1
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
# install apex
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
pip install torch==${PYTORCH_VERSION} torchvision==0.17.1 torchaudio==${PYTORCH_VERSION} --index-url https://download.pytorch.org/whl/cu121
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
pip install deepspeed bitsandbytes accelerate transformers optimum
pip install pydantic transformers datasets accelerate evaluate peft deepspeed tiktoken \
sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn diffusers \
huggingface_hub spacy Pillow blobfile requests scipy pycocotools protobuf timm \
pyyaml ipython xformers opencv-contrib-python open_clip_torch \
packaging psutil zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
python -c "import deepspeed; print(deepspeed.__version__)"
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
cd apex
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
cd ..
rm -rf ${STAGE_DIR}/apex
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
cd ..
# install nv_peer_mem
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
cd ${STAGE_DIR}/nv_peer_memory
./build_module.sh
cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# install mpi
ENV PATH=/usr/local/mpi/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# OPENMPI
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
cd ${STAGE_DIR}
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
cd openmpi-${OPENMPI_VERSION}
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
make -j"$(nproc)" install
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
# Sanity check:
test -f /usr/local/mpi/bin/mpic++
cd ${STAGE_DIR}
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
# Create a wrapper for OpenMPI to allow running as root by default
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
chmod a+x /usr/local/mpi/bin/mpirun
EOT
# Some Packages
RUN <<EOT
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
apt-get update
apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
python -m pip install https://github.com/mpi4py/mpi4py/tarball/master
python -m pip install psutil \
yappi \
cffi \
ipdb \
pandas \
matplotlib \
py3nvml \
pyarrow \
graphviz \
astor \
boto3 \
tqdm \
sentencepiece \
msgpack \
requests \
pandas \
sphinx \
sphinx_rtd_theme \
scipy \
numpy \
scikit-learn \
nvidia-ml-py3
EOT
# SSH daemon port inside container cannot conflict with host OS port
ENV SSH_PORT=2222
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
EOT
# 29.78 Usage: install.sh [options...]
# 29.78
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
# 29.78
# 29.78 [optional]
# 29.78 -l, --local_only Install only on local machine
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
# 29.78 -v, --verbose Verbose logging
# 29.78 -h, --help This help text
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
usermod -aG sudo deepspeed
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
EOT
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70适用于 NVIDIA Volta 架构(如 Tesla V100
# 75适用于 NVIDIA Turing 架构(如 Tesla T4
# 80适用于 NVIDIA Ampere 架构(如 A100
# 90a适用于 NVIDIA Hopper 架构(如 H100
# 89:GeForce RTX 4090
ARG DCUTLASS_NVCC_ARCHS="89"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
cd /opt/cutlass
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
make -j"$(nproc)" install
cd ..
# make test_unit -j"$(nproc)"
# make test_unit_gemm_warp -j"$(nproc)"
EOT
# CUDA_ARCH_LIST="80;86;89;90"
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
ARG CUDA_ARCH_LIST="80;86"
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
ARG DS_BUILD_SPARSE_ATTN=0
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
ARG DS_BUILD_FUSED_ADAM=0
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
ARG DS_BUILD_CPU_ADAM=0
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
ARG DS_BUILD_OPS=1
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
ENV CUTLASS_PATH=/opt/cutlass
# install deepspeed
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
python -m pip install setuptools==${SETUPTOOLS_VERSION}
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
cd ${STAGE_DIR}/DeepSpeed-Kernels
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
# pip install dist/deepspeed_kernels-*.whl
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} pip install -v .
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
cd ${STAGE_DIR}/DeepSpeed
git checkout .
git checkout master
python setup.py bdist_wheel
pip install dist/deepspeed*.whl --force-reinstall
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt
# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=0 pip install -U --no-cache-dir .
# ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile
cd ..
# rm -rf ${STAGE_DIR}/DeepSpeed
EOT
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
# install transformers
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
cd ${STAGE_DIR}/transformers
python3 ./setup.py develop
python3 -m pip install -U --no-cache-dir "pydantic<2"
# install flash-attn
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
RUN <<EOT
#!/bin/bash
pip install optimum
pip install peft tiktoken \
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
huggingface_hub spacy blobfile pycocotools \
xformers open_clip_torch \
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
# add vscode server
# RUN <<EOT
# #!/bin/bash
# wget -qO- https://update.code.visualstudio.com/commit:${commit_id}/server-linux-x64/stable
# code-server --install-extension ms-python.vscode-pylance
# EOT
# 启动 ssh 服务
CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]

12
finetune/README.md Normal file
View File

@@ -0,0 +1,12 @@
## deepspeed docker image build
```shell
docker-compose -f docker-compose_pytorch1.13.yml build
docker-compose -f docker-compose_pytorch2.3.yml build
```
## test command
```shell
docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash
```

View File

@@ -12,6 +12,7 @@ services:
tty: true
restart: unless-stopped
image: hotwa/finetune:test
shm_size: '32gb'
ports:
- 3227:22
environment:

View File

@@ -0,0 +1,45 @@
version: '3.8'
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
CUDA_VERSION: 11.7.1
PYTORCH_VERSION: 1.13.1
TORCHVISION_VERSION: 0.14.1
TORCHAUDIO_VERSION: 0.13.1
DS_BUILD_OPS: 1
DS_BUILD_SPARSE_ATTN: 0
DS_BUILD_FUSED_ADAM: 0
DS_BUILD_CPU_ADAM: 0
CUDA: cu117
CUDA_ARCH_LIST: "80;86" # for RTX 4090, all : "80;86;89;90"
SETUPTOOLS_VERSION: "69.5.1"
volumes:
- ./src:/bbtft
container_name: ubuntu-finetune
pull_policy: if_not_present
tty: true
restart: unless-stopped
image: hotwa/deepspeed:test
shm_size: '32gb'
ports:
- 3227:22
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
networks:
- network_finetune
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
networks:
network_finetune:
name: network_finetune

View File

@@ -0,0 +1,45 @@
version: '3.8'
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
CUDA_VERSION: 12.1.0
PYTORCH_VERSION: 2.3.0
TORCHVISION_VERSION: 0.18.0
TORCHAUDIO_VERSION: 2.3.0
DS_BUILD_OPS: 1
DS_BUILD_SPARSE_ATTN: 0
DS_BUILD_FUSED_ADAM: 0
DS_BUILD_CPU_ADAM: 0
CUDA: cu121
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
SETUPTOOLS_VERSION: "69.5.1"
volumes:
- ./src:/bbtft
container_name: ubuntu-finetune
pull_policy: if_not_present
tty: true
restart: unless-stopped
image: hotwa/deepspeed:test
shm_size: '32gb'
ports:
- 3227:22
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
networks:
- network_finetune
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
networks:
network_finetune:
name: network_finetune