udpate
This commit is contained in:
@@ -1,3 +1,9 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
|
||||||
|
# NOTE: Building this image require's docker version >= 23.0.
|
||||||
|
#
|
||||||
|
# For reference:
|
||||||
|
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||||||
ARG CUDA_VERSION=12.1.0
|
ARG CUDA_VERSION=12.1.0
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
|
||||||
ARG DEBIAN_FRONTEND="noninteractive"
|
ARG DEBIAN_FRONTEND="noninteractive"
|
||||||
@@ -105,7 +111,7 @@ ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
|
|||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
|
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y
|
||||||
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
# 克隆 ninja 源码并编译
|
# 克隆 ninja 源码并编译
|
||||||
@@ -121,7 +127,7 @@ conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' p
|
|||||||
# 运行 Ninja 单元测试
|
# 运行 Ninja 单元测试
|
||||||
./ninja_test
|
./ninja_test
|
||||||
python3 -m pip install --no-cache-dir --upgrade pip
|
python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
|
python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
|
||||||
conda clean -afy
|
conda clean -afy
|
||||||
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
|
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd ..
|
||||||
python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
|
||||||
|
# NOTE: Building this image require's docker version >= 23.0.
|
||||||
|
#
|
||||||
|
# For reference:
|
||||||
|
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||||||
ARG CUDA_VERSION=12.1.0
|
ARG CUDA_VERSION=12.1.0
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
|
||||||
ARG DEBIAN_FRONTEND="noninteractive"
|
ARG DEBIAN_FRONTEND="noninteractive"
|
||||||
@@ -81,6 +87,8 @@ ENV NV_PEER_MEM_VERSION=1.2
|
|||||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||||
ENV OPENMPI_BASEVERSION=4.1
|
ENV OPENMPI_BASEVERSION=4.1
|
||||||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
||||||
|
ARG CUDA_NUM='121'
|
||||||
|
ENV CUDA_NUM=${CUDA_NUM}
|
||||||
ARG CUDA='cu121'
|
ARG CUDA='cu121'
|
||||||
ENV CUDA=${CUDA}
|
ENV CUDA=${CUDA}
|
||||||
ARG PYTORCH_VERSION=2.3.0
|
ARG PYTORCH_VERSION=2.3.0
|
||||||
@@ -91,6 +99,9 @@ ARG TORCHAUDIO_VERSION=2.3.0
|
|||||||
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
||||||
ARG PYTORCH_CUDA_VERSION=12.1
|
ARG PYTORCH_CUDA_VERSION=12.1
|
||||||
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a"
|
||||||
|
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
|
||||||
|
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
|
||||||
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
|
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
|
||||||
ARG SETUPTOOLS_VERSION=69.5.1
|
ARG SETUPTOOLS_VERSION=69.5.1
|
||||||
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
||||||
@@ -105,7 +116,7 @@ ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
|
|||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython cmake ninja -c conda-forge -y
|
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -c conda-forge -y
|
||||||
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
||||||
conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
python3 -m pip install --no-cache-dir --upgrade pip
|
python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
@@ -120,11 +131,14 @@ python3 -m pip uninstall -y torch torchvision torchaudio
|
|||||||
# # install pytorch create conda env aleay exists
|
# # install pytorch create conda env aleay exists
|
||||||
git clone --recursive https://github.com/pytorch/pytorch ${STAGE_DIR}/pytorch
|
git clone --recursive https://github.com/pytorch/pytorch ${STAGE_DIR}/pytorch
|
||||||
cd ${STAGE_DIR}/pytorch
|
cd ${STAGE_DIR}/pytorch
|
||||||
|
git checkout v${PYTORCH_VERSION}
|
||||||
git submodule sync
|
git submodule sync
|
||||||
git submodule update --init --recursive
|
git submodule update --init --recursive
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
conda install -y intel::mkl-static intel::mkl-include
|
||||||
|
conda install -y -c pytorch magma-cuda${CUDA_NUM}
|
||||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||||
python setup.py develop
|
python setup.py install
|
||||||
# python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
# python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
||||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
python3 -m pip uninstall -y transformer-engine
|
python3 -m pip uninstall -y transformer-engine
|
||||||
|
|||||||
372
finetune/Dockerfile.conda1
Normal file
372
finetune/Dockerfile.conda1
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
|
||||||
|
# NOTE: Building this image requires Docker version >= 23.0.
|
||||||
|
#
|
||||||
|
# For reference:
|
||||||
|
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||||||
|
|
||||||
|
ARG CUDA_VERSION=12.1.0
|
||||||
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
|
||||||
|
ARG DEBIAN_FRONTEND="noninteractive"
|
||||||
|
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||||||
|
ENV MAMBA_ROOT_PREFIX=~/micromamba
|
||||||
|
ARG ROOT_PASSWD="root"
|
||||||
|
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
||||||
|
WORKDIR /root
|
||||||
|
SHELL ["/bin/bash", "-c"]
|
||||||
|
|
||||||
|
# Base tools
|
||||||
|
RUN <<EOT
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
|
||||||
|
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||||||
|
add-apt-repository ppa:git-core/ppa -y
|
||||||
|
apt-get install -y git libnuma-dev wget
|
||||||
|
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||||
|
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
||||||
|
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||||||
|
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
|
||||||
|
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
||||||
|
mkdir /var/run/sshd
|
||||||
|
echo 'root:${ROOT_PASSWD}' | chpasswd
|
||||||
|
mkdir -p ~/.pip
|
||||||
|
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
|
||||||
|
bash /tmp/miniconda.sh -b -p /opt/conda
|
||||||
|
rm /tmp/miniconda.sh
|
||||||
|
conda init bash
|
||||||
|
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
|
||||||
|
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
|
||||||
|
cat <<EOF > ~/.condarc
|
||||||
|
channels:
|
||||||
|
- conda-forge
|
||||||
|
- bioconda
|
||||||
|
- pytorch
|
||||||
|
- pytorch-nightly
|
||||||
|
- nvidia
|
||||||
|
- defaults
|
||||||
|
show_channel_urls: true
|
||||||
|
EOF
|
||||||
|
echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
|
||||||
|
micromamba shell init -s bash -p ~/micromamba
|
||||||
|
cat <<'EOF' >> ~/.bashrc
|
||||||
|
source ~/micromamba/etc/profile.d/micromamba.sh
|
||||||
|
alias mamba=micromamba
|
||||||
|
alias mba=mamba
|
||||||
|
EOF
|
||||||
|
cat <<EOF > ~/.mambarc
|
||||||
|
channels:
|
||||||
|
- conda-forge
|
||||||
|
- bioconda
|
||||||
|
- pytorch
|
||||||
|
- pytorch-nightly
|
||||||
|
- nvidia
|
||||||
|
- defaults
|
||||||
|
show_channel_urls: true
|
||||||
|
EOF
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# PyTorch
|
||||||
|
ARG CONDA_ENV_NAME="deepspeed"
|
||||||
|
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
||||||
|
ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
|
||||||
|
ARG PYTORCH_VERSION=2.3.0
|
||||||
|
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a"
|
||||||
|
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
|
||||||
|
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
|
||||||
|
ARG CUDA_NUM='121'
|
||||||
|
ENV CUDA_NUM=${CUDA_NUM}
|
||||||
|
|
||||||
|
RUN <<EOT
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} cmake ninja -c conda-forge -y
|
||||||
|
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
python3 -m pip install --no-cache-dir --upgrade pip
|
||||||
|
conda clean -afy
|
||||||
|
|
||||||
|
# 获取指定版本的 PyTorch 源代码
|
||||||
|
git clone --recursive https://github.com/pytorch/pytorch ${STAGE_DIR}/pytorch
|
||||||
|
cd ${STAGE_DIR}/pytorch
|
||||||
|
git checkout v${PYTORCH_VERSION}
|
||||||
|
git submodule sync
|
||||||
|
git submodule update --init --recursive
|
||||||
|
|
||||||
|
# 安装依赖项
|
||||||
|
conda install -y intel::mkl-static intel::mkl-include
|
||||||
|
conda install -y -c pytorch magma-cuda${CUDA_NUM}
|
||||||
|
|
||||||
|
# 构建和安装 PyTorch
|
||||||
|
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||||
|
python setup.py install
|
||||||
|
|
||||||
|
# 安装其他必要的依赖项
|
||||||
|
python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python
|
||||||
|
python -m pip install setuptools==69.5.1
|
||||||
|
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
|
||||||
|
python3 -m pip uninstall -y torch torchvision torchaudio
|
||||||
|
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==0.18.0+${CUDA} torchaudio==2.3.0 --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
||||||
|
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||||
|
python3 -m pip uninstall -y transformer-engine
|
||||||
|
python3 -m pip uninstall -y torch-tensorrt
|
||||||
|
python3 -m pip uninstall -y apex
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# install apex
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex
|
||||||
|
cd apex
|
||||||
|
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
|
||||||
|
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||||||
|
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||||||
|
cd ..
|
||||||
|
rm -rf ${STAGE_DIR}/apex
|
||||||
|
EOT
|
||||||
|
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||||||
|
python3 -m pip uninstall -y deepspeed
|
||||||
|
# This has to be run (again) inside the GPU VMs running the tests.
|
||||||
|
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
|
||||||
|
# TODO: Find out why test fail. install deepspeed
|
||||||
|
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||||
|
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
|
||||||
|
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
|
||||||
|
# install deepspeed prepare
|
||||||
|
# install Mellanox OFED
|
||||||
|
mkdir -p ${STAGE_DIR}
|
||||||
|
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
|
||||||
|
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
|
||||||
|
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
||||||
|
cd ${STAGE_DIR}
|
||||||
|
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
|
||||||
|
cd ..
|
||||||
|
# install nv_peer_mem
|
||||||
|
rm -rf ${STAGE_DIR}
|
||||||
|
mkdir -p ${STAGE_DIR}
|
||||||
|
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
|
||||||
|
cd ${STAGE_DIR}/nv_peer_memory
|
||||||
|
./build_module.sh
|
||||||
|
cd ${STAGE_DIR}
|
||||||
|
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||||
|
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y dkms
|
||||||
|
dpkg-buildpackage -us -uc
|
||||||
|
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# install mpi
|
||||||
|
ENV PATH=/usr/local/mpi/bin:${PATH}
|
||||||
|
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
# OPENMPI
|
||||||
|
rm -rf ${STAGE_DIR}
|
||||||
|
mkdir -p ${STAGE_DIR}
|
||||||
|
cd ${STAGE_DIR}
|
||||||
|
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
|
||||||
|
cd openmpi-${OPENMPI_VERSION}
|
||||||
|
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
|
||||||
|
make -j"$(nproc)" install
|
||||||
|
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
|
||||||
|
# Sanity check:
|
||||||
|
test -f /usr/local/mpi/bin/mpic++
|
||||||
|
cd ${STAGE_DIR}
|
||||||
|
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
|
||||||
|
# Create a wrapper for OpenMPI to allow running as root by default
|
||||||
|
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
|
||||||
|
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
|
||||||
|
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
|
||||||
|
chmod a+x /usr/local/mpi/bin/mpirun
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# Some Packages
|
||||||
|
RUN <<EOT
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
|
||||||
|
python -m pip install https://github.com/mpi4py/mpi4py/tarball/master
|
||||||
|
python -m pip install psutil \
|
||||||
|
yappi \
|
||||||
|
cffi \
|
||||||
|
ipdb \
|
||||||
|
pandas \
|
||||||
|
matplotlib \
|
||||||
|
py3nvml \
|
||||||
|
pyarrow \
|
||||||
|
graphviz \
|
||||||
|
astor \
|
||||||
|
boto3 \
|
||||||
|
tqdm \
|
||||||
|
sentencepiece \
|
||||||
|
msgpack \
|
||||||
|
requests \
|
||||||
|
pandas \
|
||||||
|
sphinx \
|
||||||
|
sphinx_rtd_theme \
|
||||||
|
scipy \
|
||||||
|
numpy \
|
||||||
|
scikit-learn \
|
||||||
|
nvidia-ml-py3
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# SSH daemon port inside container cannot conflict with host OS port
|
||||||
|
ENV SSH_PORT=2222
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||||
|
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# 29.78 Usage: install.sh [options...]
|
||||||
|
# 29.78
|
||||||
|
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
|
||||||
|
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
|
||||||
|
# 29.78
|
||||||
|
# 29.78 [optional]
|
||||||
|
# 29.78 -l, --local_only Install only on local machine
|
||||||
|
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
|
||||||
|
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
|
||||||
|
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
|
||||||
|
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
|
||||||
|
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
|
||||||
|
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
|
||||||
|
# 29.78 -v, --verbose Verbose logging
|
||||||
|
# 29.78 -h, --help This help text
|
||||||
|
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||||||
|
usermod -aG sudo deepspeed
|
||||||
|
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# install cutlass https://github.com/NVIDIA/cutlass
|
||||||
|
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
|
||||||
|
# A100: architecture is Ampere
|
||||||
|
# V100: architecture is Volta
|
||||||
|
# T4: architecture is Turing
|
||||||
|
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
|
||||||
|
# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。
|
||||||
|
# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。
|
||||||
|
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
||||||
|
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
||||||
|
# 89:GeForce RTX 4090
|
||||||
|
ARG DCUTLASS_NVCC_ARCHS="89"
|
||||||
|
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
|
||||||
|
cd /opt/cutlass
|
||||||
|
git checkout .
|
||||||
|
git checkout master
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
|
||||||
|
make -j"$(nproc)" install
|
||||||
|
cd ..
|
||||||
|
# make test_unit -j"$(nproc)"
|
||||||
|
# make test_unit_gemm_warp -j"$(nproc)"
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# CUDA_ARCH_LIST="80;86;89;90"
|
||||||
|
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --no_clean"
|
||||||
|
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
||||||
|
ARG CUDA_ARCH_LIST="80;86;89;90"
|
||||||
|
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
||||||
|
ARG DS_BUILD_SPARSE_ATTN=0
|
||||||
|
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
||||||
|
ARG DS_BUILD_FUSED_ADAM=1
|
||||||
|
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
||||||
|
ARG DS_BUILD_CPU_ADAM=0
|
||||||
|
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
||||||
|
ARG DS_BUILD_OPS=1
|
||||||
|
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
||||||
|
ENV CUTLASS_PATH=/opt/cutlass
|
||||||
|
# install deepspeed
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||||||
|
# install oneapi for deepspeed
|
||||||
|
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||||||
|
cd ${STAGE_DIR}/oneCCL
|
||||||
|
git checkout .
|
||||||
|
git checkout master
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||||||
|
make -j install
|
||||||
|
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
|
cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
|
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
|
||||||
|
# pip install dist/deepspeed_kernels-*.whl
|
||||||
|
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} pip install -v .
|
||||||
|
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||||
|
cd ${STAGE_DIR}/DeepSpeed
|
||||||
|
git checkout .
|
||||||
|
git checkout master
|
||||||
|
python setup.py bdist_wheel
|
||||||
|
DS_BUILD_OPS=${DS_BUILD_OPS} pip install dist/deepspeed*.whl --force-reinstall
|
||||||
|
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -r requirements/requirements.txt
|
||||||
|
# DS_BUILD_OPS=0 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CPU_ADAM=0 DS_BUILD_FUSED_ADAM=1 pip install -U --no-cache-dir .
|
||||||
|
# ./install.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /job/hostfile # ./install.sh --allow_sudo --pip_sudo --no_clean --hostfile /path/to/your/hostfile
|
||||||
|
cd ..
|
||||||
|
# rm -rf ${STAGE_DIR}/DeepSpeed
|
||||||
|
EOT
|
||||||
|
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
# install transformers
|
||||||
|
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
|
||||||
|
cd ${STAGE_DIR}/transformers
|
||||||
|
python3 ./setup.py develop
|
||||||
|
python3 -m pip install -U --no-cache-dir "pydantic<2"
|
||||||
|
# install flash-attn
|
||||||
|
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
|
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
|
EOT
|
||||||
|
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
|
conda activate ${CONDA_ENV_NAME}
|
||||||
|
pip install optimum
|
||||||
|
pip install peft tiktoken \
|
||||||
|
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
|
||||||
|
huggingface_hub spacy blobfile pycocotools \
|
||||||
|
xformers open_clip_torch \
|
||||||
|
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# add vscode server
|
||||||
|
# RUN <<EOT
|
||||||
|
# #!/bin/bash
|
||||||
|
# wget -qO- https://update.code.visualstudio.com/commit:${commit_id}/server-linux-x64/stable
|
||||||
|
# code-server --install-extension ms-python.vscode-pylance
|
||||||
|
# EOT
|
||||||
|
|
||||||
|
# 启动 ssh 服务
|
||||||
|
# CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]
|
||||||
|
CMD ["/usr/sbin/sshd", "-D"]
|
||||||
@@ -12,7 +12,7 @@ services:
|
|||||||
TORCHVISION_VERSION: 0.14.1
|
TORCHVISION_VERSION: 0.14.1
|
||||||
TORCHAUDIO_VERSION: 0.13.1
|
TORCHAUDIO_VERSION: 0.13.1
|
||||||
DS_BUILD_OPS: 1
|
DS_BUILD_OPS: 1
|
||||||
DS_BUILD_SPARSE_ATTN: 0
|
DS_BUILD_SPARSE_ATTN: 1
|
||||||
DS_BUILD_FUSED_ADAM: 1
|
DS_BUILD_FUSED_ADAM: 1
|
||||||
DS_BUILD_CPU_ADAM: 1
|
DS_BUILD_CPU_ADAM: 1
|
||||||
USE_CUDA: 1
|
USE_CUDA: 1
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ services:
|
|||||||
ubuntu-finetune:
|
ubuntu-finetune:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile.conda
|
dockerfile: Dockerfile.conda1
|
||||||
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
||||||
PYTHON_VERSION: 3.9 # sparse attion 最新支持到3.9
|
PYTHON_VERSION: 3.9 # sparse attion 最新支持到3.9
|
||||||
CUDA_VERSION: 11.7.1 # pytorch 1.13.1 对应cuda 11.7.1
|
CUDA_VERSION: 11.7.1 # pytorch 1.13.1 对应cuda 11.7.1
|
||||||
@@ -18,6 +18,7 @@ services:
|
|||||||
USE_CUDA: 1
|
USE_CUDA: 1
|
||||||
USE_ROCM: 0
|
USE_ROCM: 0
|
||||||
USE_XPU: 0
|
USE_XPU: 0
|
||||||
|
CUDA_NUM: 117
|
||||||
CUDA: cu117
|
CUDA: cu117
|
||||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||||
SETUPTOOLS_VERSION: "69.5.1"
|
SETUPTOOLS_VERSION: "69.5.1"
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ services:
|
|||||||
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
||||||
PYTHON_VERSION: 3.9
|
PYTHON_VERSION: 3.9
|
||||||
CUDA_VERSION: 12.1.0
|
CUDA_VERSION: 12.1.0
|
||||||
PYTORCH_VERSION: 2.3.0
|
PYTORCH_VERSION: 2.3.1
|
||||||
TORCHVISION_VERSION: 0.18.0
|
TORCHVISION_VERSION: 0.18.0
|
||||||
TORCHAUDIO_VERSION: 2.3.0
|
TORCHAUDIO_VERSION: 2.3.0
|
||||||
DS_BUILD_OPS: 1
|
DS_BUILD_OPS: 1
|
||||||
|
|||||||
Reference in New Issue
Block a user