add
This commit is contained in:
@@ -1,330 +0,0 @@
|
|||||||
# syntax=docker/dockerfile:1
|
|
||||||
|
|
||||||
# NOTE: Building this image require's docker version >= 23.0.
|
|
||||||
#
|
|
||||||
# For reference:
|
|
||||||
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
|
||||||
ARG CUDA_VERSION=12.1.0
|
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
|
|
||||||
ARG HTTP_PROXY
|
|
||||||
ARG HTTPS_PROXY
|
|
||||||
ENV http_proxy=${HTTP_PROXY}
|
|
||||||
ENV https_proxy=${HTTPS_PROXY}
|
|
||||||
ARG DEBIAN_FRONTEND="noninteractive"
|
|
||||||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
|
||||||
ENV MAMBA_ROOT_PREFIX=~/micromamba
|
|
||||||
ARG ROOT_PASSWD="root"
|
|
||||||
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
|
||||||
WORKDIR /root
|
|
||||||
SHELL ["/bin/bash", "-c"]
|
|
||||||
COPY id_rsa.pub /root/.ssh/id_rsa.pub
|
|
||||||
|
|
||||||
# base tools
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags && \
|
|
||||||
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 && \
|
|
||||||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c && \
|
|
||||||
add-apt-repository ppa:git-core/ppa -y && \
|
|
||||||
apt-get install -y git libnuma-dev wget && \
|
|
||||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add - && \
|
|
||||||
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
|
|
||||||
sudo apt-get update && \
|
|
||||||
sudo apt-get install -y cmake && \
|
|
||||||
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
|
|
||||||
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \
|
|
||||||
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \
|
|
||||||
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config && \
|
|
||||||
mkdir /var/run/sshd && \
|
|
||||||
echo "root:${ROOT_PASSWD}" | chpasswd && \
|
|
||||||
mkdir -p ~/.pip && \
|
|
||||||
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
|
|
||||||
bash /tmp/miniconda.sh -b -p /opt/conda && \
|
|
||||||
rm /tmp/miniconda.sh && \
|
|
||||||
conda init bash && \
|
|
||||||
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
|
|
||||||
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
|
|
||||||
cat <<EOF > ~/.condarc
|
|
||||||
channels:
|
|
||||||
- conda-forge
|
|
||||||
- bioconda
|
|
||||||
- pytorch
|
|
||||||
- pytorch-nightly
|
|
||||||
- nvidia
|
|
||||||
- defaults
|
|
||||||
show_channel_urls: true
|
|
||||||
EOF
|
|
||||||
|
|
||||||
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
|
|
||||||
# PyTorch
|
|
||||||
ARG CONDA_ENV_NAME="deepspeed"
|
|
||||||
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
|
||||||
ARG PYTHON_VERSION=3.10
|
|
||||||
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
|
||||||
ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
|
|
||||||
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
|
|
||||||
ENV REF='main'
|
|
||||||
ENV STAGE_DIR=/tmp
|
|
||||||
ENV OPENMPI_BASEVERSION=4.1
|
|
||||||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
|
||||||
ARG CUDA='cu121'
|
|
||||||
ENV CUDA=${CUDA}
|
|
||||||
ARG PYTORCH_VERSION=2.3.1
|
|
||||||
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
|
|
||||||
ARG TORCHVISION_VERSION=0.18.1
|
|
||||||
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
|
|
||||||
ARG TORCHAUDIO_VERSION=2.3.1
|
|
||||||
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
|
||||||
ARG PYTORCH_CUDA_VERSION=12.1
|
|
||||||
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
|
||||||
ARG SETUPTOOLS_VERSION=69.5.1
|
|
||||||
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
|
||||||
ARG USE_CUDA=1
|
|
||||||
ENV USE_CUDA=${USE_CUDA}
|
|
||||||
ARG USE_ROCM=0
|
|
||||||
ENV USE_ROCM=${USE_ROCM}
|
|
||||||
ARG USE_XPU=0
|
|
||||||
ENV USE_XPU=${USE_XPU}
|
|
||||||
ARG _GLIBCXX_USE_CXX11_ABI=1
|
|
||||||
ENV _GLIBCXX_USE_CXX11_ABI=${_GLIBCXX_USE_CXX11_ABI}
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -c conda-forge -y && \
|
|
||||||
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc && \
|
|
||||||
which python > ~/python_path.txt && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
git clone https://github.com/ninja-build/ninja.git ${STAGE_DIR}/ninja && \
|
|
||||||
cd ${STAGE_DIR}/ninja && \
|
|
||||||
git clone https://github.com/google/googletest.git && \
|
|
||||||
python ./configure.py --bootstrap && \
|
|
||||||
conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest" && \
|
|
||||||
./ninja all && \
|
|
||||||
./ninja_test && \
|
|
||||||
python3 -m pip install --no-cache-dir --upgrade pip && \
|
|
||||||
python3 -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python && \
|
|
||||||
conda clean -afy && \
|
|
||||||
git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. && \
|
|
||||||
python -m pip install setuptools==${SETUPTOOLS_VERSION} && \
|
|
||||||
python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] && \
|
|
||||||
python3 -m pip uninstall -y torch torchvision torchaudio && \
|
|
||||||
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} xformers --extra-index-url https://download.pytorch.org/whl/${CUDA} && \
|
|
||||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate && \
|
|
||||||
python3 -m pip uninstall -y transformer-engine && \
|
|
||||||
python3 -m pip uninstall -y torch-tensorrt && \
|
|
||||||
python3 -m pip uninstall -y apex
|
|
||||||
|
|
||||||
# install apex
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
git clone https://github.com/NVIDIA/apex ${STAGE_DIR}/apex && \
|
|
||||||
cd apex && \
|
|
||||||
MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ && \
|
|
||||||
python -c "import apex.amp; print('Apex is installed and the amp module is available.')" && \
|
|
||||||
cd .. && \
|
|
||||||
rm -rf ${STAGE_DIR}/apex
|
|
||||||
|
|
||||||
ENV MLNX_OFED_VERSION=5.8-4.1.5.0
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
mkdir -p ${STAGE_DIR} && \
|
|
||||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - && \
|
|
||||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 && \
|
|
||||||
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
|
|
||||||
cd ${STAGE_DIR} && \
|
|
||||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
|
||||||
|
|
||||||
ENV NV_PEER_MEM_VERSION=1.2
|
|
||||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
rm -rf ${STAGE_DIR} && \
|
|
||||||
mkdir -p ${STAGE_DIR} && \
|
|
||||||
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
|
|
||||||
cd ${STAGE_DIR}/nv_peer_memory && \
|
|
||||||
./build_module.sh && \
|
|
||||||
cd ${STAGE_DIR} && \
|
|
||||||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
|
|
||||||
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
|
|
||||||
apt-get update && \
|
|
||||||
apt --fix-broken install -y && \
|
|
||||||
apt-get install -y dkms && \
|
|
||||||
dpkg-buildpackage -us -uc && \
|
|
||||||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
|
||||||
|
|
||||||
# install mpi
|
|
||||||
ENV PATH=/usr/local/mpi/bin:${PATH}
|
|
||||||
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
rm -rf ${STAGE_DIR} && \
|
|
||||||
mkdir -p ${STAGE_DIR} && \
|
|
||||||
cd ${STAGE_DIR} && \
|
|
||||||
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
|
|
||||||
cd openmpi-${OPENMPI_VERSION} && \
|
|
||||||
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
|
|
||||||
make -j"$(nproc)" install && \
|
|
||||||
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
|
|
||||||
test -f /usr/local/mpi/bin/mpic++ && \
|
|
||||||
cd ${STAGE_DIR} && \
|
|
||||||
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} && \
|
|
||||||
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
|
|
||||||
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
|
|
||||||
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
|
|
||||||
chmod a+x /usr/local/mpi/bin/mpirun
|
|
||||||
|
|
||||||
# SSH daemon port inside container cannot conflict with host OS port
|
|
||||||
# ENV SSH_PORT=2222
|
|
||||||
# RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
# conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
|
||||||
# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
|
||||||
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
useradd --create-home --uid 1000 --shell /bin/bash deepspeed && \
|
|
||||||
usermod -aG sudo deepspeed && \
|
|
||||||
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
|
||||||
|
|
||||||
# install cutlass https://github.com/NVIDIA/cutlass
|
|
||||||
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
|
|
||||||
# A100: architecture is Ampere
|
|
||||||
# V100: architecture is Volta
|
|
||||||
# T4: architecture is Turing
|
|
||||||
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
|
|
||||||
# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。
|
|
||||||
# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。
|
|
||||||
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
|
||||||
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
|
||||||
# 89:GeForce RTX 4090
|
|
||||||
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
|
|
||||||
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
git clone https://github.com/NVIDIA/cutlass /opt/cutlass && \
|
|
||||||
cd /opt/cutlass && \
|
|
||||||
git checkout . && \
|
|
||||||
git checkout master && \
|
|
||||||
mkdir build && \
|
|
||||||
cd build && \
|
|
||||||
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON && \
|
|
||||||
make -j"$(nproc)" install && \
|
|
||||||
cd ..
|
|
||||||
|
|
||||||
# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
|
|
||||||
# RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
# conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
# apt-get update && \
|
|
||||||
# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev && \
|
|
||||||
# python -m pip install pipdeptree psutil yappi cffi ipdb pandas matplotlib py3nvml pyarrow graphviz astor boto3 tqdm sentencepiece msgpack requests pandas sphinx sphinx_rtd_theme scipy numpy scikit-learn nvidia-ml-py3 mpi4py
|
|
||||||
|
|
||||||
# install deepspeed step 1
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION} && \
|
|
||||||
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL && \
|
|
||||||
cd ${STAGE_DIR}/oneCCL && \
|
|
||||||
git checkout . && \
|
|
||||||
git checkout master && \
|
|
||||||
mkdir build && \
|
|
||||||
cd build && \
|
|
||||||
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local && \
|
|
||||||
make -j"$(nproc)" install
|
|
||||||
|
|
||||||
# install deepspeed step 2
|
|
||||||
ARG CUDA_ARCH_LIST="80;86;89;90"
|
|
||||||
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels && \
|
|
||||||
cd ${STAGE_DIR}/DeepSpeed-Kernels && \
|
|
||||||
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
|
||||||
|
|
||||||
ARG DEEPSPEED_VERSION="v0.14.3"
|
|
||||||
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
|
||||||
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
|
||||||
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
|
||||||
ARG DS_BUILD_SPARSE_ATTN=0
|
|
||||||
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
|
||||||
ARG DS_BUILD_FUSED_ADAM=1
|
|
||||||
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
|
||||||
ARG DS_BUILD_CPU_ADAM=1
|
|
||||||
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
|
||||||
ARG DS_BUILD_OPS=1
|
|
||||||
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
|
||||||
ARG HOSTFILE_CONTENT=""
|
|
||||||
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
|
|
||||||
ENV CUTLASS_PATH='/opt/cutlass'
|
|
||||||
ENV CUDA_HOME='/usr/local/cuda'
|
|
||||||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
|
||||||
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
|
||||||
|
|
||||||
# install deepspeed step 3
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed && \
|
|
||||||
cd ${STAGE_DIR}/DeepSpeed && \
|
|
||||||
git checkout ${DEEPSPEED_VERSION} && \
|
|
||||||
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh && \
|
|
||||||
chmod +x ./install_modified.sh && \
|
|
||||||
if [ -n "${HOSTFILE_CONTENT}" ]; then \
|
|
||||||
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile && \
|
|
||||||
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"; \
|
|
||||||
else \
|
|
||||||
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"; \
|
|
||||||
fi && \
|
|
||||||
eval $INSTALL_CMD && \
|
|
||||||
cat <<EOF > ~/compile_deepspeed_ops.py
|
|
||||||
import deepspeed
|
|
||||||
|
|
||||||
def compile_ops():
|
|
||||||
builders = [
|
|
||||||
deepspeed.ops.op_builder.AsyncIOBuilder,
|
|
||||||
deepspeed.ops.op_builder.FusedAdamBuilder,
|
|
||||||
deepspeed.ops.op_builder.CPUAdamBuilder,
|
|
||||||
deepspeed.ops.op_builder.CPUAdagradBuilder,
|
|
||||||
deepspeed.ops.op_builder.CPULionBuilder,
|
|
||||||
deepspeed.ops.op_builder.EvoformerAttnBuilder,
|
|
||||||
deepspeed.ops.op_builder.FPQuantizerBuilder,
|
|
||||||
deepspeed.ops.op_builder.FusedLambBuilder,
|
|
||||||
deepspeed.ops.op_builder.FusedLionBuilder,
|
|
||||||
deepspeed.ops.op_builder.QuantizerBuilder,
|
|
||||||
deepspeed.ops.op_builder.RaggedOpsBuilder,
|
|
||||||
deepspeed.ops.op_builder.RandomLTDBuilder,
|
|
||||||
deepspeed.ops.op_builder.SparseAttnBuilder,
|
|
||||||
deepspeed.ops.op_builder.SpatialInferenceBuilder,
|
|
||||||
deepspeed.ops.op_builder.TransformerBuilder,
|
|
||||||
deepspeed.ops.op_builder.StochasticTransformerBuilder,
|
|
||||||
]
|
|
||||||
|
|
||||||
for builder in builders:
|
|
||||||
print(f"Compiling {builder.__name__}")
|
|
||||||
builder().load()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
compile_ops()
|
|
||||||
EOF && \
|
|
||||||
python compile_deepspeed_ops.py && \
|
|
||||||
ds_report
|
|
||||||
|
|
||||||
# install transformers and flash-attn
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers && \
|
|
||||||
cd ${STAGE_DIR}/transformers && \
|
|
||||||
python3 ./setup.py develop && \
|
|
||||||
python3 -m pip install -U --no-cache-dir "pydantic<2" && \
|
|
||||||
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
|
||||||
|
|
||||||
# other packages
|
|
||||||
ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
|
|
||||||
RUN source /opt/conda/etc/profile.d/conda.sh && \
|
|
||||||
conda activate ${CONDA_ENV_NAME} && \
|
|
||||||
pip3 install optimum && \
|
|
||||||
pip3 install peft tiktoken tqdm matplotlib seaborn numpy pandas scikit-learn diffusers huggingface_hub spacy blobfile pycocotools open_clip_torch zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
|
||||||
|
|
||||||
ARG DEEPSPEED_TRAIN='/data/train_data'
|
|
||||||
ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
|
|
||||||
ARG DEEPSPEED_VALIDATION='/data/validation_data'
|
|
||||||
ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
|
|
||||||
ARG NCCL_SOCKET_IFNAME='eth0'
|
|
||||||
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
|
||||||
# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]
|
|
||||||
@@ -5,10 +5,26 @@ services:
|
|||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile.ldh
|
dockerfile: Dockerfile.ldh
|
||||||
args:
|
args:
|
||||||
|
# PYTHON_VERSION: "3.10"
|
||||||
|
# CUDA_VERSION: "12.1.0"
|
||||||
|
# PYTORCH_VERSION: "2.3.0"
|
||||||
|
# TORCHVISION_VERSION: "0.18.0"
|
||||||
|
# TORCHAUDIO_VERSION: "2.3.0"
|
||||||
|
# DS_BUILD_OPS: 1
|
||||||
|
# USE_CUDA: 1
|
||||||
|
# USE_ROCM: 0
|
||||||
|
# USE_XPU: 0
|
||||||
|
# CUDA: cu121
|
||||||
|
# CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||||
|
# SETUPTOOLS_VERSION: "69.5.1"
|
||||||
|
# DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
||||||
|
# DEEPSPEED_VERSION: "master"
|
||||||
|
# DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
|
||||||
HTTP_PROXY: "http://127.0.0.1:15777"
|
HTTP_PROXY: "http://127.0.0.1:15777"
|
||||||
HTTPS_PROXY: "http://127.0.0.1:15777"
|
HTTPS_PROXY: "http://127.0.0.1:15777"
|
||||||
|
# cache-from: "type=local"
|
||||||
image: ldh/deepspeed:test
|
image: ldh/deepspeed:test
|
||||||
shm_size: '32gb'
|
shm_size: '128gb'
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
@@ -20,22 +36,17 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- NVIDIA_VISIBLE_DEVICES=all
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
#- CUTLASS_PATH="/opt/cutlass"
|
# stdin_open: true
|
||||||
#- CUDA_HOME="/usr/local/cuda"
|
# tty: true
|
||||||
#- PATH="${CUDA_HOME}/bin:${PATH}"
|
|
||||||
#- LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}"
|
|
||||||
stdin_open: true
|
|
||||||
tty: true
|
|
||||||
privileged: true
|
privileged: true
|
||||||
cap_add:
|
cap_add:
|
||||||
- IPC_LOCK
|
- IPC_LOCK
|
||||||
volumes:
|
volumes:
|
||||||
# - /mnt/local-nvme2:/root/workspace
|
- /root/workspace:/root/data
|
||||||
# - /dev/infiniband:/dev/infiniband
|
- /dev/infiniband:/dev/infiniband
|
||||||
# - /mnt/local-nvme:/root/
|
# ports:
|
||||||
ports:
|
# - "22242:22242"
|
||||||
- "22242:22242"
|
# - "5000:5000"
|
||||||
- "5000:5000"
|
|
||||||
# networks:
|
# networks:
|
||||||
# - ldh_overlay_network
|
# - ldh_overlay_network
|
||||||
network_mode: host
|
network_mode: host
|
||||||
|
|||||||
Reference in New Issue
Block a user