remove # syntax=docker/dockerfile:1

This commit is contained in:
2024-07-13 16:08:30 +08:00
parent ddd89a900b
commit 65c0338c74

View File

@@ -1,4 +1,3 @@
# syntax=docker/dockerfile:1
# NOTE: Building this image require's docker version >= 23.0.
#
@@ -6,10 +5,6 @@
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG CUDA_VERSION=12.1.0
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba
@@ -17,16 +12,10 @@ ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
COPY id_rsa.pub /root/.ssh/id_rsa.pub
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
# 更新包管理器,并安装 RDMA 和 InfiniBand 相关的库和工具
# rdma-core: 包含 RDMA 驱动和库
# ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具
# perftest: 提供用于测试 InfiniBand 性能的工具
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
@@ -41,7 +30,7 @@ sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
# sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip
@@ -63,6 +52,23 @@ channels:
- defaults
show_channel_urls: true
EOF
# 安装 micromamba
# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
# micromamba shell init -s bash -p ~/micromamba
# cat <<'EOF' >> ~/.bashrc
# source ~/micromamba/etc/profile.d/micromamba.sh
# EOF
# # 配置 .mambarc 文件
# cat <<EOF > ~/.mambarc
# channels:
# - conda-forge
# - bioconda
# - pytorch
# - pytorch-nightly
# - nvidia
# - defaults
# show_channel_urls: true
# EOF
EOT
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -75,6 +81,8 @@ ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
ENV REF='main'
ENV STAGE_DIR=/tmp
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
ENV OPENMPI_BASEVERSION=4.1
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
ARG CUDA='cu121'
@@ -87,6 +95,7 @@ ARG TORCHAUDIO_VERSION=2.3.1
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
ARG PYTORCH_CUDA_VERSION=12.1
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
ARG SETUPTOOLS_VERSION=69.5.1
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
ARG USE_CUDA=1
@@ -147,7 +156,6 @@ cd ..
rm -rf ${STAGE_DIR}/apex
EOT
ENV MLNX_OFED_VERSION=5.8-4.1.5.0
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
@@ -163,19 +171,12 @@ python3 -m pip uninstall -y deepspeed
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
EOT
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
cd ..
# install nv_peer_mem
rm -rf ${STAGE_DIR}
mkdir -p ${STAGE_DIR}
@@ -186,7 +187,6 @@ cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt --fix-broken install -y
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
@@ -220,14 +220,14 @@ chmod a+x /usr/local/mpi/bin/mpirun
EOT
# SSH daemon port inside container cannot conflict with host OS port
# ENV SSH_PORT=2222
# RUN <<EOT
# #!/bin/bash
# source /opt/conda/etc/profile.d/conda.sh
# conda activate ${CONDA_ENV_NAME}
# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
# EOT
ENV SSH_PORT=2222
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
EOT
# 29.78 Usage: install.sh [options...]
# 29.78
@@ -384,6 +384,12 @@ else
fi
eval $INSTALL_CMD
# compile deepspeed ops
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
echo "alias mamba=micromamba" >> ~/.bashrc
echo "alias mba=mamba" >> ~/.bashrc
EOF
# 配置 .mambarc 文件
cat <<EOF > ~/compile_deepspeed_ops.py
import deepspeed
@@ -454,17 +460,4 @@ pip3 install peft tiktoken \
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
EOT
ARG DEEPSPEED_TRAIN='/data/train_data'
ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
ARG DEEPSPEED_VALIDATION='/data/validation_data'
ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
ARG NCCL_SOCKET_IFNAME='eth0'
# RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc && \
# echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \
# echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \
# echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \
# echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc
CMD ["/usr/sbin/sshd", "-D"]
# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]