remove # syntax=docker/dockerfile:1
This commit is contained in:
@@ -1,4 +1,3 @@
|
|||||||
# syntax=docker/dockerfile:1
|
|
||||||
|
|
||||||
# NOTE: Building this image require's docker version >= 23.0.
|
# NOTE: Building this image require's docker version >= 23.0.
|
||||||
#
|
#
|
||||||
@@ -6,10 +5,6 @@
|
|||||||
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||||||
ARG CUDA_VERSION=12.1.0
|
ARG CUDA_VERSION=12.1.0
|
||||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
|
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
|
||||||
ARG HTTP_PROXY
|
|
||||||
ARG HTTPS_PROXY
|
|
||||||
ENV http_proxy=${HTTP_PROXY}
|
|
||||||
ENV https_proxy=${HTTPS_PROXY}
|
|
||||||
ARG DEBIAN_FRONTEND="noninteractive"
|
ARG DEBIAN_FRONTEND="noninteractive"
|
||||||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||||||
ENV MAMBA_ROOT_PREFIX=~/micromamba
|
ENV MAMBA_ROOT_PREFIX=~/micromamba
|
||||||
@@ -17,16 +12,10 @@ ARG ROOT_PASSWD="root"
|
|||||||
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
SHELL ["/bin/bash", "-c"]
|
SHELL ["/bin/bash", "-c"]
|
||||||
COPY id_rsa.pub /root/.ssh/id_rsa.pub
|
|
||||||
# base tools
|
# base tools
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
apt-get update
|
apt-get update
|
||||||
# 更新包管理器,并安装 RDMA 和 InfiniBand 相关的库和工具
|
|
||||||
# rdma-core: 包含 RDMA 驱动和库
|
|
||||||
# ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具
|
|
||||||
# perftest: 提供用于测试 InfiniBand 性能的工具
|
|
||||||
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags
|
|
||||||
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
|
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
|
||||||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||||||
add-apt-repository ppa:git-core/ppa -y
|
add-apt-repository ppa:git-core/ppa -y
|
||||||
@@ -41,7 +30,7 @@ sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd
|
|||||||
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
||||||
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||||||
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
|
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
|
||||||
# sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
||||||
mkdir /var/run/sshd
|
mkdir /var/run/sshd
|
||||||
echo "root:${ROOT_PASSWD}" | chpasswd
|
echo "root:${ROOT_PASSWD}" | chpasswd
|
||||||
mkdir -p ~/.pip
|
mkdir -p ~/.pip
|
||||||
@@ -63,6 +52,23 @@ channels:
|
|||||||
- defaults
|
- defaults
|
||||||
show_channel_urls: true
|
show_channel_urls: true
|
||||||
EOF
|
EOF
|
||||||
|
# 安装 micromamba
|
||||||
|
# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
|
||||||
|
# micromamba shell init -s bash -p ~/micromamba
|
||||||
|
# cat <<'EOF' >> ~/.bashrc
|
||||||
|
# source ~/micromamba/etc/profile.d/micromamba.sh
|
||||||
|
# EOF
|
||||||
|
# # 配置 .mambarc 文件
|
||||||
|
# cat <<EOF > ~/.mambarc
|
||||||
|
# channels:
|
||||||
|
# - conda-forge
|
||||||
|
# - bioconda
|
||||||
|
# - pytorch
|
||||||
|
# - pytorch-nightly
|
||||||
|
# - nvidia
|
||||||
|
# - defaults
|
||||||
|
# show_channel_urls: true
|
||||||
|
# EOF
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
|
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
|
||||||
@@ -75,6 +81,8 @@ ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
|
|||||||
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
|
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
|
||||||
ENV REF='main'
|
ENV REF='main'
|
||||||
ENV STAGE_DIR=/tmp
|
ENV STAGE_DIR=/tmp
|
||||||
|
ENV NV_PEER_MEM_VERSION=1.2
|
||||||
|
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||||
ENV OPENMPI_BASEVERSION=4.1
|
ENV OPENMPI_BASEVERSION=4.1
|
||||||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
||||||
ARG CUDA='cu121'
|
ARG CUDA='cu121'
|
||||||
@@ -87,6 +95,7 @@ ARG TORCHAUDIO_VERSION=2.3.1
|
|||||||
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
||||||
ARG PYTORCH_CUDA_VERSION=12.1
|
ARG PYTORCH_CUDA_VERSION=12.1
|
||||||
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
||||||
|
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
|
||||||
ARG SETUPTOOLS_VERSION=69.5.1
|
ARG SETUPTOOLS_VERSION=69.5.1
|
||||||
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
||||||
ARG USE_CUDA=1
|
ARG USE_CUDA=1
|
||||||
@@ -147,7 +156,6 @@ cd ..
|
|||||||
rm -rf ${STAGE_DIR}/apex
|
rm -rf ${STAGE_DIR}/apex
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
ENV MLNX_OFED_VERSION=5.8-4.1.5.0
|
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
@@ -163,19 +171,12 @@ python3 -m pip uninstall -y deepspeed
|
|||||||
# install deepspeed prepare
|
# install deepspeed prepare
|
||||||
# install Mellanox OFED
|
# install Mellanox OFED
|
||||||
mkdir -p ${STAGE_DIR}
|
mkdir -p ${STAGE_DIR}
|
||||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
|
||||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
|
||||||
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
||||||
cd ${STAGE_DIR}
|
cd ${STAGE_DIR}
|
||||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
|
||||||
EOT
|
cd ..
|
||||||
|
|
||||||
ENV NV_PEER_MEM_VERSION=1.2
|
|
||||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
|
||||||
RUN <<EOT
|
|
||||||
#!/bin/bash
|
|
||||||
source /opt/conda/etc/profile.d/conda.sh
|
|
||||||
conda activate ${CONDA_ENV_NAME}
|
|
||||||
# install nv_peer_mem
|
# install nv_peer_mem
|
||||||
rm -rf ${STAGE_DIR}
|
rm -rf ${STAGE_DIR}
|
||||||
mkdir -p ${STAGE_DIR}
|
mkdir -p ${STAGE_DIR}
|
||||||
@@ -186,7 +187,6 @@ cd ${STAGE_DIR}
|
|||||||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||||
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||||
apt-get update
|
apt-get update
|
||||||
apt --fix-broken install -y
|
|
||||||
apt-get install -y dkms
|
apt-get install -y dkms
|
||||||
dpkg-buildpackage -us -uc
|
dpkg-buildpackage -us -uc
|
||||||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||||
@@ -220,14 +220,14 @@ chmod a+x /usr/local/mpi/bin/mpirun
|
|||||||
EOT
|
EOT
|
||||||
|
|
||||||
# SSH daemon port inside container cannot conflict with host OS port
|
# SSH daemon port inside container cannot conflict with host OS port
|
||||||
# ENV SSH_PORT=2222
|
ENV SSH_PORT=2222
|
||||||
# RUN <<EOT
|
RUN <<EOT
|
||||||
# #!/bin/bash
|
#!/bin/bash
|
||||||
# source /opt/conda/etc/profile.d/conda.sh
|
source /opt/conda/etc/profile.d/conda.sh
|
||||||
# conda activate ${CONDA_ENV_NAME}
|
conda activate ${CONDA_ENV_NAME}
|
||||||
# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||||
# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||||
# EOT
|
EOT
|
||||||
|
|
||||||
# 29.78 Usage: install.sh [options...]
|
# 29.78 Usage: install.sh [options...]
|
||||||
# 29.78
|
# 29.78
|
||||||
@@ -384,6 +384,12 @@ else
|
|||||||
fi
|
fi
|
||||||
eval $INSTALL_CMD
|
eval $INSTALL_CMD
|
||||||
# compile deepspeed ops
|
# compile deepspeed ops
|
||||||
|
cat <<'EOF' >> ~/.bashrc
|
||||||
|
source ~/micromamba/etc/profile.d/micromamba.sh
|
||||||
|
echo "alias mamba=micromamba" >> ~/.bashrc
|
||||||
|
echo "alias mba=mamba" >> ~/.bashrc
|
||||||
|
EOF
|
||||||
|
# 配置 .mambarc 文件
|
||||||
cat <<EOF > ~/compile_deepspeed_ops.py
|
cat <<EOF > ~/compile_deepspeed_ops.py
|
||||||
import deepspeed
|
import deepspeed
|
||||||
|
|
||||||
@@ -454,17 +460,4 @@ pip3 install peft tiktoken \
|
|||||||
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
ARG DEEPSPEED_TRAIN='/data/train_data'
|
|
||||||
ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
|
|
||||||
ARG DEEPSPEED_VALIDATION='/data/validation_data'
|
|
||||||
ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
|
|
||||||
ARG NCCL_SOCKET_IFNAME='eth0'
|
|
||||||
|
|
||||||
# RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc && \
|
|
||||||
# echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \
|
|
||||||
# echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \
|
|
||||||
# echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \
|
|
||||||
# echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc
|
|
||||||
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
CMD ["/usr/sbin/sshd", "-D"]
|
||||||
# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]
|
|
||||||
|
|||||||
Reference in New Issue
Block a user