remove # syntax=docker/dockerfile:1
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
# syntax=docker/dockerfile:1
|
||||
|
||||
# NOTE: Building this image require's docker version >= 23.0.
|
||||
#
|
||||
@@ -6,10 +5,6 @@
|
||||
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||||
ARG CUDA_VERSION=12.1.0
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu22.04
|
||||
ARG HTTP_PROXY
|
||||
ARG HTTPS_PROXY
|
||||
ENV http_proxy=${HTTP_PROXY}
|
||||
ENV https_proxy=${HTTPS_PROXY}
|
||||
ARG DEBIAN_FRONTEND="noninteractive"
|
||||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||||
ENV MAMBA_ROOT_PREFIX=~/micromamba
|
||||
@@ -17,16 +12,10 @@ ARG ROOT_PASSWD="root"
|
||||
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
||||
WORKDIR /root
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
COPY id_rsa.pub /root/.ssh/id_rsa.pub
|
||||
# base tools
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
apt-get update
|
||||
# 更新包管理器,并安装 RDMA 和 InfiniBand 相关的库和工具
|
||||
# rdma-core: 包含 RDMA 驱动和库
|
||||
# ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具
|
||||
# perftest: 提供用于测试 InfiniBand 性能的工具
|
||||
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags
|
||||
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
|
||||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||||
add-apt-repository ppa:git-core/ppa -y
|
||||
@@ -41,7 +30,7 @@ sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd
|
||||
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
||||
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||||
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
|
||||
# sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
||||
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
||||
mkdir /var/run/sshd
|
||||
echo "root:${ROOT_PASSWD}" | chpasswd
|
||||
mkdir -p ~/.pip
|
||||
@@ -63,6 +52,23 @@ channels:
|
||||
- defaults
|
||||
show_channel_urls: true
|
||||
EOF
|
||||
# 安装 micromamba
|
||||
# echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
|
||||
# micromamba shell init -s bash -p ~/micromamba
|
||||
# cat <<'EOF' >> ~/.bashrc
|
||||
# source ~/micromamba/etc/profile.d/micromamba.sh
|
||||
# EOF
|
||||
# # 配置 .mambarc 文件
|
||||
# cat <<EOF > ~/.mambarc
|
||||
# channels:
|
||||
# - conda-forge
|
||||
# - bioconda
|
||||
# - pytorch
|
||||
# - pytorch-nightly
|
||||
# - nvidia
|
||||
# - defaults
|
||||
# show_channel_urls: true
|
||||
# EOF
|
||||
EOT
|
||||
|
||||
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
|
||||
@@ -75,6 +81,8 @@ ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH
|
||||
ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3"
|
||||
ENV REF='main'
|
||||
ENV STAGE_DIR=/tmp
|
||||
ENV NV_PEER_MEM_VERSION=1.2
|
||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||
ENV OPENMPI_BASEVERSION=4.1
|
||||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
||||
ARG CUDA='cu121'
|
||||
@@ -87,6 +95,7 @@ ARG TORCHAUDIO_VERSION=2.3.1
|
||||
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
||||
ARG PYTORCH_CUDA_VERSION=12.1
|
||||
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
||||
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
|
||||
ARG SETUPTOOLS_VERSION=69.5.1
|
||||
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
||||
ARG USE_CUDA=1
|
||||
@@ -147,7 +156,6 @@ cd ..
|
||||
rm -rf ${STAGE_DIR}/apex
|
||||
EOT
|
||||
|
||||
ENV MLNX_OFED_VERSION=5.8-4.1.5.0
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
@@ -163,19 +171,12 @@ python3 -m pip uninstall -y deepspeed
|
||||
# install deepspeed prepare
|
||||
# install Mellanox OFED
|
||||
mkdir -p ${STAGE_DIR}
|
||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf -
|
||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64
|
||||
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
||||
cd ${STAGE_DIR}
|
||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||||
EOT
|
||||
|
||||
ENV NV_PEER_MEM_VERSION=1.2
|
||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
|
||||
cd ..
|
||||
# install nv_peer_mem
|
||||
rm -rf ${STAGE_DIR}
|
||||
mkdir -p ${STAGE_DIR}
|
||||
@@ -186,7 +187,6 @@ cd ${STAGE_DIR}
|
||||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||
apt-get update
|
||||
apt --fix-broken install -y
|
||||
apt-get install -y dkms
|
||||
dpkg-buildpackage -us -uc
|
||||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||
@@ -220,14 +220,14 @@ chmod a+x /usr/local/mpi/bin/mpirun
|
||||
EOT
|
||||
|
||||
# SSH daemon port inside container cannot conflict with host OS port
|
||||
# ENV SSH_PORT=2222
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||
# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
# EOT
|
||||
ENV SSH_PORT=2222
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||
sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
EOT
|
||||
|
||||
# 29.78 Usage: install.sh [options...]
|
||||
# 29.78
|
||||
@@ -384,6 +384,12 @@ else
|
||||
fi
|
||||
eval $INSTALL_CMD
|
||||
# compile deepspeed ops
|
||||
cat <<'EOF' >> ~/.bashrc
|
||||
source ~/micromamba/etc/profile.d/micromamba.sh
|
||||
echo "alias mamba=micromamba" >> ~/.bashrc
|
||||
echo "alias mba=mamba" >> ~/.bashrc
|
||||
EOF
|
||||
# 配置 .mambarc 文件
|
||||
cat <<EOF > ~/compile_deepspeed_ops.py
|
||||
import deepspeed
|
||||
|
||||
@@ -454,17 +460,4 @@ pip3 install peft tiktoken \
|
||||
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
EOT
|
||||
|
||||
ARG DEEPSPEED_TRAIN='/data/train_data'
|
||||
ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
|
||||
ARG DEEPSPEED_VALIDATION='/data/validation_data'
|
||||
ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
|
||||
ARG NCCL_SOCKET_IFNAME='eth0'
|
||||
|
||||
# RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc && \
|
||||
# echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \
|
||||
# echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \
|
||||
# echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \
|
||||
# echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc
|
||||
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
||||
# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]
|
||||
|
||||
Reference in New Issue
Block a user