This commit is contained in:
2024-06-13 15:37:35 +08:00
parent 254b21052d
commit ee51a8e37f
4 changed files with 278 additions and 143 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*.tar
build_d/

78
Dockerfile.base Normal file
View File

@@ -0,0 +1,78 @@
# syntax=docker/dockerfile:1
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba
ARG CONDA_ENV_NAME="ldh"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
# install latest cmake
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
sudo apt-get update
sudo apt-get install -y cmake
# Configure SSH for password and public key authentication
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd
echo 'root:${ROOT_PASSWD}' | chpasswd
mkdir -p ~/.pip
# install miniconda
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
conda init bash
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -y
conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch vidia-ml-py3 opencv-contrib-python
conda clean -afy
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# 安装 micromamba
echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
micromamba shell init -s bash -p ~/micromamba
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
alias mamba=micromamba
alias mba=mamba
EOF
# 配置 .mambarc 文件
cat <<EOF > ~/.mambarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
EOT

View File

@@ -21,9 +21,23 @@ echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config
echo "Port 22" >> /etc/ssh/sshd_config echo "Port 22" >> /etc/ssh/sshd_config
mkdir /var/run/sshd mkdir /var/run/sshd
echo 'root:cdcdocker' | chpasswd echo 'root:cdcdocker' | chpasswd
# Install Micromamba # 安装 micromamba 并配置 mambarc
echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh) echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
micromamba shell init -s bash -p ~/micromamba micromamba shell init -s bash -p ~/micromamba
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
alias mamba=micromamba
alias mba=mamba
EOF
# 配置 .mambarc 文件
cat <<EOF > ~/.mambarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
EOF
mkdir -p ~/.pip mkdir -p ~/.pip
echo " echo "
[global] [global]

View File

@@ -1,163 +1,204 @@
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
ARG DEBIAN_FRONTEND="noninteractive" ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ENV MAMBA_ROOT_PREFIX=~/micromamba ENV MAMBA_ROOT_PREFIX=~/micromamba
ARG CONDA_ENV_NAME="ldh"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
WORKDIR /root WORKDIR /root
SHELL ["/bin/bash", "-c"] SHELL ["/bin/bash", "-c"]
# base tools # base tools
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
apt-get update apt-get update
apt-get install -y wget curl git jq vim bash libaio-dev build-essential openssh-server apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
# install latest cmake
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
sudo apt-get update
sudo apt-get install -y cmake
# Configure SSH for password and public key authentication # Configure SSH for password and public key authentication
echo "PermitRootLogin yes" >> /etc/ssh/sshd_config sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
echo "Port 22" >> /etc/ssh/sshd_config sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd mkdir /var/run/sshd
echo 'root:root' | chpasswd echo 'root:${ROOT_PASSWD}' | chpasswd
mkdir -p ~/.pip mkdir -p ~/.pip
echo " # install miniconda
[global] wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
index-url = https://mirrors.aliyun.com/pypi/simple/ bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
[install] conda init bash
trusted-host=mirrors.aliyun.com conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -y
" >> ~/.pip/pip.conf conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch vidia-ml-py3 opencv-contrib-python
conda clean -afy
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# 安装 micromamba
echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
micromamba shell init -s bash -p ~/micromamba
cat <<'EOF' >> ~/.bashrc
source ~/micromamba/etc/profile.d/micromamba.sh
alias mamba=micromamba
alias mba=mamba
EOF
# 配置 .mambarc 文件
cat <<EOF > ~/.mambarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
EOT EOT
# deepspeed # 安装 ninja 并测试
ENV STAGE_DIR=/tmp
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
mkdir -p ${STAGE_DIR} # 安装 ninja
apt-get update source /opt/conda/etc/profile.d/conda.sh
apt-get install -y --no-install-recommends conda activate ${CONDA_ENV_NAME}
software-properties-common build-essential autotools-dev \ # 克隆 ninja 源码并编译
nfs-common pdsh \ git clone https://github.com/ninja-build/ninja.git
cmake g++ gcc \ cd ninja
curl wget vim tmux emacs less unzip \ # 克隆 GoogleTest 源码
htop iftop iotop ca-certificates openssh-client openssh-server \ git clone https://github.com/google/googletest.git
rsync iputils-ping net-tools sudo \ conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap
llvm-dev # 配置并构建 Ninja 测试,添加 pthread 链接选项
add-apt-repository ppa:git-core/ppa -y CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
apt-get install -y git ./ninja all
echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config # 运行 Ninja 单元测试
cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config ./ninja_test
sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
EOT EOT
# Mellanox OFED # # deepspeed
ENV MLNX_OFED_VERSION=4.9-7.1.0.0 # ENV STAGE_DIR=/tmp
RUN apt-get install -y libnuma-dev # RUN <<EOT
RUN cd ${STAGE_DIR} && \ # #!/bin/bash
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \ # mkdir -p ${STAGE_DIR}
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \ # echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ # cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config
cd ${STAGE_DIR} && \ # sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* # EOT
# nv_peer_mem
ENV NV_PEER_MEM_VERSION=1.2 # # Mellanox OFED
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 # WORKDIR ${STAGE_DIR}
RUN mkdir -p ${STAGE_DIR} && \ # ENV MLNX_OFED_VERSION=4.9-7.1.0.0
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ # RUN wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
cd ${STAGE_DIR}/nv_peer_memory && \ # cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
./build_module.sh && \ # ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
cd ${STAGE_DIR} && \ # cd ${STAGE_DIR} && \
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ # rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ # # nv_peer_mem
apt-get update && \ # ENV NV_PEER_MEM_VERSION=1.2
apt-get install -y dkms && \ # ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
dpkg-buildpackage -us -uc && \ # RUN mkdir -p ${STAGE_DIR} && \
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb # git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
# OPENMPI # cd ${STAGE_DIR}/nv_peer_memory && \
ENV OPENMPI_BASEVERSION=4.1 # ./build_module.sh && \
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 # cd ${STAGE_DIR} && \
RUN cd ${STAGE_DIR} && \ # tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ # cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \
cd openmpi-${OPENMPI_VERSION} && \ # apt-get update && \
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ # apt-get install -y dkms && \
make -j"$(nproc)" install && \ # dpkg-buildpackage -us -uc && \
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ # dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
# Sanity check: # # OPENMPI
test -f /usr/local/mpi/bin/mpic++ && \ # ENV OPENMPI_BASEVERSION=4.1
cd ${STAGE_DIR} && \ # ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} # RUN <<EOT
ENV PATH=/usr/local/mpi/bin:${PATH} \ # #!/bin/bash
LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} # cd ${STAGE_DIR}
# Create a wrapper for OpenMPI to allow running as root by default # wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ # cd openmpi-${OPENMPI_VERSION}
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ # ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ # make -j"$(nproc)" install
chmod a+x /usr/local/mpi/bin/mpirun # ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
# Python # # Sanity check:
ENV DEBIAN_FRONTEND=noninteractive # test -f /usr/local/mpi/bin/mpic++
ENV PYTHON_VERSION=3 # cd ${STAGE_DIR}
RUN apt-get install -y python3 python3-dev && \ # rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
rm -f /usr/bin/python && \ # EOT
ln -s /usr/bin/python3 /usr/bin/python && \
curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \ # ENV PATH=/usr/local/mpi/bin:${PATH}
python get-pip.py && \ # ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
rm get-pip.py && \ # # Create a wrapper for OpenMPI to allow running as root by default
pip install --upgrade pip && \ # RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
# Print python an pip version # echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
python -V && pip -V # echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
RUN pip install pyyaml # chmod a+x /usr/local/mpi/bin/mpirun
RUN pip install ipython # # Some Packages
# Some Packages # RUN <<EOT
RUN apt-get update && \ # apt-get update
apt-get install -y --no-install-recommends \ # apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
libsndfile-dev \ # source /opt/conda/etc/profile.d/conda.sh
libcupti-dev \ # conda activate ${CONDA_ENV_NAME}
libjpeg-dev \ # conda install -y mpi4py
libpng-dev \ # python -m pip install psutil \
screen \ # yappi \
libaio-dev # cffi \
RUN pip install psutil \ # ipdb \
yappi \ # pandas \
cffi \ # matplotlib \
ipdb \ # py3nvml \
pandas \ # pyarrow \
matplotlib \ # graphviz \
py3nvml \ # astor \
pyarrow \ # boto3 \
graphviz \ # tqdm \
astor \ # sentencepiece \
boto3 \ # msgpack \
tqdm \ # requests \
sentencepiece \ # pandas \
msgpack \ # sphinx \
requests \ # sphinx_rtd_theme \
pandas \ # scipy \
sphinx \ # numpy \
sphinx_rtd_theme \ # scikit-learn \
scipy \ # nvidia-ml-py3
numpy \ # EOT
scikit-learn \
nvidia-ml-py3 \
mpi4py
# PyTorch # PyTorch
ARG PYTORCH_VERSION=1.13.0 ARG PYTORCH_VERSION=1.13.0
ENV PYTORCH_VERSION=${PYTORCH_VERSION} ENV PYTORCH_VERSION=${PYTORCH_VERSION}
RUN pip install torch==${PYTORCH_VERSION} RUN <<EOT
RUN rm -rf /usr/lib/python3/dist-packages/yaml && \ #!/bin/bash
rm -rf /usr/lib/python3/dist-packages/PyYAML-* source /opt/conda/etc/profile.d/conda.sh
## Add deepspeed user conda activate ${CONDA_ENV_NAME}
# Add a deepspeed user with user id 8877 pip install deepspeed torch==${PYTORCH_VERSION} torchvision torchaudio bitsandbytes accelerate transformers optimum
#RUN useradd --create-home --uid 8877 deepspeed pip install \
RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed torch torchvision torchaudio \
RUN usermod -aG sudo deepspeed pydantic transformers datasets accelerate evaluate peft deepspeed tiktoken \
RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers sentencepiece tqdm nltk matplotlib seaborn numpy pandas scikit-learn diffusers \
# # Change to non-root privilege huggingface_hub spacy Pillow blobfile requests scipy pycocotools protobuf timm \
USER deepspeed pyyaml ipython xformers opencv-contrib-python open_clip_torch flash-attn \
# DeepSpeed packaging psutil zstandard
RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed python -c "import deepspeed; print(deepspeed.__version__)"
RUN cd ${STAGE_DIR}/DeepSpeed && \ EOT
git checkout . && \
git checkout master && \ # 启动 ssh 服务
./install.sh --pip_sudo CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"]
RUN rm -rf ${STAGE_DIR}/DeepSpeed
RUN python -c "import deepspeed; print(deepspeed.__version__)"