Files
dockerfile_dp/pdf_clean/Dockerfile.colossalai
2024-08-28 17:18:03 +08:00

461 lines
18 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# syntax=docker/dockerfile:1
# NOTE: Building this image require's docker version >= 23.0.
#
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG TAG_VERSION="12.1.1"
FROM nvidia/cuda:${TAG_VERSION}-cudnn8-devel-ubuntu22.04 as apptainerbuilder
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
# 安装必需的包
RUN apt-get update && apt-get install -y \
wget \
gcc \
git \
libc-dev \
make \
bash \
linux-headers-generic \
libseccomp-dev \
libssl-dev \
libuuid1 \
uuid-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# 安装 Go
ARG GO_VERSION="1.21.13"
RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
rm go${GO_VERSION}.linux-amd64.tar.gz
# 设置 Go 环境变量
ENV PATH="/usr/local/go/bin:${PATH}"
# 构建 Apptainer
ARG APPTAINER_COMMITISH="main"
ARG MCONFIG_OPTIONS="--with-suid"
WORKDIR /go/src/github.com/apptainer
RUN git clone https://github.com/apptainer/apptainer.git \
&& cd apptainer \
&& git checkout "$APPTAINER_COMMITISH" \
&& ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \
&& cd builddir \
&& make \
&& make install
# 清理
RUN apt-get remove -y wget gcc git && \
apt-get autoremove -y && \
apt-get clean
FROM nvidia/cuda:${TAG_VERSION}-cudnn8-devel-ubuntu22.04
# 复制 Apptainer 和 Go
COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer
COPY --from=apptainerbuilder /usr/local/go /usr/local/go
ENV CUDA_HOME=/usr/local/cuda
ENV GO_PATH="/usr/local/go"
ENV PATH="/usr/local/apptainer/bin:${GO_PATH}/bin:$PATH"
ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer"
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
ENV SSH_PORT=2222
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
pip install pipx
pipx install nvitop
pipx ensurepath
. ~/.bashrc
# Configure SSH for password and public key authentication
mkdir ~/.ssh
# 创建或覆盖 SSH 配置文件 ~/.ssh/config
# - Host *: 针对所有主机的通用配置
# - ForwardAgent yes: 启用 SSH 代理转发,允许通过本地的 SSH 代理进行身份验证
# - StrictHostKeyChecking no: 禁用主机密钥检查,自动接受新的主机密钥(适用于自动化环境)
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config
cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config
ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y
cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
chmod 600 /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys2
mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip
# install pixi
curl -fsSL https://pixi.sh/install.sh | bash
EOT
# install NVIDIA DOCA 2.7
# RUN <<EOT
# #!/bin/bash
# wget https://www.mellanox.com/downloads/DOCA/DOCA_v2.7.0/host/doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
# sudo dpkg -i doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
# sudo apt-get update
# sudo apt-get -y install doca-all
# EOT
ARG NV_DRIVER_VERSION="535"
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
apt-get install -y \
apt-file \
automake \
default-jdk \
dh-make \
g++ \
git \
openjdk-8-jdk \
libcap2 \
libnuma-dev \
libtool \
# Provide CUDA dependencies by libnvidia-compute*
libnvidia-compute-${NV_DRIVER_VERSION} \
make \
maven \
pkg-config \
udev \
wget \
environment-modules \
# Remove cuda-compat* from nvidia/cuda:x86_64 images, provide CUDA dependencies by libnvidia-compute* instead
&& apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ARG MLNX_OFED_VERSION="23.10-3.2.2.0"
ENV MLNX_OFED_VERSION=${MLNX_OFED_VERSION}
RUN <<EOT
#!/bin/bash
apt update
apt install -y libnvidia-compute-${NV_DRIVER_VERSION}
apt install -y automake swig pciutils libltdl-dev libnl-3-dev libfuse2 chrpath graphviz libgfortran5 libusb-1.0-0 tcl debhelper libpci3 pkg-config udev autoconf lsof libmnl0 gfortran libnl-route-3-200 tk kmod ethtool bison flex m4 libnl-route-3-dev
# install Mellanox OFED
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --skip-distro-check --without-ucx --without-hcoll --without-openmpi --without-mpich --without-sharp --all --force -q
rm -rf MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
EOT
ENV CPATH /usr/local/cuda/include:${CPATH}
ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:${LD_LIBRARY_PATH}
ENV LIBRARY_PATH /usr/local/cuda/lib64:${LIBRARY_PATH}
# install nv_peer_mem
ENV NV_PEER_MEM_VERSION=1.2
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN <<EOT
#!/bin/bash
cd /root
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG}
cd nv_peer_memory
./build_module.sh
cd /tmp
tar xzf nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt-get install -y dkms debhelper autotools-dev
dpkg-buildpackage -us -uc
cd ..
dpkg -i nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# install cutlass https://github.com/NVIDIA/cutlass
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
# A100: architecture is Ampere
# V100: architecture is Volta
# T4: architecture is Turing
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
# 70适用于 NVIDIA Volta 架构(如 Tesla V100
# 75适用于 NVIDIA Turing 架构(如 Tesla T4
# 80适用于 NVIDIA Ampere 架构(如 A100
# 90a适用于 NVIDIA Hopper 架构(如 H100
# 89:GeForce RTX 4090
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
RUN <<EOT
#!/bin/bash
git clone https://github.com/NVIDIA/cutlass
cd cutlass
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
make -j"$(nproc)" install
cd ..
# make test_unit -j"$(nproc)"
# make test_unit_gemm_warp -j"$(nproc)"
EOT
# install ucx
# https://github.com/openucx/ucx
# OpenMPI and OpenSHMEM installation with UCX
# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
# https://openucx.readthedocs.io/en/master
# Running in Docker containers
# https://openucx.readthedocs.io/en/master/running.html#running-in-docker-containers
ENV UCX_HOME=/usr/local/ucx
ENV PATH=${CUDA_HOME}/bin:${UCX_HOME}/bin:$PATH
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${UCX_HOME}/lib:$LD_LIBRARY_PATH
RUN <<EOT
#!/bin/bash
# 启用调试信息
sudo apt update
sudo apt -y install gdb valgrind
sudo apt-get update
sudo apt-get install -y build-essential libnuma-dev pkg-config libfuse3-dev
# sudo apt install -y openmpi-bin openmpi-common openmpi-doc openmpi-debug libopenmpi-dev
# sudo apt install -y libucx0-dbg libucs0-dbg libucm0-dbg libuct0-dbg libibverbs1-dbg librdmacm1-dbg libmlx5-1-dbg
git clone https://github.com/openucx/ucx.git
cd ucx
# git checkout v1.15.0
git checkout master
./autogen.sh
mkdir build
cd build
# make clean
# make distclean
# 性能优化配置 ../contrib/configure-release --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# 调试/开发配置 ../contrib/configure-devel --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# default ../configure --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# ../contrib/configure-release --prefix=${UCX_HOME} --with-cuda=${CUDA_HOME} --with-gdrcopy=/usr/local/gdrcopy
# ../contrib/configure-release --prefix=/usr/local/ucx \
# --with-cuda= /usr/local/cuda-12.5 \
# --with-mlx5 \
# --with-rc \
# --with-ud \
# --with-dc \
# --with-dm \
# --with-verbs
../contrib/configure-release --prefix=${UCX_HOME} \
--with-cuda=/usr/local/cuda \
--with-mlx5 \
--with-go=/usr/local/go \
--with-rc \
--with-ud \
--with-dc \
--with-dm \
--with-verbs
make -j$(nproc)
make install
# ucx_info -a
# 测试性能
# ucx_perftest -d <device> -t bw -p <protocol> -n <num_iterations>
# 测试 UCX 读取配置
# ucx_read_profile
# 检查 UCX 进程
# mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program
# CUDA support check
ucx_info -c
ucx_info -d
# ompi_info | grep ucx
EOT
# # mpich install with ucx
# # UCX is already embedded in the MPICH tarball, so you do not need to separately download UCX.
# ENV MPICH_HOME=/opt/mpich
# RUN <<EOT
# #!/bin/bash
# wget -c https://www.mpich.org/static/downloads/4.2.2/mpich-4.2.2.tar.gz
# cd mpich-4.2.2
# mkdir build
# cd build
# ../configure --prefix=${MPICH_HOME} --with-device=ch4:ucx --with-cuda=/usr/local/cuda
# make -j$(nproc)
# make install
# pip install mpi4py
# EOT
# 编译安装OpenMPI 和 OpenSHMEM
# Open MPI 运行时优化
# 默认情况下OpenMPI 启用内置的传输层BTLs这可能会导致额外的软件开销。可以尝试禁用某些 BTL
# $ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_NET_DEVICES=mlx5_0:1 ./app
# UCX 在 Cray 系统上的运行
# 在 Cray 系统上运行 UCX 时,用户需要显式启用 UCX 统一模式:
# $ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_UNIFIED_MODE=1 ./app
# UCX 是一种用于高性能计算的通信库,它通过支持多种传输协议(如 IB、RoCE 和 TCP来优化性能。
# openmpi编译https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html
# https://cuterwrite.top/p/openmpi-with-ucx/
# http://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
ENV MPI_HOME=/usr/local/openmpi
ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH
# export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
# export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
RUN <<EOT
#!/bin/bash
apt update && apt install -y autoconf automake libtool flex
/usr/bin/python3 -m pip install cython
# git clone https://github.com/open-mpi/ompi.git
# git submodule update --init --recursive
git clone --recursive https://github.com/open-mpi/ompi.git
cd ompi
git checkout main
# make clean
# make distclean
./autogen.pl
mkdir build
cd build
# NOTE With OpenMPI 4.0 and above, there could be compilation errors from "btl_uct" component. This component is not critical for using UCX; so it could be disabled this way:
# OpenMPI 4.0 及以上版本可能会因 "btl_uct" 组件而出现编译错误。可以禁用此组件:
# ./configure ... --enable-mca-no-build=btl-uct ...
# 在运行时禁用 btl/uct
# mpirun -np 2 -mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./app
# 通过UCX首选机制支持CUDA 就不需要在编译安装openmpi加上--with-cuda了
# 检查UCX是否支持CUDA
# ucx_info -v
# ../configure --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --with-cuda=${CUDA_HOME} --enable-mca-no-build=btl-uct
# --enable-python-bindings 启用Python绑定这通常是为了与mpi4py等库进行集成
# --without-hcoll禁用HCOLL集体通信库
# --enable-python-bindings启用Python绑定允许使用Python调用MPI功能。
# --enable-mpirun-prefix-by-default默认启用mpirun的prefix选项。
# --prefix=${MPI_HOME}:指定安装路径。
# --with-ucx=${UCX_HOME}指定UCX库路径。
# --with-cuda=${CUDA_HOME}指定CUDA库路径。
# --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda启用特定的MCA模块组件架构动态共享对象。
# --enable-mca-no-build=btl-uct禁用构建UCTUCX传输层模块。
# -x UCX_LOG_LEVEL=debug 进行UCX的debug
# !当你编译 UCX 时,如果你启用了 CUDA 支持(即使用 --with-cuda 参数),那么 UCX 已经包含了 CUDA 的 RDMA 支持。此时,在编译 OpenMPI 时,你可以选择不再指定 --with-cuda 参数,因为 OpenMPI 会通过 UCX 自动获得 CUDA 支持
../configure --with-cuda=/usr/local/cuda --without-hcoll --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda --enable-mca-no-build=btl-uct --with-python=/usr/bin/python3
make -j$(nproc)
make install
# 验证CUDA支持
# ompi_info | grep "MPI extensions"
# ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
# 运行MPI程序时启用CUDA调试信息
# mpirun --mca opal_cuda_verbose 10 ...
# mpirun --mca mpi_common_cuda_verbose 10 ...
# ompi_info | grep cuda
# ucx_info -c
# 使用以下命令检查 Open MPI 的 UCX 支持
# ompi_info | grep cuda 中确实看到了 btl: smcuda 和其他与 CUDA 相关的扩展,但这并不意味着所有与 CUDA 相关的功能都正常工作。
# ompi_info | grep ucx
# unit test in mpi cuda
EOT
# install ninja
RUN <<EOT
#!/bin/bash
# 克隆 ninja 源码并编译
git clone https://github.com/ninja-build/ninja.git ninja
cd ninja
# 克隆 GoogleTest 源码
git clone https://github.com/google/googletest.git
python ./configure.py --bootstrap
# 配置并构建 Ninja 测试,添加 pthread 链接选项
CXXFLAGS="-pthread" LDFLAGS="-pthread" /usr/bin/python3 ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
# conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
./ninja all
# 运行 Ninja 单元测试
./ninja_test
EOT
# install apex
RUN <<EOT
#!/bin/bash
source /opt/conda/etc/profile.d/conda.sh
conda activate ${CONDA_ENV_NAME}
git clone https://github.com/NVIDIA/apex
cd apex
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
MAX_JOBS=1 /usr/bin/python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
/usr/bin/python3 -c "import apex.amp; print('Apex is installed and the amp module is available.')"
cd ..
rm -rf apex
EOT
# install colossalai
ARG COLOSSALAI_VERSION="0.4.2"
ENV COLOSSALAI_VERSION=${COLOSSALAI_VERSION}
RUN <<EOT
#!/bin/bash
git clone https://github.com/hpcaitech/ColossalAI.git
cd ColossalAI
git checkout v${COLOSSALAI_VERSION}
# install dependency
/usr/bin/python3 -m pip install -r requirements/requirements.txt
# install colossalai
BUILD_EXT=1 /usr/bin/python3 -m pip install .
colossalai check -i
# install tensornvme
/usr/bin/python3 -m pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
EOT
# ARG CONDA_ENV_NAME="mineru"
# ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
# ARG PYTHON_VERSION="3.10"
# ENV PYTHON_VERSION=${PYTHON_VERSION}
# # https://github.com/opendatalab/PDF-Extract-Kit
# RUN <<EOT
# #!/bin/bash
# # install miniconda
# wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
# bash /tmp/miniconda.sh -b -p /opt/conda
# rm /tmp/miniconda.sh
# ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
# echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
# . /opt/conda/etc/profile.d/conda.sh
# conda init bash
# conda config --set show_channel_urls true
# # 配置 .condarc 文件
# cat <<EOF > ~/.condarc
# channels:
# - conda-forge
# - bioconda
# - pytorch
# - pytorch-nightly
# - nvidia
# - defaults
# show_channel_urls: true
# EOF
# source /opt/conda/etc/profile.d/conda.sh
# conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -y
# conda activate ${CONDA_ENV_NAME}
# EOT
ENV PATH=/usr/local/cuda/bin:$PATH
# 添加deepspeed user
# RUN <<EOT
# #!/bin/bash
# useradd --create-home --uid 1000 --shell /bin/bash deepspeed
# usermod -aG sudo deepspeed
# echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# EOT
# # # Change to non-root privilege
# USER deepspeed
RUN <<EOT
#!/bin/bash
apt-get clean && rm -rf /var/lib/apt/lists/*
EOT
EXPOSE 2222
CMD ["/usr/sbin/sshd", "-D"]