Files
dockerfile_dp/pdf_clean/Dockerfile.mineru310_mpich_complie
2024-08-28 17:18:03 +08:00

411 lines
16 KiB
Docker
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# syntax=docker/dockerfile:1
# NOTE: Building this image require's docker version >= 23.0.
#
# For reference:
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
ARG TAG_VERSION="12.4.1"
FROM nvidia/cuda:${TAG_VERSION}-cudnn-devel-ubuntu22.04 as apptainerbuilder
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
# 安装必需的包
RUN apt-get update && apt-get install -y \
wget \
gcc \
git \
libc-dev \
make \
bash \
linux-headers-generic \
libseccomp-dev \
libssl-dev \
libuuid1 \
uuid-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/*
# 安装 Go
ARG GO_VERSION="1.21.13"
RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
rm go${GO_VERSION}.linux-amd64.tar.gz
# 设置 Go 环境变量
ENV PATH="/usr/local/go/bin:${PATH}"
# 构建 Apptainer
ARG APPTAINER_COMMITISH="main"
ARG MCONFIG_OPTIONS="--with-suid"
WORKDIR /go/src/github.com/apptainer
RUN git clone https://github.com/apptainer/apptainer.git \
&& cd apptainer \
&& git checkout "$APPTAINER_COMMITISH" \
&& ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \
&& cd builddir \
&& make \
&& make install
# 清理
RUN apt-get remove -y wget gcc git && \
apt-get autoremove -y && \
apt-get clean
FROM nvidia/cuda:${TAG_VERSION}-cudnn-devel-ubuntu22.04
# 复制 Apptainer 和 Go
COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer
COPY --from=apptainerbuilder /usr/local/go /usr/local/go
ENV GO_PATH="/usr/local/go"
ENV PATH="/usr/local/apptainer/bin:${GO_PATH}/bin:$PATH"
ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer"
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
ENV SSH_PORT=2222
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
pip install pipx
pipx install nvitop
pipx ensurepath
. ~/.bashrc
# Configure SSH for password and public key authentication
mkdir ~/.ssh
# 创建或覆盖 SSH 配置文件 ~/.ssh/config
# - Host *: 针对所有主机的通用配置
# - ForwardAgent yes: 启用 SSH 代理转发,允许通过本地的 SSH 代理进行身份验证
# - StrictHostKeyChecking no: 禁用主机密钥检查,自动接受新的主机密钥(适用于自动化环境)
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config
cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config
ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y
cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
chmod 600 /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys2
mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip
# install pixi
curl -fsSL https://pixi.sh/install.sh | bash
EOT
# install NVIDIA DOCA 2.7
# RUN <<EOT
# #!/bin/bash
# wget https://www.mellanox.com/downloads/DOCA/DOCA_v2.7.0/host/doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
# sudo dpkg -i doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
# sudo apt-get update
# sudo apt-get -y install doca-all
# EOT
ARG NV_DRIVER_VERSION="535"
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
apt-get install -y \
apt-file \
automake \
default-jdk \
dh-make \
g++ \
git \
openjdk-8-jdk \
libcap2 \
libnuma-dev \
libtool \
# Provide CUDA dependencies by libnvidia-compute*
libnvidia-compute-${NV_DRIVER_VERSION} \
make \
maven \
pkg-config \
udev \
wget \
environment-modules \
# Remove cuda-compat* from nvidia/cuda:x86_64 images, provide CUDA dependencies by libnvidia-compute* instead
&& apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ARG MLNX_OFED_VERSION="23.10-3.2.2.0"
ENV MLNX_OFED_VERSION=${MLNX_OFED_VERSION}
RUN <<EOT
#!/bin/bash
apt update
apt install -y libnvidia-compute-${NV_DRIVER_VERSION}
apt install -y automake swig pciutils libltdl-dev libnl-3-dev libfuse2 chrpath graphviz libgfortran5 libusb-1.0-0 tcl debhelper libpci3 pkg-config udev autoconf lsof libmnl0 gfortran libnl-route-3-200 tk kmod ethtool bison flex m4 libnl-route-3-dev
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --skip-distro-check --without-ucx --without-hcoll --without-openmpi --without-mpich --without-sharp --all --force -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
EOT
ENV CPATH /usr/local/cuda/include:${CPATH}
ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:${LD_LIBRARY_PATH}
ENV LIBRARY_PATH /usr/local/cuda/lib64:${LIBRARY_PATH}
# install ucx
# https://github.com/openucx/ucx
# OpenMPI and OpenSHMEM installation with UCX
# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
# https://openucx.readthedocs.io/en/master
# Running in Docker containers
# https://openucx.readthedocs.io/en/master/running.html#running-in-docker-containers
ENV UCX_HOME=/usr/local/ucx
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=${CUDA_HOME}/bin:${UCX_HOME}/bin:$PATH
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${UCX_HOME}/lib:$LD_LIBRARY_PATH
RUN <<EOT
#!/bin/bash
# 启用调试信息
sudo apt update
sudo apt -y install gdb valgrind
sudo apt-get update
sudo apt-get install -y build-essential libnuma-dev pkg-config libfuse3-dev
# sudo apt install -y openmpi-bin openmpi-common openmpi-doc openmpi-debug libopenmpi-dev
# sudo apt install -y libucx0-dbg libucs0-dbg libucm0-dbg libuct0-dbg libibverbs1-dbg librdmacm1-dbg libmlx5-1-dbg
git clone https://github.com/openucx/ucx.git
cd ucx
# git checkout v1.15.0
git checkout master
./autogen.sh
mkdir build
cd build
# make clean
# make distclean
# 性能优化配置 ../contrib/configure-release --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# 调试/开发配置 ../contrib/configure-devel --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# default ../configure --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# ../contrib/configure-release --prefix=${UCX_HOME} --with-cuda=${CUDA_HOME} --with-gdrcopy=/usr/local/gdrcopy
# ../contrib/configure-release --prefix=/usr/local/ucx \
# --with-cuda= /usr/local/cuda-12.5 \
# --with-mlx5 \
# --with-rc \
# --with-ud \
# --with-dc \
# --with-dm \
# --with-verbs
../contrib/configure-release --prefix=${UCX_HOME} \
--with-cuda=/usr/local/cuda \
--with-mlx5 \
--with-go=/usr/local/go \
--with-rc \
--with-ud \
--with-dc \
--with-dm \
--with-verbs
make -j$(nproc)
make install
# ucx_info -a
# 测试性能
# ucx_perftest -d <device> -t bw -p <protocol> -n <num_iterations>
# 测试 UCX 读取配置
# ucx_read_profile
# 检查 UCX 进程
# mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program
# CUDA support check
ucx_info -c
ucx_info -d
# ompi_info | grep ucx
EOT
# # mpich install with ucx
# # UCX is already embedded in the MPICH tarball, so you do not need to separately download UCX.
# ENV MPICH_HOME=/opt/mpich
# RUN <<EOT
# #!/bin/bash
# wget -c https://www.mpich.org/static/downloads/4.2.2/mpich-4.2.2.tar.gz
# cd mpich-4.2.2
# mkdir build
# cd build
# ../configure --prefix=${MPICH_HOME} --with-device=ch4:ucx --with-cuda=/usr/local/cuda
# make -j$(nproc)
# make install
# pip install mpi4py
# EOT
# 编译安装OpenMPI 和 OpenSHMEM
# Open MPI 运行时优化
# 默认情况下OpenMPI 启用内置的传输层BTLs这可能会导致额外的软件开销。可以尝试禁用某些 BTL
# $ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_NET_DEVICES=mlx5_0:1 ./app
# UCX 在 Cray 系统上的运行
# 在 Cray 系统上运行 UCX 时,用户需要显式启用 UCX 统一模式:
# $ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_UNIFIED_MODE=1 ./app
# UCX 是一种用于高性能计算的通信库,它通过支持多种传输协议(如 IB、RoCE 和 TCP来优化性能。
# openmpi编译https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html
# https://cuterwrite.top/p/openmpi-with-ucx/
# http://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
ENV MPI_HOME=/usr/local/openmpi
ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH
# export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
# export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
RUN <<EOT
#!/bin/bash
apt update && apt install -y autoconf automake libtool flex
/usr/bin/python3 -m pip install cython
# git clone https://github.com/open-mpi/ompi.git
# git submodule update --init --recursive
git clone --recursive https://github.com/open-mpi/ompi.git
cd ompi
git checkout main
# make clean
# make distclean
./autogen.pl
mkdir build
cd build
# NOTE With OpenMPI 4.0 and above, there could be compilation errors from "btl_uct" component. This component is not critical for using UCX; so it could be disabled this way:
# OpenMPI 4.0 及以上版本可能会因 "btl_uct" 组件而出现编译错误。可以禁用此组件:
# ./configure ... --enable-mca-no-build=btl-uct ...
# 在运行时禁用 btl/uct
# mpirun -np 2 -mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./app
# 通过UCX首选机制支持CUDA 就不需要在编译安装openmpi加上--with-cuda了
# 检查UCX是否支持CUDA
# ucx_info -v
# ../configure --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --with-cuda=${CUDA_HOME} --enable-mca-no-build=btl-uct
# --enable-python-bindings 启用Python绑定这通常是为了与mpi4py等库进行集成
# --without-hcoll禁用HCOLL集体通信库
# --enable-python-bindings启用Python绑定允许使用Python调用MPI功能。
# --enable-mpirun-prefix-by-default默认启用mpirun的prefix选项。
# --prefix=${MPI_HOME}:指定安装路径。
# --with-ucx=${UCX_HOME}指定UCX库路径。
# --with-cuda=${CUDA_HOME}指定CUDA库路径。
# --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda启用特定的MCA模块组件架构动态共享对象。
# --enable-mca-no-build=btl-uct禁用构建UCTUCX传输层模块。
# -x UCX_LOG_LEVEL=debug 进行UCX的debug
# !当你编译 UCX 时,如果你启用了 CUDA 支持(即使用 --with-cuda 参数),那么 UCX 已经包含了 CUDA 的 RDMA 支持。此时,在编译 OpenMPI 时,你可以选择不再指定 --with-cuda 参数,因为 OpenMPI 会通过 UCX 自动获得 CUDA 支持
../configure --with-cuda=/usr/local/cuda --without-hcoll --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda --enable-mca-no-build=btl-uct --with-python=/usr/bin/python3
make -j$(nproc)
make install
# 验证CUDA支持
# ompi_info | grep "MPI extensions"
# ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
# 运行MPI程序时启用CUDA调试信息
# mpirun --mca opal_cuda_verbose 10 ...
# mpirun --mca mpi_common_cuda_verbose 10 ...
# ompi_info | grep cuda
# ucx_info -c
# 使用以下命令检查 Open MPI 的 UCX 支持
# ompi_info | grep cuda 中确实看到了 btl: smcuda 和其他与 CUDA 相关的扩展,但这并不意味着所有与 CUDA 相关的功能都正常工作。
# ompi_info | grep ucx
# unit test in mpi cuda
cat <<EOF > ./test_mpi_cuda.cu
#include <mpi.h>
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void hello_cuda() {
printf("Hello from CUDA kernel! Thread id: %d\n", threadIdx.x);
}
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("Hello from MPI process %d!\n", rank);
// Launch CUDA kernel
hello_cuda<<<1, 10>>>();
cudaDeviceSynchronize(); // Wait for the CUDA kernel to finish
MPI_Finalize();
return 0;
}
EOF
nvcc -o test_mpi_cuda test_mpi_cuda.cu -I${CUDA_HOME}/include -I${MPI_HOME}/include -L${MPI_HOME}/lib -lcudart -lmpi
# mpirun --allow-run-as-root -np 2 ./test_mpi_cuda
EOT
ARG CONDA_ENV_NAME="mineru"
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
ARG PYTHON_VERSION="3.10"
ENV PYTHON_VERSION=${PYTHON_VERSION}
# https://github.com/opendatalab/PDF-Extract-Kit
RUN <<EOT
#!/bin/bash
# install miniconda
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
. /opt/conda/etc/profile.d/conda.sh
conda init bash
conda config --set show_channel_urls true
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
source /opt/conda/etc/profile.d/conda.sh
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -y
conda activate ${CONDA_ENV_NAME}
# python -m pip install magic-pdf[full-cpu] --index-url=http://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
# python -m pip install magic-pdf[full]==0.6.2b1 detectron2 --extra-index-url https://myhloli.github.io/wheels/ -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m pip install magic-pdf[full]==0.7.0b1 detectron2 --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
# pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
# python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
# python -m pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
# python -m pip install -U python-alist
cat <<EOF > ~/magic-pdf.json
{
"temp-output-dir":"/results",
"models-dir":"/models",
"device-mode":"cuda"
}
EOF
mkdir -p /app
EOT
ENV PATH=/usr/local/cuda/bin:$PATH
# 添加deepspeed user
RUN <<EOT
#!/bin/bash
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
usermod -aG sudo deepspeed
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
EOT
# # # Change to non-root privilege
# USER deepspeed
RUN <<EOT
#!/bin/bash
apt-get clean && rm -rf /var/lib/apt/lists/*
EOT
EXPOSE 2222
CMD ["/usr/sbin/sshd", "-D"]