461 lines
18 KiB
Docker
461 lines
18 KiB
Docker
# syntax=docker/dockerfile:1
|
||
# NOTE: Building this image require's docker version >= 23.0.
|
||
#
|
||
# For reference:
|
||
# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel
|
||
ARG TAG_VERSION="12.1.1"
|
||
FROM nvidia/cuda:${TAG_VERSION}-cudnn8-devel-ubuntu22.04 as apptainerbuilder
|
||
ARG HTTP_PROXY
|
||
ARG HTTPS_PROXY
|
||
ENV http_proxy=${HTTP_PROXY}
|
||
ENV https_proxy=${HTTPS_PROXY}
|
||
ARG DEBIAN_FRONTEND="noninteractive"
|
||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||
# 安装必需的包
|
||
RUN apt-get update && apt-get install -y \
|
||
wget \
|
||
gcc \
|
||
git \
|
||
libc-dev \
|
||
make \
|
||
bash \
|
||
linux-headers-generic \
|
||
libseccomp-dev \
|
||
libssl-dev \
|
||
libuuid1 \
|
||
uuid-dev \
|
||
pkg-config \
|
||
&& rm -rf /var/lib/apt/lists/*
|
||
|
||
# 安装 Go
|
||
ARG GO_VERSION="1.21.13"
|
||
RUN wget https://golang.org/dl/go${GO_VERSION}.linux-amd64.tar.gz && \
|
||
tar -C /usr/local -xzf go${GO_VERSION}.linux-amd64.tar.gz && \
|
||
rm go${GO_VERSION}.linux-amd64.tar.gz
|
||
|
||
# 设置 Go 环境变量
|
||
ENV PATH="/usr/local/go/bin:${PATH}"
|
||
|
||
# 构建 Apptainer
|
||
ARG APPTAINER_COMMITISH="main"
|
||
ARG MCONFIG_OPTIONS="--with-suid"
|
||
WORKDIR /go/src/github.com/apptainer
|
||
RUN git clone https://github.com/apptainer/apptainer.git \
|
||
&& cd apptainer \
|
||
&& git checkout "$APPTAINER_COMMITISH" \
|
||
&& ./mconfig $MCONFIG_OPTIONS -p /usr/local/apptainer \
|
||
&& cd builddir \
|
||
&& make \
|
||
&& make install
|
||
|
||
# 清理
|
||
RUN apt-get remove -y wget gcc git && \
|
||
apt-get autoremove -y && \
|
||
apt-get clean
|
||
|
||
FROM nvidia/cuda:${TAG_VERSION}-cudnn8-devel-ubuntu22.04
|
||
# 复制 Apptainer 和 Go
|
||
COPY --from=apptainerbuilder /usr/local/apptainer /usr/local/apptainer
|
||
COPY --from=apptainerbuilder /usr/local/go /usr/local/go
|
||
ENV CUDA_HOME=/usr/local/cuda
|
||
ENV GO_PATH="/usr/local/go"
|
||
ENV PATH="/usr/local/apptainer/bin:${GO_PATH}/bin:$PATH"
|
||
ENV APPTAINER_TMPDIR="/tmp/tmp-apptainer"
|
||
ARG HTTP_PROXY
|
||
ARG HTTPS_PROXY
|
||
ENV http_proxy=${HTTP_PROXY}
|
||
ENV https_proxy=${HTTPS_PROXY}
|
||
ARG DEBIAN_FRONTEND="noninteractive"
|
||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||
ARG ROOT_PASSWD="root"
|
||
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
||
ENV SSH_PORT=2222
|
||
WORKDIR /root
|
||
SHELL ["/bin/bash", "-c"]
|
||
|
||
# base tools
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
apt-get update
|
||
apt-get install -y libgl1-mesa-glx bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server openssh-client python3 python3-pip python3-venv bzip2
|
||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||
add-apt-repository ppa:git-core/ppa -y
|
||
apt-get install -y git libnuma-dev wget
|
||
pip install pipx
|
||
pipx install nvitop
|
||
pipx ensurepath
|
||
. ~/.bashrc
|
||
# Configure SSH for password and public key authentication
|
||
mkdir ~/.ssh
|
||
# 创建或覆盖 SSH 配置文件 ~/.ssh/config
|
||
# - Host *: 针对所有主机的通用配置
|
||
# - ForwardAgent yes: 启用 SSH 代理转发,允许通过本地的 SSH 代理进行身份验证
|
||
# - StrictHostKeyChecking no: 禁用主机密钥检查,自动接受新的主机密钥(适用于自动化环境)
|
||
printf "Host * \n ForwardAgent yes\nHost *\n StrictHostKeyChecking no" > ~/.ssh/config
|
||
cp /etc/ssh/sshd_config /etc/ssh/sshd_config.bak
|
||
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
||
sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||
sed -i 's/^\(\s*\)GSSAPIAuthentication yes/\1GSSAPIAuthentication no/' /etc/ssh/ssh_config
|
||
sed -i "s/^#Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config
|
||
sudo sed -i "s/# Port 22/Port ${SSH_PORT}/" /etc/ssh/ssh_config
|
||
ssh-keygen -t rsa -b 4096 -f /root/.ssh/id_rsa -N "" <<< y
|
||
cat ~/.ssh/id_rsa.pub >> ~/.ssh/auth
|
||
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
|
||
cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys2
|
||
chmod 600 /root/.ssh/authorized_keys
|
||
chmod 600 /root/.ssh/authorized_keys2
|
||
mkdir /var/run/sshd
|
||
echo "root:${ROOT_PASSWD}" | chpasswd
|
||
mkdir -p ~/.pip
|
||
# install pixi
|
||
curl -fsSL https://pixi.sh/install.sh | bash
|
||
EOT
|
||
|
||
# install NVIDIA DOCA 2.7
|
||
# RUN <<EOT
|
||
# #!/bin/bash
|
||
# wget https://www.mellanox.com/downloads/DOCA/DOCA_v2.7.0/host/doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
|
||
# sudo dpkg -i doca-host_2.7.0-209000-24.04-ubuntu2204_amd64.deb
|
||
# sudo apt-get update
|
||
# sudo apt-get -y install doca-all
|
||
# EOT
|
||
ARG NV_DRIVER_VERSION="535"
|
||
RUN apt-get update && \
|
||
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
|
||
apt-get install -y \
|
||
apt-file \
|
||
automake \
|
||
default-jdk \
|
||
dh-make \
|
||
g++ \
|
||
git \
|
||
openjdk-8-jdk \
|
||
libcap2 \
|
||
libnuma-dev \
|
||
libtool \
|
||
# Provide CUDA dependencies by libnvidia-compute*
|
||
libnvidia-compute-${NV_DRIVER_VERSION} \
|
||
make \
|
||
maven \
|
||
pkg-config \
|
||
udev \
|
||
wget \
|
||
environment-modules \
|
||
# Remove cuda-compat* from nvidia/cuda:x86_64 images, provide CUDA dependencies by libnvidia-compute* instead
|
||
&& apt-get remove -y openjdk-11-* cuda-compat* || apt-get autoremove -y
|
||
|
||
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
|
||
ARG MLNX_OFED_VERSION="23.10-3.2.2.0"
|
||
ENV MLNX_OFED_VERSION=${MLNX_OFED_VERSION}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
apt update
|
||
apt install -y libnvidia-compute-${NV_DRIVER_VERSION}
|
||
apt install -y automake swig pciutils libltdl-dev libnl-3-dev libfuse2 chrpath graphviz libgfortran5 libusb-1.0-0 tcl debhelper libpci3 pkg-config udev autoconf lsof libmnl0 gfortran libnl-route-3-200 tk kmod ethtool bison flex m4 libnl-route-3-dev
|
||
# install Mellanox OFED
|
||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
||
./mlnxofedinstall --user-space-only --without-fw-update --skip-distro-check --without-ucx --without-hcoll --without-openmpi --without-mpich --without-sharp --all --force -q
|
||
rm -rf MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||
EOT
|
||
ENV CPATH /usr/local/cuda/include:${CPATH}
|
||
ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:${LD_LIBRARY_PATH}
|
||
ENV LIBRARY_PATH /usr/local/cuda/lib64:${LIBRARY_PATH}
|
||
|
||
# install nv_peer_mem
|
||
ENV NV_PEER_MEM_VERSION=1.2
|
||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
cd /root
|
||
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG}
|
||
cd nv_peer_memory
|
||
./build_module.sh
|
||
cd /tmp
|
||
tar xzf nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||
cd nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||
apt-get update
|
||
apt-get install -y dkms debhelper autotools-dev
|
||
dpkg-buildpackage -us -uc
|
||
cd ..
|
||
dpkg -i nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||
EOT
|
||
|
||
# install cutlass https://github.com/NVIDIA/cutlass
|
||
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
|
||
# A100: architecture is Ampere
|
||
# V100: architecture is Volta
|
||
# T4: architecture is Turing
|
||
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
|
||
# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。
|
||
# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。
|
||
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
||
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
||
# 89:GeForce RTX 4090
|
||
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
|
||
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
git clone https://github.com/NVIDIA/cutlass
|
||
cd cutlass
|
||
git checkout .
|
||
git checkout master
|
||
mkdir build
|
||
cd build
|
||
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
|
||
make -j"$(nproc)" install
|
||
cd ..
|
||
# make test_unit -j"$(nproc)"
|
||
# make test_unit_gemm_warp -j"$(nproc)"
|
||
EOT
|
||
|
||
# install ucx
|
||
# https://github.com/openucx/ucx
|
||
# OpenMPI and OpenSHMEM installation with UCX
|
||
# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
|
||
# https://openucx.readthedocs.io/en/master
|
||
# Running in Docker containers
|
||
# https://openucx.readthedocs.io/en/master/running.html#running-in-docker-containers
|
||
ENV UCX_HOME=/usr/local/ucx
|
||
ENV PATH=${CUDA_HOME}/bin:${UCX_HOME}/bin:$PATH
|
||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${UCX_HOME}/lib:$LD_LIBRARY_PATH
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
# 启用调试信息
|
||
sudo apt update
|
||
sudo apt -y install gdb valgrind
|
||
sudo apt-get update
|
||
sudo apt-get install -y build-essential libnuma-dev pkg-config libfuse3-dev
|
||
# sudo apt install -y openmpi-bin openmpi-common openmpi-doc openmpi-debug libopenmpi-dev
|
||
# sudo apt install -y libucx0-dbg libucs0-dbg libucm0-dbg libuct0-dbg libibverbs1-dbg librdmacm1-dbg libmlx5-1-dbg
|
||
git clone https://github.com/openucx/ucx.git
|
||
cd ucx
|
||
# git checkout v1.15.0
|
||
git checkout master
|
||
./autogen.sh
|
||
mkdir build
|
||
cd build
|
||
# make clean
|
||
# make distclean
|
||
# 性能优化配置 ../contrib/configure-release --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
|
||
# 调试/开发配置 ../contrib/configure-devel --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
|
||
# default ../configure --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
|
||
# ../contrib/configure-release --prefix=${UCX_HOME} --with-cuda=${CUDA_HOME} --with-gdrcopy=/usr/local/gdrcopy
|
||
# ../contrib/configure-release --prefix=/usr/local/ucx \
|
||
# --with-cuda= /usr/local/cuda-12.5 \
|
||
# --with-mlx5 \
|
||
# --with-rc \
|
||
# --with-ud \
|
||
# --with-dc \
|
||
# --with-dm \
|
||
# --with-verbs
|
||
../contrib/configure-release --prefix=${UCX_HOME} \
|
||
--with-cuda=/usr/local/cuda \
|
||
--with-mlx5 \
|
||
--with-go=/usr/local/go \
|
||
--with-rc \
|
||
--with-ud \
|
||
--with-dc \
|
||
--with-dm \
|
||
--with-verbs
|
||
make -j$(nproc)
|
||
make install
|
||
# ucx_info -a
|
||
# 测试性能
|
||
# ucx_perftest -d <device> -t bw -p <protocol> -n <num_iterations>
|
||
# 测试 UCX 读取配置
|
||
# ucx_read_profile
|
||
# 检查 UCX 进程
|
||
# mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program
|
||
# CUDA support check
|
||
ucx_info -c
|
||
ucx_info -d
|
||
# ompi_info | grep ucx
|
||
EOT
|
||
|
||
# # mpich install with ucx
|
||
# # UCX is already embedded in the MPICH tarball, so you do not need to separately download UCX.
|
||
# ENV MPICH_HOME=/opt/mpich
|
||
# RUN <<EOT
|
||
# #!/bin/bash
|
||
# wget -c https://www.mpich.org/static/downloads/4.2.2/mpich-4.2.2.tar.gz
|
||
# cd mpich-4.2.2
|
||
# mkdir build
|
||
# cd build
|
||
# ../configure --prefix=${MPICH_HOME} --with-device=ch4:ucx --with-cuda=/usr/local/cuda
|
||
# make -j$(nproc)
|
||
# make install
|
||
# pip install mpi4py
|
||
# EOT
|
||
|
||
# 编译安装OpenMPI 和 OpenSHMEM
|
||
# Open MPI 运行时优化
|
||
# 默认情况下,OpenMPI 启用内置的传输层(BTLs),这可能会导致额外的软件开销。可以尝试禁用某些 BTL:
|
||
# $ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_NET_DEVICES=mlx5_0:1 ./app
|
||
# UCX 在 Cray 系统上的运行
|
||
# 在 Cray 系统上运行 UCX 时,用户需要显式启用 UCX 统一模式:
|
||
# $ mpirun -np 2 -mca pml ucx --mca btl ^vader,tcp,openib,uct -x UCX_UNIFIED_MODE=1 ./app
|
||
# UCX 是一种用于高性能计算的通信库,它通过支持多种传输协议(如 IB、RoCE 和 TCP)来优化性能。
|
||
# openmpi编译:https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html
|
||
# https://cuterwrite.top/p/openmpi-with-ucx/
|
||
# http://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
|
||
ENV MPI_HOME=/usr/local/openmpi
|
||
ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH
|
||
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
|
||
ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
|
||
ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH
|
||
# export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
|
||
# export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH
|
||
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
apt update && apt install -y autoconf automake libtool flex
|
||
/usr/bin/python3 -m pip install cython
|
||
# git clone https://github.com/open-mpi/ompi.git
|
||
# git submodule update --init --recursive
|
||
git clone --recursive https://github.com/open-mpi/ompi.git
|
||
cd ompi
|
||
git checkout main
|
||
# make clean
|
||
# make distclean
|
||
./autogen.pl
|
||
mkdir build
|
||
cd build
|
||
# NOTE With OpenMPI 4.0 and above, there could be compilation errors from "btl_uct" component. This component is not critical for using UCX; so it could be disabled this way:
|
||
# OpenMPI 4.0 及以上版本可能会因 "btl_uct" 组件而出现编译错误。可以禁用此组件:
|
||
# ./configure ... --enable-mca-no-build=btl-uct ...
|
||
# 在运行时禁用 btl/uct:
|
||
# mpirun -np 2 -mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./app
|
||
# 通过UCX(首选机制)支持CUDA 就不需要在编译安装openmpi加上--with-cuda了
|
||
# 检查UCX是否支持CUDA
|
||
# ucx_info -v
|
||
# ../configure --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --with-cuda=${CUDA_HOME} --enable-mca-no-build=btl-uct
|
||
# --enable-python-bindings 启用Python绑定,这通常是为了与mpi4py等库进行集成
|
||
# --without-hcoll:禁用HCOLL(集体通信库)。
|
||
# --enable-python-bindings:启用Python绑定,允许使用Python调用MPI功能。
|
||
# --enable-mpirun-prefix-by-default:默认启用mpirun的prefix选项。
|
||
# --prefix=${MPI_HOME}:指定安装路径。
|
||
# --with-ucx=${UCX_HOME}:指定UCX库路径。
|
||
# --with-cuda=${CUDA_HOME}:指定CUDA库路径。
|
||
# --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda:启用特定的MCA(模块组件架构)动态共享对象。
|
||
# --enable-mca-no-build=btl-uct:禁用构建UCT(UCX传输层)模块。
|
||
# -x UCX_LOG_LEVEL=debug 进行UCX的debug
|
||
# !当你编译 UCX 时,如果你启用了 CUDA 支持(即使用 --with-cuda 参数),那么 UCX 已经包含了 CUDA 的 RDMA 支持。此时,在编译 OpenMPI 时,你可以选择不再指定 --with-cuda 参数,因为 OpenMPI 会通过 UCX 自动获得 CUDA 支持
|
||
../configure --with-cuda=/usr/local/cuda --without-hcoll --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda --enable-mca-no-build=btl-uct --with-python=/usr/bin/python3
|
||
make -j$(nproc)
|
||
make install
|
||
# 验证CUDA支持
|
||
# ompi_info | grep "MPI extensions"
|
||
# ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
|
||
# 运行MPI程序时启用CUDA调试信息
|
||
# mpirun --mca opal_cuda_verbose 10 ...
|
||
# mpirun --mca mpi_common_cuda_verbose 10 ...
|
||
# ompi_info | grep cuda
|
||
# ucx_info -c
|
||
# 使用以下命令检查 Open MPI 的 UCX 支持
|
||
# ompi_info | grep cuda 中确实看到了 btl: smcuda 和其他与 CUDA 相关的扩展,但这并不意味着所有与 CUDA 相关的功能都正常工作。
|
||
# ompi_info | grep ucx
|
||
# unit test in mpi cuda
|
||
EOT
|
||
|
||
# install ninja
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
# 克隆 ninja 源码并编译
|
||
git clone https://github.com/ninja-build/ninja.git ninja
|
||
cd ninja
|
||
# 克隆 GoogleTest 源码
|
||
git clone https://github.com/google/googletest.git
|
||
python ./configure.py --bootstrap
|
||
# 配置并构建 Ninja 测试,添加 pthread 链接选项
|
||
CXXFLAGS="-pthread" LDFLAGS="-pthread" /usr/bin/python3 ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
|
||
# conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
|
||
./ninja all
|
||
# 运行 Ninja 单元测试
|
||
./ninja_test
|
||
EOT
|
||
|
||
# install apex
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
source /opt/conda/etc/profile.d/conda.sh
|
||
conda activate ${CONDA_ENV_NAME}
|
||
git clone https://github.com/NVIDIA/apex
|
||
cd apex
|
||
# if pip >= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key...
|
||
MAX_JOBS=1 /usr/bin/python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||
/usr/bin/python3 -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||
cd ..
|
||
rm -rf apex
|
||
EOT
|
||
|
||
# install colossalai
|
||
ARG COLOSSALAI_VERSION="0.4.2"
|
||
ENV COLOSSALAI_VERSION=${COLOSSALAI_VERSION}
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
git clone https://github.com/hpcaitech/ColossalAI.git
|
||
cd ColossalAI
|
||
git checkout v${COLOSSALAI_VERSION}
|
||
# install dependency
|
||
/usr/bin/python3 -m pip install -r requirements/requirements.txt
|
||
# install colossalai
|
||
BUILD_EXT=1 /usr/bin/python3 -m pip install .
|
||
colossalai check -i
|
||
# install tensornvme
|
||
/usr/bin/python3 -m pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
|
||
EOT
|
||
|
||
# ARG CONDA_ENV_NAME="mineru"
|
||
# ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
||
# ARG PYTHON_VERSION="3.10"
|
||
# ENV PYTHON_VERSION=${PYTHON_VERSION}
|
||
# # https://github.com/opendatalab/PDF-Extract-Kit
|
||
# RUN <<EOT
|
||
# #!/bin/bash
|
||
# # install miniconda
|
||
# wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
|
||
# bash /tmp/miniconda.sh -b -p /opt/conda
|
||
# rm /tmp/miniconda.sh
|
||
# ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
|
||
# echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
|
||
# . /opt/conda/etc/profile.d/conda.sh
|
||
# conda init bash
|
||
# conda config --set show_channel_urls true
|
||
# # 配置 .condarc 文件
|
||
# cat <<EOF > ~/.condarc
|
||
# channels:
|
||
# - conda-forge
|
||
# - bioconda
|
||
# - pytorch
|
||
# - pytorch-nightly
|
||
# - nvidia
|
||
# - defaults
|
||
# show_channel_urls: true
|
||
# EOF
|
||
# source /opt/conda/etc/profile.d/conda.sh
|
||
# conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} -y
|
||
# conda activate ${CONDA_ENV_NAME}
|
||
# EOT
|
||
|
||
ENV PATH=/usr/local/cuda/bin:$PATH
|
||
# 添加deepspeed user
|
||
# RUN <<EOT
|
||
# #!/bin/bash
|
||
# useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||
# usermod -aG sudo deepspeed
|
||
# echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||
# EOT
|
||
|
||
# # # Change to non-root privilege
|
||
# USER deepspeed
|
||
|
||
RUN <<EOT
|
||
#!/bin/bash
|
||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||
EOT
|
||
|
||
EXPOSE 2222
|
||
|
||
CMD ["/usr/sbin/sshd", "-D"] |