add openucx and openmpi

This commit is contained in:
root
2024-10-17 13:27:37 +08:00
parent 68f7ecbf2d
commit 9e0469c9d3

View File

@@ -60,6 +60,7 @@ echo "Asia/Shanghai" > /etc/timezone
# curl -fsSL https://pixi.sh/install.sh | bash
EOT
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=/opt/modules/bin:$PATH
ENV LIBRARY_PATH=/opt/modules/lib:$LIBRARY_PATH
COPY ./file/modules-5.4.0.tar.gz /root
@@ -74,6 +75,8 @@ cd modules-5.4.0
./configure --prefix=/opt/modules --bindir=/opt/modules/bin --libdir=/opt/modules/lib --disable-libtclenvmodules
make -j$(nproc)
make install
echo "source /opt/modules/init/profile.sh" >> /etc/profile
echo "source /opt/modules/init/profile.sh" >> ~/.bashrc
# /opt/modules/bin/modulecmd
EOT
@@ -91,10 +94,74 @@ make -j$(nproc)
make install
EOT
# install ucx
# https://github.com/openucx/ucx
# OpenMPI and OpenSHMEM installation with UCX
# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
# https://openucx.readthedocs.io/en/master
# Running in Docker containers
# https://openucx.readthedocs.io/en/master/running.html#running-in-docker-containers
ENV UCX_HOME=/usr/local/ucx
ENV PATH=${CUDA_HOME}/bin:${UCX_HOME}/bin:$PATH
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${UCX_HOME}/lib:$LD_LIBRARY_PATH
RUN <<EOT
#!/bin/bash
# 启用调试信息
sudo apt update
sudo apt -y install gdb valgrind
sudo apt-get update
sudo apt-get install -y build-essential libnuma-dev pkg-config libfuse3-dev
# sudo apt install -y openmpi-bin openmpi-common openmpi-doc openmpi-debug libopenmpi-dev
# sudo apt install -y libucx0-dbg libucs0-dbg libucm0-dbg libuct0-dbg libibverbs1-dbg librdmacm1-dbg libmlx5-1-dbg
git clone https://github.com/openucx/ucx.git
cd ucx
# git checkout v1.15.0
git checkout master
./autogen.sh
mkdir build
cd build
# make clean
# make distclean
# 性能优化配置 ../contrib/configure-release --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# 调试/开发配置 ../contrib/configure-devel --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# default ../configure --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
# ../contrib/configure-release --prefix=${UCX_HOME} --with-cuda=${CUDA_HOME} --with-gdrcopy=/usr/local/gdrcopy
# ../contrib/configure-release --prefix=/usr/local/ucx \
# --with-cuda= /usr/local/cuda-12.5 \
# --with-mlx5 \
# --with-rc \
# --with-ud \
# --with-dc \
# --with-dm \
# --with-verbs \
# --with-go=/usr/local/go
# --with-mlx5
../contrib/configure-release --prefix=${UCX_HOME} \
--with-cuda=/usr/local/cuda \
--with-rc \
--with-ud \
--with-dc \
--with-dm \
--with-verbs
make -j$(nproc)
make install
# ucx_info -a
# 测试性能
# ucx_perftest -d <device> -t bw -p <protocol> -n <num_iterations>
# 测试 UCX 读取配置
# ucx_read_profile
# 检查 UCX 进程
# mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program
# CUDA support check
ucx_info -c
ucx_info -d
# ompi_info | grep ucx
EOT
# 安装openmpi
ENV MPI_HOME=/usr/local/openmpi
ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH
# export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
@@ -102,7 +169,7 @@ ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPAT
# export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
RUN <<EOT
#!/bin/bash
apt update && apt install -y autoconf automake libtool flex
apt update && apt install -y autoconf automake libtool flex gfortran
/usr/bin/python3 -m pip install cython
git clone --recursive https://github.com/open-mpi/ompi.git
cd ompi
@@ -112,7 +179,10 @@ git checkout main
./autogen.pl
mkdir build
cd build
../configure --with-cuda=/usr/local/cuda --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-python=/usr/bin/python3
# ../configure --with-cuda=/usr/local/cuda --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-python=/usr/bin/python3 FC=gfortran
# ../configure FC=gfortran PYTHON=/usr/bin/python3 --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64 --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME}
../configure FC=gfortran PYTHON=/usr/bin/python3 --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64 --enable-python-bindings \
--enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda --enable-mca-no-build=btl-uct
make -j$(nproc)
make install
# 验证CUDA支持
@@ -143,6 +213,7 @@ int main(int argc, char **argv) {
EOF
nvcc -o test_mpi_cuda test_mpi_cuda.cu -I${CUDA_HOME}/include -I${MPI_HOME}/include -L${MPI_HOME}/lib -lcudart -lmpi
# mpirun --allow-run-as-root -np 2 ./test_mpi_cuda
ompi_info | grep "MPI extensions"
EOT
# 安装plumed
@@ -213,25 +284,53 @@ COPY file/Amber24.tar.bz2 file/AmberTools24.tar.bz2 /root
COPY file/l_HPCKit_p_2024.2.1.79_offline.sh file/l_onemkl_p_2024.2.2.17_offline.sh /root
COPY file/boost_1_86_0.tar.gz /root
ENV DOWNLOAD_MINICONDA="False"
# install ambertools
# install HPCKit and oneMKL
RUN <<EOT
#!/bin/bash
python3 -m pip install numpy scipy matplotlib
chmod +x l_HPCKit_p_2024.2.1.79_offline.sh
./l_HPCKit_p_2024.2.1.79_offline.sh -a --silent --eula accept --install-dir /opt/intel
chmod +x l_onemkl_p_2024.2.2.17_offline.sh
./l_onemkl_p_2024.2.2.17_offline.sh -a --silent --eula accept --install-dir /opt/intel/onemkl
# echo "source /opt/intel/setvars.sh" >> /etc/profile
# echo "source /opt/intel/onemkl/setvars.sh" >> /etc/profile
echo "source /opt/intel/setvars.sh" >> ~/.bashrc
echo "source /opt/intel/onemkl/setvars.sh" >> ~/.bashrc
mkdir -p /opt/modulefiles/intel
chmod +x /opt/intel/setvars.sh
chmod +x /opt/intel/onemkl/setvars.sh
# curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# echo "source $HOME/.cargo/env" >> ~/.bashrc
# cargo install modenv
EOT
# install ambertools
ENV MODULEPATH=/opt/modulefiles/boost:$MODULEPATH
RUN <<EOT
#!/bin/bash
python3 -m pip install numpy scipy matplotlib cython setuptools
# install Boost from https://www.boost.org/users/download/
tar zxvf boost_1_86_0.tar.gz
cd boost_1_86_0
./bootstrap.sh --prefix=/opt/boost --with-libraries=all --with-toolset=gcc
echo "using mpi : /usr/local/openmpi/bin/mpicxx ;" >> project-config.jam
# echo "using mpi : /opt/intel/mpi/2021.13/bin/mpicxx ;" >> project-config.jam
# # echo "using mpi : /opt/intel/mpi/2021.13/bin/mpicxx ;" >> project-config.jam
./b2 -j$(nproc) --layout=tagged link=static,shared threading=multi install
mkdir -p /opt/modulefiles/boost
# use modulefile to load boost command is:
# module load boost/1.86.0-openmpi-5.1.0a1 | module list | module avail
cat << EOF > /opt/modulefiles/boost/1.86.0-openmpi-5.1.0a1
#%Module1.0
set prefix /opt/boost
# 设置库路径和头文件路径,方便编译器找到 Boost
prepend-path LD_LIBRARY_PATH \$prefix/lib
prepend-path CPATH \$prefix/include
prepend-path LIBRARY_PATH \$prefix/lib
prepend-path PATH \$prefix/bin
EOF
# 解压 Amber24
tar -xjf Amber24.tar.bz2
tar -xjvf Amber24.tar.bz2
# 解压 AmberTools24
tar -xjf AmberTools24.tar.bz2
tar -xjvf AmberTools24.tar.bz2
# 清理解压后的 .tar.bz2 文件(可选)
# rm Amber24.tar.bz2 AmberTools24.tar.bz2
EOT