add openucx and openmpi

2024-10-17 13:27:37 +08:00
parent 68f7ecbf2d
commit 9e0469c9d3
1 changed files with 107 additions and 8 deletions
--- a/Dockerfile.gromacs_amber
+++ b/Dockerfile.gromacs_amber
@@ -60,6 +60,7 @@ echo "Asia/Shanghai" > /etc/timezone
 # curl -fsSL https://pixi.sh/install.sh | bash
 EOT

+ENV CUDA_HOME=/usr/local/cuda
 ENV PATH=/opt/modules/bin:$PATH
 ENV LIBRARY_PATH=/opt/modules/lib:$LIBRARY_PATH
 COPY ./file/modules-5.4.0.tar.gz /root
@@ -74,6 +75,8 @@ cd modules-5.4.0
 ./configure --prefix=/opt/modules --bindir=/opt/modules/bin --libdir=/opt/modules/lib --disable-libtclenvmodules
 make -j$(nproc)
 make install
+echo "source /opt/modules/init/profile.sh" >> /etc/profile
+echo "source /opt/modules/init/profile.sh" >> ~/.bashrc
 # /opt/modules/bin/modulecmd
 EOT

@@ -91,10 +94,74 @@ make -j$(nproc)
 make install
 EOT

+# install ucx
+# https://github.com/openucx/ucx
+# OpenMPI and OpenSHMEM installation with UCX
+# https://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX
+# https://openucx.readthedocs.io/en/master
+# Running in Docker containers
+# https://openucx.readthedocs.io/en/master/running.html#running-in-docker-containers
+ENV UCX_HOME=/usr/local/ucx
+ENV PATH=${CUDA_HOME}/bin:${UCX_HOME}/bin:$PATH
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${UCX_HOME}/lib:$LD_LIBRARY_PATH
+RUN <<EOT
+#!/bin/bash
+# 启用调试信息
+sudo apt update
+sudo apt -y install gdb valgrind
+sudo apt-get update
+sudo apt-get install -y build-essential libnuma-dev pkg-config libfuse3-dev
+# sudo apt install -y openmpi-bin openmpi-common openmpi-doc openmpi-debug libopenmpi-dev
+# sudo apt install -y libucx0-dbg libucs0-dbg libucm0-dbg libuct0-dbg libibverbs1-dbg librdmacm1-dbg libmlx5-1-dbg
+git clone https://github.com/openucx/ucx.git
+cd ucx
+# git checkout v1.15.0
+git checkout master
+./autogen.sh
+mkdir build
+cd build
+# make clean
+# make distclean
+# 性能优化配置 ../contrib/configure-release --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
+# 调试/开发配置 ../contrib/configure-devel --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
+# default ../configure --prefix=/usr/local/ucx --with-cuda=${CUDA_HOME}
+# ../contrib/configure-release --prefix=${UCX_HOME} --with-cuda=${CUDA_HOME} --with-gdrcopy=/usr/local/gdrcopy
+# ../contrib/configure-release --prefix=/usr/local/ucx \
+# --with-cuda= /usr/local/cuda-12.5 \
+# --with-mlx5 \
+# --with-rc \
+# --with-ud \
+# --with-dc \
+# --with-dm \
+# --with-verbs \
+# --with-go=/usr/local/go 
+# --with-mlx5 
+../contrib/configure-release --prefix=${UCX_HOME} \
+--with-cuda=/usr/local/cuda \
+--with-rc \
+--with-ud \
+--with-dc \
+--with-dm \
+--with-verbs
+make -j$(nproc)
+make install
+# ucx_info -a
+# 测试性能
+# ucx_perftest -d <device> -t bw -p <protocol> -n <num_iterations>
+# 测试 UCX 读取配置
+# ucx_read_profile
+# 检查 UCX 进程
+# mpirun -np 2 -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 ./your_mpi_program
+# CUDA support check
+ucx_info -c
+ucx_info -d
+# ompi_info | grep ucx
+EOT
+
 # 安装openmpi
 ENV MPI_HOME=/usr/local/openmpi
 ENV PATH=${MPI_HOME}/bin:/usr/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${MPI_HOME}/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 ENV LIBRARY_PATH=/usr/local/cuda/lib64:${LIBRARY_PATH}
 ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPATH
 # export C_INCLUDE_PATH=/usr/local/cuda/include:$C_INCLUDE_PATH
@@ -102,7 +169,7 @@ ENV CPATH=/usr/local/cuda/include:${MPI_HOME}/include:${CUDA_HOME}/include:$CPAT
 # export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 RUN <<EOT
 #!/bin/bash
-apt update &&  apt install -y autoconf automake libtool flex
+apt update &&  apt install -y autoconf automake libtool flex gfortran
 /usr/bin/python3 -m pip install cython
 git clone --recursive https://github.com/open-mpi/ompi.git
 cd ompi
@@ -112,7 +179,10 @@ git checkout main
 ./autogen.pl
 mkdir build
 cd build
-../configure --with-cuda=/usr/local/cuda --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-python=/usr/bin/python3
+# ../configure --with-cuda=/usr/local/cuda --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-python=/usr/bin/python3 FC=gfortran
+# ../configure FC=gfortran PYTHON=/usr/bin/python3 --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64 --enable-python-bindings --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME}
+../configure FC=gfortran PYTHON=/usr/bin/python3 --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64 --enable-python-bindings \
+ --enable-mpirun-prefix-by-default --prefix=${MPI_HOME} --with-ucx=${UCX_HOME} --enable-mca-dso=btl-smcuda,rcache-rgpusm,rcache-gpusm,accelerator-cuda --enable-mca-no-build=btl-uct
 make -j$(nproc)
 make install
 # 验证CUDA支持
@@ -143,6 +213,7 @@ int main(int argc, char **argv) {
 EOF
 nvcc -o test_mpi_cuda test_mpi_cuda.cu -I${CUDA_HOME}/include -I${MPI_HOME}/include -L${MPI_HOME}/lib -lcudart -lmpi
 # mpirun --allow-run-as-root -np 2 ./test_mpi_cuda
+ompi_info | grep "MPI extensions"
 EOT

 # 安装plumed
@@ -213,25 +284,53 @@ COPY file/Amber24.tar.bz2 file/AmberTools24.tar.bz2 /root
 COPY file/l_HPCKit_p_2024.2.1.79_offline.sh file/l_onemkl_p_2024.2.2.17_offline.sh /root
 COPY file/boost_1_86_0.tar.gz /root
 ENV DOWNLOAD_MINICONDA="False"
-# install ambertools
+# install HPCKit and oneMKL
 RUN <<EOT
 #!/bin/bash
-python3 -m pip install numpy scipy matplotlib
 chmod +x l_HPCKit_p_2024.2.1.79_offline.sh
 ./l_HPCKit_p_2024.2.1.79_offline.sh -a --silent --eula accept --install-dir /opt/intel
 chmod +x l_onemkl_p_2024.2.2.17_offline.sh
 ./l_onemkl_p_2024.2.2.17_offline.sh -a --silent --eula accept --install-dir /opt/intel/onemkl
+# echo "source /opt/intel/setvars.sh" >> /etc/profile
+# echo "source /opt/intel/onemkl/setvars.sh" >> /etc/profile
+echo "source /opt/intel/setvars.sh" >> ~/.bashrc
+echo "source /opt/intel/onemkl/setvars.sh" >> ~/.bashrc
+mkdir -p /opt/modulefiles/intel
+chmod +x /opt/intel/setvars.sh
+chmod +x /opt/intel/onemkl/setvars.sh
+# curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+# echo  "source $HOME/.cargo/env" >> ~/.bashrc
+# cargo install modenv
+EOT
+# install ambertools
+ENV MODULEPATH=/opt/modulefiles/boost:$MODULEPATH
+RUN <<EOT
+#!/bin/bash
+python3 -m pip install numpy scipy matplotlib cython setuptools
 # install Boost from https://www.boost.org/users/download/
 tar zxvf boost_1_86_0.tar.gz
 cd boost_1_86_0
 ./bootstrap.sh --prefix=/opt/boost --with-libraries=all --with-toolset=gcc
 echo "using mpi : /usr/local/openmpi/bin/mpicxx ;" >> project-config.jam
-# echo "using mpi : /opt/intel/mpi/2021.13/bin/mpicxx ;" >> project-config.jam
+# # echo "using mpi : /opt/intel/mpi/2021.13/bin/mpicxx ;" >> project-config.jam
 ./b2 -j$(nproc) --layout=tagged link=static,shared threading=multi install
+mkdir -p /opt/modulefiles/boost
+# use modulefile to load boost command is: 
+# module load boost/1.86.0-openmpi-5.1.0a1 | module list | module avail
+cat << EOF > /opt/modulefiles/boost/1.86.0-openmpi-5.1.0a1
+#%Module1.0
+set prefix /opt/boost
+
+# 设置库路径和头文件路径，方便编译器找到 Boost
+prepend-path LD_LIBRARY_PATH \$prefix/lib
+prepend-path CPATH \$prefix/include
+prepend-path LIBRARY_PATH \$prefix/lib
+prepend-path PATH \$prefix/bin
+EOF
 # 解压 Amber24
-tar -xjf Amber24.tar.bz2
+tar -xjvf Amber24.tar.bz2
 # 解压 AmberTools24
-tar -xjf AmberTools24.tar.bz2
+tar -xjvf AmberTools24.tar.bz2
 # 清理解压后的 .tar.bz2 文件（可选）
 # rm Amber24.tar.bz2 AmberTools24.tar.bz2
 EOT