From 329036f25b10f380c3ac63117103677049d66747 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 4 Jul 2024 00:26:10 +0000 Subject: [PATCH] split nv_peer_mem and MLNX --- finetune/Dockerfile | 543 ++++++++++++++++++++++---------------------- 1 file changed, 274 insertions(+), 269 deletions(-) diff --git a/finetune/Dockerfile b/finetune/Dockerfile index a4abad2..f115e76 100644 --- a/finetune/Dockerfile +++ b/finetune/Dockerfile @@ -71,8 +71,6 @@ ENV PATH=/opt/conda/envs/${CONDA_ENV_NAME}/bin:/usr/bin:/opt/conda/bin:$PATH ENV DEEPSPEED_PYTHON="/opt/conda/envs/${CONDA_ENV_NAME}/bin/python3" ENV REF='main' ENV STAGE_DIR=/tmp -ENV NV_PEER_MEM_VERSION=1.2 -ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 ENV OPENMPI_BASEVERSION=4.1 ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 ARG CUDA='cu121' @@ -166,295 +164,302 @@ cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 ./mlnxofedinstall --user-space-only --without-fw-update --all -q cd ${STAGE_DIR} rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* -cd .. -# install nv_peer_mem -rm -rf ${STAGE_DIR} -mkdir -p ${STAGE_DIR} -git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory -cd ${STAGE_DIR}/nv_peer_memory -./build_module.sh -cd ${STAGE_DIR} -tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz -cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} -apt-get update -apt-get install -y dkms -dpkg-buildpackage -us -uc -dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb EOT -# install mpi -ENV PATH=/usr/local/mpi/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -RUN < /usr/local/mpi/bin/mpirun -echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun -chmod a+x /usr/local/mpi/bin/mpirun -EOT - -# SSH daemon port inside container cannot conflict with host OS port -# ENV SSH_PORT=2222 +# ENV NV_PEER_MEM_VERSION=1.2 +# ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 # RUN < ${STAGE_DIR}/sshd_config && \ -# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +# # install nv_peer_mem +# rm -rf ${STAGE_DIR} +# mkdir -p ${STAGE_DIR} +# git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory +# cd ${STAGE_DIR}/nv_peer_memory +# ./build_module.sh +# cd ${STAGE_DIR} +# tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz +# cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} +# apt-get update +# apt-get install -y dkms +# dpkg-buildpackage -us -uc +# dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb # EOT -# 29.78 Usage: install.sh [options...] -# 29.78 -# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in -# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally -# 29.78 -# 29.78 [optional] -# 29.78 -l, --local_only Install only on local machine -# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) -# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) -# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels -# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) -# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) -# 29.78 -e, --examples Checkout deepspeed example submodule (no install) -# 29.78 -v, --verbose Verbose logging -# 29.78 -h, --help This help text - -RUN <> /etc/sudoers -EOT - -# install cutlass https://github.com/NVIDIA/cutlass -# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) -# A100: architecture is Ampere -# V100: architecture is Volta -# T4: architecture is Turing -# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 -# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 -# 80:适用于 NVIDIA Ampere 架构(如 A100)。 -# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 -# 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="80;89;90a" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -RUN < /usr/local/mpi/bin/mpirun +# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun +# chmod a+x /usr/local/mpi/bin/mpirun # EOT -# install deepspeed step 1 -RUN < ${STAGE_DIR}/sshd_config && \ +# # sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +# # EOT -# install deepspeed step 2 -ARG CUDA_ARCH_LIST="80;86;89;90" -ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST} -RUN < install_modified.sh -chmod +x ./install_modified.sh -# 检查 HOSTFILE_CONTENT 并写入文件 -if [ -n "${HOSTFILE_CONTENT}" ]; then - echo "${HOSTFILE_CONTENT}" > /tmp/hostfile - INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" -else - INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" -fi -eval $INSTALL_CMD -# compile deepspeed ops -cat < ~/compile_deepspeed_ops.py -import deepspeed +# RUN <> /etc/sudoers +# EOT -def compile_ops(): - builders = [ - deepspeed.ops.op_builder.AsyncIOBuilder, - deepspeed.ops.op_builder.FusedAdamBuilder, - deepspeed.ops.op_builder.CPUAdamBuilder, - deepspeed.ops.op_builder.CPUAdagradBuilder, - deepspeed.ops.op_builder.CPULionBuilder, - deepspeed.ops.op_builder.EvoformerAttnBuilder, - deepspeed.ops.op_builder.FPQuantizerBuilder, - deepspeed.ops.op_builder.FusedLambBuilder, - deepspeed.ops.op_builder.FusedLionBuilder, - deepspeed.ops.op_builder.QuantizerBuilder, - deepspeed.ops.op_builder.RaggedOpsBuilder, - deepspeed.ops.op_builder.RandomLTDBuilder, - deepspeed.ops.op_builder.SparseAttnBuilder, - deepspeed.ops.op_builder.SpatialInferenceBuilder, - deepspeed.ops.op_builder.TransformerBuilder, - deepspeed.ops.op_builder.StochasticTransformerBuilder, - ] +# # install cutlass https://github.com/NVIDIA/cutlass +# # H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) +# # A100: architecture is Ampere +# # V100: architecture is Volta +# # T4: architecture is Turing +# # ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc +# # 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 +# # 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 +# # 80:适用于 NVIDIA Ampere 架构(如 A100)。 +# # 90a:适用于 NVIDIA Hopper 架构(如 H100)。 +# # 89:GeForce RTX 4090 +# ARG DCUTLASS_NVCC_ARCHS="80;89;90a" +# ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} +# RUN < install_modified.sh +# chmod +x ./install_modified.sh +# # 检查 HOSTFILE_CONTENT 并写入文件 +# if [ -n "${HOSTFILE_CONTENT}" ]; then +# echo "${HOSTFILE_CONTENT}" > /tmp/hostfile +# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile" +# else +# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}" +# fi +# eval $INSTALL_CMD +# # compile deepspeed ops +# cat < ~/compile_deepspeed_ops.py +# import deepspeed + +# def compile_ops(): +# builders = [ +# deepspeed.ops.op_builder.AsyncIOBuilder, +# deepspeed.ops.op_builder.FusedAdamBuilder, +# deepspeed.ops.op_builder.CPUAdamBuilder, +# deepspeed.ops.op_builder.CPUAdagradBuilder, +# deepspeed.ops.op_builder.CPULionBuilder, +# deepspeed.ops.op_builder.EvoformerAttnBuilder, +# deepspeed.ops.op_builder.FPQuantizerBuilder, +# deepspeed.ops.op_builder.FusedLambBuilder, +# deepspeed.ops.op_builder.FusedLionBuilder, +# deepspeed.ops.op_builder.QuantizerBuilder, +# deepspeed.ops.op_builder.RaggedOpsBuilder, +# deepspeed.ops.op_builder.RandomLTDBuilder, +# deepspeed.ops.op_builder.SparseAttnBuilder, +# deepspeed.ops.op_builder.SpatialInferenceBuilder, +# deepspeed.ops.op_builder.TransformerBuilder, +# deepspeed.ops.op_builder.StochasticTransformerBuilder, +# ] - for builder in builders: - print(f"Compiling {builder.__name__}") - builder().load() +# for builder in builders: +# print(f"Compiling {builder.__name__}") +# builder().load() -if __name__ == "__main__": - compile_ops() -EOF -python compile_deepspeed_ops.py -ds_report -# clean up -# rm -f deepspeed/git_version_info_installed.py -# rm -rf dist build deepspeed.egg-info -# python setup.py bdist_wheel -# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl -# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt -# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 -EOT +# if __name__ == "__main__": +# compile_ops() +# EOF +# python compile_deepspeed_ops.py +# ds_report +# # clean up +# # rm -f deepspeed/git_version_info_installed.py +# # rm -rf dist build deepspeed.egg-info +# # python setup.py bdist_wheel +# # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl +# # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt +# # pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 +# EOT -# install transformers and flash-attn -RUN <> ~/.bashrc && \ -# echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \ -# echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \ -# echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \ -# echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc +# # RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc && \ +# # echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \ +# # echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \ +# # echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \ +# # echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc CMD ["/usr/sbin/sshd", "-D"] # CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]