This commit is contained in:
Your Name
2024-07-17 06:14:32 +00:00
parent e7e28fd76d
commit 7f7d490de6
2 changed files with 4 additions and 3 deletions

View File

@@ -15,6 +15,7 @@ ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root WORKDIR /root
SHELL ["/bin/bash", "-c"] SHELL ["/bin/bash", "-c"]
ENV STAGE_DIR=/tmp
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ # https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ENV MLNX_OFED_VERSION=23.10-3.2.2.0 ENV MLNX_OFED_VERSION=23.10-3.2.2.0
RUN <<EOT RUN <<EOT
@@ -35,7 +36,7 @@ apt-get install -y flex tk ethtool libpci3 libltdl-dev bison lsof tcl libelf1 pc
# install Mellanox OFED # install Mellanox OFED
mkdir -p ${STAGE_DIR} mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q ./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR} cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
@@ -55,7 +56,7 @@ cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update apt-get update
apt --fix-broken install -y apt-get --fix-broken install -y
apt-get install -y dkms apt-get install -y dkms
dpkg-buildpackage -us -uc dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
@@ -89,7 +90,6 @@ pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=
pip install git+https://github.com/huggingface/transformers pip install git+https://github.com/huggingface/transformers
EOT EOT
ENV STAGE_DIR=/tmp
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels

View File

@@ -27,6 +27,7 @@ services:
OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3 OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
LABEL: "pytorch" LABEL: "pytorch"
VERSION: "24.06-py3" VERSION: "24.06-py3"
NV_PEER_MEM_VERSION: "1.2"
DS_BUILD_OPS: 1 DS_BUILD_OPS: 1
DEEPSPEED_VERSION: "master" DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"