This commit is contained in:
Your Name
2024-07-17 06:14:32 +00:00
parent e7e28fd76d
commit 7f7d490de6
2 changed files with 4 additions and 3 deletions

View File

@@ -15,6 +15,7 @@ ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
ENV STAGE_DIR=/tmp
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
RUN <<EOT
@@ -35,7 +36,7 @@ apt-get install -y flex tk ethtool libpci3 libltdl-dev bison lsof tcl libelf1 pc
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
@@ -55,7 +56,7 @@ cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt --fix-broken install -y
apt-get --fix-broken install -y
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
@@ -89,7 +90,6 @@ pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=
pip install git+https://github.com/huggingface/transformers
EOT
ENV STAGE_DIR=/tmp
RUN <<EOT
#!/bin/bash
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels

View File

@@ -27,6 +27,7 @@ services:
OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
LABEL: "pytorch"
VERSION: "24.06-py3"
NV_PEER_MEM_VERSION: "1.2"
DS_BUILD_OPS: 1
DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"