update
This commit is contained in:
@@ -15,6 +15,7 @@ ENV ROOT_PASSWD=${ROOT_PASSWD}
|
|||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
SHELL ["/bin/bash", "-c"]
|
SHELL ["/bin/bash", "-c"]
|
||||||
|
|
||||||
|
ENV STAGE_DIR=/tmp
|
||||||
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
|
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
|
||||||
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
@@ -35,7 +36,7 @@ apt-get install -y flex tk ethtool libpci3 libltdl-dev bison lsof tcl libelf1 pc
|
|||||||
# install Mellanox OFED
|
# install Mellanox OFED
|
||||||
mkdir -p ${STAGE_DIR}
|
mkdir -p ${STAGE_DIR}
|
||||||
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
||||||
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
||||||
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
||||||
cd ${STAGE_DIR}
|
cd ${STAGE_DIR}
|
||||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||||||
@@ -55,7 +56,7 @@ cd ${STAGE_DIR}
|
|||||||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||||
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||||
apt-get update
|
apt-get update
|
||||||
apt --fix-broken install -y
|
apt-get --fix-broken install -y
|
||||||
apt-get install -y dkms
|
apt-get install -y dkms
|
||||||
dpkg-buildpackage -us -uc
|
dpkg-buildpackage -us -uc
|
||||||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||||
@@ -89,7 +90,6 @@ pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=
|
|||||||
pip install git+https://github.com/huggingface/transformers
|
pip install git+https://github.com/huggingface/transformers
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
ENV STAGE_DIR=/tmp
|
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ services:
|
|||||||
OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
|
OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
|
||||||
LABEL: "pytorch"
|
LABEL: "pytorch"
|
||||||
VERSION: "24.06-py3"
|
VERSION: "24.06-py3"
|
||||||
|
NV_PEER_MEM_VERSION: "1.2"
|
||||||
DS_BUILD_OPS: 1
|
DS_BUILD_OPS: 1
|
||||||
DEEPSPEED_VERSION: "master"
|
DEEPSPEED_VERSION: "master"
|
||||||
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
|
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
|
||||||
|
|||||||
Reference in New Issue
Block a user