update to drive
This commit is contained in:
@@ -15,6 +15,49 @@ ENV ROOT_PASSWD=${ROOT_PASSWD}
|
|||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
SHELL ["/bin/bash", "-c"]
|
SHELL ["/bin/bash", "-c"]
|
||||||
|
|
||||||
|
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
|
||||||
|
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
# SYSTEM_NAME=$(lsb_release -cs) # 查看发行版本
|
||||||
|
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
|
||||||
|
python3 -m pip uninstall -y deepspeed
|
||||||
|
# This has to be run (again) inside the GPU VMs running the tests.
|
||||||
|
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
|
||||||
|
# TODO: Find out why test fail. install deepspeed
|
||||||
|
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
|
||||||
|
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
|
||||||
|
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
|
||||||
|
# install deepspeed prepare
|
||||||
|
# install Mellanox OFED
|
||||||
|
mkdir -p ${STAGE_DIR}
|
||||||
|
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
|
||||||
|
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
||||||
|
./mlnxofedinstall --user-space-only --without-fw-update --all -q
|
||||||
|
cd ${STAGE_DIR}
|
||||||
|
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||||||
|
EOT
|
||||||
|
|
||||||
|
ARG NV_PEER_MEM_VERSION="1.2"
|
||||||
|
ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
|
||||||
|
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||||
|
RUN <<EOT
|
||||||
|
#!/bin/bash
|
||||||
|
# install nv_peer_mem
|
||||||
|
mkdir -p ${STAGE_DIR}
|
||||||
|
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
|
||||||
|
cd ${STAGE_DIR}/nv_peer_memory
|
||||||
|
./build_module.sh
|
||||||
|
cd ${STAGE_DIR}
|
||||||
|
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||||
|
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||||
|
apt-get update
|
||||||
|
apt --fix-broken install -y
|
||||||
|
apt-get install -y dkms
|
||||||
|
dpkg-buildpackage -us -uc
|
||||||
|
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||||
|
EOT
|
||||||
|
|
||||||
# base tools
|
# base tools
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
@@ -32,37 +75,18 @@ sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
|||||||
mkdir /var/run/sshd
|
mkdir /var/run/sshd
|
||||||
echo "root:${ROOT_PASSWD}" | chpasswd
|
echo "root:${ROOT_PASSWD}" | chpasswd
|
||||||
mkdir -p ~/.pip
|
mkdir -p ~/.pip
|
||||||
# install miniconda
|
eval "$(curl https://get.x-cmd.com)"
|
||||||
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
|
|
||||||
bash /tmp/miniconda.sh -b -p /opt/conda
|
|
||||||
rm /tmp/miniconda.sh
|
|
||||||
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
|
|
||||||
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
|
|
||||||
. /opt/conda/etc/profile.d/conda.sh
|
|
||||||
conda init bash
|
|
||||||
conda config --set show_channel_urls true
|
|
||||||
# 配置 .condarc 文件
|
|
||||||
cat <<EOF > ~/.condarc
|
|
||||||
channels:
|
|
||||||
- conda-forge
|
|
||||||
- bioconda
|
|
||||||
- pytorch
|
|
||||||
- pytorch-nightly
|
|
||||||
- nvidia
|
|
||||||
- defaults
|
|
||||||
show_channel_urls: true
|
|
||||||
EOF
|
|
||||||
# install pixi
|
# install pixi
|
||||||
curl -fsSL https://pixi.sh/install.sh | bash
|
curl -fsSL https://pixi.sh/install.sh | bash
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
ENV STAGE_DIR=/tmp
|
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||||
pip install git+https://github.com/huggingface/transformers
|
pip install git+https://github.com/huggingface/transformers
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
|
ENV STAGE_DIR=/tmp
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||||
@@ -112,8 +136,10 @@ EOT
|
|||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -m pip install peft tiktoken seaborn diffusers blobfile open_clip_torch zstandard mpi4py
|
python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
|
||||||
# python -m pip install --no-deps git+https://github.com/huggingface/optimum.git
|
# optimum 手动解决依赖
|
||||||
|
python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
|
||||||
|
python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
RUN <<EOT
|
RUN <<EOT
|
||||||
@@ -129,6 +155,7 @@ TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
|||||||
CUDA_HOME=${CUDA_HOME}
|
CUDA_HOME=${CUDA_HOME}
|
||||||
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
||||||
EOF
|
EOF
|
||||||
|
unset https_proxy http_proxy
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
CMD ["/usr/sbin/sshd", "-D"]
|
CMD ["/usr/sbin/sshd", "-D"]
|
||||||
@@ -28,6 +28,9 @@ NCCL_IB_DISABLE=0
|
|||||||
|
|
||||||
## test command
|
## test command
|
||||||
|
|
||||||
|
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc
|
||||||
|
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
nvcc -V
|
nvcc -V
|
||||||
@@ -40,7 +43,7 @@ python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
|
|||||||
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||||||
python -c "from xformers import ops as xops"
|
python -c "from xformers import ops as xops"
|
||||||
ibstat
|
ibstat
|
||||||
ofed_info -s
|
ofed_info -s # 如果输出显示了 OFED 版本号,则说明 OFED 驱动已安装。
|
||||||
mst version
|
mst version
|
||||||
mpirun --version
|
mpirun --version
|
||||||
```
|
```
|
||||||
|
|||||||
Reference in New Issue
Block a user