update to drive

This commit is contained in:
Your Name
2024-07-17 04:49:01 +00:00
parent f685bf6d80
commit adc45bc432
2 changed files with 54 additions and 24 deletions

View File

@@ -15,6 +15,49 @@ ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root WORKDIR /root
SHELL ["/bin/bash", "-c"] SHELL ["/bin/bash", "-c"]
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
RUN <<EOT
#!/bin/bash
# SYSTEM_NAME=$(lsb_release -cs) # 查看发行版本
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
EOT
ARG NV_PEER_MEM_VERSION="1.2"
ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN <<EOT
#!/bin/bash
# install nv_peer_mem
mkdir -p ${STAGE_DIR}
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
cd ${STAGE_DIR}/nv_peer_memory
./build_module.sh
cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt --fix-broken install -y
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# base tools # base tools
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
@@ -32,37 +75,18 @@ sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip mkdir -p ~/.pip
# install miniconda eval "$(curl https://get.x-cmd.com)"
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
. /opt/conda/etc/profile.d/conda.sh
conda init bash
conda config --set show_channel_urls true
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF
# install pixi # install pixi
curl -fsSL https://pixi.sh/install.sh | bash curl -fsSL https://pixi.sh/install.sh | bash
EOT EOT
ENV STAGE_DIR=/tmp
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
pip install git+https://github.com/huggingface/transformers pip install git+https://github.com/huggingface/transformers
EOT EOT
ENV STAGE_DIR=/tmp
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
@@ -112,8 +136,10 @@ EOT
RUN <<EOT RUN <<EOT
#!/bin/bash #!/bin/bash
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install peft tiktoken seaborn diffusers blobfile open_clip_torch zstandard mpi4py python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
# python -m pip install --no-deps git+https://github.com/huggingface/optimum.git # optimum 手动解决依赖
python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
EOT EOT
RUN <<EOT RUN <<EOT
@@ -129,6 +155,7 @@ TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
CUDA_HOME=${CUDA_HOME} CUDA_HOME=${CUDA_HOME}
LD_LIBRARY_PATH=${LD_LIBRARY_PATH} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
EOF EOF
unset https_proxy http_proxy
EOT EOT
CMD ["/usr/sbin/sshd", "-D"] CMD ["/usr/sbin/sshd", "-D"]

View File

@@ -28,6 +28,9 @@ NCCL_IB_DISABLE=0
## test command ## test command
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash
```shell ```shell
nvidia-smi nvidia-smi
nvcc -V nvcc -V
@@ -40,7 +43,7 @@ python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
python -c "import apex.amp; print('Apex is installed and the amp module is available.')" python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
python -c "from xformers import ops as xops" python -c "from xformers import ops as xops"
ibstat ibstat
ofed_info -s ofed_info -s # 如果输出显示了 OFED 版本号,则说明 OFED 驱动已安装。
mst version mst version
mpirun --version mpirun --version
``` ```