update to drive

2024-07-17 04:49:01 +00:00
parent f685bf6d80
commit adc45bc432
2 changed files with 54 additions and 24 deletions
--- a/spawnerdockerfile/Dockerfile.ngc
+++ b/spawnerdockerfile/Dockerfile.ngc
@@ -15,6 +15,49 @@ ENV ROOT_PASSWD=${ROOT_PASSWD}
 WORKDIR /root
 SHELL ["/bin/bash", "-c"]
 # https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
 ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
 RUN <<EOT
 #!/bin/bash
 # SYSTEM_NAME=$(lsb_release -cs) # 查看发行版本
 # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail. install deepspeed
 # DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 # from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
 # reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
 # install deepspeed prepare
 # install Mellanox OFED
 mkdir -p ${STAGE_DIR}
 wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
 cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
 ./mlnxofedinstall --user-space-only --without-fw-update --all -q 
 cd ${STAGE_DIR} 
 rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
 EOT
 ARG NV_PEER_MEM_VERSION="1.2"
 ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
 ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
 RUN <<EOT
 #!/bin/bash
 # install nv_peer_mem
 mkdir -p ${STAGE_DIR}
 git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
 cd ${STAGE_DIR}/nv_peer_memory
 ./build_module.sh
 cd ${STAGE_DIR}
 tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
 cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
 apt-get update
 apt --fix-broken install -y
 apt-get install -y dkms
 dpkg-buildpackage -us -uc
 dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
 EOT
 # base tools
 RUN <<EOT
 #!/bin/bash
@@ -32,37 +75,18 @@ sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
 mkdir /var/run/sshd
 echo "root:${ROOT_PASSWD}" | chpasswd
 mkdir -p ~/.pip
-# install miniconda
+eval "$(curl https://get.x-cmd.com)"
 wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
 bash /tmp/miniconda.sh -b -p /opt/conda 
 rm /tmp/miniconda.sh 
 ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
 echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
 . /opt/conda/etc/profile.d/conda.sh 
 conda init bash
 conda config --set show_channel_urls true
 # 配置 .condarc 文件
 cat <<EOF > ~/.condarc
 channels:
  - conda-forge
  - bioconda
  - pytorch
  - pytorch-nightly
  - nvidia
  - defaults
 show_channel_urls: true
 EOF
 # install pixi
 curl -fsSL https://pixi.sh/install.sh | bash
 EOT
 ENV STAGE_DIR=/tmp
 RUN <<EOT
 #!/bin/bash
 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
 pip install git+https://github.com/huggingface/transformers
 EOT
 ENV STAGE_DIR=/tmp
 RUN <<EOT
 #!/bin/bash
 git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
@@ -112,8 +136,10 @@ EOT
 RUN <<EOT
 #!/bin/bash
 python -m pip install --upgrade pip
-python -m pip install peft tiktoken seaborn diffusers blobfile open_clip_torch zstandard mpi4py
+python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
-# python -m pip install --no-deps git+https://github.com/huggingface/optimum.git
+# optimum 手动解决依赖
 python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
 python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
 EOT
 RUN <<EOT
@@ -129,6 +155,7 @@ TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 CUDA_HOME=${CUDA_HOME}
 LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
 EOF
 unset https_proxy http_proxy
 EOT
 CMD ["/usr/sbin/sshd", "-D"]
--- a/spawnerdockerfile/README.md
+++ b/spawnerdockerfile/README.md
@@ -28,6 +28,9 @@ NCCL_IB_DISABLE=0
 ## test command
 docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc
 docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash
 ```shell
 nvidia-smi
 nvcc -V
@@ -40,7 +43,7 @@ python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
 python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
 python -c "from xformers import ops as xops"
 ibstat
-ofed_info  -s
+ofed_info  -s # 如果输出显示了 OFED 版本号，则说明 OFED 驱动已安装。
 mst version
 mpirun --version
 ```