From 779ca9a2b2ad428dcc5d49b404aadb321aebdae1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 17 Jul 2024 05:01:55 +0000 Subject: [PATCH] Merged specific files from main branch into devgpu --- spawnerdockerfile/Dockerfile.ngc | 161 +++++++++++++++++++++++ spawnerdockerfile/docker-compose_ngc.yml | 72 ++++++++++ spawnerdockerfile/install_conda.sh | 20 +++ 3 files changed, 253 insertions(+) create mode 100644 spawnerdockerfile/Dockerfile.ngc create mode 100644 spawnerdockerfile/docker-compose_ngc.yml create mode 100644 spawnerdockerfile/install_conda.sh diff --git a/spawnerdockerfile/Dockerfile.ngc b/spawnerdockerfile/Dockerfile.ngc new file mode 100644 index 0000000..7a4e3c5 --- /dev/null +++ b/spawnerdockerfile/Dockerfile.ngc @@ -0,0 +1,161 @@ +ARG REGISTRY=quay.io +ARG OWNER=jupyter +ARG LABEL=notebook +ARG VERSION +ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION +FROM $BASE_CONTAINER +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] + +# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ +ENV MLNX_OFED_VERSION=23.10-3.2.2.0 +RUN <&1 +# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail +# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +# install deepspeed prepare +# install Mellanox OFED +mkdir -p ${STAGE_DIR} +wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - +cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 +./mlnxofedinstall --user-space-only --without-fw-update --all -q +cd ${STAGE_DIR} +rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* +EOT + +ARG NV_PEER_MEM_VERSION="1.2" +ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN <=0.17.0 +python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality] +EOT + +RUN < ~/.deepspeed_env +TORCH_USE_CUDA_DSA=1 +DEEPSPEED_VERBOSE=1 +DEEPSPEED_LOG_LEVEL=DEBUG +CUTLASS_PATH=${CUTLASS_PATH} +TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +CUDA_HOME=${CUDA_HOME} +LD_LIBRARY_PATH=${LD_LIBRARY_PATH} +EOF +unset https_proxy http_proxy +EOT + +CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/spawnerdockerfile/docker-compose_ngc.yml b/spawnerdockerfile/docker-compose_ngc.yml new file mode 100644 index 0000000..b925144 --- /dev/null +++ b/spawnerdockerfile/docker-compose_ngc.yml @@ -0,0 +1,72 @@ +version: '3.9' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +# 检测系统总内存(以GB为单位) +# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo) +# echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。" + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile.ngc + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + REGISTRY: "nvcr.io" + OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3 + LABEL: "pytorch" + VERSION: "24.06-py3" + DS_BUILD_OPS: 1 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + HTTP_PROXY: "http://127.0.0.1:15777" + HTTPS_PROXY: "http://127.0.0.1:15777" + CACHEBUST: 1 + # volumes: + # - ./workspace:/workspace + # - /tmp:/tmp + container_name: ubuntu-ngc + pull_policy: if_not_present + ulimits: + memlock: + soft: -1 + hard: -1 + # tty: true + # stdin_open: true + restart: unless-stopped + image: hotwa/notebook:ngc + privileged: true + ipc: host + network_mode: host + shm_size: '128gb' + # ports: + # - 3228:2222 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + # networks: + # - network_finetune + # command: ["/usr/sbin/sshd", "-D"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +# networks: +# network_finetune: +# name: network_finetune diff --git a/spawnerdockerfile/install_conda.sh b/spawnerdockerfile/install_conda.sh new file mode 100644 index 0000000..420690e --- /dev/null +++ b/spawnerdockerfile/install_conda.sh @@ -0,0 +1,20 @@ +# install miniconda +wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh +bash /tmp/miniconda.sh -b -p /opt/conda +rm /tmp/miniconda.sh +ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh +echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc +. /opt/conda/etc/profile.d/conda.sh +conda init bash +conda config --set show_channel_urls true +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF \ No newline at end of file