diff --git a/spawnerdockerfile/Dockerfile.ngc b/spawnerdockerfile/Dockerfile.ngc new file mode 100644 index 0000000..0818e32 --- /dev/null +++ b/spawnerdockerfile/Dockerfile.ngc @@ -0,0 +1,134 @@ +ARG REGISTRY=quay.io +ARG OWNER=jupyter +ARG LABEL=notebook +ARG VERSION +ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION +FROM $BASE_CONTAINER +ARG HTTP_PROXY +ARG HTTPS_PROXY +ENV http_proxy=${HTTP_PROXY} +ENV https_proxy=${HTTPS_PROXY} +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ARG ROOT_PASSWD="root" +ENV ROOT_PASSWD=${ROOT_PASSWD} +WORKDIR /root +SHELL ["/bin/bash", "-c"] + +# base tools +RUN <> ~/.bashrc +. /opt/conda/etc/profile.d/conda.sh +conda init bash +conda config --set show_channel_urls true +# 配置 .condarc 文件 +cat < ~/.condarc +channels: + - conda-forge + - bioconda + - pytorch + - pytorch-nightly + - nvidia + - defaults +show_channel_urls: true +EOF +# install pixi +curl -fsSL https://pixi.sh/install.sh | bash +EOT + +ENV STAGE_DIR=/tmp +RUN < ~/.deepspeed_env +TORCH_USE_CUDA_DSA=1 +DEEPSPEED_VERBOSE=1 +DEEPSPEED_LOG_LEVEL=DEBUG +CUTLASS_PATH=${CUTLASS_PATH} +TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +CUDA_HOME=${CUDA_HOME} +LD_LIBRARY_PATH=${LD_LIBRARY_PATH} +EOF +EOT + +CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/spawnerdockerfile/README.md b/spawnerdockerfile/README.md index d8576ff..829de11 100755 --- a/spawnerdockerfile/README.md +++ b/spawnerdockerfile/README.md @@ -1,5 +1,50 @@ # Base Jupyter Notebook Stack +## ds_report + +```shell +[2024-07-17 02:25:56,956] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) + [WARNING] async_io requires the dev libaio .so object and headers but these were not found. + [WARNING] async_io: please install the libaio-dev package with apt + [WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. + [WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH + [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4 + [WARNING] using untested triton version (3.0.0), only 1.0.0 is known to be compatible + +(deepspeed) root@ubuntu-finetune:~/binbbt/train/pretrain# cat .deepspeed_env +CUDA_HOME=/usr/local/cuda/ +TORCH_USE_CUDA_DSA=1 +CUTLASS_PATH=/opt/cutlass +TORCH_CUDA_ARCH_LIST="80;89;90;90a" +LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 +NCCL_DEBUG=WARN +NCCL_SOCKET_IFNAME=bond0 +NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1 +NCCL_IB_GID_INDEX=3 +NCCL_NET_GDR_LEVEL=2 +NCCL_P2P_DISABLE=0 +NCCL_IB_DISABLE=0 +``` + +## test command + +```shell +nvidia-smi +nvcc -V +ninja --version +ds_report +python -c "import torch; print('torch:', torch.__version__, torch)" +python -c "import torch; print('CUDA available:', torch.cuda.is_available())" +python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()" +python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" +python -c "import apex.amp; print('Apex is installed and the amp module is available.')" +python -c "from xformers import ops as xops" +ibstat +ofed_info -s +mst version +mpirun --version +``` + > **Images hosted on Docker Hub are no longer updated. Please, use [quay.io image](https://quay.io/repository/jupyter/base-notebook)** [![docker pulls](https://img.shields.io/docker/pulls/jupyter/base-notebook.svg)](https://hub.docker.com/r/jupyter/base-notebook/) diff --git a/spawnerdockerfile/docker-compose_ngc.yml b/spawnerdockerfile/docker-compose_ngc.yml new file mode 100644 index 0000000..b925144 --- /dev/null +++ b/spawnerdockerfile/docker-compose_ngc.yml @@ -0,0 +1,72 @@ +version: '3.9' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +# 检测系统总内存(以GB为单位) +# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo) +# echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。" + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile.ngc + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + REGISTRY: "nvcr.io" + OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3 + LABEL: "pytorch" + VERSION: "24.06-py3" + DS_BUILD_OPS: 1 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + HTTP_PROXY: "http://127.0.0.1:15777" + HTTPS_PROXY: "http://127.0.0.1:15777" + CACHEBUST: 1 + # volumes: + # - ./workspace:/workspace + # - /tmp:/tmp + container_name: ubuntu-ngc + pull_policy: if_not_present + ulimits: + memlock: + soft: -1 + hard: -1 + # tty: true + # stdin_open: true + restart: unless-stopped + image: hotwa/notebook:ngc + privileged: true + ipc: host + network_mode: host + shm_size: '128gb' + # ports: + # - 3228:2222 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + # networks: + # - network_finetune + # command: ["/usr/sbin/sshd", "-D"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +# networks: +# network_finetune: +# name: network_finetune