version: '3.8' # DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: # FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 # FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 # SparseAttention - 用于高效计算稀疏注意力机制。 # Transformer - 提供Transformer模型的高效实现。 # TransformerInference - 专门用于Transformer模型的推理优化。 # CPUAdam - 针对CPU优化的Adam优化器。 # CPULion - 针对CPU的Lion优化器。 # Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 # RandomLTD - 用于随机层裁剪的优化器。 # StochasticTransformer - 支持随机Transformer模型的训练和推理。 # 检测系统总内存(以GB为单位) # TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo) # echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。" services: ubuntu-finetune: build: context: . dockerfile: Dockerfile.update args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 PYTHON_VERSION: "3.10" NV_PEER_MEM_VERSION: "1.2" # CUDA_VERSION: "12.1.0" TAG_VERSION: "12.4.1" PYTORCH_VERSION: "2.3.0" TORCHVISION_VERSION: "0.18.0" TORCHAUDIO_VERSION: "2.3.0" DS_BUILD_OPS: 1 USE_CUDA: 1 USE_ROCM: 0 USE_XPU: 0 CUDA: cu121 CUDA_ARCH_LIST: "80;90" # for RTX 4090, all : "80;86;89;90" TORCH_CUDA_ARCH_LIST: "8.0;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" SETUPTOOLS_VERSION: "69.5.1" # Hopper (H100): 90a # Ampere (A100, RTX 30 series): 80 # Turing (RTX 20 series, Titan RTX): 75 # Volta (V100): 70 # Pascal (P100, GTX 10 series): 60;61 # Maxwell (GTX 900 series): 50;53 DCUTLASS_NVCC_ARCHS: "80;90a" # 90a for H100 GPU 89:GeForce RTX 4090 DEEPSPEED_VERSION: "master" DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" # HTTP_PROXY: "http://127.0.0.1:15777" # HTTPS_PROXY: "http://127.0.0.1:15777" CACHEBUST: 1 NV_DRIVER_VERSION: "535" GO_VERSION: "1.21.13" env_file: - .env volumes: - ../src:/work - /root/PDF-Extract-Kit/models:/models - /mnt/beegfs/downloads:/mnt/beegfs/downloads - ./results:/results - /mnt/beegfs/pdf_clean/yanbaodata:/data - ./start_mineru_update_mpi.py:/start_mineru_update_mpi.py - ./magic-pdf.json:/root/magic-pdf.json - ./hostfile:/hostfile - ./hostfile_mpich:/hostfile_mpich - ./mpi_test.py:/mpi_test.py container_name: ubuntu-finetune pull_policy: if_not_present ulimits: memlock: soft: -1 hard: -1 # tty: true # stdin_open: true restart: unless-stopped image: hotwa/deepspeed:pt23_update privileged: true cap_add: - ALL - CAP_SYS_PTRACE shm_size: '2000gb' devices: - /dev/infiniband/rdma_cm - /dev/infiniband/uverbs0 - /dev/infiniband/uverbs1 - /dev/infiniband/uverbs2 - /dev/infiniband/uverbs3 - /dev/infiniband/uverbs4 - /dev/infiniband/uverbs5 - /dev/infiniband/uverbs6 - /dev/infiniband/uverbs7 - /dev/infiniband/uverbs8 ipc: host # ports: # - 3228:2222 environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - TMPDIR=/var/tmp - MAGIC_PDF_METHOD=auto - MAGIC_PDF_MODEL_MODE=full - MAGIC_PDF_INSIDE_MODEL=true - MAX_PROCESSES_PER_GPU=10 - PDF_DIR=/data - UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 network_mode: host # networks: # - network_finetune command: ["/usr/sbin/sshd", "-D"] deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] # networks: # network_finetune: # name: network_finetune