This commit is contained in:
2024-08-28 17:18:03 +08:00
parent 77140c1407
commit f735605e9f
15 changed files with 2635 additions and 1 deletions

View File

@@ -0,0 +1,120 @@
version: '3.8'
# DeepSpeed支持多种C++/CUDA扩展ops这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能
# FusedAdam - 提供融合优化的Adam优化器适用于GPU。
# FusedLamb - 类似FusedAdam针对LAMB优化器适用于大规模分布式训练。
# SparseAttention - 用于高效计算稀疏注意力机制。
# Transformer - 提供Transformer模型的高效实现。
# TransformerInference - 专门用于Transformer模型的推理优化。
# CPUAdam - 针对CPU优化的Adam优化器。
# CPULion - 针对CPU的Lion优化器。
# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。
# RandomLTD - 用于随机层裁剪的优化器。
# StochasticTransformer - 支持随机Transformer模型的训练和推理。
# 检测系统总内存以GB为单位
# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
# echo "Docker Compose 文件已生成shm_size 设置为 ${TOTAL_MEM}GB。"
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile.update
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
PYTHON_VERSION: "3.10"
NV_PEER_MEM_VERSION: "1.2"
# CUDA_VERSION: "12.1.0"
TAG_VERSION: "12.4.1"
PYTORCH_VERSION: "2.3.0"
TORCHVISION_VERSION: "0.18.0"
TORCHAUDIO_VERSION: "2.3.0"
DS_BUILD_OPS: 1
USE_CUDA: 1
USE_ROCM: 0
USE_XPU: 0
CUDA: cu121
CUDA_ARCH_LIST: "80;90" # for RTX 4090, all : "80;86;89;90"
TORCH_CUDA_ARCH_LIST: "8.0;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
SETUPTOOLS_VERSION: "69.5.1"
# Hopper (H100): 90a
# Ampere (A100, RTX 30 series): 80
# Turing (RTX 20 series, Titan RTX): 75
# Volta (V100): 70
# Pascal (P100, GTX 10 series): 60;61
# Maxwell (GTX 900 series): 50;53
DCUTLASS_NVCC_ARCHS: "80;90a" # 90a for H100 GPU 89:GeForce RTX 4090
DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
# HTTP_PROXY: "http://127.0.0.1:15777"
# HTTPS_PROXY: "http://127.0.0.1:15777"
CACHEBUST: 1
NV_DRIVER_VERSION: "535"
GO_VERSION: "1.21.13"
env_file:
- .env
volumes:
- ../src:/work
- /root/PDF-Extract-Kit/models:/models
- /mnt/beegfs/downloads:/mnt/beegfs/downloads
- ./results:/results
- /mnt/beegfs/pdf_clean/yanbaodata:/data
- ./start_mineru_update_mpi.py:/start_mineru_update_mpi.py
- ./magic-pdf.json:/root/magic-pdf.json
- ./hostfile:/hostfile
- ./hostfile_mpich:/hostfile_mpich
- ./mpi_test.py:/mpi_test.py
container_name: ubuntu-finetune
pull_policy: if_not_present
ulimits:
memlock:
soft: -1
hard: -1
# tty: true
# stdin_open: true
restart: unless-stopped
image: hotwa/deepspeed:pt23_update
privileged: true
cap_add:
- ALL
- CAP_SYS_PTRACE
shm_size: '2000gb'
devices:
- /dev/infiniband/rdma_cm
- /dev/infiniband/uverbs0
- /dev/infiniband/uverbs1
- /dev/infiniband/uverbs2
- /dev/infiniband/uverbs3
- /dev/infiniband/uverbs4
- /dev/infiniband/uverbs5
- /dev/infiniband/uverbs6
- /dev/infiniband/uverbs7
- /dev/infiniband/uverbs8
ipc: host
# ports:
# - 3228:2222
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TMPDIR=/var/tmp
- MAGIC_PDF_METHOD=auto
- MAGIC_PDF_MODEL_MODE=full
- MAGIC_PDF_INSIDE_MODEL=true
- MAX_PROCESSES_PER_GPU=10
- PDF_DIR=/data
- UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1
network_mode: host
# networks:
# - network_finetune
command: ["/usr/sbin/sshd", "-D"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# networks:
# network_finetune:
# name: network_finetune