first add
This commit is contained in:
81
docker-compose_update.yml
Normal file
81
docker-compose_update.yml
Normal file
@@ -0,0 +1,81 @@
|
||||
version: '3.8'
|
||||
|
||||
# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能:
|
||||
|
||||
# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。
|
||||
# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。
|
||||
# SparseAttention - 用于高效计算稀疏注意力机制。
|
||||
# Transformer - 提供Transformer模型的高效实现。
|
||||
# TransformerInference - 专门用于Transformer模型的推理优化。
|
||||
# CPUAdam - 针对CPU优化的Adam优化器。
|
||||
# CPULion - 针对CPU的Lion优化器。
|
||||
# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。
|
||||
# RandomLTD - 用于随机层裁剪的优化器。
|
||||
# StochasticTransformer - 支持随机Transformer模型的训练和推理。
|
||||
# 检测系统总内存(以GB为单位)
|
||||
# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
|
||||
# echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。"
|
||||
|
||||
services:
|
||||
ubuntu-finetune:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.update
|
||||
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
||||
PYTHON_VERSION: "3.10"
|
||||
NV_PEER_MEM_VERSION: "1.2"
|
||||
CUDA_VERSION: "12.1.0"
|
||||
PYTORCH_VERSION: "2.3.0"
|
||||
TORCHVISION_VERSION: "0.18.0"
|
||||
TORCHAUDIO_VERSION: "2.3.0"
|
||||
DS_BUILD_OPS: 1
|
||||
USE_CUDA: 1
|
||||
USE_ROCM: 0
|
||||
USE_XPU: 0
|
||||
CUDA: cu121
|
||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||
TORCH_CUDA_ARCH_LIST: "8.0;8.6;8.9;9.0+PTX" # all "6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0"
|
||||
SETUPTOOLS_VERSION: "69.5.1"
|
||||
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090
|
||||
DEEPSPEED_VERSION: "master"
|
||||
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
|
||||
# HTTP_PROXY: "http://127.0.0.1:15777"
|
||||
# HTTPS_PROXY: "http://127.0.0.1:15777"
|
||||
CACHEBUST: 1
|
||||
volumes:
|
||||
- ./src:/bbtft
|
||||
# - /tmp:/tmp
|
||||
container_name: ubuntu-finetune
|
||||
pull_policy: if_not_present
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
# tty: true
|
||||
# stdin_open: true
|
||||
restart: unless-stopped
|
||||
image: hotwa/deepspeed:pt23_update
|
||||
privileged: true
|
||||
ipc: host
|
||||
network_mode: host
|
||||
shm_size: '128gb'
|
||||
# ports:
|
||||
# - 3228:2222
|
||||
environment:
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
- TMPDIR=/var/tmp
|
||||
# networks:
|
||||
# - network_finetune
|
||||
# command: ["/usr/sbin/sshd", "-D"]
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
|
||||
# networks:
|
||||
# network_finetune:
|
||||
# name: network_finetune
|
||||
Reference in New Issue
Block a user