Merged specific files from main branch into devgpu

This commit is contained in:
Your Name
2024-07-17 05:00:46 +00:00
parent 86efe1122c
commit 2d056396a8
3 changed files with 253 additions and 0 deletions

View File

@@ -0,0 +1,72 @@
version: '3.9'
# DeepSpeed支持多种C++/CUDA扩展ops这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能
# FusedAdam - 提供融合优化的Adam优化器适用于GPU。
# FusedLamb - 类似FusedAdam针对LAMB优化器适用于大规模分布式训练。
# SparseAttention - 用于高效计算稀疏注意力机制。
# Transformer - 提供Transformer模型的高效实现。
# TransformerInference - 专门用于Transformer模型的推理优化。
# CPUAdam - 针对CPU优化的Adam优化器。
# CPULion - 针对CPU的Lion优化器。
# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。
# RandomLTD - 用于随机层裁剪的优化器。
# StochasticTransformer - 支持随机Transformer模型的训练和推理。
# 检测系统总内存以GB为单位
# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
# echo "Docker Compose 文件已生成shm_size 设置为 ${TOTAL_MEM}GB。"
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile.ngc
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
REGISTRY: "nvcr.io"
OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
LABEL: "pytorch"
VERSION: "24.06-py3"
DS_BUILD_OPS: 1
DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
HTTP_PROXY: "http://127.0.0.1:15777"
HTTPS_PROXY: "http://127.0.0.1:15777"
CACHEBUST: 1
# volumes:
# - ./workspace:/workspace
# - /tmp:/tmp
container_name: ubuntu-ngc
pull_policy: if_not_present
ulimits:
memlock:
soft: -1
hard: -1
# tty: true
# stdin_open: true
restart: unless-stopped
image: hotwa/notebook:ngc
privileged: true
ipc: host
network_mode: host
shm_size: '128gb'
# ports:
# - 3228:2222
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TMPDIR=/var/tmp
# networks:
# - network_finetune
# command: ["/usr/sbin/sshd", "-D"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# networks:
# network_finetune:
# name: network_finetune