From 455b634bfeee0221845ab95d029f0a4475d35dbc Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 2 Jul 2024 02:45:55 +0000 Subject: [PATCH] update add device and id_rsa --- finetune/docker-compose_pytorch2.3_device.yml | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 finetune/docker-compose_pytorch2.3_device.yml diff --git a/finetune/docker-compose_pytorch2.3_device.yml b/finetune/docker-compose_pytorch2.3_device.yml new file mode 100644 index 0000000..5211bca --- /dev/null +++ b/finetune/docker-compose_pytorch2.3_device.yml @@ -0,0 +1,69 @@ +version: '3.8' + +# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: + +# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 +# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 +# SparseAttention - 用于高效计算稀疏注意力机制。 +# Transformer - 提供Transformer模型的高效实现。 +# TransformerInference - 专门用于Transformer模型的推理优化。 +# CPUAdam - 针对CPU优化的Adam优化器。 +# CPULion - 针对CPU的Lion优化器。 +# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 +# RandomLTD - 用于随机层裁剪的优化器。 +# StochasticTransformer - 支持随机Transformer模型的训练和推理。 + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./src:/bbtft + - ./id_rsa_finetune:/root/.ssh/id_rsa + - ./id_rsa.pub:/root/.ssh/id_rsa.pub + container_name: ubuntu-finetune + pull_policy: if_not_present + # tty: true + restart: unless-stopped + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - network_finetune + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + cap_add: + - IPC_LOCK + devices: + - /dev/infiniband:/dev/infiniband + +networks: + network_finetune: + name: network_finetune