This commit is contained in:
Your Name
2024-07-02 07:43:07 +00:00
parent 6df4ef5f17
commit 4bf09ad538
2 changed files with 69 additions and 2 deletions

View File

@@ -18,10 +18,20 @@ services:
kind: gpus kind: gpus
value: 1 value: 1
placement: placement:
constraints: [node.platform.os == linux] constraints:
- node.labels.gpu == true
cap_add: cap_add:
- IPC_LOCK - IPC_LOCK
networks: networks:
default: default:
driver: overlay driver: overlay
# 为节点添加标签:
# docker node ls
# docker node update --label-add gpu=true node1
# docker stack deploy -c docker-compose.yml rdma_stack

View File

@@ -0,0 +1,57 @@
version: '3.8'
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile
args:
PYTHON_VERSION: "3.10"
CUDA_VERSION: "12.1.0"
PYTORCH_VERSION: "2.3.0"
TORCHVISION_VERSION: "0.18.0"
TORCHAUDIO_VERSION: "2.3.0"
DS_BUILD_OPS: 1
USE_CUDA: 1
USE_ROCM: 0
USE_XPU: 0
CUDA: cu121
CUDA_ARCH_LIST: "80;86;89;90"
SETUPTOOLS_VERSION: "69.5.1"
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
volumes:
- ./src:/bbtft
- ./id_rsa_finetune:/root/.ssh/id_rsa
- ./id_rsa.pub:/root/.ssh/id_rsa.pub
container_name: ubuntu-finetune
image: hotwa/deepspeed:pt23
shm_size: '32gb'
ports:
- 3228:22
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TMPDIR=/var/tmp
networks:
- my-custom-bridge
deploy:
replicas: 1
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: nvidia
value: 1
placement:
constraints: [node.labels.gpu == true]
runtime: nvidia
cap_add:
- IPC_LOCK
devices:
- /dev/infiniband:/dev/infiniband
networks:
my-custom-bridge:
external: true