From 4bf09ad53880b2d26f1a9395404f41b824644d36 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 2 Jul 2024 07:43:07 +0000 Subject: [PATCH] update --- finetune/docker-compose_stack1.yml | 14 ++++++-- finetune/docker-compose_stack2.yml | 57 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 finetune/docker-compose_stack2.yml diff --git a/finetune/docker-compose_stack1.yml b/finetune/docker-compose_stack1.yml index b8e4497..cfd3b0c 100644 --- a/finetune/docker-compose_stack1.yml +++ b/finetune/docker-compose_stack1.yml @@ -18,10 +18,20 @@ services: kind: gpus value: 1 placement: - constraints: [node.platform.os == linux] + constraints: + - node.labels.gpu == true cap_add: - IPC_LOCK networks: default: - driver: overlay \ No newline at end of file + driver: overlay + +# 为节点添加标签: +# docker node ls + + +# docker node update --label-add gpu=true node1 + +# docker stack deploy -c docker-compose.yml rdma_stack + diff --git a/finetune/docker-compose_stack2.yml b/finetune/docker-compose_stack2.yml new file mode 100644 index 0000000..0ba7bb3 --- /dev/null +++ b/finetune/docker-compose_stack2.yml @@ -0,0 +1,57 @@ +version: '3.8' + +services: + ubuntu-finetune: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + CUDA_VERSION: "12.1.0" + PYTORCH_VERSION: "2.3.0" + TORCHVISION_VERSION: "0.18.0" + TORCHAUDIO_VERSION: "2.3.0" + DS_BUILD_OPS: 1 + USE_CUDA: 1 + USE_ROCM: 0 + USE_XPU: 0 + CUDA: cu121 + CUDA_ARCH_LIST: "80;86;89;90" + SETUPTOOLS_VERSION: "69.5.1" + DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" + DEEPSPEED_VERSION: "master" + DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" + volumes: + - ./src:/bbtft + - ./id_rsa_finetune:/root/.ssh/id_rsa + - ./id_rsa.pub:/root/.ssh/id_rsa.pub + container_name: ubuntu-finetune + image: hotwa/deepspeed:pt23 + shm_size: '32gb' + ports: + - 3228:22 + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + - TMPDIR=/var/tmp + networks: + - my-custom-bridge + deploy: + replicas: 1 + resources: + reservations: + generic_resources: + - discrete_resource_spec: + kind: nvidia + value: 1 + placement: + constraints: [node.labels.gpu == true] + runtime: nvidia + cap_add: + - IPC_LOCK + devices: + - /dev/infiniband:/dev/infiniband + +networks: + my-custom-bridge: + external: true