version: '3.8' services: ubuntu-finetune: build: context: . dockerfile: Dockerfile args: PYTHON_VERSION: "3.10" CUDA_VERSION: "12.1.0" PYTORCH_VERSION: "2.3.0" TORCHVISION_VERSION: "0.18.0" TORCHAUDIO_VERSION: "2.3.0" DS_BUILD_OPS: 1 USE_CUDA: 1 USE_ROCM: 0 USE_XPU: 0 CUDA: cu121 CUDA_ARCH_LIST: "80;86;89;90" SETUPTOOLS_VERSION: "69.5.1" DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" DEEPSPEED_VERSION: "master" DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" volumes: - ./src:/bbtft - ./id_rsa_finetune:/root/.ssh/id_rsa - ./id_rsa.pub:/root/.ssh/id_rsa.pub container_name: ubuntu-finetune image: hotwa/deepspeed:pt23 shm_size: '32gb' ports: - 3228:22 environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - TMPDIR=/var/tmp networks: - my-custom-bridge deploy: replicas: 2 resources: reservations: generic_resources: - discrete_resource_spec: kind: "NVIDIA-GPU" value: 1 placement: constraints: [node.platform.os == linux] cap_add: - IPC_LOCK devices: - /dev/infiniband:/dev/infiniband networks: my-custom-bridge: external: true # docker stack deploy -c docker-compose_stack.yml rdma_stack