version: '3.8' services: ubuntu-finetune: build: context: . dockerfile: Dockerfile args: PYTHON_VERSION: "3.10" CUDA_VERSION: "12.1.0" PYTORCH_VERSION: "2.3.0" TORCHVISION_VERSION: "0.18.0" TORCHAUDIO_VERSION: "2.3.0" DS_BUILD_OPS: 1 USE_CUDA: 1 USE_ROCM: 0 USE_XPU: 0 CUDA: cu121 CUDA_ARCH_LIST: "80;86;89;90" SETUPTOOLS_VERSION: "69.5.1" DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" DEEPSPEED_VERSION: "master" DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" volumes: - type: tmpfs target: /dev/shm tmpfs: size: 32000000000 # 32GB # - ./src:/bbtft # - ./id_rsa_finetune:/root/.ssh/id_rsa # - ./id_rsa.pub:/root/.ssh/id_rsa.pub # container_name: ubuntu-finetune image: hotwa/deepspeed:pt23 shm_size: '32gb' ports: - 3228:22 environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - TMPDIR=/var/tmp # networks: # - my-custom-bridge deploy: replicas: 4 resources: reservations: generic_resources: - discrete_resource_spec: kind: "NVIDIA-GPU" value: 8 - discrete_resource_spec: kind: "SRIOV-VF" value: 1 placement: constraints: [node.labels.gpu == true] cap_add: - IPC_LOCK privileged: true # networks: # my-custom-bridge: # external: true