version: '3.8'

services:
  ubuntu-finetune:
    build: 
      context: .
      dockerfile: Dockerfile
      args:
        PYTHON_VERSION: "3.10"
        CUDA_VERSION: "12.1.0"
        PYTORCH_VERSION: "2.3.0"
        TORCHVISION_VERSION: "0.18.0"
        TORCHAUDIO_VERSION: "2.3.0"
        DS_BUILD_OPS: 1
        USE_CUDA: 1
        USE_ROCM: 0
        USE_XPU: 0
        CUDA: cu121
        CUDA_ARCH_LIST: "80;86;89;90"
        SETUPTOOLS_VERSION: "69.5.1"
        DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
        DEEPSPEED_VERSION: "master"
        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
    volumes:
      - type: tmpfs
        target: /dev/shm
        tmpfs:
          size: 32000000000 # 32GB
      # - ./src:/bbtft
      # - ./id_rsa_finetune:/root/.ssh/id_rsa
      # - ./id_rsa.pub:/root/.ssh/id_rsa.pub
    # container_name: ubuntu-finetune
    image: hotwa/deepspeed:pt23
    shm_size: '32gb'
    ports:
      - 3228:22
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
      - TMPDIR=/var/tmp
    # networks:
    #   - my-custom-bridge
    deploy:
      replicas: 4
      resources:
        reservations:
          generic_resources:
            - discrete_resource_spec:
                kind: "NVIDIA-GPU"
                value: 8
            - discrete_resource_spec:
                kind: "SRIOV-VF"
                value: 1
      placement:
        constraints: [node.labels.gpu == true]
    cap_add:
      - IPC_LOCK
    privileged: true

# networks:
#   my-custom-bridge:
#     external: true