services: ldh-deepspeed-test: build: context: . dockerfile: Dockerfile.ldh args: # PYTHON_VERSION: "3.10" # CUDA_VERSION: "12.1.0" # PYTORCH_VERSION: "2.3.0" # TORCHVISION_VERSION: "0.18.0" # TORCHAUDIO_VERSION: "2.3.0" # DS_BUILD_OPS: 1 # USE_CUDA: 1 # USE_ROCM: 0 # USE_XPU: 0 # CUDA: cu121 # CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" # SETUPTOOLS_VERSION: "69.5.1" # DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 # DEEPSPEED_VERSION: "master" # DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" HTTP_PROXY: "http://127.0.0.1:15777" HTTPS_PROXY: "http://127.0.0.1:15777" # cache-from: "type=local" image: ldh/deepspeed:test shm_size: '128gb' deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] #runtime: nvidia environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility # stdin_open: true # tty: true privileged: true cap_add: - IPC_LOCK volumes: - /root/workspace:/root/data - /dev/infiniband:/dev/infiniband # ports: # - "22242:22242" # - "5000:5000" # networks: # - ldh_overlay_network network_mode: host command: ["/usr/sbin/sshd", "-D"] # networks: # ldh_overlay_network: # external: true