version: '3.8' services: colossalai: build: context: . dockerfile: Dockerfile.colossalai args: CACHEBUST: 1 # CONDA_ENV_NAME: "mineru" # PYTHON_VERSION: "3.10" TAG_VERSION: "12.1.1" NV_DRIVER_VERSION: "535" GO_VERSION: "1.21.13" DCUTLASS_NVCC_ARCHS: "89,90a" # env_file: # - .env_mpich volumes: - ../src:/work container_name: colossalai pull_policy: if_not_present ulimits: memlock: soft: -1 hard: -1 restart: unless-stopped image: hotwa/colossalai:latest privileged: true cap_add: - ALL - CAP_SYS_PTRACE shm_size: '2000gb' devices: - /dev/infiniband/rdma_cm - /dev/infiniband/uverbs0 - /dev/infiniband/uverbs1 - /dev/infiniband/uverbs2 - /dev/infiniband/uverbs3 - /dev/infiniband/uverbs4 - /dev/infiniband/uverbs5 - /dev/infiniband/uverbs6 - /dev/infiniband/uverbs7 - /dev/infiniband/uverbs8 environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility - TMPDIR=/var/tmp - MAGIC_PDF_METHOD=auto - MAGIC_PDF_MODEL_MODE=full - MAGIC_PDF_INSIDE_MODEL=true - MAX_PROCESSES_PER_GPU=10 - PDF_DIR=/data # - UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1 - UCX_NET_DEVICES=mlx5_3:1 network_mode: host command: ["/usr/sbin/sshd", "-D"] deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu]