This commit is contained in:
Your Name
2024-07-02 06:08:08 +00:00
parent 4081db7ce0
commit 54a2227837
4 changed files with 67 additions and 5 deletions

View File

@@ -22,7 +22,7 @@ apt-get update
# rdma-core: 包含 RDMA 驱动和库 # rdma-core: 包含 RDMA 驱动和库
# ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具 # ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具
# perftest: 提供用于测试 InfiniBand 性能的工具 # perftest: 提供用于测试 InfiniBand 性能的工具
apt-get install -y rdma-core ibverbs-utils perftest apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y add-apt-repository ppa:git-core/ppa -y

View File

@@ -8,6 +8,8 @@ services:
environment: environment:
- PORT=1998 - PORT=1998
- PASS=P@88w0rd - PASS=P@88w0rd
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
volumes: volumes:
- ./id_rsa_finetune:/root/.ssh/id_rsa - ./id_rsa_finetune:/root/.ssh/id_rsa
- ./id_rsa.pub:/root/.ssh/id_rsa.pub - ./id_rsa.pub:/root/.ssh/id_rsa.pub

View File

@@ -51,7 +51,7 @@ services:
- NVIDIA_DRIVER_CAPABILITIES=compute,utility - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TMPDIR=/var/tmp - TMPDIR=/var/tmp
networks: networks:
- network_finetune - my-custom-bridge
deploy: deploy:
resources: resources:
reservations: reservations:
@@ -63,7 +63,9 @@ services:
- IPC_LOCK - IPC_LOCK
devices: devices:
- /dev/infiniband:/dev/infiniband - /dev/infiniband:/dev/infiniband
# docker swarm init
# docker swarm join-token manager
# docker network create -d overlay --subnet=192.168.200.0/24 my-overlay-network
networks: networks:
network_finetune: my-custom-bridge:
name: network_finetune external: true

View File

@@ -0,0 +1,58 @@
version: '3.8'
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile
args:
PYTHON_VERSION: "3.10"
CUDA_VERSION: "12.1.0"
PYTORCH_VERSION: "2.3.0"
TORCHVISION_VERSION: "0.18.0"
TORCHAUDIO_VERSION: "2.3.0"
DS_BUILD_OPS: 1
USE_CUDA: 1
USE_ROCM: 0
USE_XPU: 0
CUDA: cu121
CUDA_ARCH_LIST: "80;86;89;90"
SETUPTOOLS_VERSION: "69.5.1"
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
volumes:
- ./src:/bbtft
- ./id_rsa_finetune:/root/.ssh/id_rsa
- ./id_rsa.pub:/root/.ssh/id_rsa.pub
container_name: ubuntu-finetune
image: hotwa/deepspeed:pt23
shm_size: '32gb'
ports:
- 3228:22
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TMPDIR=/var/tmp
networks:
- my-custom-bridge
deploy:
replicas: 2
resources:
reservations:
generic_resources:
- discrete_resource_spec:
kind: nvidia
value: 8
placement:
constraints: [node.platform.os == linux]
cap_add:
- IPC_LOCK
devices:
- /dev/infiniband:/dev/infiniband
networks:
my-custom-bridge:
external: true
# docker stack deploy -c docker-compose_stack.yml rdma_stack