update
This commit is contained in:
@@ -22,7 +22,7 @@ apt-get update
|
|||||||
# rdma-core: 包含 RDMA 驱动和库
|
# rdma-core: 包含 RDMA 驱动和库
|
||||||
# ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具
|
# ibverbs-utils: 提供用于检查和配置 InfiniBand 设备的工具
|
||||||
# perftest: 提供用于测试 InfiniBand 性能的工具
|
# perftest: 提供用于测试 InfiniBand 性能的工具
|
||||||
apt-get install -y rdma-core ibverbs-utils perftest
|
apt-get install -y rdma-core ibverbs-utils perftest libibverbs-dev infiniband-diags
|
||||||
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
|
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2
|
||||||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||||||
add-apt-repository ppa:git-core/ppa -y
|
add-apt-repository ppa:git-core/ppa -y
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
- PORT=1998
|
- PORT=1998
|
||||||
- PASS=P@88w0rd
|
- PASS=P@88w0rd
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
volumes:
|
volumes:
|
||||||
- ./id_rsa_finetune:/root/.ssh/id_rsa
|
- ./id_rsa_finetune:/root/.ssh/id_rsa
|
||||||
- ./id_rsa.pub:/root/.ssh/id_rsa.pub
|
- ./id_rsa.pub:/root/.ssh/id_rsa.pub
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ services:
|
|||||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
- TMPDIR=/var/tmp
|
- TMPDIR=/var/tmp
|
||||||
networks:
|
networks:
|
||||||
- network_finetune
|
- my-custom-bridge
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
@@ -63,7 +63,9 @@ services:
|
|||||||
- IPC_LOCK
|
- IPC_LOCK
|
||||||
devices:
|
devices:
|
||||||
- /dev/infiniband:/dev/infiniband
|
- /dev/infiniband:/dev/infiniband
|
||||||
|
# docker swarm init
|
||||||
|
# docker swarm join-token manager
|
||||||
|
# docker network create -d overlay --subnet=192.168.200.0/24 my-overlay-network
|
||||||
networks:
|
networks:
|
||||||
network_finetune:
|
my-custom-bridge:
|
||||||
name: network_finetune
|
external: true
|
||||||
|
|||||||
58
finetune/docker-compose_stack.yml
Normal file
58
finetune/docker-compose_stack.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
ubuntu-finetune:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
args:
|
||||||
|
PYTHON_VERSION: "3.10"
|
||||||
|
CUDA_VERSION: "12.1.0"
|
||||||
|
PYTORCH_VERSION: "2.3.0"
|
||||||
|
TORCHVISION_VERSION: "0.18.0"
|
||||||
|
TORCHAUDIO_VERSION: "2.3.0"
|
||||||
|
DS_BUILD_OPS: 1
|
||||||
|
USE_CUDA: 1
|
||||||
|
USE_ROCM: 0
|
||||||
|
USE_XPU: 0
|
||||||
|
CUDA: cu121
|
||||||
|
CUDA_ARCH_LIST: "80;86;89;90"
|
||||||
|
SETUPTOOLS_VERSION: "69.5.1"
|
||||||
|
DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a"
|
||||||
|
DEEPSPEED_VERSION: "master"
|
||||||
|
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
|
||||||
|
volumes:
|
||||||
|
- ./src:/bbtft
|
||||||
|
- ./id_rsa_finetune:/root/.ssh/id_rsa
|
||||||
|
- ./id_rsa.pub:/root/.ssh/id_rsa.pub
|
||||||
|
container_name: ubuntu-finetune
|
||||||
|
image: hotwa/deepspeed:pt23
|
||||||
|
shm_size: '32gb'
|
||||||
|
ports:
|
||||||
|
- 3228:22
|
||||||
|
environment:
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
- TMPDIR=/var/tmp
|
||||||
|
networks:
|
||||||
|
- my-custom-bridge
|
||||||
|
deploy:
|
||||||
|
replicas: 2
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
generic_resources:
|
||||||
|
- discrete_resource_spec:
|
||||||
|
kind: nvidia
|
||||||
|
value: 8
|
||||||
|
placement:
|
||||||
|
constraints: [node.platform.os == linux]
|
||||||
|
cap_add:
|
||||||
|
- IPC_LOCK
|
||||||
|
devices:
|
||||||
|
- /dev/infiniband:/dev/infiniband
|
||||||
|
|
||||||
|
networks:
|
||||||
|
my-custom-bridge:
|
||||||
|
external: true
|
||||||
|
|
||||||
|
# docker stack deploy -c docker-compose_stack.yml rdma_stack
|
||||||
Reference in New Issue
Block a user