Files
cdc_dockerfile/finetune
..
2024-06-12 16:53:36 +08:00
2024-06-12 16:53:36 +08:00
2024-06-21 14:31:53 +08:00
2024-06-21 14:31:53 +08:00
2024-06-21 13:46:40 +08:00
2024-06-20 16:17:56 +08:00
2024-06-21 14:31:53 +08:00
2024-06-21 14:31:53 +08:00
2024-06-21 14:31:53 +08:00
2024-06-12 16:53:36 +08:00
2024-06-21 11:51:54 +08:00
2024-06-20 16:18:14 +08:00
2024-06-20 16:18:14 +08:00
2024-06-12 16:53:36 +08:00

deepspeed docker image build

docker-compose -f docker-compose_pytorch1.13.yml build
docker-compose -f docker-compose_pytorch2.3.yml build

test command

docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash

查询GPU 架构 给变量赋值

git clone https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps.git
cd deepstream_tlt_apps/TRT-OSS/x86
nvcc deviceQuery.cpp -o deviceQuery
./deviceQuery

H100 输出

(base) root@node19:~/bgpt/deepstream_tlt_apps/TRT-OSS/x86# ./deviceQuery
Detected 8 CUDA Capable device(s)

Device 0: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 1: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 2: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 3: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 4: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 5: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 6: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0

Device 7: "NVIDIA H100 80GB HBM3"
  CUDA Driver Version / Runtime Version          12.4 / 10.1
  CUDA Capability Major/Minor version number:    9.0