first add
This commit is contained in:
350
finetune/README.md
Normal file
350
finetune/README.md
Normal file
@@ -0,0 +1,350 @@
|
||||
## deepspeed docker image build
|
||||
|
||||
```shell
|
||||
docker-compose -f docker-compose_pytorch1.13.yml build
|
||||
docker-compose -f docker-compose_pytorch2.3.yml build
|
||||
```
|
||||
|
||||
## 英伟达显卡安装卸载驱动
|
||||
|
||||
卸载
|
||||
|
||||
```shell
|
||||
cd /usr/local/cuda
|
||||
ll
|
||||
cd ..
|
||||
cd cuda-12.3/
|
||||
ll
|
||||
cd bin/
|
||||
ll
|
||||
./cuda-uninstaller
|
||||
cd ~
|
||||
nvidia-uninstall
|
||||
sudo modprobe -r nvidia-drm nvidia-modeset nvidia-uvm nvidia
|
||||
sudo rm -rf /usr/lib64/nvidia /usr/lib/nvidia
|
||||
sudo apt autoremove nvidia*
|
||||
sudo apt clean all
|
||||
sudo dracut --force
|
||||
sudo reboot
|
||||
```
|
||||
|
||||
安装
|
||||
|
||||
```shell
|
||||
wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
|
||||
dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb
|
||||
wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda_12.5.1_555.42.06_linux.run
|
||||
ll
|
||||
sudo sh cuda_12.5.1_555.42.06_linux.run
|
||||
echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc && echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc && source /root/.bashrc
|
||||
nvcc -V
|
||||
nvidia-smi
|
||||
nvidia-smi -pm 1
|
||||
modprobe nvidia_peermem
|
||||
nvidia-smi
|
||||
modinfo nvidia_peermem
|
||||
lsmod | grep nvidia_peermem
|
||||
systemctl mask apt-daily-upgrade.service
|
||||
systemctl mask apt-daily-upgrade.timer
|
||||
systemctl disable apt-daily-upgrade.timer
|
||||
systemctl disable apt-daily-upgrade.service
|
||||
ll
|
||||
wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
|
||||
dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb
|
||||
sudo systemctl start nvidia-fabricmanager
|
||||
sudo systemctl status nvidia-fabricmanager
|
||||
```
|
||||
|
||||
## 镜像测试命令
|
||||
|
||||
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
|
||||
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
|
||||
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:24.06-py3 /bin/bash
|
||||
|
||||
pip3 install -U xformers --index-url https://mirror.sjtu.edu.cn/pytorch-wheels
|
||||
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
pip3 install -U xformers --index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
|
||||
```shell
|
||||
1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||
2 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||
3 curl -ksSL http://120.232.240.71:8887/linux/install.sh | bash
|
||||
4 pigchacli
|
||||
5 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
|
||||
6 export https_proxy=http://127.0.0.1:15777 http_proxy=http://127.0.0.1:15777
|
||||
7 pip install -v -U git+https://ghproxy.dockless.eu.org/https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||
8 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
||||
9 python -c "from xformers import ops as xops"
|
||||
10 python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||||
11 env
|
||||
12 pip install git+https://github.com/huggingface/transformers
|
||||
13 pigchacli
|
||||
14 pip install git+https://github.com/huggingface/transformers
|
||||
15 pip list
|
||||
16 export STAGE_DIR=/tmp
|
||||
17 git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||||
18 cd ${STAGE_DIR}/oneCCL
|
||||
19 git checkout .
|
||||
20 git checkout master
|
||||
21 mkdir build
|
||||
22 cd build
|
||||
23 cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||||
24 make -j"$(nproc)" install
|
||||
25 ls
|
||||
26 echo ${CUDA_ARCH_LIST}
|
||||
27 git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||
28 cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||||
29 python -m pip install -v .
|
||||
30 env
|
||||
31 python -m pip install -v .
|
||||
32 git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||
33 cd ${STAGE_DIR}/DeepSpeed
|
||||
34 export DEEPSPEED_VERSION="v0.14.3"
|
||||
35 git checkout ${DEEPSPEED_VERSION}
|
||||
36 ls
|
||||
37 ./install.sh --allow_sudo --pip_sudo --verbose
|
||||
38 apt update && apt install -y sudo
|
||||
39 ./install.sh --allow_sudo --pip_sudo --verbose
|
||||
```
|
||||
|
||||
```shell
|
||||
nvidia-smi
|
||||
nvcc -V
|
||||
ninja --version
|
||||
ds_report
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()"
|
||||
python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
|
||||
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||||
python -c "from xformers import ops as xops"
|
||||
ibstat
|
||||
ofed_info -s
|
||||
mst version
|
||||
mpirun --version
|
||||
```
|
||||
|
||||
```shell
|
||||
cat <<EOF > ~/compile_deepspeed_ops.py
|
||||
import deepspeed
|
||||
|
||||
def compile_ops():
|
||||
builders = [
|
||||
deepspeed.ops.op_builder.AsyncIOBuilder,
|
||||
deepspeed.ops.op_builder.FusedAdamBuilder,
|
||||
deepspeed.ops.op_builder.CPUAdamBuilder,
|
||||
deepspeed.ops.op_builder.CPUAdagradBuilder,
|
||||
deepspeed.ops.op_builder.CPULionBuilder,
|
||||
deepspeed.ops.op_builder.EvoformerAttnBuilder,
|
||||
deepspeed.ops.op_builder.FPQuantizerBuilder,
|
||||
deepspeed.ops.op_builder.FusedLambBuilder,
|
||||
deepspeed.ops.op_builder.FusedLionBuilder,
|
||||
deepspeed.ops.op_builder.QuantizerBuilder,
|
||||
deepspeed.ops.op_builder.RaggedOpsBuilder,
|
||||
deepspeed.ops.op_builder.RandomLTDBuilder,
|
||||
deepspeed.ops.op_builder.SparseAttnBuilder,
|
||||
deepspeed.ops.op_builder.SpatialInferenceBuilder,
|
||||
deepspeed.ops.op_builder.TransformerBuilder,
|
||||
deepspeed.ops.op_builder.StochasticTransformerBuilder,
|
||||
]
|
||||
|
||||
for builder in builders:
|
||||
print(f"Compiling {builder.__name__}")
|
||||
builder().load()
|
||||
|
||||
if __name__ == "__main__":
|
||||
compile_ops()
|
||||
EOF
|
||||
python compile_deepspeed_ops.py
|
||||
```
|
||||
|
||||
## 配置vscode的docker的插件
|
||||
|
||||
[nerdctl配置](https://blog.csdn.net/margu_168/article/details/139822555)
|
||||
|
||||
|
||||
|
||||
```shell
|
||||
cat << 'EOF' > /usr/local/bin/docker
|
||||
#!/bin/bash
|
||||
exec nerdctl "$@"
|
||||
EOF
|
||||
chmod +x /usr/local/bin/docker
|
||||
```
|
||||
|
||||
nerdctl bash自动补全
|
||||
|
||||
```shell
|
||||
apt update
|
||||
apt install bash-completion -y
|
||||
nerdctl completion bash > /etc/bash_completion.d/nerdctl
|
||||
nerdctl completion bash > /etc/bash_completion.d/docker
|
||||
source /etc/bash_completion.d/nerdctl
|
||||
source /etc/bash_completion.d/docker
|
||||
```
|
||||
|
||||
## 物理机更新内核
|
||||
|
||||
```shell
|
||||
uname -r # 5.4.0-144-generic
|
||||
lsb_release -a
|
||||
sudo apt-get update # This will update the repositories list
|
||||
sudo apt-get upgrade # This will update all the necessary packages on your system
|
||||
sudo apt-get dist-upgrade # This will add/remove any needed packages
|
||||
reboot # You may need this since sometimes after a upgrade/dist-upgrade, there are some left over entries that get fixed after a reboot
|
||||
sudo apt-get install linux-headers-$(uname -r) # This should work now
|
||||
```
|
||||
|
||||
## test command
|
||||
|
||||
```shell
|
||||
docker run -it --gpus all --name deepspeed_test --shm-size=1gb --rm hotwa/deepspeed:latest /bin/bash
|
||||
```
|
||||
|
||||
## [查询GPU 架构 给变量赋值](https://blog.csdn.net/zong596568821xp/article/details/106411024)
|
||||
|
||||
```shell
|
||||
git clone https://github.com/NVIDIA-AI-IOT/deepstream_tlt_apps.git
|
||||
cd deepstream_tlt_apps/TRT-OSS/x86
|
||||
nvcc deviceQuery.cpp -o deviceQuery
|
||||
./deviceQuery
|
||||
```
|
||||
|
||||
H100 输出
|
||||
|
||||
```shell
|
||||
(base) root@node19:~/bgpt/deepstream_tlt_apps/TRT-OSS/x86# ./deviceQuery
|
||||
Detected 8 CUDA Capable device(s)
|
||||
|
||||
Device 0: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 1: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 2: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 3: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 4: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 5: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 6: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
Device 7: "NVIDIA H100 80GB HBM3"
|
||||
CUDA Driver Version / Runtime Version 12.4 / 10.1
|
||||
CUDA Capability Major/Minor version number: 9.0
|
||||
|
||||
```
|
||||
|
||||
|
||||
## DeepSpeed hostfile 分发
|
||||
|
||||
要手动分发 hostfile 并进行分布式安装,你需要以下几个步骤:
|
||||
|
||||
1. 准备 hostfile
|
||||
确保 hostfile 文件包含所有参与的主机及其配置。
|
||||
|
||||
示例 hostfile 内容:
|
||||
|
||||
```plaintext
|
||||
host1 slots=4
|
||||
host2 slots=4
|
||||
host3 slots=8
|
||||
```
|
||||
|
||||
2. 确保 SSH 配置正确
|
||||
确保你能够通过 SSH 无密码登录到所有主机。可以使用 ssh-keygen 和 ssh-copy-id 配置 SSH 密钥。
|
||||
|
||||
生成 SSH 密钥(如果尚未生成):
|
||||
|
||||
```shell
|
||||
ssh-keygen -t rsa
|
||||
```
|
||||
|
||||
将 SSH 公钥复制到每个主机:
|
||||
|
||||
```shell
|
||||
ssh-copy-id user@host1
|
||||
ssh-copy-id user@host2
|
||||
ssh-copy-id user@host3
|
||||
```
|
||||
|
||||
3. 创建临时目录并复制 wheel 文件
|
||||
在所有主机上创建一个临时目录,用于存放分发的 wheel 文件。
|
||||
|
||||
```shell
|
||||
export PDSH_RCMD_TYPE=ssh
|
||||
hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",")
|
||||
tmp_wheel_path="/tmp/deepspeed_wheels"
|
||||
|
||||
pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}"
|
||||
pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/
|
||||
pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
|
||||
```
|
||||
|
||||
4. 在每个主机上安装 DeepSpeed 和依赖项
|
||||
在所有主机上安装 DeepSpeed 和所需的依赖项。
|
||||
|
||||
```shell
|
||||
pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl"
|
||||
pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt"
|
||||
```
|
||||
|
||||
5. 清理临时文件
|
||||
安装完成后,删除所有主机上的临时文件。
|
||||
|
||||
```shell
|
||||
pdsh -w $hosts "rm -rf ${tmp_wheel_path}"
|
||||
```
|
||||
|
||||
详细步骤
|
||||
确保 SSH 配置正确:
|
||||
|
||||
```shell
|
||||
ssh-keygen -t rsa
|
||||
ssh-copy-id user@host1
|
||||
ssh-copy-id user@host2
|
||||
ssh-copy-id user@host3
|
||||
```
|
||||
|
||||
创建临时目录并复制文件:
|
||||
|
||||
```shell
|
||||
export PDSH_RCMD_TYPE=ssh
|
||||
hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",")
|
||||
tmp_wheel_path="/tmp/deepspeed_wheels"
|
||||
|
||||
pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}"
|
||||
pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/
|
||||
pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
|
||||
```
|
||||
|
||||
在所有主机上安装 DeepSpeed 和依赖项:
|
||||
|
||||
```shell
|
||||
pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl"
|
||||
pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt"
|
||||
```
|
||||
|
||||
清理临时文件:
|
||||
|
||||
```shell
|
||||
pdsh -w $hosts "rm -rf ${tmp_wheel_path}"
|
||||
```
|
||||
|
||||
通过这些步骤,你可以手动分发 hostfile 并在多个主机上安装 DeepSpeed 和其依赖项。这种方法确保了每个主机的环境配置一致,从而支持分布式训练或部署。
|
||||
Reference in New Issue
Block a user