This commit is contained in:
2024-07-16 13:20:45 +08:00
parent 7d32986bdf
commit 383a016d4e

View File

@@ -5,8 +5,62 @@ docker-compose -f docker-compose_pytorch1.13.yml build
docker-compose -f docker-compose_pytorch2.3.yml build
```
## 英伟达显卡安装卸载驱动
卸载
```shell
cd /usr/local/cuda
ll
cd ..
cd cuda-12.3/
ll
cd bin/
ll
./cuda-uninstaller
cd ~
nvidia-uninstall
sudo modprobe -r nvidia-drm nvidia-modeset nvidia-uvm nvidia
sudo rm -rf /usr/lib64/nvidia /usr/lib/nvidia
sudo apt autoremove nvidia*
sudo apt clean all
sudo dracut --force
sudo reboot
```
安装
```shell
wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb
wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda_12.5.1_555.42.06_linux.run
ll
sudo sh cuda_12.5.1_555.42.06_linux.run
echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc && echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc && source /root/.bashrc
nvcc -V
nvidia-smi
nvidia-smi -pm 1
modprobe nvidia_peermem
nvidia-smi
modinfo nvidia_peermem
lsmod | grep nvidia_peermem
systemctl mask apt-daily-upgrade.service
systemctl mask apt-daily-upgrade.timer
systemctl disable apt-daily-upgrade.timer
systemctl disable apt-daily-upgrade.service
ll
wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb
dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb
sudo systemctl start nvidia-fabricmanager
sudo systemctl status nvidia-fabricmanager
```
## 镜像测试命令
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update
```shell
nvidia-smi
nvcc -V
@@ -58,6 +112,31 @@ EOF
python compile_deepspeed_ops.py
```
## 配置vscode的docker的插件
[nerdctl配置](https://blog.csdn.net/margu_168/article/details/139822555)
```shell
cat << 'EOF' > /usr/local/bin/docker
#!/bin/bash
exec nerdctl "$@"
EOF
chmod +x /usr/local/bin/docker
```
nerdctl bash自动补全
```shell
apt update
apt install bash-completion -y
nerdctl completion bash > /etc/bash_completion.d/nerdctl
nerdctl completion bash > /etc/bash_completion.d/docker
source /etc/bash_completion.d/nerdctl
source /etc/bash_completion.d/docker
```
## 物理机更新内核
```shell