diff --git a/finetune/README.md b/finetune/README.md index d5dfb35..c83161f 100644 --- a/finetune/README.md +++ b/finetune/README.md @@ -5,8 +5,62 @@ docker-compose -f docker-compose_pytorch1.13.yml build docker-compose -f docker-compose_pytorch2.3.yml build ``` +## 英伟达显卡安装卸载驱动 + +卸载 + +```shell +cd /usr/local/cuda +ll +cd .. +cd cuda-12.3/ +ll +cd bin/ +ll +./cuda-uninstaller +cd ~ +nvidia-uninstall +sudo modprobe -r nvidia-drm nvidia-modeset nvidia-uvm nvidia +sudo rm -rf /usr/lib64/nvidia /usr/lib/nvidia +sudo apt autoremove nvidia* +sudo apt clean all +sudo dracut --force +sudo reboot +``` + +安装 + +```shell +wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb +dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb +wget https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda_12.5.1_555.42.06_linux.run +ll +sudo sh cuda_12.5.1_555.42.06_linux.run +echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc && echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc && source /root/.bashrc +nvcc -V +nvidia-smi +nvidia-smi -pm 1 +modprobe nvidia_peermem +nvidia-smi +modinfo nvidia_peermem +lsmod | grep nvidia_peermem +systemctl mask apt-daily-upgrade.service +systemctl mask apt-daily-upgrade.timer +systemctl disable apt-daily-upgrade.timer +systemctl disable apt-daily-upgrade.service +ll +wget https://developer.download.nvidia.cn/compute/cuda/repos/ubuntu2204/x86_64/nvidia-fabricmanager-555_555.42.06-1_amd64.deb +dpkg -i nvidia-fabricmanager-555_555.42.06-1_amd64.deb +sudo systemctl start nvidia-fabricmanager +sudo systemctl status nvidia-fabricmanager +``` + ## 镜像测试命令 +docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all ldh/deepspeed:test +docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/deepspeed:pt23_update + + ```shell nvidia-smi nvcc -V @@ -58,6 +112,31 @@ EOF python compile_deepspeed_ops.py ``` +## 配置vscode的docker的插件 + +[nerdctl配置](https://blog.csdn.net/margu_168/article/details/139822555) + + + +```shell +cat << 'EOF' > /usr/local/bin/docker +#!/bin/bash +exec nerdctl "$@" +EOF +chmod +x /usr/local/bin/docker +``` + +nerdctl bash自动补全 + +```shell +apt update +apt install bash-completion -y +nerdctl completion bash > /etc/bash_completion.d/nerdctl +nerdctl completion bash > /etc/bash_completion.d/docker +source /etc/bash_completion.d/nerdctl +source /etc/bash_completion.d/docker +``` + ## 物理机更新内核 ```shell