#!/bin/bash # 获取当前执行脚本的用户 CURRENT_USER=$(whoami) echo "当前用户:$CURRENT_USER" # 安装 NVIDIA Docker 工具包 install_nvidia_docker() { echo "正在安装 NVIDIA Docker..." sudo apt-get update sudo apt-get install -y curl gnupg lsb-release # 配置 NVIDIA Docker 源 if ! grep -q "^deb .\+nvidia-container-toolkit" /etc/apt/sources.list /etc/apt/sources.list.d/*; then curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list else echo "NVIDIA Docker 源已经配置,跳过此步骤。" fi sudo apt-get update sudo apt-get install -y nvidia-container-toolkit } # 配置 NVIDIA Container Toolkit configure_nvidia_ctk() { echo "正在配置 NVIDIA Container Toolkit..." # 创建 Rootless 模式下的 containerd 配置目录 mkdir -p "$HOME/.config/containerd" # 配置 nvidia-ctk 到 Rootless 模式下的 config.toml nvidia-ctk runtime configure --runtime=containerd --config="$HOME/.config/containerd/config.toml" # 确保 /etc/nvidia-container-runtime/config.toml 存在,并配置为默认 runtime if [ ! -f /etc/nvidia-container-runtime/config.toml ]; then sudo nvidia-ctk config --set default-runtime --config=/etc/nvidia-container-runtime/config.toml fi # 配置 NVIDIA 共享库路径,确保加载 GPU 驱动 echo "/usr/lib/x86_64-linux-gnu" | sudo tee /etc/ld.so.conf.d/nvidia.conf sudo ldconfig # 添加 nvidia-container-cli 到 PATH if ! echo "$PATH" | grep -q "/usr/bin"; then echo 'export PATH=$PATH:/usr/bin' >> ~/.profile source ~/.profile fi } # 启用 cgroup v2 支持和权限调整 configure_cgroup_v2() { echo "配置 cgroup v2 支持..." sudo chmod -R 755 /sys/fs/cgroup sudo chown -R $(whoami) /sys/fs/cgroup # 创建并设置 /etc/cni/tuning/allowlist.conf 文件 sudo mkdir -p /etc/cni/tuning sudo touch /etc/cni/tuning/allowlist.conf sudo chmod 644 /etc/cni/tuning/allowlist.conf sudo chown -R $(whoami) /etc/cni } # 重启 containerd 服务 restart_containerd() { echo "重启 containerd 服务..." systemctl --user daemon-reload systemctl --user restart containerd sudo systemctl daemon-reload sudo systemctl restart containerd } # 执行所有步骤 install_nvidia_docker configure_nvidia_ctk configure_cgroup_v2 restart_containerd echo "所有步骤已完成,NVIDIA Docker 和 containerd 配置已更新。"