diff --git a/finetune/Dockerfile b/finetune/Dockerfile index 567be7c..ce24d4b 100644 --- a/finetune/Dockerfile +++ b/finetune/Dockerfile @@ -376,8 +376,6 @@ cd ${STAGE_DIR}/DeepSpeed git checkout . git checkout v${DEEPSPEED_VERSION} # python setup.py bdist_wheel -# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl --force-reinstall -# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt # 修改 install.sh 脚本中的 python 解释器路径 # sed "s|\bpython\b|/opt/conda/envs/${CONDA_ENV_NAME}/bin/python|g" install.sh > install_modified.sh # chmod +x ./install_modified.sh @@ -389,6 +387,9 @@ else INSTALL_CMD="./install.sh ${DEEPSPEED_INSTALL_FLAGS}" fi eval $INSTALL_CMD +DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl --force-reinstall +DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt +pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0 EOT # install transformers and flash-attn diff --git a/finetune/README.md b/finetune/README.md index 50902b8..ab77345 100644 --- a/finetune/README.md +++ b/finetune/README.md @@ -58,4 +58,102 @@ Device 7: "NVIDIA H100 80GB HBM3" CUDA Driver Version / Runtime Version 12.4 / 10.1 CUDA Capability Major/Minor version number: 9.0 -``` \ No newline at end of file +``` + + +## DeepSpeed hostfile 分发 + +要手动分发 hostfile 并进行分布式安装,你需要以下几个步骤: + +1. 准备 hostfile +确保 hostfile 文件包含所有参与的主机及其配置。 + +示例 hostfile 内容: + +```plaintext +host1 slots=4 +host2 slots=4 +host3 slots=8 +``` + +2. 确保 SSH 配置正确 +确保你能够通过 SSH 无密码登录到所有主机。可以使用 ssh-keygen 和 ssh-copy-id 配置 SSH 密钥。 + +生成 SSH 密钥(如果尚未生成): + +```shell +ssh-keygen -t rsa +``` + +将 SSH 公钥复制到每个主机: + +```shell +ssh-copy-id user@host1 +ssh-copy-id user@host2 +ssh-copy-id user@host3 +``` + +3. 创建临时目录并复制 wheel 文件 +在所有主机上创建一个临时目录,用于存放分发的 wheel 文件。 + +```shell +export PDSH_RCMD_TYPE=ssh +hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",") +tmp_wheel_path="/tmp/deepspeed_wheels" + +pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}" +pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/ +pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/ +``` + +4. 在每个主机上安装 DeepSpeed 和依赖项 +在所有主机上安装 DeepSpeed 和所需的依赖项。 + +```shell +pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl" +pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt" +``` + +5. 清理临时文件 +安装完成后,删除所有主机上的临时文件。 + +```shell +pdsh -w $hosts "rm -rf ${tmp_wheel_path}" +``` + +详细步骤 +确保 SSH 配置正确: + +```shell +ssh-keygen -t rsa +ssh-copy-id user@host1 +ssh-copy-id user@host2 +ssh-copy-id user@host3 +``` + +创建临时目录并复制文件: + +```shell +export PDSH_RCMD_TYPE=ssh +hosts=$(cat /path/to/your/hostfile | awk '{print $1}' | paste -sd ",") +tmp_wheel_path="/tmp/deepspeed_wheels" + +pdsh -w $hosts "mkdir -pv ${tmp_wheel_path}" +pdcp -w $hosts dist/deepspeed*.whl ${tmp_wheel_path}/ +pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/ +``` + +在所有主机上安装 DeepSpeed 和依赖项: + +```shell +pdsh -w $hosts "pip install ${tmp_wheel_path}/deepspeed*.whl" +pdsh -w $hosts "pip install -r ${tmp_wheel_path}/requirements.txt" +``` + +清理临时文件: + +```shell +pdsh -w $hosts "rm -rf ${tmp_wheel_path}" +``` + +通过这些步骤,你可以手动分发 hostfile 并在多个主机上安装 DeepSpeed 和其依赖项。这种方法确保了每个主机的环境配置一致,从而支持分布式训练或部署。 \ No newline at end of file diff --git a/finetune/docker-compose_pytorch2.3.yml b/finetune/docker-compose_pytorch2.3.yml index 754bf15..cdcd486 100644 --- a/finetune/docker-compose_pytorch2.3.yml +++ b/finetune/docker-compose_pytorch2.3.yml @@ -35,6 +35,7 @@ services: CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90" SETUPTOOLS_VERSION: "69.5.1" DCUTLASS_NVCC_ARCHS: "80;86;89;90;90a" # 90a for H100 GPU 89:GeForce RTX 4090 + DEEPSPEED_VERSION: "0.14.3" DEEPSPEED_INSTALL_FLAGS: "--allow_sudo --pip_sudo --verbose" volumes: - ./src:/bbtft