From 067e1bdafed80715e019820ae3ae1f34e842dd0a Mon Sep 17 00:00:00 2001 From: hotwa Date: Wed, 12 Jun 2024 16:53:36 +0800 Subject: [PATCH] update --- README.md | 101 ++-------- finetune/Dockerfile | 163 ++++++++++++++++ finetune/accelerate-gpu-deepspeed.Dockerfile | 46 +++++ finetune/deepspeed.Dockerfile | 184 ++++++++++++++++++ finetune/peft-gpu-bnb-multi-source.Dockerfile | 68 +++++++ finetune/transformer.Dockerfile | 70 +++++++ ldb_docker_extend.Dockerfile | 9 + ldh_docker | 108 ++++++++++ 8 files changed, 660 insertions(+), 89 deletions(-) create mode 100644 finetune/Dockerfile create mode 100644 finetune/accelerate-gpu-deepspeed.Dockerfile create mode 100644 finetune/deepspeed.Dockerfile create mode 100644 finetune/peft-gpu-bnb-multi-source.Dockerfile create mode 100644 finetune/transformer.Dockerfile create mode 100644 ldb_docker_extend.Dockerfile create mode 100644 ldh_docker diff --git a/README.md b/README.md index b990584..eab1248 100644 --- a/README.md +++ b/README.md @@ -1,93 +1,16 @@ -# cdc_dockerfile +## 预训练 -## Getting started +GitHub - huggingface/transformers: 🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX. +GitHub - microsoft/DeepSpeed: DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective. +GitHub - huggingface/peft: 🤗 PEFT: State-of-the-art Parameter-Efficient Fine-Tuning. +GitHub - huggingface/accelerate: 🚀 A simple way to launch, train, and use PyTorch models on almost any device and distributed configuration, automatic mixed precision (including fp8), and easy-to-configure FSDP and DeepSpeed support -To make it easy for you to get started with GitLab, here's a list of recommended next steps. - -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! - -## Add your files - -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: - -``` -cd existing_repo -git remote add origin http://gitlab.dockless.eu.org/lingyuzeng/cdc_dockerfile.git -git branch -M main -git push -uf origin main -``` - -## Integrate with your tools - -- [ ] [Set up project integrations](http://gitlab.dockless.eu.org/lingyuzeng/cdc_dockerfile/-/settings/integrations) - -## Collaborate with your team - -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) - -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README - -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README - -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. - -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. - -## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. - -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. - -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. - -## Contributing -State if you are open to contributions and what your requirements are for accepting them. - -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. - -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. - -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. - -## License -For open source projects, say how it is licensed. - -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +```shell +# torch +https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile +https://github.com/huggingface/transformers/blob/main/docker/transformers-all-latest-gpu/Dockerfile +https://github.com/huggingface/peft/tree/main/docker/peft-gpu-bnb-source +https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +``` \ No newline at end of file diff --git a/finetune/Dockerfile b/finetune/Dockerfile new file mode 100644 index 0000000..532bdc1 --- /dev/null +++ b/finetune/Dockerfile @@ -0,0 +1,163 @@ +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 + +ARG DEBIAN_FRONTEND="noninteractive" +ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} +ENV MAMBA_ROOT_PREFIX=~/micromamba +WORKDIR /root +SHELL ["/bin/bash", "-c"] + +# base tools +RUN <> /etc/ssh/sshd_config +echo "PasswordAuthentication yes" >> /etc/ssh/sshd_config +echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config +echo "Port 22" >> /etc/ssh/sshd_config +mkdir /var/run/sshd +echo 'root:root' | chpasswd +mkdir -p ~/.pip +echo " +[global] +index-url = https://mirrors.aliyun.com/pypi/simple/ + +[install] +trusted-host=mirrors.aliyun.com +" >> ~/.pip/pip.conf +EOT + +# deepspeed +ENV STAGE_DIR=/tmp + +RUN <> /etc/ssh/sshd_config +cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config +sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config +EOT + +# Mellanox OFED +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +RUN apt-get install -y libnuma-dev +RUN cd ${STAGE_DIR} && \ + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* +# nv_peer_mem +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb +# OPENMPI +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +RUN cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun +# Python +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHON_VERSION=3 +RUN apt-get install -y python3 python3-dev && \ + rm -f /usr/bin/python && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py && \ + pip install --upgrade pip && \ + # Print python an pip version + python -V && pip -V +RUN pip install pyyaml +RUN pip install ipython +# Some Packages +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libsndfile-dev \ + libcupti-dev \ + libjpeg-dev \ + libpng-dev \ + screen \ + libaio-dev +RUN pip install psutil \ + yappi \ + cffi \ + ipdb \ + pandas \ + matplotlib \ + py3nvml \ + pyarrow \ + graphviz \ + astor \ + boto3 \ + tqdm \ + sentencepiece \ + msgpack \ + requests \ + pandas \ + sphinx \ + sphinx_rtd_theme \ + scipy \ + numpy \ + scikit-learn \ + nvidia-ml-py3 \ + mpi4py +# PyTorch +ARG PYTORCH_VERSION=1.13.0 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +RUN pip install torch==${PYTORCH_VERSION} +RUN rm -rf /usr/lib/python3/dist-packages/yaml && \ + rm -rf /usr/lib/python3/dist-packages/PyYAML-* +## Add deepspeed user +# Add a deepspeed user with user id 8877 +#RUN useradd --create-home --uid 8877 deepspeed +RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed +RUN usermod -aG sudo deepspeed +RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +# # Change to non-root privilege +USER deepspeed +# DeepSpeed +RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed +RUN cd ${STAGE_DIR}/DeepSpeed && \ + git checkout . && \ + git checkout master && \ + ./install.sh --pip_sudo +RUN rm -rf ${STAGE_DIR}/DeepSpeed +RUN python -c "import deepspeed; print(deepspeed.__version__)" \ No newline at end of file diff --git a/finetune/accelerate-gpu-deepspeed.Dockerfile b/finetune/accelerate-gpu-deepspeed.Dockerfile new file mode 100644 index 0000000..d35fc1b --- /dev/null +++ b/finetune/accelerate-gpu-deepspeed.Dockerfile @@ -0,0 +1,46 @@ +# Builds GPU docker image of PyTorch specifically +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +# Note: DeepSpeed beyond v0.12.6 requires py 3.10 +ENV PYTHON_VERSION=3.10 +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Create our conda env +RUN conda create --name accelerate python=${PYTHON_VERSION} ipython jupyter pip +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/accelerate/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] +# Activate the conda env, install mpy4pi, and install torch + accelerate +RUN source activate accelerate && conda install -c conda-forge mpi4py +RUN source activate accelerate && \ + python3 -m pip install --no-cache-dir \ + git+https://github.com/huggingface/accelerate#egg=accelerate[testing,test_trackers,deepspeed] \ + --extra-index-url https://download.pytorch.org/whl/cu117 + +RUN python3 -m pip install --no-cache-dir bitsandbytes + +# Stage 2 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +RUN echo "source activate accelerate" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] \ No newline at end of file diff --git a/finetune/deepspeed.Dockerfile b/finetune/deepspeed.Dockerfile new file mode 100644 index 0000000..fecb0c7 --- /dev/null +++ b/finetune/deepspeed.Dockerfile @@ -0,0 +1,184 @@ +FROM nvidia/cuda:12.2.2-devel-ubuntu20.04 + +ENV DEBIAN_FRONTEND noninteractive + +############################################################################## +# Temporary Installation Directory +############################################################################## +ENV STAGE_DIR=/tmp +RUN mkdir -p ${STAGE_DIR} + +############################################################################## +# Installation/Basic Utilities +############################################################################## +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + nfs-common pdsh \ + cmake g++ gcc \ + curl wget vim tmux emacs less unzip \ + htop iftop iotop ca-certificates openssh-client openssh-server \ + rsync iputils-ping net-tools sudo \ + llvm-dev + +############################################################################## +# Installation Latest Git +############################################################################## +RUN add-apt-repository ppa:git-core/ppa -y && \ + apt-get update && \ + apt-get install -y git && \ + git --version + +############################################################################## +# Client Liveness & Uncomment Port 22 for SSH Daemon +############################################################################## +# Keep SSH client alive from server side +RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config +RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \ + sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config + +############################################################################## +# Mellanox OFED +############################################################################## +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +RUN apt-get install -y libnuma-dev +RUN cd ${STAGE_DIR} && \ + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* + +############################################################################## +# nv_peer_mem +############################################################################## +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + +############################################################################## +# OPENMPI +############################################################################## +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +RUN cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + +############################################################################## +# Python +############################################################################## +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHON_VERSION=3 +RUN apt-get install -y python3 python3-dev && \ + rm -f /usr/bin/python && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py && \ + pip install --upgrade pip && \ + # Print python an pip version + python -V && pip -V +RUN pip install pyyaml +RUN pip install ipython + +############################################################################## +# Some Packages +############################################################################## +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libsndfile-dev \ + libcupti-dev \ + libjpeg-dev \ + libpng-dev \ + screen \ + libaio-dev +RUN pip install psutil \ + yappi \ + cffi \ + ipdb \ + pandas \ + matplotlib \ + py3nvml \ + pyarrow \ + graphviz \ + astor \ + boto3 \ + tqdm \ + sentencepiece \ + msgpack \ + requests \ + pandas \ + sphinx \ + sphinx_rtd_theme \ + scipy \ + numpy \ + scikit-learn \ + nvidia-ml-py3 \ + mpi4py + +############################################################################## +## SSH daemon port inside container cannot conflict with host OS port +############################################################################### +ENV SSH_PORT=2222 +RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ + sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config + +############################################################################## +# PyTorch +############################################################################## +ENV PYTORCH_VERSION=1.13.0 +RUN pip install torch==${PYTORCH_VERSION} + +############################################################################## +# PyYAML build issue +# https://stackoverflow.com/a/53926898 +############################################################################## +RUN rm -rf /usr/lib/python3/dist-packages/yaml && \ + rm -rf /usr/lib/python3/dist-packages/PyYAML-* + +############################################################################## +## Add deepspeed user +############################################################################### +# Add a deepspeed user with user id 8877 +#RUN useradd --create-home --uid 8877 deepspeed +RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed +RUN usermod -aG sudo deepspeed +RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +# # Change to non-root privilege +USER deepspeed + +############################################################################## +# DeepSpeed +############################################################################## +RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed +RUN cd ${STAGE_DIR}/DeepSpeed && \ + git checkout . && \ + git checkout master && \ + ./install.sh --pip_sudo +RUN rm -rf ${STAGE_DIR}/DeepSpeed +RUN python -c "import deepspeed; print(deepspeed.__version__)" \ No newline at end of file diff --git a/finetune/peft-gpu-bnb-multi-source.Dockerfile b/finetune/peft-gpu-bnb-multi-source.Dockerfile new file mode 100644 index 0000000..2c839c4 --- /dev/null +++ b/finetune/peft-gpu-bnb-multi-source.Dockerfile @@ -0,0 +1,68 @@ +# Builds GPU docker image of PyTorch +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +ENV PYTHON_VERSION=3.8 +# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN apt-get update && \ + apt-get install -y curl git wget software-properties-common git-lfs && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Install audio-related libraries +RUN apt-get update && \ + apt install -y ffmpeg + +RUN apt install -y libsndfile1-dev +RUN git lfs install + +# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip +RUN python3 -m pip install --no-cache-dir --upgrade pip + +# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/peft/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Stage 2 +FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget cmake && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Activate the conda env and install transformers + accelerate from source +# Also clone BNB and build it from source. +RUN source activate peft && \ + python3 -m pip install -U --no-cache-dir \ + librosa \ + "soundfile>=0.12.1" \ + scipy \ + git+https://github.com/huggingface/transformers \ + git+https://github.com/huggingface/accelerate \ + peft[test]@git+https://github.com/huggingface/peft \ + optimum \ + auto-gptq && \ + git clone https://github.com/TimDettmers/bitsandbytes && cd bitsandbytes && git checkout multi-backend-refactor && \ + cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \ + cmake --build . && \ + pip install -e . && \ + pip freeze | grep bitsandbytes + +RUN echo "source activate peft" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] \ No newline at end of file diff --git a/finetune/transformer.Dockerfile b/finetune/transformer.Dockerfile new file mode 100644 index 0000000..e38170e --- /dev/null +++ b/finetune/transformer.Dockerfile @@ -0,0 +1,70 @@ +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands) +SHELL ["sh", "-lc"] + +# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant +# to be used as arguments for docker build (so far). + +ARG PYTORCH='2.3.0' +# (not always a valid torch version) +ARG INTEL_TORCH_EXT='2.3.0' +# Example: `cu102`, `cu113`, etc. +ARG CUDA='cu121' + +RUN apt update +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs +RUN git lfs install +RUN python3 -m pip install --no-cache-dir --upgrade pip + +ARG REF=main +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + +# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. +# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. +# Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). +RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA + +RUN python3 -m pip uninstall -y flax jax + +RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu + +RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract +RUN python3 -m pip install -U "itsdangerous<2.1.0" + +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft + +# For bettertransformer +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum + +# For video model testing +RUN python3 -m pip install --no-cache-dir decord av==9.2.0 + +# Some slow tests require bnb +RUN python3 -m pip install --no-cache-dir bitsandbytes + +# Some tests require quanto +RUN python3 -m pip install --no-cache-dir quanto + +# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests +# (`deformable_detr`, `rwkv`, `mra`) +RUN python3 -m pip uninstall -y ninja + +# For `dinat` model +# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent) +RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels + +# For `nougat` tokenizer +RUN python3 -m pip install --no-cache-dir python-Levenshtein + +# For `FastSpeech2ConformerTokenizer` tokenizer +RUN python3 -m pip install --no-cache-dir g2p-en + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop \ No newline at end of file diff --git a/ldb_docker_extend.Dockerfile b/ldb_docker_extend.Dockerfile new file mode 100644 index 0000000..b7a42ce --- /dev/null +++ b/ldb_docker_extend.Dockerfile @@ -0,0 +1,9 @@ +FROM hotwa/bbt:latest + +# 确保 SSH 服务可以启动 +RUN echo 'root:root' | chpasswd && \ + sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config + +# 启动 ssh 服务 +CMD ["/bin/bash", "-c", "service ssh start; tail -f /dev/null"] diff --git a/ldh_docker b/ldh_docker new file mode 100644 index 0000000..009e8f8 --- /dev/null +++ b/ldh_docker @@ -0,0 +1,108 @@ +FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 +ENV DEBIAN_FRONTEND=noninteractive + +WORKDIR /workspace + +# Temporary Installation Directory +ENV STAGE_DIR=/tmp +RUN mkdir -p ${STAGE_DIR} + +# Installation/Basic Utilities +RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + software-properties-common build-essential autotools-dev \ + nfs-common pdsh \ + cmake g++ gcc \ + curl wget vim tmux emacs less unzip \ + htop iftop iotop ca-certificates openssh-client openssh-server \ + rsync iputils-ping net-tools sudo \ + llvm-dev + +# Installation Latest Git +# RUN add-apt-repository ppa:git-core/ppa -y && \ +RUN \ + apt-get update && \ + apt-get install -y git && \ + git --version + +# Client Liveness & Uncomment Port 22 for SSH Daemon +# Keep SSH client alive from server side +RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config +RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \ + sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config + +# Mellanox OFED +ENV MLNX_OFED_VERSION=4.9-7.1.0.0 +RUN apt-get install -y libnuma-dev +RUN cd ${STAGE_DIR} && \ + wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \ + cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \ + ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \ + cd ${STAGE_DIR} && \ + rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* + +# nv_peer_mem +ENV NV_PEER_MEM_VERSION=1.2 +ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 +RUN mkdir -p ${STAGE_DIR} && \ + git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ + cd ${STAGE_DIR}/nv_peer_memory && \ + ./build_module.sh && \ + cd ${STAGE_DIR} && \ + tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ + cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ + apt-get update && \ + apt-get install -y dkms && \ + dpkg-buildpackage -us -uc && \ + dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb + +# OPENMPI +ENV OPENMPI_BASEVERSION=4.1 +ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6 +RUN cd ${STAGE_DIR} && \ + wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \ + cd openmpi-${OPENMPI_VERSION} && \ + ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \ + make -j"$(nproc)" install && \ + ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \ + # Sanity check: + test -f /usr/local/mpi/bin/mpic++ && \ + cd ${STAGE_DIR} && \ + rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION} +ENV PATH=/usr/local/mpi/bin:${PATH} \ + LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \ + chmod a+x /usr/local/mpi/bin/mpirun + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libsndfile-dev \ + libcupti-dev \ + libjpeg-dev \ + libpng-dev \ + screen \ + libaio-dev + +RUN wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /${STAGE_DIR}/miniconda.sh && \ + bash /tmp/miniconda.sh -b -p /opt/conda && \ + rm /tmp/miniconda.sh && \ + /opt/conda/bin/conda init bash + +ENV PATH /opt/conda/bin:$PATH +COPY environment.yaml /workspace/environment.yaml +RUN conda env create -n ldh -f /workspace/environment.yaml + +RUN conda run -n ldh pip config set global.index-url http://mirrors.aliyun.com/pypi/simple && \ + conda run -n ldh pip config set install.trusted-host mirrors.aliyun.com && \ + conda run -n ldh pip install -U pip && \ + echo 'root:root' | chpasswd && \ + sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config && \ + rm -rf /usr/lib/python3/dist-packages/yaml && \ + rm -rf /usr/lib/python3/dist-packages/PyYAML-* + +CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D"]