From 8017a3e1049f433f37c78bb8fd60bad01fd45ccb Mon Sep 17 00:00:00 2001 From: hotwa Date: Fri, 21 Jun 2024 18:17:28 +0800 Subject: [PATCH] update --- .gitignore | 3 +- evo/Dockerfile | 259 +------------------------ evo/docker-compose_pytorch1.13.yml | 11 +- evo/docker-compose_pytorch2.3.yml | 17 +- finetune/Dockerfile | 6 +- finetune/docker-compose_pytorch2.3.yml | 14 +- megadna/Dockerfile | 249 +----------------------- megadna/docker-compose_pytorch1.13.yml | 11 +- megadna/docker-compose_pytorch2.3.yml | 13 +- 9 files changed, 53 insertions(+), 530 deletions(-) diff --git a/.gitignore b/.gitignore index 18628d3..722d7bf 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ build_d/ *_src evo_src/ -megaDNA_src/ \ No newline at end of file +megaDNA_src/ +evo/huggingface/ \ No newline at end of file diff --git a/evo/Dockerfile b/evo/Dockerfile index a1ab221..4591208 100644 --- a/evo/Dockerfile +++ b/evo/Dockerfile @@ -113,270 +113,27 @@ conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' p ./ninja_test conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y python3 -m pip install --no-cache-dir --upgrade pip -conda run -n ${CONDA_ENV_NAME} python -m pip install open_clip_torch nvidia-ml-py3 opencv-contrib-python conda clean -afy -git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF && cd .. -conda run -n ${CONDA_ENV_NAME} python -m pip install setuptools==${SETUPTOOLS_VERSION} -conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] -# # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) -# # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch torchvision torchaudio -# # install pytorch create conda env aleay exists -conda run -n ${CONDA_ENV_NAME} python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA} -conda run -n ${CONDA_ENV_NAME} python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y transformer-engine -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y torch-tensorrt -conda run -n ${CONDA_ENV_NAME} python3 -m pip uninstall -y apex -EOT - -# install apex -RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -python -c "import apex.amp; print('Apex is installed and the amp module is available.')" -cd .. -rm -rf ${STAGE_DIR}/apex EOT RUN <&1 -# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail -# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile -# install deepspeed prepare -# install Mellanox OFED -mkdir -p ${STAGE_DIR} -wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - -cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 -./mlnxofedinstall --user-space-only --without-fw-update --all -q -cd ${STAGE_DIR} -rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* -cd .. -# install nv_peer_mem -rm -rf ${STAGE_DIR} -mkdir -p ${STAGE_DIR} -git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory -cd ${STAGE_DIR}/nv_peer_memory -./build_module.sh -cd ${STAGE_DIR} -tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz -cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} -apt-get update -apt-get install -y dkms -dpkg-buildpackage -us -uc -dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb -EOT - -# install mpi -ENV PATH=/usr/local/mpi/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -RUN < /usr/local/mpi/bin/mpirun -echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun -chmod a+x /usr/local/mpi/bin/mpirun -EOT - -# Some Packages -RUN < ${STAGE_DIR}/sshd_config && \ -sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config -EOT - -# 29.78 Usage: install.sh [options...] -# 29.78 -# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in -# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally -# 29.78 -# 29.78 [optional] -# 29.78 -l, --local_only Install only on local machine -# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) -# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) -# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels -# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) -# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) -# 29.78 -e, --examples Checkout deepspeed example submodule (no install) -# 29.78 -v, --verbose Verbose logging -# 29.78 -h, --help This help text - -RUN <> /etc/sudoers -EOT - -# install cutlass https://github.com/NVIDIA/cutlass -# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) -# A100: architecture is Ampere -# V100: architecture is Volta -# T4: architecture is Turing -# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 -# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 -# 80:适用于 NVIDIA Ampere 架构(如 A100)。 -# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 -# 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="89" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -RUN <= 23.1 (ref: https://pip.pypa.io/en/stable/news/#v23-1) which supports multiple `--config-settings` with the same key... -MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ -python -c "import apex.amp; print('Apex is installed and the amp module is available.')" -cd .. -rm -rf ${STAGE_DIR}/apex EOT RUN <&1 -# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail -# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile -# install deepspeed prepare -# install Mellanox OFED -mkdir -p ${STAGE_DIR} -wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - -cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 -./mlnxofedinstall --user-space-only --without-fw-update --all -q -cd ${STAGE_DIR} -rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64* -cd .. -# install nv_peer_mem -rm -rf ${STAGE_DIR} -mkdir -p ${STAGE_DIR} -git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory -cd ${STAGE_DIR}/nv_peer_memory -./build_module.sh -cd ${STAGE_DIR} -tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz -cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} -apt-get update -apt-get install -y dkms -dpkg-buildpackage -us -uc -dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb -EOT +git clone https://github.com/lingxusb/megaDNA.git ${STAGE_DIR}/megaDNA +cd ${STAGE_DIR}/megaDNA +pip install . -# install mpi -ENV PATH=/usr/local/mpi/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH} -RUN < /usr/local/mpi/bin/mpirun -echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun -chmod a+x /usr/local/mpi/bin/mpirun -EOT - -# Some Packages -RUN < ${STAGE_DIR}/sshd_config && \ -sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config -EOT - -# 29.78 Usage: install.sh [options...] -# 29.78 -# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in -# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally -# 29.78 -# 29.78 [optional] -# 29.78 -l, --local_only Install only on local machine -# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo) -# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo) -# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels -# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror) -# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile) -# 29.78 -e, --examples Checkout deepspeed example submodule (no install) -# 29.78 -v, --verbose Verbose logging -# 29.78 -h, --help This help text - -RUN <> /etc/sudoers -EOT - -# install cutlass https://github.com/NVIDIA/cutlass -# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" ) -# A100: architecture is Ampere -# V100: architecture is Volta -# T4: architecture is Turing -# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。 -# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。 -# 80:适用于 NVIDIA Ampere 架构(如 A100)。 -# 90a:适用于 NVIDIA Hopper 架构(如 H100)。 -# 89:GeForce RTX 4090 -ARG DCUTLASS_NVCC_ARCHS="89" -ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -RUN <