From 86efe1122c8c64ecd13ef49fb11e86bfdba430e5 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 17 Jul 2024 04:57:56 +0000 Subject: [PATCH] Revert "Merge branch 'main' into devgpu" This reverts commit c7bff5448a4d1840dc254f514bc53fa2d528e3eb, reversing changes made to cea7bc59f3e72f0495a3159c14445c405c8a237b. --- .gitignore | 1 - Dockerfile | 1 - Dockerfile.jupyterhub | 38 ++--- docker-compose.yml | 7 +- id_rsa | 49 +++++++ jupyterhub_config.py | 14 -- nginx.conf | 3 +- spawnerdockerfile/Dockerfile.base-notebook | 58 ++------ spawnerdockerfile/Dockerfile.ngc | 161 --------------------- spawnerdockerfile/README.md | 48 ------ spawnerdockerfile/docker-compose_ngc.yml | 72 --------- spawnerdockerfile/install_conda.sh | 20 --- 12 files changed, 79 insertions(+), 393 deletions(-) create mode 100755 id_rsa delete mode 100644 spawnerdockerfile/Dockerfile.ngc delete mode 100644 spawnerdockerfile/docker-compose_ngc.yml delete mode 100644 spawnerdockerfile/install_conda.sh diff --git a/.gitignore b/.gitignore index c105ea7..8e2defa 100755 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ /docker-stacks -jupyterhub-data *.tar \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c8e02e2..15984d8 100755 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,3 @@ -# syntax=docker/dockerfile:1 FROM debian:bullseye ARG CREATE_USER="jovyan" ARG CREATE_USER_PASSWD="password" diff --git a/Dockerfile.jupyterhub b/Dockerfile.jupyterhub index 9ec6ea4..5e48cae 100755 --- a/Dockerfile.jupyterhub +++ b/Dockerfile.jupyterhub @@ -8,8 +8,6 @@ ARG ROOT_PASSWD="password" ARG HOME="/home/${CREATE_USER}" ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} -ARG NODEJS_VERSION='18' -ENV NODEJS_VERSION=${NODEJS_VERSION} USER root RUN < /etc/timezone dpkg-reconfigure -f noninteractive tzdata -sudo apt-get remove --purge libnode72:amd64 -y -curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash - # 安装所需的软件包 apt-get install -y python3 python3-pip gcc g++ build-essential nodejs npm gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git # 创建新用户 @@ -42,25 +38,8 @@ index-url = https://mirrors.aliyun.com/pypi/simple/ trusted-host=mirrors.aliyun.com " >> ~/.pip/pip.conf # 安装 Jupyter 和相关软件 -npm install -g configurable-http-proxy yarn --registry=https://registry.npmmirror.com -python3 -m pip install ipython jupyter_packaging jupyterhub jupyterlab notebook radian pycurl jupyter-rsession-proxy \ -ipykernel jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator \ -jupyter_contrib_nbextensions jupyterlab-unfold jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor \ -jupyterlab-cell-flash jedi-language-server jupyterlab_code_formatter jupyterlab-spellchecker jupyterlab_vim nbresuse \ -ipydrawio jedi ipympl black isort theme-darcula ipywidgets tensorboard jupyterlab_latex jupyter_bokeh autopep8 \ -xeus-python jupyterlab-lsp python-lsp-server dockerspawner jupyterhub-nativeauthenticator lckr_jupyterlab_variableinspector -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -git clone https://ghproxy.dockless.eu.org/https://github.com/arose/nglview -cd nglview -python3 setup.py install -cd js -rm -rf node_modules package-lock.json -npm install typescript@latest --registry=https://registry.npmmirror.com -npm install --registry=https://registry.npmmirror.com -cd ../.. -python -m ipykernel install --sys-prefix -jupyter nbextension enable --py --sys-prefix widgetsnbextension -jupyter nbextension enable --py --sys-prefix nglview -jupyter-nbextension enable nglview --py --sys-prefix +npm install -g configurable-http-proxy +python3 -m pip install jupyterhub jupyterlab notebook radian pycurl jupyter-rsession-proxy ipykernel jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator jupyter_contrib_nbextensions jupyterlab-unfold jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor jupyterlab-cell-flash jedi-language-server jupyterlab_code_formatter jupyterlab-spellchecker jupyterlab_vim nbresuse ipydrawio jedi ipympl black isort theme-darcula ipywidgets tensorboard jupyterlab_latex jupyter_bokeh autopep8 xeus-python jupyterlab-lsp python-lsp-server nglview dockerspawner jupyterhub-nativeauthenticator lckr_jupyterlab_variableinspector # 创建 JupyterHub 配置目录 mkdir -p /root/.jupyterhub EOT @@ -81,9 +60,22 @@ sudo wget "https://download2.rstudio.org/server/$(lsb_release -cs)/amd64/rstudio sudo chmod +x /tmp/rstudio-server.deb sudo gdebi -n /tmp/rstudio-server.deb sudo rm -rf /tmp/rstudio-server.deb +EOT + +RUN < /etc/timezone dpkg-reconfigure -f noninteractive tzdata # 安装所需的软件包 -sudo apt-get remove --purge libnode72:amd64 -y -curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash - apt-get install -y python3 python3-pip gcc g++ build-essential nodejs npm gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git nodejs -npm install -g configurable-http-proxy yarn --registry=https://registry.npmmirror.com # 创建新用户 useradd -m -s /bin/bash ${CREATE_USER} echo "${CREATE_USER}:${CREATE_USER_PASSWD}" | chpasswd @@ -105,7 +100,11 @@ EOT # 安装 Jupyter 和相关软件 RUN <=6.25.0 jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator jupyter_contrib_nbextensions jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor jupyterlab-cell-flash jedi-language-server jupyterlab_code_formatter jupyterlab-spellchecker jupyterlab_vim nbresuse ipydrawio jedi ipympl black isort theme-darcula ipywidgets tensorboard jupyterlab_latex jupyter_bokeh autopep8 xeus-python jupyterlab-lsp python-lsp-server dockerspawner jupyterhub-nativeauthenticator lckr_jupyterlab_variableinspector -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com EOT @@ -139,58 +138,23 @@ USER ${NB_UID} ARG MODULAR_HOME="/home/${CREATE_USER}/.modular" ENV MODULAR_HOME=$MODULAR_HOME ENV PATH="$MODULAR_HOME/pkg/packages.modular.com_mojo/bin:/home/${CREATE_USER}/.local/bin:$PATH" -ARG HTTP_PROXY="" -ARG HTTPS_PROXY="" -ARG NO_PROXY="localhost,127.0.0.1" -ENV HTTP_PROXY=$HTTP_PROXY -ENV HTTPS_PROXY=$HTTPS_PROXY RUN <> ~/.cargo/config.toml +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y source $HOME/.cargo/env cargo install evcxr_jupyter evcxr_jupyter --install EOT -RUN <&1 -# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail -# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile -# install deepspeed prepare -# install Mellanox OFED -mkdir -p ${STAGE_DIR} -wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - -cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 -./mlnxofedinstall --user-space-only --without-fw-update --all -q -cd ${STAGE_DIR} -rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* -EOT - -ARG NV_PEER_MEM_VERSION="1.2" -ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} -ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 -RUN <=0.17.0 -python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality] -EOT - -RUN < ~/.deepspeed_env -TORCH_USE_CUDA_DSA=1 -DEEPSPEED_VERBOSE=1 -DEEPSPEED_LOG_LEVEL=DEBUG -CUTLASS_PATH=${CUTLASS_PATH} -TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} -CUDA_HOME=${CUDA_HOME} -LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -EOF -unset https_proxy http_proxy -EOT - -CMD ["/usr/sbin/sshd", "-D"] \ No newline at end of file diff --git a/spawnerdockerfile/README.md b/spawnerdockerfile/README.md index 635a383..d8576ff 100755 --- a/spawnerdockerfile/README.md +++ b/spawnerdockerfile/README.md @@ -1,53 +1,5 @@ # Base Jupyter Notebook Stack -## ds_report - -```shell -[2024-07-17 02:25:56,956] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) - [WARNING] async_io requires the dev libaio .so object and headers but these were not found. - [WARNING] async_io: please install the libaio-dev package with apt - [WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. - [WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH - [WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4 - [WARNING] using untested triton version (3.0.0), only 1.0.0 is known to be compatible - -(deepspeed) root@ubuntu-finetune:~/binbbt/train/pretrain# cat .deepspeed_env -CUDA_HOME=/usr/local/cuda/ -TORCH_USE_CUDA_DSA=1 -CUTLASS_PATH=/opt/cutlass -TORCH_CUDA_ARCH_LIST="80;89;90;90a" -LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 -NCCL_DEBUG=WARN -NCCL_SOCKET_IFNAME=bond0 -NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1 -NCCL_IB_GID_INDEX=3 -NCCL_NET_GDR_LEVEL=2 -NCCL_P2P_DISABLE=0 -NCCL_IB_DISABLE=0 -``` - -## test command - -docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc -docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash - -```shell -nvidia-smi -nvcc -V -ninja --version -ds_report -python -c "import torch; print('torch:', torch.__version__, torch)" -python -c "import torch; print('CUDA available:', torch.cuda.is_available())" -python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()" -python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func" -python -c "import apex.amp; print('Apex is installed and the amp module is available.')" -python -c "from xformers import ops as xops" -ibstat -ofed_info -s # 如果输出显示了 OFED 版本号,则说明 OFED 驱动已安装。 -mst version -mpirun --version -``` - > **Images hosted on Docker Hub are no longer updated. Please, use [quay.io image](https://quay.io/repository/jupyter/base-notebook)** [![docker pulls](https://img.shields.io/docker/pulls/jupyter/base-notebook.svg)](https://hub.docker.com/r/jupyter/base-notebook/) diff --git a/spawnerdockerfile/docker-compose_ngc.yml b/spawnerdockerfile/docker-compose_ngc.yml deleted file mode 100644 index b925144..0000000 --- a/spawnerdockerfile/docker-compose_ngc.yml +++ /dev/null @@ -1,72 +0,0 @@ -version: '3.9' - -# DeepSpeed支持多种C++/CUDA扩展(ops),这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能: - -# FusedAdam - 提供融合优化的Adam优化器,适用于GPU。 -# FusedLamb - 类似FusedAdam,针对LAMB优化器,适用于大规模分布式训练。 -# SparseAttention - 用于高效计算稀疏注意力机制。 -# Transformer - 提供Transformer模型的高效实现。 -# TransformerInference - 专门用于Transformer模型的推理优化。 -# CPUAdam - 针对CPU优化的Adam优化器。 -# CPULion - 针对CPU的Lion优化器。 -# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。 -# RandomLTD - 用于随机层裁剪的优化器。 -# StochasticTransformer - 支持随机Transformer模型的训练和推理。 - -# 检测系统总内存(以GB为单位) -# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo) -# echo "Docker Compose 文件已生成,shm_size 设置为 ${TOTAL_MEM}GB。" - -services: - ubuntu-finetune: - build: - context: . - dockerfile: Dockerfile.ngc - args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822 - REGISTRY: "nvcr.io" - OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3 - LABEL: "pytorch" - VERSION: "24.06-py3" - DS_BUILD_OPS: 1 - DEEPSPEED_VERSION: "master" - DEEPSPEED_INSTALL_FLAGS: "--allow_sudo" - HTTP_PROXY: "http://127.0.0.1:15777" - HTTPS_PROXY: "http://127.0.0.1:15777" - CACHEBUST: 1 - # volumes: - # - ./workspace:/workspace - # - /tmp:/tmp - container_name: ubuntu-ngc - pull_policy: if_not_present - ulimits: - memlock: - soft: -1 - hard: -1 - # tty: true - # stdin_open: true - restart: unless-stopped - image: hotwa/notebook:ngc - privileged: true - ipc: host - network_mode: host - shm_size: '128gb' - # ports: - # - 3228:2222 - environment: - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=compute,utility - - TMPDIR=/var/tmp - # networks: - # - network_finetune - # command: ["/usr/sbin/sshd", "-D"] - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] - -# networks: -# network_finetune: -# name: network_finetune diff --git a/spawnerdockerfile/install_conda.sh b/spawnerdockerfile/install_conda.sh deleted file mode 100644 index 420690e..0000000 --- a/spawnerdockerfile/install_conda.sh +++ /dev/null @@ -1,20 +0,0 @@ -# install miniconda -wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh -bash /tmp/miniconda.sh -b -p /opt/conda -rm /tmp/miniconda.sh -ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh -echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc -. /opt/conda/etc/profile.d/conda.sh -conda init bash -conda config --set show_channel_urls true -# 配置 .condarc 文件 -cat < ~/.condarc -channels: - - conda-forge - - bioconda - - pytorch - - pytorch-nightly - - nvidia - - defaults -show_channel_urls: true -EOF \ No newline at end of file