add command

Merged specific files from main branch into devgpu
add self container start notebook
2024-07-17 05:10:41 +00:00 · 2024-07-17 05:01:55 +00:00 · 2024-05-27 22:22:22 +08:00 · 2024-05-27 14:34:59 +08:00 · 2024-05-27 14:03:16 +08:00 · 2024-05-27 13:01:20 +08:00
9 changed files with 513 additions and 58 deletions
--- a/Dockerfile.jupyterhub
+++ b/Dockerfile.jupyterhub
@@ -21,10 +21,11 @@ apt-get install -y tzdata
 ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
 echo 'Asia/Shanghai' > /etc/timezone
 dpkg-reconfigure -f noninteractive tzdata
-sudo apt-get remove --purge libnode72:amd64 -y
-curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash - 
 # 安装所需的软件包
-apt-get install -y python3 python3-pip gcc g++ build-essential nodejs npm gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git
+apt-get install -y python3 python3-pip gcc g++ build-essential gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git
+curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash - 
+apt-get update
+apt-get install -y nodejs npm
 # 创建新用户
 useradd -m -s /bin/bash ${CREATE_USER}
 echo "${CREATE_USER}:${CREATE_USER_PASSWD}" | chpasswd
@@ -34,7 +35,8 @@ EOT
 RUN <<EOT
 #!/bin/bash
 # 安装 Jupyter 和相关软件
-npm install -g configurable-http-proxy yarn --registry=https://registry.npmmirror.com
+npm install -g configurable-http-proxy yarn typescript-language-server vscode-css-languageserver-bin yaml-language-server \
+vscode-html-languageserver-bin vscode-json-languageserver-bin yaml-language-server --registry=https://registry.npmmirror.com
 python3 -m pip install ipython jupyter_packaging jupyterhub jupyterlab notebook radian pycurl jupyter-rsession-proxy \
 ipykernel jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator \
 jupyter_contrib_nbextensions jupyterlab-unfold jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor \
@@ -61,6 +63,8 @@ EOT
 # jupyter nbextension enable --py --sys-prefix widgetsnbextension

 # install Rstudio
+ARG RSERVER_VERSION="rstudio-server-2024.04.1-748-amd64.deb"
+ENV RSERVER_VERSION=${RSERVER_VERSION}
 RUN <<EOT
 #!/bin/bash
 apt update -qq
@@ -69,7 +73,7 @@ wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sud
 sudo add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
 apt-get update
 apt install --no-install-recommends r-base
-sudo wget "https://download2.rstudio.org/server/$(lsb_release -cs)/amd64/rstudio-server-2023.06.1-524-amd64.deb" -O /tmp/rstudio-server.deb
+sudo wget "https://download2.rstudio.org/server/$(lsb_release -cs)/amd64/${RSERVER_VERSION}" -O /tmp/rstudio-server.deb
 sudo chmod +x /tmp/rstudio-server.deb 
 sudo gdebi -n /tmp/rstudio-server.deb
 sudo rm -rf /tmp/rstudio-server.deb
--- a/docker-compose-self.yml
+++ b/docker-compose-self.yml
@@ -0,0 +1,62 @@
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+# JupyterHub docker compose configuration file
+version: "3"
+
+services:
+  hub:
+    build:
+      context: .
+      dockerfile: Dockerfile.jupyterhub
+      args:
+        JUPYTERHUB_VERSION: latest
+    restart: always
+    image: quay.io/hotwa/jupyterhub:latest
+    container_name: jupyterhub
+    networks:
+      - jupyterhub-network
+    volumes:
+      # The JupyterHub configuration file
+      - "./jupyterhub_config_self.py:/srv/jupyterhub/jupyterhub_config.py:ro"
+      # Bind Docker socket on the host so we can connect to the daemon from
+      # within the container
+      - "/var/run/docker.sock:/var/run/docker.sock:rw"
+      # Bind Docker volume on host for JupyterHub database and cookie secrets
+      - "./jupyterhub-data:/data"
+    ports:
+      - "8000:8000"
+      - "8080:8080"
+    environment:
+      # This username will be a JupyterHub admin
+      JUPYTERHUB_ADMIN: admin
+      # All containers will join this network
+      DOCKER_NETWORK_NAME: jupyterhub-network
+      # JupyterHub will spawn this Notebook image for users
+      DOCKER_NOTEBOOK_IMAGE: quay.io/hotwa/notebook:latest
+      # Notebook directory inside user image
+      DOCKER_NOTEBOOK_DIR: /home/jovyan/work
+
+  nginx:
+    image: nginx:latest
+    container_name: nginx-proxy
+    depends_on:
+      - hub
+    volumes:
+      - "./nginx.conf:/etc/nginx/nginx.conf:ro"
+      - "./nginx-selfsigned.crt:/etc/ssl/certs/nginx-selfsigned.crt:ro"
+      - "./nginx-selfsigned.key:/etc/ssl/private/nginx-selfsigned.key:ro"
+      - "./dhparam.pem:/etc/ssl/certs/dhparam.pem:ro"
+    ports:
+      - "50000:443"
+    networks:
+      - jupyterhub-network
+
+volumes:
+  jupyterhub-data:
+
+networks:
+  jupyterhub-network:
+    name: jupyterhub-network
+
+# use 127.0.0.1:8000 access
--- a/jupyterhub_config.py.bak
+++ b/jupyterhub_config.py.bak
@@ -1,68 +1,108 @@
+# Copyright (c) Jupyter Development Team.
+# Distributed under the terms of the Modified BSD License.
+
+# Configuration file for JupyterHub
 import os
-from pathlib import Path
-from dockerspawner import DockerSpawner

-c = get_config()
-c.Application.log_level = 'DEBUG'
+c = get_config()  # noqa: F821

-# 基本的JupyterHub配置
-c.JupyterHub.cookie_secret_file = os.path.expanduser('~/.jupyterhub/jupyterhub_cookie_secret')
-db_file = os.path.expanduser('~/.jupyterhub/jupyterhub.sqlite')
-c.JupyterHub.db_url = f'sqlite:///{db_file}'
-c.ConfigurableHTTPProxy.pid_file = os.path.expanduser('~/.jupyterhub/jupyterhub-proxy.pid')
+# We rely on environment variables to configure JupyterHub so that we
+# avoid having to rebuild the JupyterHub container every time we change a
+# configuration parameter.

-# Authenticator 设置
-c.JupyterHub.authenticator_class = 'jupyterhub.auth.PAMAuthenticator'
-c.PAMAuthenticator.encoding = 'utf8'
-c.Authenticator.admin_users = set()
-c.Authenticator.allowed_users = set()
-c.LocalAuthenticator.create_system_users = True
+# from dockerspawner import DockerSpawner

-# Spawner 设置
-c.Spawner.ip = '127.0.0.1'
-c.Spawner.cmd = ['jupyter-labhub']
-c.Spawner.default_url = '/lab'
-c.LocalProcessSpawner.shell_cmd = ["bash", "-l", "-c"]
-c.Spawner.notebook_dir = '~'
-c.Spawner.args = ['--allow-root', "--KernelSpecManager.ensure_native_kernel=False", '--NotebookApp.allow_origin_pat=https://.*vscode-cdn\\.net', '--NotebookApp.iopub_data_rate_limit=10000000']
+# class MyDockerSpawner(DockerSpawner):
+#     def start(self):
+#         # 启动父类的start方法
+#         self.user_options['environment']['JUPYTER_ENABLE_NBEXTENSIONS'] = 'true'
+#         self.user_options['cmd'] = [
+#             'bash',
+#             '-c',
+#             'pip install nglview jupyter_packaging -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com && jupyter nbextension enable nglview --py --sys-prefix && jupyter labextension install nglview-js-widgets && jupyter labextension install @jupyter-widgets/jupyterlab-manager && start-singleuser.sh'
+#         ]
+#         return super().start()
+        
+# Spawn single-user servers as Docker containers
+c.Authenticator.allow_all = True
+c.JupyterHub.spawner_class = "dockerspawner.DockerSpawner"

-# 环境变量保持
-c.Spawner.env_keep = ['PATH', 'PYTHONPATH', 'LD_LIBRARY_PATH', 'ENV1', 'ENV2']
+# Spawn containers from this image
+c.DockerSpawner.image = os.environ["DOCKER_NOTEBOOK_IMAGE"]

-# JupyterHub 服务配置
-c.JupyterHub.ip = '0.0.0.0'
-c.JupyterHub.port = 9000
-c.JupyterHub.shutdown_on_logout = True
-c.JupyterHub.statsd_prefix = 'jupyterhub'
-c.JupyterHub.page_title = 'JupyterHub Service'
+# Connect containers to this Docker network
+network_name = os.environ["DOCKER_NETWORK_NAME"]
+c.DockerSpawner.use_internal_ip = True
+c.DockerSpawner.network_name = network_name

-# Dockerspawner 配置（如果需要启用）
-c.JupyterHub.spawner_class = DockerSpawner
-c.DockerSpawner.allowed_images='*'
+# Explicitly set notebook directory because we'll be mounting a volume to it.
+# Most `jupyter/docker-stacks` *-notebook images run the Notebook server as
+# user `jovyan`, and set the notebook directory to `/home/jovyan/work`.
+# We follow the same convention.
+notebook_dir = os.environ.get("DOCKER_NOTEBOOK_DIR", "/home/jovyan/work")
+c.DockerSpawner.notebook_dir = notebook_dir

-# Docker 守护进程的地址
-c.DockerSpawner.docker_host = 'unix:///var/run/docker.sock'
+# Mount the real user's Docker volume on the host to the notebook user's
+# notebook directory in the container
+# c.DockerSpawner.volumes = {"jupyterhub-user-{username}": notebook_dir}
+# Mount the real user's Docker volume on the host to the notebook user's
+# notebook directory in the container
+c.DockerSpawner.volumes = {
+    "jupyterhub-user-{username}": notebook_dir,
+    "/mnt/mydrive": "/home/jovyan/work/mydrive",
+    "/mnt/mydrive/project/docker-jupyterhub/id_rsa": "/home/jovyan/.ssh/id_rsa",
+}

-# 使用的 Docker 镜像
-c.DockerSpawner.image = 'quay.io/jupyter/scipy-notebook'

-# 删除容器当它停止时
+# Remove containers once they are stopped
 c.DockerSpawner.remove = True

-# 设置网络（如果您有特定的 Docker 网络配置）
-# c.DockerSpawner.network_name = 'jupyterhub'
+# For debugging arguments passed to spawned containers
+c.DockerSpawner.debug = True
+# c.Application.log_level = 'DEBUG'

-# JupyterHub 的连接地址，用于 DockerSpawner 内部通信
-# 如果 JupyterHub 运行在同一 Docker 网络中，可以使用 Docker 容器名称
-# c.JupyterHub.hub_connect_ip = 'jupyterhub'
+# User containers will access hub by container name on the Docker network
+c.JupyterHub.hub_ip = 'jupyterhub'
+c.JupyterHub.hub_port = 8080

-# 其他配置...
+# Persist hub data on volume mounted inside container
+c.JupyterHub.cookie_secret_file = "/data/jupyterhub_cookie_secret"
+c.JupyterHub.db_url = "sqlite:////data/jupyterhub.sqlite"

-# 注意：下面这行配置是不必要的，因为您已经使用 Unix 套接字
-# c.DockerSpawner.docker_host = 'tcp://docker-daemon-host:2375'
-# 如果使用TLS（根据需要取消注释）
-# os.environ['DOCKER_TLS_VERIFY'] = '1'
-# os.environ['DOCKER_CERT_PATH'] = '/path/to/certificates'
+# Authenticate users with Native Authenticator
+c.JupyterHub.authenticator_class = "nativeauthenticator.NativeAuthenticator"
+
+# Allow anyone to sign-up without approval
+c.NativeAuthenticator.open_signup = True
+
+# Allowed admins
+admin = os.environ.get("JUPYTERHUB_ADMIN")
+if admin:
+    c.Authenticator.admin_users = [admin]
+
+# c.DockerSpawner.extra_create_kwargs.update({
+#    "environment": {"JUPYTER_ENABLE_LAB": "yes"}
+# })
+
+# 启动jupyter时候增加跨域支持, 否则反向代理的时候出现问题
+# --NotebookApp.iopub_data_rate_limit=10000000 给nglview使用
+c.DockerSpawner.extra_create_kwargs.update({
+    "environment": {"NOTEBOOK_ARGS": "--NotebookApp.allow_origin='*' --NotebookApp.iopub_data_rate_limit=10000000"}
+})
+
+
+# 要支持正则匹配的域名请求，可以通过设置 allow_origin_pat 参数来实现。这个参数允许你使用正则表达式来匹配允许跨域请求的域名。例如，如果你想允许所有以 .example.com 结尾的域名进行跨域请求，可以在 jupyterhub_config.py 文件中添加如下配置：
+
+# c.Spawner.environment = {
+#     'JUPYTERHUB_CORS': '{"allow_origin_pat": "https?://.*\\.example\\.com"}'
+# }
+
+# GPU 和网络配置
+c.DockerSpawner.extra_host_config = {
+    'runtime': 'nvidia'
+}
+c.DockerSpawner.environment = {
+    'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility',
+    'NVIDIA_VISIBLE_DEVICES': 'all'
+}

-# 其他配置（根据需要添加）
-# ...
--- a/jupyterhub_config_self.py
+++ b/jupyterhub_config_self.py
@@ -0,0 +1,48 @@
+import os
+
+c = get_config()  # noqa: F821
+
+# 基本配置
+c.Authenticator.allow_all = True
+c.JupyterHub.spawner_class = "jupyterhub.spawner.LocalProcessSpawner"
+
+# 单用户配置
+c.Spawner.cmd = ['jupyter-labhub']
+c.Spawner.default_url = '/lab'
+c.Spawner.notebook_dir = '/home/jovyan'
+c.Spawner.environment = {
+    'JUPYTER_ENABLE_LAB': 'yes',
+    'NOTEBOOK_ARGS': '--NotebookApp.allow_origin="*" --NotebookApp.iopub_data_rate_limit=10000000',
+}
+
+# Hub IP 和端口配置
+c.JupyterHub.hub_ip = '0.0.0.0'
+c.JupyterHub.hub_port = 8080
+
+# Cookie secret 和数据库 URL
+c.JupyterHub.cookie_secret_file = '/srv/jupyterhub/jupyterhub_cookie_secret'
+c.JupyterHub.db_url = 'sqlite:////srv/jupyterhub/jupyterhub.sqlite'
+
+# Authenticator 配置
+c.JupyterHub.authenticator_class = 'nativeauthenticator.NativeAuthenticator'
+c.NativeAuthenticator.open_signup = True
+
+# 管理员配置
+admin = os.environ.get('JUPYTERHUB_ADMIN')
+if admin:
+    c.Authenticator.admin_users = {admin}
+
+# 调试模式
+c.JupyterHub.log_level = 'DEBUG'
+c.Spawner.debug = True
+
+# GPU 和网络配置（仅在需要 GPU 时启用）
+c.Spawner.environment.update({
+    'NVIDIA_DRIVER_CAPABILITIES': 'compute,utility',
+    'NVIDIA_VISIBLE_DEVICES': 'all'
+})
+
+# 可选：GPU runtime 配置
+c.Spawner.extra_host_config = {
+    'runtime': 'nvidia'
+}
--- a/spawnerdockerfile/Dockerfile.base-notebook
+++ b/spawnerdockerfile/Dockerfile.base-notebook
@@ -92,9 +92,9 @@ ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
 echo 'Asia/Shanghai' > /etc/timezone
 dpkg-reconfigure -f noninteractive tzdata
 # 安装所需的软件包
-sudo apt-get remove --purge libnode72:amd64 -y
+apt-get install -y python3 python3-pip gcc g++ build-essential gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git nodejs
 curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash - 
-apt-get install -y python3 python3-pip gcc g++ build-essential nodejs npm gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git nodejs
+apt-get install -y nodejs npm
 npm install -g configurable-http-proxy yarn --registry=https://registry.npmmirror.com
 # 创建新用户
 useradd -m -s /bin/bash ${CREATE_USER}
--- a/spawnerdockerfile/Dockerfile.ngc
+++ b/spawnerdockerfile/Dockerfile.ngc
@@ -0,0 +1,161 @@
+ARG REGISTRY=quay.io
+ARG OWNER=jupyter
+ARG LABEL=notebook
+ARG VERSION
+ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION
+FROM $BASE_CONTAINER
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ENV http_proxy=${HTTP_PROXY}
+ENV https_proxy=${HTTPS_PROXY}
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
+ARG ROOT_PASSWD="root"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+WORKDIR /root
+SHELL ["/bin/bash", "-c"]
+
+# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
+ENV MLNX_OFED_VERSION=23.10-3.2.2.0 
+RUN <<EOT
+#!/bin/bash
+# SYSTEM_NAME=$(lsb_release -cs) # 查看发行版本
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail. install deepspeed
+# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail 
+# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
+# install deepspeed prepare
+# install Mellanox OFED
+mkdir -p ${STAGE_DIR}
+wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
+cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
+./mlnxofedinstall --user-space-only --without-fw-update --all -q 
+cd ${STAGE_DIR} 
+rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
+EOT
+
+ARG NV_PEER_MEM_VERSION="1.2"
+ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
+RUN <<EOT
+#!/bin/bash
+# install nv_peer_mem
+mkdir -p ${STAGE_DIR}
+git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
+cd ${STAGE_DIR}/nv_peer_memory
+./build_module.sh
+cd ${STAGE_DIR}
+tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
+cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
+apt-get update
+apt --fix-broken install -y
+apt-get install -y dkms
+dpkg-buildpackage -us -uc
+dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
+EOT
+
+# base tools
+RUN <<EOT
+#!/bin/bash
+apt-get update
+apt-get install -y bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 sudo 
+apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
+add-apt-repository ppa:git-core/ppa -y
+apt-get install -y git libnuma-dev wget
+# Configure SSH for password and public key authentication
+sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
+sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
+sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
+sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
+mkdir /var/run/sshd
+echo "root:${ROOT_PASSWD}" | chpasswd
+mkdir -p ~/.pip
+eval "$(curl https://get.x-cmd.com)"
+# install pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+EOT
+
+RUN <<EOT
+#!/bin/bash
+pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+pip install git+https://github.com/huggingface/transformers
+EOT
+
+ENV STAGE_DIR=/tmp
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
+cd ${STAGE_DIR}/DeepSpeed-Kernels
+python -m pip install -v .
+EOT
+
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
+cd ${STAGE_DIR}/oneCCL
+git checkout . 
+git checkout master
+mkdir build
+cd build 
+cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+make -j"$(nproc)" install
+EOT
+
+ARG DEEPSPEED_VERSION="v0.14.3"
+ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
+ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
+ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
+ARG DS_BUILD_SPARSE_ATTN=0
+ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
+ARG DS_BUILD_FUSED_ADAM=1
+ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
+ARG DS_BUILD_CPU_ADAM=1
+ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
+ARG DS_BUILD_OPS=1
+ENV DS_BUILD_OPS=${DS_BUILD_OPS}
+ARG HOSTFILE_CONTENT=""
+ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
+ENV CUTLASS_PATH="/opt/pytorch/pytorch/third_party/cutlass"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+ENV PATH=${CUDA_HOME}/bin:${PATH}
+RUN <<EOT
+#!/bin/bash
+git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+cd ${STAGE_DIR}/DeepSpeed
+git checkout ${DEEPSPEED_VERSION}
+./install.sh ${DEEPSPEED_INSTALL_FLAGS}
+ds_report
+EOT
+
+RUN <<EOT
+#!/bin/bash
+python -m pip install --upgrade pip
+python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
+# optimum 手动解决依赖
+python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
+python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
+EOT
+
+RUN <<EOT
+#!/bin/bash
+# 项目目录中的定义通常会覆盖用户家目录中的定义
+# 配置 .deepspeed_env 文件
+cat <<EOF > ~/.deepspeed_env
+TORCH_USE_CUDA_DSA=1
+DEEPSPEED_VERBOSE=1
+DEEPSPEED_LOG_LEVEL=DEBUG
+CUTLASS_PATH=${CUTLASS_PATH}
+TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+CUDA_HOME=${CUDA_HOME}
+LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
+EOF
+unset https_proxy http_proxy
+EOT
+
+CMD ["/usr/sbin/sshd", "-D"]
--- a/spawnerdockerfile/README.md
+++ b/spawnerdockerfile/README.md
@@ -1,5 +1,53 @@
 # Base Jupyter Notebook Stack

+## ds_report
+
+```shell
+[2024-07-17 02:25:56,956] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+ [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
+ [WARNING]  async_io: please install the libaio-dev package with apt
+ [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+ [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+ [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
+ [WARNING]  using untested triton version (3.0.0), only 1.0.0 is known to be compatible
+
+(deepspeed) root@ubuntu-finetune:~/binbbt/train/pretrain# cat .deepspeed_env 
+CUDA_HOME=/usr/local/cuda/
+TORCH_USE_CUDA_DSA=1
+CUTLASS_PATH=/opt/cutlass
+TORCH_CUDA_ARCH_LIST="80;89;90;90a"
+LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+NCCL_DEBUG=WARN
+NCCL_SOCKET_IFNAME=bond0
+NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1
+NCCL_IB_GID_INDEX=3
+NCCL_NET_GDR_LEVEL=2
+NCCL_P2P_DISABLE=0
+NCCL_IB_DISABLE=0
+```
+
+## test command
+
+docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc
+docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash
+
+```shell
+nvidia-smi
+nvcc -V
+ninja --version
+ds_report
+python -c "import torch; print('torch:', torch.__version__, torch)"
+python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()"
+python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
+python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
+python -c "from xformers import ops as xops"
+ibstat
+ofed_info  -s # 如果输出显示了 OFED 版本号，则说明 OFED 驱动已安装。
+mst version
+mpirun --version
+```
+
 > **Images hosted on Docker Hub are no longer updated. Please, use [quay.io image](https://quay.io/repository/jupyter/base-notebook)**

 [![docker pulls](https://img.shields.io/docker/pulls/jupyter/base-notebook.svg)](https://hub.docker.com/r/jupyter/base-notebook/)
--- a/spawnerdockerfile/docker-compose_ngc.yml
+++ b/spawnerdockerfile/docker-compose_ngc.yml
@@ -0,0 +1,72 @@
+version: '3.9'
+
+# DeepSpeed支持多种C++/CUDA扩展（ops），这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能：
+
+# FusedAdam - 提供融合优化的Adam优化器，适用于GPU。
+# FusedLamb - 类似FusedAdam，针对LAMB优化器，适用于大规模分布式训练。
+# SparseAttention - 用于高效计算稀疏注意力机制。
+# Transformer - 提供Transformer模型的高效实现。
+# TransformerInference - 专门用于Transformer模型的推理优化。
+# CPUAdam - 针对CPU优化的Adam优化器。
+# CPULion - 针对CPU的Lion优化器。
+# Quantizer - 提供量化支持，以减少模型大小和提高推理速度。
+# RandomLTD - 用于随机层裁剪的优化器。
+# StochasticTransformer - 支持随机Transformer模型的训练和推理。
+
+# 检测系统总内存（以GB为单位）
+# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
+# echo "Docker Compose 文件已生成，shm_size 设置为 ${TOTAL_MEM}GB。"
+
+services:
+  ubuntu-finetune:
+    build: 
+      context: .
+      dockerfile: Dockerfile.ngc
+      args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
+        REGISTRY: "nvcr.io"
+        OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
+        LABEL: "pytorch"
+        VERSION: "24.06-py3"
+        DS_BUILD_OPS: 1
+        DEEPSPEED_VERSION: "master"
+        DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
+        HTTP_PROXY: "http://127.0.0.1:15777"
+        HTTPS_PROXY: "http://127.0.0.1:15777"
+        CACHEBUST: 1
+    # volumes:
+    #   - ./workspace:/workspace
+      # - /tmp:/tmp
+    container_name: ubuntu-ngc
+    pull_policy: if_not_present
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    # tty: true
+    # stdin_open: true
+    restart: unless-stopped
+    image: hotwa/notebook:ngc
+    privileged: true
+    ipc: host
+    network_mode: host
+    shm_size: '128gb'
+    # ports:
+    #   - 3228:2222
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - TMPDIR=/var/tmp
+    # networks:
+    #   - network_finetune
+    # command: ["/usr/sbin/sshd", "-D"]
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+# networks:
+#   network_finetune:
+#     name: network_finetune
--- a/spawnerdockerfile/install_conda.sh
+++ b/spawnerdockerfile/install_conda.sh
@@ -0,0 +1,20 @@
+# install miniconda
+wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
+bash /tmp/miniconda.sh -b -p /opt/conda 
+rm /tmp/miniconda.sh 
+ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
+echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc 
+. /opt/conda/etc/profile.d/conda.sh 
+conda init bash
+conda config --set show_channel_urls true
+# 配置 .condarc 文件
+cat <<EOF > ~/.condarc
+channels:
+  - conda-forge
+  - bioconda
+  - pytorch
+  - pytorch-nightly
+  - nvidia
+  - defaults
+show_channel_urls: true
+EOF
Author	SHA1	Message	Date
Your Name	538f73b294	add command	2024-07-17 05:10:41 +00:00
Your Name	779ca9a2b2	Merged specific files from main branch into devgpu	2024-07-17 05:01:55 +00:00
hotwa	3690813ae9	add self container start notebook	2024-05-27 22:22:22 +08:00
hotwa	c9f79c2af4	update file	2024-05-27 14:34:59 +08:00
hotwa	7ecc5f2671	change install nodejs sequence	2024-05-27 14:03:16 +08:00
hotwa	c8389c4855	update bak	2024-05-27 13:01:20 +08:00
hotwa	d31595f238	change latest rserver	2024-05-27 12:36:28 +08:00