Revert "Merge branch 'main' into devgpu"

This reverts commit c7bff5448a, reversing
changes made to cea7bc59f3.
This commit is contained in:
Your Name
2024-07-17 04:57:56 +00:00
parent c7bff5448a
commit 86efe1122c
12 changed files with 79 additions and 393 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,2 @@
/docker-stacks
jupyterhub-data
*.tar

View File

@@ -1,4 +1,3 @@
# syntax=docker/dockerfile:1
FROM debian:bullseye
ARG CREATE_USER="jovyan"
ARG CREATE_USER_PASSWD="password"

View File

@@ -8,8 +8,6 @@ ARG ROOT_PASSWD="password"
ARG HOME="/home/${CREATE_USER}"
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ARG NODEJS_VERSION='18'
ENV NODEJS_VERSION=${NODEJS_VERSION}
USER root
RUN <<EOT
@@ -21,8 +19,6 @@ apt-get install -y tzdata
ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
echo 'Asia/Shanghai' > /etc/timezone
dpkg-reconfigure -f noninteractive tzdata
sudo apt-get remove --purge libnode72:amd64 -y
curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash -
# 安装所需的软件包
apt-get install -y python3 python3-pip gcc g++ build-essential nodejs npm gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git
# 创建新用户
@@ -42,25 +38,8 @@ index-url = https://mirrors.aliyun.com/pypi/simple/
trusted-host=mirrors.aliyun.com
" >> ~/.pip/pip.conf
# 安装 Jupyter 和相关软件
npm install -g configurable-http-proxy yarn --registry=https://registry.npmmirror.com
python3 -m pip install ipython jupyter_packaging jupyterhub jupyterlab notebook radian pycurl jupyter-rsession-proxy \
ipykernel jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator \
jupyter_contrib_nbextensions jupyterlab-unfold jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor \
jupyterlab-cell-flash jedi-language-server jupyterlab_code_formatter jupyterlab-spellchecker jupyterlab_vim nbresuse \
ipydrawio jedi ipympl black isort theme-darcula ipywidgets tensorboard jupyterlab_latex jupyter_bokeh autopep8 \
xeus-python jupyterlab-lsp python-lsp-server dockerspawner jupyterhub-nativeauthenticator lckr_jupyterlab_variableinspector -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
git clone https://ghproxy.dockless.eu.org/https://github.com/arose/nglview
cd nglview
python3 setup.py install
cd js
rm -rf node_modules package-lock.json
npm install typescript@latest --registry=https://registry.npmmirror.com
npm install --registry=https://registry.npmmirror.com
cd ../..
python -m ipykernel install --sys-prefix
jupyter nbextension enable --py --sys-prefix widgetsnbextension
jupyter nbextension enable --py --sys-prefix nglview
jupyter-nbextension enable nglview --py --sys-prefix
npm install -g configurable-http-proxy
python3 -m pip install jupyterhub jupyterlab notebook radian pycurl jupyter-rsession-proxy ipykernel jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator jupyter_contrib_nbextensions jupyterlab-unfold jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor jupyterlab-cell-flash jedi-language-server jupyterlab_code_formatter jupyterlab-spellchecker jupyterlab_vim nbresuse ipydrawio jedi ipympl black isort theme-darcula ipywidgets tensorboard jupyterlab_latex jupyter_bokeh autopep8 xeus-python jupyterlab-lsp python-lsp-server nglview dockerspawner jupyterhub-nativeauthenticator lckr_jupyterlab_variableinspector
# 创建 JupyterHub 配置目录
mkdir -p /root/.jupyterhub
EOT
@@ -81,9 +60,22 @@ sudo wget "https://download2.rstudio.org/server/$(lsb_release -cs)/amd64/rstudio
sudo chmod +x /tmp/rstudio-server.deb
sudo gdebi -n /tmp/rstudio-server.deb
sudo rm -rf /tmp/rstudio-server.deb
EOT
RUN <<EOT
#!/bin/bash
# 安装micromamba
echo "1" | bash <(curl -s https://raw.githubusercontent.com/hotwa/MicroMamba_Installer/main/install.sh)
mkdir -p /root/workspace/personal /root/workspace/project
EOT
RUN <<EOT
#!/bin/bash
# 清理和减小镜像大小
apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /usr/share/man/?? /usr/share/man/??_*
EOT
CMD ["jupyterhub", "-f", "/srv/jupyterhub/jupyterhub_config.py"]

View File

@@ -12,7 +12,7 @@ services:
args:
JUPYTERHUB_VERSION: latest
restart: always
image: quay.io/hotwa/jupyterhub:latest
image: hotwa/jupyterhub # registry.cn-hangzhou.aliyuncs.com/hotwa/jupyterhub
container_name: jupyterhub
networks:
- jupyterhub-network
@@ -23,17 +23,16 @@ services:
# within the container
- "/var/run/docker.sock:/var/run/docker.sock:rw"
# Bind Docker volume on host for JupyterHub database and cookie secrets
- "./jupyterhub-data:/data"
- "jupyterhub-data:/data"
ports:
- "8000:8000"
- "8080:8080"
environment:
# This username will be a JupyterHub admin
JUPYTERHUB_ADMIN: admin
# All containers will join this network
DOCKER_NETWORK_NAME: jupyterhub-network
# JupyterHub will spawn this Notebook image for users
DOCKER_NOTEBOOK_IMAGE: quay.io/hotwa/notebook:latest
DOCKER_NOTEBOOK_IMAGE: hotwa/notebook:latest
# Notebook directory inside user image
DOCKER_NOTEBOOK_DIR: /home/jovyan/work

49
id_rsa Executable file
View File

@@ -0,0 +1,49 @@
-----BEGIN OPENSSH PRIVATE KEY-----
b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAACFwAAAAdzc2gtcn
NhAAAAAwEAAQAAAgEArjozHdq/VMgEmQZn5i/3UiRxmU2EZ0J7h0bMV/dIl7dALHDQeGuh
Y8rwcCXsCwNBYGvRhBYkaMIgM+Gi/BTDufgHyJ7UKsYWACdxrj8Ycy1nS/qB2ppuLcRrzi
us/XDuU8eBdqrvjL7KXn6FcBXE3wPSas3rPnBp1o0Fc4ipu91U/LDpZ8RvAesvMFfZc4cm
QO/6zNm7zQ7vEemhXmFKnafe+EGJl9YIJuHud8EkRKwb7YMCjSdUxk/LRN4hlFel5+Hgf5
tMmJpXdIyusCtGVG1IhvQv/R6ojLX+5qZE+3P7FFJWY/KeLqymiC9VCZ7nOmYzhhU7ROIv
I4Eszdpp+vVn93lpfa1jjYIl1WbU9rhCbmjdbthDrK/8R/9NRIgLJbCIdXUOKv+JS2aX+a
0dmPKDTuUAZwU7K5c4ftlfi7eb5D7gHKR4XE9NuOZncG8Idb0OQg79txOhYn/nheV3yv34
VcSQytR/9EoEgjEfdTUhaqovSGEkMWPC7JU3wZnvM0q46xdC/QCjqAciDVtkRJH9GnKpJ1
pnIvHSQMfy65xKcWA22SzlfL6hlEAVAsIj7/g85JkOkOxy/uSxPR0l/lKfPvu/JpaIudTG
R12K3aPoiNsJPR4UGIm1IBjv4f6QiKYjKImAY28MBtwAYKyZrbQBwMcMcsLyMowcU7HHtJ
0AAAdQYDap4GA2qeAAAAAHc3NoLXJzYQAAAgEArjozHdq/VMgEmQZn5i/3UiRxmU2EZ0J7
h0bMV/dIl7dALHDQeGuhY8rwcCXsCwNBYGvRhBYkaMIgM+Gi/BTDufgHyJ7UKsYWACdxrj
8Ycy1nS/qB2ppuLcRrzius/XDuU8eBdqrvjL7KXn6FcBXE3wPSas3rPnBp1o0Fc4ipu91U
/LDpZ8RvAesvMFfZc4cmQO/6zNm7zQ7vEemhXmFKnafe+EGJl9YIJuHud8EkRKwb7YMCjS
dUxk/LRN4hlFel5+Hgf5tMmJpXdIyusCtGVG1IhvQv/R6ojLX+5qZE+3P7FFJWY/KeLqym
iC9VCZ7nOmYzhhU7ROIvI4Eszdpp+vVn93lpfa1jjYIl1WbU9rhCbmjdbthDrK/8R/9NRI
gLJbCIdXUOKv+JS2aX+a0dmPKDTuUAZwU7K5c4ftlfi7eb5D7gHKR4XE9NuOZncG8Idb0O
Qg79txOhYn/nheV3yv34VcSQytR/9EoEgjEfdTUhaqovSGEkMWPC7JU3wZnvM0q46xdC/Q
CjqAciDVtkRJH9GnKpJ1pnIvHSQMfy65xKcWA22SzlfL6hlEAVAsIj7/g85JkOkOxy/uSx
PR0l/lKfPvu/JpaIudTGR12K3aPoiNsJPR4UGIm1IBjv4f6QiKYjKImAY28MBtwAYKyZrb
QBwMcMcsLyMowcU7HHtJ0AAAADAQABAAACAQCh2Xz84eXVy43B3hqM0shNLX++Ky2xgygf
7dwMZ7KvBdP+tLpk849UlgSwGfph7J9CBEb5Dly6WIPzzktLI7sDOm94pltwdN2hPQM1HH
YJIQeRpGAXBFkP+SUwvJASTGOJvo78/yRTujVifORW0FI33mQNRcegFGutnQmQX0dWNvEp
YOsis29v99u5bBKcaNNXhdeVjzsP7iyWllOKypS3SmgudSttuq2N+Vo/M1QO21eCV676E6
yEIUIycqv9E0MS21ljEZChPte5dKsfCsRlz7ONcrBV0rI/LR5jg9gXQ5HeOfL5HDxoHYkC
f/3eVxh0vtZkW7rYbcnekLRfcuP5brfyvdtzXGHGy1upVhmsYIJJ6s2OCEqi5NmQ36SgGO
rVwJMaX9WZbxXFP2Z8Kn/8Acx/pbxaco4X66vRtw6d8GE6WldlzTJF5okq0KSbCnTfyHvm
1j6TaHnFG0GqzX4D3THW7ZOGeud5unvI1MnP64P6q2hGax/KWFuQEl/WTxw3NDAC82kj+4
LqOpPt0OF8m9j+agVxb+dcdczw6MoB8tCWWQwZxYzUNqWAio6V82mTbCu/KFtuTIrd5orq
vLt2dIXeEPkGy2Q4f8RvlqigDQbwQW1lleWdCbX56SFPlASWDGTXF8EsP9m+LDsz9OY5Lu
2aue4YDaFAtOnOm332wQAAAQEAun/OvgihnXYyFdWNr6zlufEqDbHaR7hT7iefqpSM8NsO
rOS9WsSIyVYpIg6Od+/HWTrdpCDDUdRgX0znIhMZF+YjPV8+J8rVbwTCsmpxjBBtVPG3DX
ICb3Y+WHtMznyrk+3vKG+FhWFEip6BXiQQW/9r8by9/Ho19eK8IzmcYDWx81a6CaxgxGM3
OhdcVVxrMHrlX7Lj6guM81pc5L3HM4lYYihEuIDGMC6GqaKqPMISzroCJ1gIvlE+mdyK/u
lXV5tT41OIsnNOdNzTDPc6Iff62Of7KceuQpRAgUXKIReei6JCrGl66Q1EDY8f4RiwGU1o
tSUPJ3oc9KYTsNw6pgAAAQEA06Jk8wSN9siVTazsE0aNcE+cQUAdqjCcfgNt/PR8kfqUGs
9FA33icsulTiOXyzPr5Ua96v6mAtscbkWLPPXVYm9DifarC787EfaeX2SZrmNpELV4I4oj
U/mbKKDoMwtZTuzkClswBPa5o5yoo8TzFISTAdbK26xquS4THaOUsXv0zI26HUMoCf4Idr
06xtUwljLdVdEjBkpXTRMLUbSfMoOOYTPmsMN2yCyz2AI5XCs5ChXF0q7rFdY68VG6n7Go
6Un4HKfpUxdouHexTX4PJsU225oeLtCSMfsil1jf25Z7Kj3/VuEEQ4h/DW+iE7ZBz3Czgd
iiUTZHgTrWm18wkQAAAQEA0sBQdC2Af3psNbk0vQ7MVwMQED7q0OHSpDfKxQdLgo0xASJ0
nTX5HamlA57Owrln0HgW332Xp3vHwL9170pk260xx0R2gndX6JxZTbI6RBhggEfSDj7YOt
aAWEk1zzcqi0IBv51x8gd7xqwOezkxpM2k3Ej93/+/qzEJDwcn/QxCjqwRRQgY3KGr+H+q
R/cFkD8b+lFaLumQu1v7dMJltOD5Ls+rPglIUqWBeMILuyPbmBXxFF00yOuhkcpIwPN266
nTtSishRCLkDL83lWVGw3PmOtifEmlvU8cb+t7d+T/kvwgTUSd6m8oVHphxvx3NBUmpzhO
H0lsz1qPDuEJTQAAABJweWx5emVuZ0BnbWFpbC5jb20BAgMEBQYH
-----END OPENSSH PRIVATE KEY-----

View File

@@ -10,21 +10,7 @@ c = get_config() # noqa: F821
# avoid having to rebuild the JupyterHub container every time we change a
# configuration parameter.
# from dockerspawner import DockerSpawner
# class MyDockerSpawner(DockerSpawner):
# def start(self):
# # 启动父类的start方法
# self.user_options['environment']['JUPYTER_ENABLE_NBEXTENSIONS'] = 'true'
# self.user_options['cmd'] = [
# 'bash',
# '-c',
# 'pip install nglview jupyter_packaging -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com && jupyter nbextension enable nglview --py --sys-prefix && jupyter labextension install nglview-js-widgets && jupyter labextension install @jupyter-widgets/jupyterlab-manager && start-singleuser.sh'
# ]
# return super().start()
# Spawn single-user servers as Docker containers
c.Authenticator.allow_all = True
c.JupyterHub.spawner_class = "dockerspawner.DockerSpawner"
# Spawn containers from this image

View File

@@ -33,7 +33,6 @@ http {
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_set_header X-Forwarded-Port $server_port; # 添加这一行,保持原有的端口号
# WebSocket support
proxy_http_version 1.1;

View File

@@ -78,8 +78,6 @@ ARG ROOT_PASSWD="password"
ARG HOME="/home/${CREATE_USER}"
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ARG NODEJS_VERSION='18'
ENV NODEJS_VERSION=${NODEJS_VERSION}
COPY install.sh /tmp
RUN <<EOT
@@ -92,10 +90,7 @@ ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
echo 'Asia/Shanghai' > /etc/timezone
dpkg-reconfigure -f noninteractive tzdata
# 安装所需的软件包
sudo apt-get remove --purge libnode72:amd64 -y
curl -fsSL https://deb.nodesource.com/setup_${NODEJS_VERSION}.x | sudo -E bash -
apt-get install -y python3 python3-pip gcc g++ build-essential nodejs npm gdebi-core curl wget openssh-server vim lrzsz net-tools sudo git nodejs
npm install -g configurable-http-proxy yarn --registry=https://registry.npmmirror.com
# 创建新用户
useradd -m -s /bin/bash ${CREATE_USER}
echo "${CREATE_USER}:${CREATE_USER_PASSWD}" | chpasswd
@@ -105,7 +100,11 @@ EOT
# 安装 Jupyter 和相关软件
RUN <<EOT
#!/bin/bash
python3 -m pip install ipython jupyterhub jupyterlab notebook jupyter_packaging -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
npm install -g configurable-http-proxy
git clone https://github.com/arose/nglview
cd nglview
python setup.py install
cd ..
python3 -m pip install aiohttp -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
python3 -m pip install jupyterhub jupyterlab notebook radian pycurl aiohttp jupyter-rsession-proxy ipykernel>=6.25.0 jupyterlab-language-pack-zh-CN jupyterlab-git jupyterlab-system-monitor jupyter_nbextensions_configurator jupyter_contrib_nbextensions jupyterlab_widgets jupyterlab-drawio jupyterlab-spreadsheet-editor jupyterlab-cell-flash jedi-language-server jupyterlab_code_formatter jupyterlab-spellchecker jupyterlab_vim nbresuse ipydrawio jedi ipympl black isort theme-darcula ipywidgets tensorboard jupyterlab_latex jupyter_bokeh autopep8 xeus-python jupyterlab-lsp python-lsp-server dockerspawner jupyterhub-nativeauthenticator lckr_jupyterlab_variableinspector -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
EOT
@@ -139,58 +138,23 @@ USER ${NB_UID}
ARG MODULAR_HOME="/home/${CREATE_USER}/.modular"
ENV MODULAR_HOME=$MODULAR_HOME
ENV PATH="$MODULAR_HOME/pkg/packages.modular.com_mojo/bin:/home/${CREATE_USER}/.local/bin:$PATH"
ARG HTTP_PROXY=""
ARG HTTPS_PROXY=""
ARG NO_PROXY="localhost,127.0.0.1"
ENV HTTP_PROXY=$HTTP_PROXY
ENV HTTPS_PROXY=$HTTPS_PROXY
RUN <<EOT
#!/bin/bash
curl --retry 5 https://get.modular.com | sh -
mamba create -n mojo python=3.10 ipykernel ipython nglview -c conda-forge -y
mamba activate mojo && modular install mojo
mamba run -n mojo python -m ipykernel install --user --name="mojo" --display-name="mojo_env"
curl https://get.modular.com | sh -
mamba create -n mojo python=3.10 -c conda-forge -y
mamba activate mojo
modular install mojo
EOT
# Install Rust https://rsproxy.cn/#getStarted
ENV RUSTUP_DIST_SERVER="https://rsproxy.cn"
ENV RUSTUP_UPDATE_ROOT="https://rsproxy.cn/rustup"
# Install Rust
ENV PATH="/home/${CREATE_USER}/.cargo/bin:$PATH"
ENV CARGO_UNSTABLE_SPARSE_REGISTRY="true"
RUN <<EOT
#!/bin/bash
curl --proto '=https' --tlsv1.2 -sSf https://rsproxy.cn/rustup-init.sh | sh -s -- -y
echo '[source.crates-io]
replace-with = "rsproxy-sparse"
[source.rsproxy]
registry = "https://rsproxy.cn/crates.io-index"
[source.rsproxy-sparse]
registry = "sparse+https://rsproxy.cn/index/"
[registries.rsproxy]
index = "https://rsproxy.cn/crates.io-index"
[net]
git-fetch-with-cli = true' >> ~/.cargo/config.toml
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
cargo install evcxr_jupyter
evcxr_jupyter --install
EOT
RUN <<EOT
#!/bin/bash
sudo apt update
sudo apt install -y texlive-full
mamba create -n torch tensorboard jupyter_packaging pytorch biopython biopandas pymol-open-source mdtraj torchvision torchaudio python=3 ipython requests scienceplots autopep8 ipykernel\
pandas numpy matplotlib scipy seaborn orange3 -c pytorch -c nvidia -c conda-forge -y
git clone https://ghproxy.dockless.eu.org/https://github.com/arose/nglview
cd nglview
mamba run -n torch python3 setup.py install
cd js
rm -rf node_modules package-lock.json
npm install typescript@latest --registry=https://registry.npmmirror.com
npm install --registry=https://registry.npmmirror.com
cd ../..
mamba run -n torch python -m ipykernel install --user --name="torch" --display-name="torch_env"
EOT
WORKDIR "${HOME}"

View File

@@ -1,161 +0,0 @@
ARG REGISTRY=quay.io
ARG OWNER=jupyter
ARG LABEL=notebook
ARG VERSION
ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION
FROM $BASE_CONTAINER
ARG HTTP_PROXY
ARG HTTPS_PROXY
ENV http_proxy=${HTTP_PROXY}
ENV https_proxy=${HTTPS_PROXY}
ARG DEBIAN_FRONTEND="noninteractive"
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
ARG ROOT_PASSWD="root"
ENV ROOT_PASSWD=${ROOT_PASSWD}
WORKDIR /root
SHELL ["/bin/bash", "-c"]
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
RUN <<EOT
#!/bin/bash
# SYSTEM_NAME=$(lsb_release -cs) # 查看发行版本
# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
python3 -m pip uninstall -y deepspeed
# This has to be run (again) inside the GPU VMs running the tests.
# The installation works here, but some tests fail, if we do not pre-build deepspeed again in the VMs running the tests.
# TODO: Find out why test fail. install deepspeed
# DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM} DS_BUILD_FUSED_ADAM={DS_BUILD_FUSED_ADAM} python3 -m pip install "deepspeed<=0.14.0" --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
# from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail
# reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
# install deepspeed prepare
# install Mellanox OFED
mkdir -p ${STAGE_DIR}
wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf -
cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
./mlnxofedinstall --user-space-only --without-fw-update --all -q
cd ${STAGE_DIR}
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
EOT
ARG NV_PEER_MEM_VERSION="1.2"
ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
RUN <<EOT
#!/bin/bash
# install nv_peer_mem
mkdir -p ${STAGE_DIR}
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
cd ${STAGE_DIR}/nv_peer_memory
./build_module.sh
cd ${STAGE_DIR}
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
apt-get update
apt --fix-broken install -y
apt-get install -y dkms
dpkg-buildpackage -us -uc
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
EOT
# base tools
RUN <<EOT
#!/bin/bash
apt-get update
apt-get install -y bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 sudo
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
add-apt-repository ppa:git-core/ppa -y
apt-get install -y git libnuma-dev wget
# Configure SSH for password and public key authentication
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
mkdir /var/run/sshd
echo "root:${ROOT_PASSWD}" | chpasswd
mkdir -p ~/.pip
eval "$(curl https://get.x-cmd.com)"
# install pixi
curl -fsSL https://pixi.sh/install.sh | bash
EOT
RUN <<EOT
#!/bin/bash
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
pip install git+https://github.com/huggingface/transformers
EOT
ENV STAGE_DIR=/tmp
RUN <<EOT
#!/bin/bash
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
cd ${STAGE_DIR}/DeepSpeed-Kernels
python -m pip install -v .
EOT
RUN <<EOT
#!/bin/bash
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
cd ${STAGE_DIR}/oneCCL
git checkout .
git checkout master
mkdir build
cd build
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
make -j"$(nproc)" install
EOT
ARG DEEPSPEED_VERSION="v0.14.3"
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
ARG DS_BUILD_SPARSE_ATTN=0
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
ARG DS_BUILD_FUSED_ADAM=1
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
ARG DS_BUILD_CPU_ADAM=1
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
ARG DS_BUILD_OPS=1
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
ARG HOSTFILE_CONTENT=""
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
ENV CUTLASS_PATH="/opt/pytorch/pytorch/third_party/cutlass"
ENV CUDA_HOME="/usr/local/cuda"
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
ENV PATH=${CUDA_HOME}/bin:${PATH}
RUN <<EOT
#!/bin/bash
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
cd ${STAGE_DIR}/DeepSpeed
git checkout ${DEEPSPEED_VERSION}
./install.sh ${DEEPSPEED_INSTALL_FLAGS}
ds_report
EOT
RUN <<EOT
#!/bin/bash
python -m pip install --upgrade pip
python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
# optimum 手动解决依赖
python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
EOT
RUN <<EOT
#!/bin/bash
# 项目目录中的定义通常会覆盖用户家目录中的定义
# 配置 .deepspeed_env 文件
cat <<EOF > ~/.deepspeed_env
TORCH_USE_CUDA_DSA=1
DEEPSPEED_VERBOSE=1
DEEPSPEED_LOG_LEVEL=DEBUG
CUTLASS_PATH=${CUTLASS_PATH}
TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
CUDA_HOME=${CUDA_HOME}
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
EOF
unset https_proxy http_proxy
EOT
CMD ["/usr/sbin/sshd", "-D"]

View File

@@ -1,53 +1,5 @@
# Base Jupyter Notebook Stack
## ds_report
```shell
[2024-07-17 02:25:56,956] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
[WARNING] using untested triton version (3.0.0), only 1.0.0 is known to be compatible
(deepspeed) root@ubuntu-finetune:~/binbbt/train/pretrain# cat .deepspeed_env
CUDA_HOME=/usr/local/cuda/
TORCH_USE_CUDA_DSA=1
CUTLASS_PATH=/opt/cutlass
TORCH_CUDA_ARCH_LIST="80;89;90;90a"
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
NCCL_DEBUG=WARN
NCCL_SOCKET_IFNAME=bond0
NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1
NCCL_IB_GID_INDEX=3
NCCL_NET_GDR_LEVEL=2
NCCL_P2P_DISABLE=0
NCCL_IB_DISABLE=0
```
## test command
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash
```shell
nvidia-smi
nvcc -V
ninja --version
ds_report
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()"
python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
python -c "from xformers import ops as xops"
ibstat
ofed_info -s # 如果输出显示了 OFED 版本号,则说明 OFED 驱动已安装。
mst version
mpirun --version
```
> **Images hosted on Docker Hub are no longer updated. Please, use [quay.io image](https://quay.io/repository/jupyter/base-notebook)**
[![docker pulls](https://img.shields.io/docker/pulls/jupyter/base-notebook.svg)](https://hub.docker.com/r/jupyter/base-notebook/)

View File

@@ -1,72 +0,0 @@
version: '3.9'
# DeepSpeed支持多种C++/CUDA扩展ops这些ops旨在优化深度学习的训练和推理过程。以下是一些主要的DeepSpeed ops及其功能
# FusedAdam - 提供融合优化的Adam优化器适用于GPU。
# FusedLamb - 类似FusedAdam针对LAMB优化器适用于大规模分布式训练。
# SparseAttention - 用于高效计算稀疏注意力机制。
# Transformer - 提供Transformer模型的高效实现。
# TransformerInference - 专门用于Transformer模型的推理优化。
# CPUAdam - 针对CPU优化的Adam优化器。
# CPULion - 针对CPU的Lion优化器。
# Quantizer - 提供量化支持,以减少模型大小和提高推理速度。
# RandomLTD - 用于随机层裁剪的优化器。
# StochasticTransformer - 支持随机Transformer模型的训练和推理。
# 检测系统总内存以GB为单位
# TOTAL_MEM=$(awk '/MemTotal/ {printf "%.0f\n", $2/1024/1024}' /proc/meminfo)
# echo "Docker Compose 文件已生成shm_size 设置为 ${TOTAL_MEM}GB。"
services:
ubuntu-finetune:
build:
context: .
dockerfile: Dockerfile.ngc
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
REGISTRY: "nvcr.io"
OWNER: "nvidia" # nvcr.io/nvidia/pytorch:24.06-py3
LABEL: "pytorch"
VERSION: "24.06-py3"
DS_BUILD_OPS: 1
DEEPSPEED_VERSION: "master"
DEEPSPEED_INSTALL_FLAGS: "--allow_sudo"
HTTP_PROXY: "http://127.0.0.1:15777"
HTTPS_PROXY: "http://127.0.0.1:15777"
CACHEBUST: 1
# volumes:
# - ./workspace:/workspace
# - /tmp:/tmp
container_name: ubuntu-ngc
pull_policy: if_not_present
ulimits:
memlock:
soft: -1
hard: -1
# tty: true
# stdin_open: true
restart: unless-stopped
image: hotwa/notebook:ngc
privileged: true
ipc: host
network_mode: host
shm_size: '128gb'
# ports:
# - 3228:2222
environment:
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- TMPDIR=/var/tmp
# networks:
# - network_finetune
# command: ["/usr/sbin/sshd", "-D"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# networks:
# network_finetune:
# name: network_finetune

View File

@@ -1,20 +0,0 @@
# install miniconda
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
bash /tmp/miniconda.sh -b -p /opt/conda
rm /tmp/miniconda.sh
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
. /opt/conda/etc/profile.d/conda.sh
conda init bash
conda config --set show_channel_urls true
# 配置 .condarc 文件
cat <<EOF > ~/.condarc
channels:
- conda-forge
- bioconda
- pytorch
- pytorch-nightly
- nvidia
- defaults
show_channel_urls: true
EOF