162 lines
5.6 KiB
Docker
162 lines
5.6 KiB
Docker
ARG REGISTRY=quay.io
|
|
ARG OWNER=jupyter
|
|
ARG LABEL=notebook
|
|
ARG VERSION
|
|
ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION
|
|
FROM $BASE_CONTAINER
|
|
ARG HTTP_PROXY
|
|
ARG HTTPS_PROXY
|
|
ENV http_proxy=${HTTP_PROXY}
|
|
ENV https_proxy=${HTTPS_PROXY}
|
|
ARG DEBIAN_FRONTEND="noninteractive"
|
|
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
|
ARG ROOT_PASSWD="root"
|
|
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
|
WORKDIR /root
|
|
SHELL ["/bin/bash", "-c"]
|
|
|
|
# https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
|
|
ENV MLNX_OFED_VERSION=23.10-3.2.2.0
|
|
ENV STAGE_DIR=/tmp
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
# install Mellanox OFED prepare
|
|
apt-get update
|
|
apt install -y libnvidia-compute-535
|
|
apt-get install -y pciutils tk kmod libusb-1.0-0 tcl chrpath libpci3 bison lsof graphviz ethtool swig udev libltdl-dev libelf1 libmnl0 debhelper flex libfuse2
|
|
# install Mellanox OFED
|
|
mkdir -p ${STAGE_DIR}
|
|
wget http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz -O ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz
|
|
tar xzf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz -C ${STAGE_DIR}
|
|
cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64
|
|
./mlnxofedinstall --user-space-only --without-fw-update --all -q > ${STAGE_DIR}/mlnxofedinstall.log 2>&1
|
|
cd ${STAGE_DIR}
|
|
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
|
EOT
|
|
|
|
ARG NV_PEER_MEM_VERSION="1.2"
|
|
ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION}
|
|
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
# install nv_peer_mem
|
|
mkdir -p ${STAGE_DIR}
|
|
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
|
|
cd ${STAGE_DIR}/nv_peer_memory
|
|
./build_module.sh
|
|
cd ${STAGE_DIR}
|
|
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
|
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
|
apt-get update
|
|
apt --fix-broken install -y
|
|
apt-get install -y dkms
|
|
dpkg-buildpackage -us -uc
|
|
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
|
EOT
|
|
|
|
# base tools
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
apt-get update
|
|
apt-get install -y bash-completion wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip bzip2 sudo
|
|
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
|
add-apt-repository ppa:git-core/ppa -y
|
|
apt-get install -y git libnuma-dev wget
|
|
# Configure SSH for password and public key authentication
|
|
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
|
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
|
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
|
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
|
|
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
|
mkdir /var/run/sshd
|
|
echo "root:${ROOT_PASSWD}" | chpasswd
|
|
mkdir -p ~/.pip
|
|
eval "$(curl https://get.x-cmd.com)"
|
|
# install pixi
|
|
curl -fsSL https://pixi.sh/install.sh | bash
|
|
EOT
|
|
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
|
|
EOT
|
|
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
pip install git+https://github.com/huggingface/transformers
|
|
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
|
|
EOT
|
|
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
|
cd ${STAGE_DIR}/DeepSpeed-Kernels
|
|
python -m pip install -v .
|
|
EOT
|
|
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
|
cd ${STAGE_DIR}/oneCCL
|
|
git checkout .
|
|
git checkout master
|
|
mkdir build
|
|
cd build
|
|
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
|
make -j"$(nproc)" install
|
|
EOT
|
|
|
|
ARG DEEPSPEED_VERSION="v0.14.3"
|
|
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
|
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
|
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
|
ARG DS_BUILD_SPARSE_ATTN=0
|
|
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
|
ARG DS_BUILD_FUSED_ADAM=1
|
|
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
|
ARG DS_BUILD_CPU_ADAM=1
|
|
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
|
ARG DS_BUILD_OPS=1
|
|
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
|
ARG HOSTFILE_CONTENT=""
|
|
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
|
|
ENV CUTLASS_PATH="/opt/pytorch/pytorch/third_party/cutlass"
|
|
ENV CUDA_HOME="/usr/local/cuda"
|
|
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
|
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
|
cd ${STAGE_DIR}/DeepSpeed
|
|
git checkout ${DEEPSPEED_VERSION}
|
|
./install.sh ${DEEPSPEED_INSTALL_FLAGS}
|
|
ds_report
|
|
EOT
|
|
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
python -m pip install --upgrade pip
|
|
python -m pip install peft tiktoken seaborn blobfile open_clip_torch zstandard mpi4py
|
|
# optimum 手动解决依赖
|
|
python -m pip install black~=23.1 ruff==0.1.5 diffusers>=0.17.0
|
|
python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality]
|
|
python -m pip install evaluate datasets
|
|
EOT
|
|
|
|
RUN <<EOT
|
|
#!/bin/bash
|
|
# 项目目录中的定义通常会覆盖用户家目录中的定义
|
|
# 配置 .deepspeed_env 文件
|
|
cat <<EOF > ~/.deepspeed_env
|
|
TORCH_USE_CUDA_DSA=1
|
|
DEEPSPEED_VERBOSE=1
|
|
DEEPSPEED_LOG_LEVEL=DEBUG
|
|
CUTLASS_PATH=${CUTLASS_PATH}
|
|
TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
|
CUDA_HOME=${CUDA_HOME}
|
|
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}
|
|
EOF
|
|
unset https_proxy http_proxy
|
|
EOT
|
|
|
|
CMD ["/usr/sbin/sshd", "-D"] |