ARG REGISTRY=quay.io ARG OWNER=jupyter ARG LABEL=notebook ARG VERSION ARG BASE_CONTAINER=$REGISTRY/$OWNER/$LABEL:$VERSION FROM $BASE_CONTAINER ARG HTTP_PROXY ARG HTTPS_PROXY ENV http_proxy=${HTTP_PROXY} ENV https_proxy=${HTTPS_PROXY} ARG DEBIAN_FRONTEND="noninteractive" ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND} ARG ROOT_PASSWD="root" ENV ROOT_PASSWD=${ROOT_PASSWD} WORKDIR /root SHELL ["/bin/bash", "-c"] ENV STAGE_DIR=/tmp # https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ ENV MLNX_OFED_VERSION=23.10-3.2.2.0 RUN <&1 # from https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile install deepspeed fail # reference deepspeed install from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile # install deepspeed prepare apt-get update apt-get install -y libnvidia-compute-535 apt-get install -y flex tk ethtool libpci3 libltdl-dev bison lsof tcl libelf1 pciutils kmod libmnl0 debhelper libusb-1.0-0 graphviz chrpath swig libfuse2 udev # install Mellanox OFED mkdir -p ${STAGE_DIR} wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64.tgz | tar xzf - cd ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64 ./mlnxofedinstall --user-space-only --without-fw-update --all -q > /tmp/mlnxofedinstall.log 2>&1 cd ${STAGE_DIR} # rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64* EOT ARG NV_PEER_MEM_VERSION="1.2" ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 RUN mkdir -p ${STAGE_DIR} && \ git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \ cd ${STAGE_DIR}/nv_peer_memory && \ ./build_module.sh && \ cd ${STAGE_DIR} && \ tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz && \ cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION} && \ apt-get update && \ apt-get --fix-broken install -y && \ apt-get install -y dkms && \ dpkg-buildpackage -us -uc && \ dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb # ENV NV_PEER_MEM_VERSION=${NV_PEER_MEM_VERSION} # ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0 # RUN <=0.17.0 python -m pip install --no-deps git+https://github.com/huggingface/optimum.git#egg=optimum[diffusers,quality] EOT # SSH daemon port inside container cannot conflict with host OS port ENV SSH_PORT=2222 RUN < ~/.deepspeed_env TORCH_USE_CUDA_DSA=1 DEEPSPEED_VERBOSE=1 DEEPSPEED_LOG_LEVEL=DEBUG CUTLASS_PATH=${CUTLASS_PATH} TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} CUDA_HOME=${CUDA_HOME} LD_LIBRARY_PATH=${LD_LIBRARY_PATH} EOF unset https_proxy http_proxy cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \ sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config EOT CMD ["/usr/sbin/sshd", "-D"]