first add
This commit is contained in:
139
evo/Dockerfile
Normal file
139
evo/Dockerfile
Normal file
@@ -0,0 +1,139 @@
|
||||
ARG CUDA_VERSION=12.1.0
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn8-devel-ubuntu20.04
|
||||
ARG DEBIAN_FRONTEND="noninteractive"
|
||||
ENV DEBIAN_FRONTEND=${DEBIAN_FRONTEND}
|
||||
ENV MAMBA_ROOT_PREFIX=~/micromamba
|
||||
ARG CONDA_ENV_NAME="deepspeed"
|
||||
ENV CONDA_ENV_NAME=${CONDA_ENV_NAME}
|
||||
ARG PYTHON_VERSION=3.10
|
||||
ENV PYTHON_VERSION=${PYTHON_VERSION}
|
||||
ARG ROOT_PASSWD="root"
|
||||
ENV ROOT_PASSWD=${ROOT_PASSWD}
|
||||
ENV PATH /opt/conda/bin:/opt/conda/envs/${CONDA_ENV_NAME}/bin:$PATH
|
||||
WORKDIR /root
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
# base tools
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
apt-get update
|
||||
apt-get install -y wget curl htop jq vim bash libaio-dev build-essential openssh-server python3 python3-pip
|
||||
apt-get install -y --no-install-recommends software-properties-common build-essential autotools-dev nfs-common pdsh cmake g++ gcc curl wget vim tmux emacs less unzip htop iftop iotop ca-certificates openssh-client openssh-server rsync iputils-ping net-tools sudo llvm-dev re2c
|
||||
add-apt-repository ppa:git-core/ppa -y
|
||||
apt-get install -y git libnuma-dev wget
|
||||
# install latest cmake
|
||||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo apt-key add -
|
||||
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake
|
||||
# Configure SSH for password and public key authentication
|
||||
sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||
sed -i 's/#PasswordAuthentication yes/PasswordAuthentication yes/' /etc/ssh/sshd_config
|
||||
sed -i 's/PubkeyAuthentication no/PubkeyAuthentication yes/' /etc/ssh/sshd_config
|
||||
sed -i 's/^#Port 22/Port 22/' /etc/ssh/sshd_config
|
||||
sed -i 's/^Port [0-9]*/Port 22/' /etc/ssh/sshd_config
|
||||
mkdir /var/run/sshd
|
||||
echo 'root:${ROOT_PASSWD}' | chpasswd
|
||||
mkdir -p ~/.pip
|
||||
# install miniconda
|
||||
wget -qO- https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
|
||||
bash /tmp/miniconda.sh -b -p /opt/conda
|
||||
rm /tmp/miniconda.sh
|
||||
conda init bash
|
||||
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh
|
||||
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
|
||||
echo "conda activate ${CONDA_ENV_NAME}" >> ~/.bashrc
|
||||
# 配置 .condarc 文件
|
||||
cat <<EOF > ~/.condarc
|
||||
channels:
|
||||
- conda-forge
|
||||
- bioconda
|
||||
- pytorch
|
||||
- pytorch-nightly
|
||||
- nvidia
|
||||
- defaults
|
||||
show_channel_urls: true
|
||||
EOF
|
||||
# 安装 micromamba
|
||||
echo 1 | bash <(curl -s https://cdn.jsdelivr.net/gh/hotwa/MicroMamba_Installer@main/install.sh)
|
||||
micromamba shell init -s bash -p ~/micromamba
|
||||
cat <<'EOF' >> ~/.bashrc
|
||||
source ~/micromamba/etc/profile.d/micromamba.sh
|
||||
alias mamba=micromamba
|
||||
alias mba=mamba
|
||||
EOF
|
||||
# 配置 .mambarc 文件
|
||||
cat <<EOF > ~/.mambarc
|
||||
channels:
|
||||
- conda-forge
|
||||
- bioconda
|
||||
- pytorch
|
||||
- pytorch-nightly
|
||||
- nvidia
|
||||
- defaults
|
||||
show_channel_urls: true
|
||||
EOF
|
||||
EOT
|
||||
|
||||
# reference: https://github.com/huggingface/transformers/blob/main/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
|
||||
# PyTorch
|
||||
ENV REF='main'
|
||||
ENV STAGE_DIR=/tmp
|
||||
ENV NV_PEER_MEM_VERSION=1.2
|
||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||
ENV OPENMPI_BASEVERSION=4.1
|
||||
ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
|
||||
ARG CUDA='cu121'
|
||||
ENV CUDA=${CUDA}
|
||||
ARG PYTORCH_VERSION=2.3.0
|
||||
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
|
||||
ARG TORCHVISION_VERSION=0.18.0
|
||||
ENV TORCHVISION_VERSION=${TORCHVISION_VERSION}
|
||||
ARG TORCHAUDIO_VERSION=2.3.0
|
||||
ENV TORCHAUDIO_VERSION=${TORCHAUDIO_VERSION}
|
||||
ARG PYTORCH_CUDA_VERSION=12.1
|
||||
ENV PYTORCH_CUDA_VERSION=${PYTORCH_CUDA_VERSION}
|
||||
ENV MLNX_OFED_VERSION=4.9-7.1.0.0
|
||||
ARG SETUPTOOLS_VERSION=69.5.1
|
||||
ENV SETUPTOOLS_VERSION=${SETUPTOOLS_VERSION}
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
# 克隆 ninja 源码并编译
|
||||
git clone https://github.com/ninja-build/ninja.git
|
||||
cd ninja
|
||||
# 克隆 GoogleTest 源码
|
||||
git clone https://github.com/google/googletest.git
|
||||
conda run -n ${CONDA_ENV_NAME} python ./configure.py --bootstrap
|
||||
# 配置并构建 Ninja 测试,添加 pthread 链接选项
|
||||
# CXXFLAGS="-pthread" LDFLAGS="-pthread" ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest
|
||||
conda run -n ${CONDA_ENV_NAME} bash -c "CXXFLAGS='-pthread' LDFLAGS='-pthread' python ./configure.py --bootstrap --gtest-source-dir=$(pwd)/googletest"
|
||||
./ninja all
|
||||
# 运行 Ninja 单元测试
|
||||
./ninja_test
|
||||
conda create -n ${CONDA_ENV_NAME} python=${PYTHON_VERSION} pyyaml ipython -c conda-forge -y
|
||||
python3 -m pip install --no-cache-dir --upgrade pip
|
||||
conda clean -afy
|
||||
EOT
|
||||
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
git clone https://github.com/evo-design/evo.git ${STAGE_DIR}/evo
|
||||
cd ${STAGE_DIR}/evo
|
||||
pip install packaging
|
||||
python3 -m pip install torch==${PYTORCH_VERSION}+${CUDA} torchvision==${TORCHVISION_VERSION}+${CUDA} torchaudio==${TORCHAUDIO_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA}
|
||||
python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
|
||||
pip install .
|
||||
EOT
|
||||
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
pip install ipykernel
|
||||
python3 -m ipykernel install --user --name=${CONDA_ENV_NAME} --display-name=${CONDA_ENV_NAME}
|
||||
EOT
|
||||
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
||||
48
evo/docker-compose_pytorch1.13.yml
Normal file
48
evo/docker-compose_pytorch1.13.yml
Normal file
@@ -0,0 +1,48 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
ubuntu-evo:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
||||
CONDA_ENV_NAME: evo
|
||||
CUDA_VERSION: 11.7.1
|
||||
PYTORCH_VERSION: 1.13.1
|
||||
TORCHVISION_VERSION: 0.14.1
|
||||
TORCHAUDIO_VERSION: 0.13.1
|
||||
DS_BUILD_OPS: 1
|
||||
DS_BUILD_SPARSE_ATTN: 0
|
||||
DS_BUILD_FUSED_ADAM: 0
|
||||
DS_BUILD_CPU_ADAM: 0
|
||||
CUDA: cu117
|
||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||
SETUPTOOLS_VERSION: "69.5.1"
|
||||
ROOT_PASSWD: "root"
|
||||
volumes:
|
||||
- ./src:/bbtft
|
||||
container_name: ubuntu-evo
|
||||
pull_policy: if_not_present
|
||||
tty: true
|
||||
restart: unless-stopped
|
||||
image: hotwa/deepspeed:test
|
||||
shm_size: '32gb'
|
||||
ports:
|
||||
- 3227:2222
|
||||
command: ["/usr/sbin/sshd", "-D"]
|
||||
environment:
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
networks:
|
||||
- network_evo
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
|
||||
networks:
|
||||
network_evo:
|
||||
name: network_evo
|
||||
46
evo/docker-compose_pytorch2.3.yml
Normal file
46
evo/docker-compose_pytorch2.3.yml
Normal file
@@ -0,0 +1,46 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
ubuntu-evo:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args: # PyTorch版本、Python版本与pytorch_lightning版本的对应关系表 https://blog.csdn.net/qq_41813454/article/details/137421822
|
||||
CONDA_ENV_NAME: evo
|
||||
CUDA_VERSION: 12.1.0
|
||||
PYTORCH_VERSION: 2.3.0
|
||||
TORCHVISION_VERSION: 0.18.0
|
||||
TORCHAUDIO_VERSION: 2.3.0
|
||||
DS_BUILD_OPS: 1
|
||||
DS_BUILD_SPARSE_ATTN: 0
|
||||
DS_BUILD_FUSED_ADAM: 0
|
||||
DS_BUILD_CPU_ADAM: 0
|
||||
CUDA: cu121
|
||||
CUDA_ARCH_LIST: "80;86;89;90" # for RTX 4090, all : "80;86;89;90"
|
||||
SETUPTOOLS_VERSION: "69.5.1"
|
||||
volumes:
|
||||
- ./huggingface:/root/.cache/huggingface
|
||||
container_name: ubuntu-evo
|
||||
pull_policy: if_not_present
|
||||
tty: true
|
||||
restart: unless-stopped
|
||||
image: hotwa/evo:latest
|
||||
shm_size: '32gb'
|
||||
ports:
|
||||
- 3227:22
|
||||
environment:
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
networks:
|
||||
- network_evo
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
|
||||
networks:
|
||||
network_evo:
|
||||
name: network_evo
|
||||
Reference in New Issue
Block a user