update dockerfile
This commit is contained in:
@@ -166,300 +166,301 @@ cd ${STAGE_DIR}
|
||||
rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu22.04-x86_64*
|
||||
EOT
|
||||
|
||||
# ENV NV_PEER_MEM_VERSION=1.2
|
||||
# ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||
ENV NV_PEER_MEM_VERSION=1.2
|
||||
ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
# install nv_peer_mem
|
||||
rm -rf ${STAGE_DIR}
|
||||
mkdir -p ${STAGE_DIR}
|
||||
git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
|
||||
cd ${STAGE_DIR}/nv_peer_memory
|
||||
./build_module.sh
|
||||
cd ${STAGE_DIR}
|
||||
tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||
cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||
apt-get update
|
||||
apt --fix-broken install -y
|
||||
apt-get install -y dkms
|
||||
dpkg-buildpackage -us -uc
|
||||
dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||
EOT
|
||||
|
||||
# install mpi
|
||||
ENV PATH=/usr/local/mpi/bin:${PATH}
|
||||
ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
# OPENMPI
|
||||
rm -rf ${STAGE_DIR}
|
||||
mkdir -p ${STAGE_DIR}
|
||||
cd ${STAGE_DIR}
|
||||
wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
|
||||
cd openmpi-${OPENMPI_VERSION}
|
||||
./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
|
||||
make -j"$(nproc)" install
|
||||
ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
|
||||
# Sanity check:
|
||||
test -f /usr/local/mpi/bin/mpic++
|
||||
cd ${STAGE_DIR}
|
||||
rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
|
||||
# Create a wrapper for OpenMPI to allow running as root by default
|
||||
mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
|
||||
echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
|
||||
echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
|
||||
chmod a+x /usr/local/mpi/bin/mpirun
|
||||
EOT
|
||||
|
||||
# SSH daemon port inside container cannot conflict with host OS port
|
||||
# ENV SSH_PORT=2222
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# # install nv_peer_mem
|
||||
# rm -rf ${STAGE_DIR}
|
||||
# mkdir -p ${STAGE_DIR}
|
||||
# git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory
|
||||
# cd ${STAGE_DIR}/nv_peer_memory
|
||||
# ./build_module.sh
|
||||
# cd ${STAGE_DIR}
|
||||
# tar xzf ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_VERSION}.orig.tar.gz
|
||||
# cd ${STAGE_DIR}/nvidia-peer-memory-${NV_PEER_MEM_VERSION}
|
||||
# apt-get update
|
||||
# apt-get install -y dkms
|
||||
# dpkg-buildpackage -us -uc
|
||||
# dpkg -i ${STAGE_DIR}/nvidia-peer-memory_${NV_PEER_MEM_TAG}_all.deb
|
||||
# cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||
# sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
# EOT
|
||||
|
||||
# # install mpi
|
||||
# ENV PATH=/usr/local/mpi/bin:${PATH}
|
||||
# ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
|
||||
# 29.78 Usage: install.sh [options...]
|
||||
# 29.78
|
||||
# 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
|
||||
# 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
|
||||
# 29.78
|
||||
# 29.78 [optional]
|
||||
# 29.78 -l, --local_only Install only on local machine
|
||||
# 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
|
||||
# 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
|
||||
# 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
|
||||
# 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
|
||||
# 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
|
||||
# 29.78 -e, --examples Checkout deepspeed example submodule (no install)
|
||||
# 29.78 -v, --verbose Verbose logging
|
||||
# 29.78 -h, --help This help text
|
||||
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||||
usermod -aG sudo deepspeed
|
||||
echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||
EOT
|
||||
|
||||
# install cutlass https://github.com/NVIDIA/cutlass
|
||||
# H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
|
||||
# A100: architecture is Ampere
|
||||
# V100: architecture is Volta
|
||||
# T4: architecture is Turing
|
||||
# ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
|
||||
# 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。
|
||||
# 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。
|
||||
# 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
||||
# 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
||||
# 89:GeForce RTX 4090
|
||||
ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
|
||||
ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
git clone https://github.com/NVIDIA/cutlass /opt/cutlass
|
||||
cd /opt/cutlass
|
||||
git checkout .
|
||||
git checkout master
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
|
||||
make -j"$(nproc)" install
|
||||
cd ..
|
||||
# make test_unit -j"$(nproc)"
|
||||
# make test_unit_gemm_warp -j"$(nproc)"
|
||||
EOT
|
||||
|
||||
# Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# # OPENMPI
|
||||
# rm -rf ${STAGE_DIR}
|
||||
# mkdir -p ${STAGE_DIR}
|
||||
# cd ${STAGE_DIR}
|
||||
# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf -
|
||||
# cd openmpi-${OPENMPI_VERSION}
|
||||
# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION}
|
||||
# make -j"$(nproc)" install
|
||||
# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi
|
||||
# # Sanity check:
|
||||
# test -f /usr/local/mpi/bin/mpic++
|
||||
# cd ${STAGE_DIR}
|
||||
# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
|
||||
# # Create a wrapper for OpenMPI to allow running as root by default
|
||||
# mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real
|
||||
# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun
|
||||
# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun
|
||||
# chmod a+x /usr/local/mpi/bin/mpirun
|
||||
# apt-get update
|
||||
# apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
|
||||
# python -m pip install pipdeptree \
|
||||
# psutil \
|
||||
# yappi \
|
||||
# cffi \
|
||||
# ipdb \
|
||||
# pandas \
|
||||
# matplotlib \
|
||||
# py3nvml \
|
||||
# pyarrow \
|
||||
# graphviz \
|
||||
# astor \
|
||||
# boto3 \
|
||||
# tqdm \
|
||||
# sentencepiece \
|
||||
# msgpack \
|
||||
# requests \
|
||||
# pandas \
|
||||
# sphinx \
|
||||
# sphinx_rtd_theme \
|
||||
# scipy \
|
||||
# numpy \
|
||||
# scikit-learn \
|
||||
# nvidia-ml-py3 \
|
||||
# mpi4py
|
||||
# EOT
|
||||
|
||||
# # SSH daemon port inside container cannot conflict with host OS port
|
||||
# # ENV SSH_PORT=2222
|
||||
# # RUN <<EOT
|
||||
# # #!/bin/bash
|
||||
# # source /opt/conda/etc/profile.d/conda.sh
|
||||
# # conda activate ${CONDA_ENV_NAME}
|
||||
# # cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||
# # sed "0,/^Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
# # EOT
|
||||
# install deepspeed step 1
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
/opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||||
# install oneapi for deepspeed
|
||||
git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||||
cd ${STAGE_DIR}/oneCCL
|
||||
git checkout .
|
||||
git checkout master
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||||
make -j"$(nproc)" install
|
||||
EOT
|
||||
|
||||
# # 29.78 Usage: install.sh [options...]
|
||||
# # 29.78
|
||||
# # 29.78 By default will install deepspeed and all third party dependencies across all machines listed in
|
||||
# # 29.78 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
|
||||
# # 29.78
|
||||
# # 29.78 [optional]
|
||||
# # 29.78 -l, --local_only Install only on local machine
|
||||
# # 29.78 -s, --pip_sudo Run pip install with sudo (default: no sudo)
|
||||
# # 29.78 -r, --allow_sudo Allow script to be run by root (probably don't want this, instead use --pip_sudo)
|
||||
# # 29.78 -n, --no_clean Do not clean prior build state, by default prior build files are removed before building wheels
|
||||
# # 29.78 -m, --pip_mirror Use the specified pip mirror (default: the default pip mirror)
|
||||
# # 29.78 -H, --hostfile Path to MPI-style hostfile (default: /job/hostfile)
|
||||
# # 29.78 -e, --examples Checkout deepspeed example submodule (no install)
|
||||
# # 29.78 -v, --verbose Verbose logging
|
||||
# # 29.78 -h, --help This help text
|
||||
# install deepspeed step 2
|
||||
ARG CUDA_ARCH_LIST="80;86;89;90"
|
||||
ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||
cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||||
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
|
||||
# pip install dist/deepspeed_kernels-*.whl
|
||||
CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
||||
EOT
|
||||
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||||
# usermod -aG sudo deepspeed
|
||||
# echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||
# EOT
|
||||
ARG DEEPSPEED_VERSION="v0.14.3"
|
||||
ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
||||
ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
||||
ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
||||
ARG DS_BUILD_SPARSE_ATTN=0
|
||||
ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
||||
ARG DS_BUILD_FUSED_ADAM=1
|
||||
ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
||||
ARG DS_BUILD_CPU_ADAM=1
|
||||
ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
||||
ARG DS_BUILD_OPS=1
|
||||
ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
||||
ARG HOSTFILE_CONTENT=""
|
||||
ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
|
||||
ENV CUTLASS_PATH='/opt/cutlass'
|
||||
ENV CUDA_HOME='/usr/local/cuda'
|
||||
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
||||
# install deepspeed step 3
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||
cd ${STAGE_DIR}/DeepSpeed
|
||||
git checkout ${DEEPSPEED_VERSION}
|
||||
sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
|
||||
chmod +x ./install_modified.sh
|
||||
# 检查 HOSTFILE_CONTENT 并写入文件
|
||||
if [ -n "${HOSTFILE_CONTENT}" ]; then
|
||||
echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
|
||||
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
|
||||
else
|
||||
INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
|
||||
fi
|
||||
eval $INSTALL_CMD
|
||||
# compile deepspeed ops
|
||||
cat <<EOF > ~/compile_deepspeed_ops.py
|
||||
import deepspeed
|
||||
|
||||
# # install cutlass https://github.com/NVIDIA/cutlass
|
||||
# # H100: architecture is Hopper (cutlass need add : cmake .. -DCUTLASS_NVCC_ARCHS="90a" )
|
||||
# # A100: architecture is Ampere
|
||||
# # V100: architecture is Volta
|
||||
# # T4: architecture is Turing
|
||||
# # ENV CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc
|
||||
# # 70:适用于 NVIDIA Volta 架构(如 Tesla V100)。
|
||||
# # 75:适用于 NVIDIA Turing 架构(如 Tesla T4)。
|
||||
# # 80:适用于 NVIDIA Ampere 架构(如 A100)。
|
||||
# # 90a:适用于 NVIDIA Hopper 架构(如 H100)。
|
||||
# # 89:GeForce RTX 4090
|
||||
# ARG DCUTLASS_NVCC_ARCHS="80;89;90a"
|
||||
# ENV DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS}
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# git clone https://github.com/NVIDIA/cutlass /opt/cutlass
|
||||
# cd /opt/cutlass
|
||||
# git checkout .
|
||||
# git checkout master
|
||||
# mkdir build
|
||||
# cd build
|
||||
# cmake .. -DCUTLASS_NVCC_ARCHS=${DCUTLASS_NVCC_ARCHS} -DCUTLASS_ENABLE_TESTS=OFF -DCUTLASS_UNITY_BUILD_ENABLED=ON # compiles for NVIDIA Hopper GPU architecture, like H100
|
||||
# make -j"$(nproc)" install
|
||||
# cd ..
|
||||
# # make test_unit -j"$(nproc)"
|
||||
# # make test_unit_gemm_warp -j"$(nproc)"
|
||||
# EOT
|
||||
|
||||
# # Some Packages from https://github.com/microsoft/DeepSpeed/blob/master/docker/Dockerfile
|
||||
# # RUN <<EOT
|
||||
# # source /opt/conda/etc/profile.d/conda.sh
|
||||
# # conda activate ${CONDA_ENV_NAME}
|
||||
# # apt-get update
|
||||
# # apt-get install -y --no-install-recommends libsndfile-dev libcupti-dev libjpeg-dev libpng-dev screen libaio-dev
|
||||
# # python -m pip install pipdeptree \
|
||||
# # psutil \
|
||||
# # yappi \
|
||||
# # cffi \
|
||||
# # ipdb \
|
||||
# # pandas \
|
||||
# # matplotlib \
|
||||
# # py3nvml \
|
||||
# # pyarrow \
|
||||
# # graphviz \
|
||||
# # astor \
|
||||
# # boto3 \
|
||||
# # tqdm \
|
||||
# # sentencepiece \
|
||||
# # msgpack \
|
||||
# # requests \
|
||||
# # pandas \
|
||||
# # sphinx \
|
||||
# # sphinx_rtd_theme \
|
||||
# # scipy \
|
||||
# # numpy \
|
||||
# # scikit-learn \
|
||||
# # nvidia-ml-py3 \
|
||||
# # mpi4py
|
||||
# # EOT
|
||||
|
||||
# # install deepspeed step 1
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# /opt/conda/envs/${CONDA_ENV_NAME}/bin/python -m pip install setuptools==${SETUPTOOLS_VERSION}
|
||||
# # install oneapi for deepspeed
|
||||
# git clone https://github.com/oneapi-src/oneCCL.git ${STAGE_DIR}/oneCCL
|
||||
# cd ${STAGE_DIR}/oneCCL
|
||||
# git checkout .
|
||||
# git checkout master
|
||||
# mkdir build
|
||||
# cd build
|
||||
# cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
|
||||
# make -j"$(nproc)" install
|
||||
# EOT
|
||||
|
||||
# # install deepspeed step 2
|
||||
# ARG CUDA_ARCH_LIST="80;86;89;90"
|
||||
# ENV CUDA_ARCH_LIST=${CUDA_ARCH_LIST}
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# git clone https://github.com/microsoft/DeepSpeed-Kernels.git ${STAGE_DIR}/DeepSpeed-Kernels
|
||||
# cd ${STAGE_DIR}/DeepSpeed-Kernels
|
||||
# # CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python setup.py bdist_wheel
|
||||
# # pip install dist/deepspeed_kernels-*.whl
|
||||
# CUDA_ARCH_LIST=${CUDA_ARCH_LIST} python -m pip install -v .
|
||||
# EOT
|
||||
|
||||
# ARG DEEPSPEED_VERSION="v0.14.3"
|
||||
# ENV DEEPSPEED_VERSION=${DEEPSPEED_VERSION}
|
||||
# ARG DEEPSPEED_INSTALL_FLAGS="--allow_sudo --pip_sudo --verbose"
|
||||
# ENV DEEPSPEED_INSTALL_FLAGS=${DEEPSPEED_INSTALL_FLAGS}
|
||||
# ARG DS_BUILD_SPARSE_ATTN=0
|
||||
# ENV DS_BUILD_SPARSE_ATTN=${DS_BUILD_SPARSE_ATTN}
|
||||
# ARG DS_BUILD_FUSED_ADAM=1
|
||||
# ENV DS_BUILD_FUSED_ADAM=${DS_BUILD_FUSED_ADAM}
|
||||
# ARG DS_BUILD_CPU_ADAM=1
|
||||
# ENV DS_BUILD_CPU_ADAM=${DS_BUILD_CPU_ADAM}
|
||||
# ARG DS_BUILD_OPS=1
|
||||
# ENV DS_BUILD_OPS=${DS_BUILD_OPS}
|
||||
# ARG HOSTFILE_CONTENT=""
|
||||
# ENV HOSTFILE_CONTENT=${HOSTFILE_CONTENT}
|
||||
# ENV CUTLASS_PATH='/opt/cutlass'
|
||||
# ENV CUDA_HOME='/usr/local/cuda'
|
||||
# ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
||||
# ENV PATH=${CUDA_HOME}/bin:${PATH}
|
||||
# # install deepspeed step 3
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||
# cd ${STAGE_DIR}/DeepSpeed
|
||||
# git checkout ${DEEPSPEED_VERSION}
|
||||
# sed 's/pip install/python -m pip install/' install.sh > install_modified.sh
|
||||
# chmod +x ./install_modified.sh
|
||||
# # 检查 HOSTFILE_CONTENT 并写入文件
|
||||
# if [ -n "${HOSTFILE_CONTENT}" ]; then
|
||||
# echo "${HOSTFILE_CONTENT}" > /tmp/hostfile
|
||||
# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS} --hostfile /tmp/hostfile"
|
||||
# else
|
||||
# INSTALL_CMD="./install_modified.sh ${DEEPSPEED_INSTALL_FLAGS}"
|
||||
# fi
|
||||
# eval $INSTALL_CMD
|
||||
# # compile deepspeed ops
|
||||
# cat <<EOF > ~/compile_deepspeed_ops.py
|
||||
# import deepspeed
|
||||
|
||||
# def compile_ops():
|
||||
# builders = [
|
||||
# deepspeed.ops.op_builder.AsyncIOBuilder,
|
||||
# deepspeed.ops.op_builder.FusedAdamBuilder,
|
||||
# deepspeed.ops.op_builder.CPUAdamBuilder,
|
||||
# deepspeed.ops.op_builder.CPUAdagradBuilder,
|
||||
# deepspeed.ops.op_builder.CPULionBuilder,
|
||||
# deepspeed.ops.op_builder.EvoformerAttnBuilder,
|
||||
# deepspeed.ops.op_builder.FPQuantizerBuilder,
|
||||
# deepspeed.ops.op_builder.FusedLambBuilder,
|
||||
# deepspeed.ops.op_builder.FusedLionBuilder,
|
||||
# deepspeed.ops.op_builder.QuantizerBuilder,
|
||||
# deepspeed.ops.op_builder.RaggedOpsBuilder,
|
||||
# deepspeed.ops.op_builder.RandomLTDBuilder,
|
||||
# deepspeed.ops.op_builder.SparseAttnBuilder,
|
||||
# deepspeed.ops.op_builder.SpatialInferenceBuilder,
|
||||
# deepspeed.ops.op_builder.TransformerBuilder,
|
||||
# deepspeed.ops.op_builder.StochasticTransformerBuilder,
|
||||
# ]
|
||||
def compile_ops():
|
||||
builders = [
|
||||
deepspeed.ops.op_builder.AsyncIOBuilder,
|
||||
deepspeed.ops.op_builder.FusedAdamBuilder,
|
||||
deepspeed.ops.op_builder.CPUAdamBuilder,
|
||||
deepspeed.ops.op_builder.CPUAdagradBuilder,
|
||||
deepspeed.ops.op_builder.CPULionBuilder,
|
||||
deepspeed.ops.op_builder.EvoformerAttnBuilder,
|
||||
deepspeed.ops.op_builder.FPQuantizerBuilder,
|
||||
deepspeed.ops.op_builder.FusedLambBuilder,
|
||||
deepspeed.ops.op_builder.FusedLionBuilder,
|
||||
deepspeed.ops.op_builder.QuantizerBuilder,
|
||||
deepspeed.ops.op_builder.RaggedOpsBuilder,
|
||||
deepspeed.ops.op_builder.RandomLTDBuilder,
|
||||
deepspeed.ops.op_builder.SparseAttnBuilder,
|
||||
deepspeed.ops.op_builder.SpatialInferenceBuilder,
|
||||
deepspeed.ops.op_builder.TransformerBuilder,
|
||||
deepspeed.ops.op_builder.StochasticTransformerBuilder,
|
||||
]
|
||||
|
||||
# for builder in builders:
|
||||
# print(f"Compiling {builder.__name__}")
|
||||
# builder().load()
|
||||
for builder in builders:
|
||||
print(f"Compiling {builder.__name__}")
|
||||
builder().load()
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# compile_ops()
|
||||
# EOF
|
||||
# python compile_deepspeed_ops.py
|
||||
# ds_report
|
||||
# # clean up
|
||||
# # rm -f deepspeed/git_version_info_installed.py
|
||||
# # rm -rf dist build deepspeed.egg-info
|
||||
# # python setup.py bdist_wheel
|
||||
# # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
|
||||
# # DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
|
||||
# # pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
|
||||
# EOT
|
||||
if __name__ == "__main__":
|
||||
compile_ops()
|
||||
EOF
|
||||
python compile_deepspeed_ops.py
|
||||
ds_report
|
||||
# clean up
|
||||
# rm -f deepspeed/git_version_info_installed.py
|
||||
# rm -rf dist build deepspeed.egg-info
|
||||
# python setup.py bdist_wheel
|
||||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v dist/deepspeed*.whl
|
||||
# DS_BUILD_OPS=${DS_BUILD_OPS} pip install -v -r requirements/requirements.txt
|
||||
# pip install numpy==1.22.4 # ImportError: cannot import name 'BUFSIZE' from 'numpy' (/opt/conda/envs/deepspeed/lib/python3.10/site-packages/numpy/__init__.py) wait for fix in numpy=2.0.0
|
||||
EOT
|
||||
|
||||
# # install transformers and flash-attn
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# # install transformers
|
||||
# git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
|
||||
# cd ${STAGE_DIR}/transformers
|
||||
# python3 ./setup.py develop
|
||||
# python3 -m pip install -U --no-cache-dir "pydantic<2"
|
||||
# # install flash-attn
|
||||
# # pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
# pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
# EOT
|
||||
# install transformers and flash-attn
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
# install transformers
|
||||
git clone https://github.com/huggingface/transformers ${STAGE_DIR}/transformers
|
||||
cd ${STAGE_DIR}/transformers
|
||||
python3 ./setup.py develop
|
||||
python3 -m pip install -U --no-cache-dir "pydantic<2"
|
||||
# install flash-attn
|
||||
# pip install packaging -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
pip install flash-attn --no-build-isolation -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
EOT
|
||||
|
||||
# # other packages
|
||||
# ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
|
||||
# RUN <<EOT
|
||||
# #!/bin/bash
|
||||
# source /opt/conda/etc/profile.d/conda.sh
|
||||
# conda activate ${CONDA_ENV_NAME}
|
||||
# pip3 install optimum
|
||||
# pip3 install peft tiktoken \
|
||||
# tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
|
||||
# huggingface_hub spacy blobfile pycocotools \
|
||||
# open_clip_torch \
|
||||
# zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
# EOT
|
||||
# other packages
|
||||
ENV TORCH_CUDA_ARCH_LIST="80;86;89;90"
|
||||
RUN <<EOT
|
||||
#!/bin/bash
|
||||
source /opt/conda/etc/profile.d/conda.sh
|
||||
conda activate ${CONDA_ENV_NAME}
|
||||
pip3 install optimum
|
||||
pip3 install peft tiktoken \
|
||||
tqdm matplotlib seaborn numpy pandas scikit-learn diffusers \
|
||||
huggingface_hub spacy blobfile pycocotools \
|
||||
open_clip_torch \
|
||||
zstandard -i https://pypi.org/simple/ --trusted-host pypi.org
|
||||
EOT
|
||||
|
||||
# ARG DEEPSPEED_TRAIN='/data/train_data'
|
||||
# ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
|
||||
# ARG DEEPSPEED_VALIDATION='/data/validation_data'
|
||||
# ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
|
||||
# ARG NCCL_SOCKET_IFNAME='eth0'
|
||||
ARG DEEPSPEED_TRAIN='/data/train_data'
|
||||
ENV DEEPSPEED_TRAIN=DEEPSPEED_TRAIN
|
||||
ARG DEEPSPEED_VALIDATION='/data/validation_data'
|
||||
ENV DEEPSPEED_VALIDATION=DEEPSPEED_VALIDATION
|
||||
ARG NCCL_SOCKET_IFNAME='eth0'
|
||||
|
||||
# # RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc && \
|
||||
# # echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \
|
||||
# # echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \
|
||||
# # echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \
|
||||
# # echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc
|
||||
# RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc && \
|
||||
# echo 'export PATH=${CUDA_HOME}/bin:${PATH}' >> ~/.bashrc && \
|
||||
# echo 'export CUTLASS_PATH=/opt/cutlass' >> ~/.bashrc && \
|
||||
# echo 'export PATH=/opt/conda/bin:$PATH' >> ~/.bashrc && \
|
||||
# echo "source activate ${CONDA_ENV_NAME}" > ~/.bashrc
|
||||
|
||||
CMD ["/usr/sbin/sshd", "-D"]
|
||||
# CMD ["/bin/bash", "-c", "/usr/sbin/sshd -D & while true; do sleep 1000; done"]
|
||||
|
||||
Reference in New Issue
Block a user