add command
This commit is contained in:
@@ -1,5 +1,53 @@
|
|||||||
# Base Jupyter Notebook Stack
|
# Base Jupyter Notebook Stack
|
||||||
|
|
||||||
|
## ds_report
|
||||||
|
|
||||||
|
```shell
|
||||||
|
[2024-07-17 02:25:56,956] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
||||||
|
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
|
||||||
|
[WARNING] async_io: please install the libaio-dev package with apt
|
||||||
|
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
|
||||||
|
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
|
||||||
|
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.4
|
||||||
|
[WARNING] using untested triton version (3.0.0), only 1.0.0 is known to be compatible
|
||||||
|
|
||||||
|
(deepspeed) root@ubuntu-finetune:~/binbbt/train/pretrain# cat .deepspeed_env
|
||||||
|
CUDA_HOME=/usr/local/cuda/
|
||||||
|
TORCH_USE_CUDA_DSA=1
|
||||||
|
CUTLASS_PATH=/opt/cutlass
|
||||||
|
TORCH_CUDA_ARCH_LIST="80;89;90;90a"
|
||||||
|
LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
|
NCCL_DEBUG=WARN
|
||||||
|
NCCL_SOCKET_IFNAME=bond0
|
||||||
|
NCCL_IB_HCA=mlx5_0:1,mlx5_2:1,mlx5_4:1,mlx5_6:1
|
||||||
|
NCCL_IB_GID_INDEX=3
|
||||||
|
NCCL_NET_GDR_LEVEL=2
|
||||||
|
NCCL_P2P_DISABLE=0
|
||||||
|
NCCL_IB_DISABLE=0
|
||||||
|
```
|
||||||
|
|
||||||
|
## test command
|
||||||
|
|
||||||
|
docker run -it --rm --network=host --privileged --ipc=host --ulimit memlock=-1 --gpus all hotwa/notebook:ngc
|
||||||
|
docker run --rm -it --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 hotwa/notebook:ngc /bin/bash
|
||||||
|
|
||||||
|
```shell
|
||||||
|
nvidia-smi
|
||||||
|
nvcc -V
|
||||||
|
ninja --version
|
||||||
|
ds_report
|
||||||
|
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||||
|
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||||
|
python -c "import deepspeed; deepspeed.ops.op_builder.CPUAdamBuilder().load()"
|
||||||
|
python -c "from flash_attn import flash_attn_func, flash_attn_varlen_func"
|
||||||
|
python -c "import apex.amp; print('Apex is installed and the amp module is available.')"
|
||||||
|
python -c "from xformers import ops as xops"
|
||||||
|
ibstat
|
||||||
|
ofed_info -s # 如果输出显示了 OFED 版本号,则说明 OFED 驱动已安装。
|
||||||
|
mst version
|
||||||
|
mpirun --version
|
||||||
|
```
|
||||||
|
|
||||||
> **Images hosted on Docker Hub are no longer updated. Please, use [quay.io image](https://quay.io/repository/jupyter/base-notebook)**
|
> **Images hosted on Docker Hub are no longer updated. Please, use [quay.io image](https://quay.io/repository/jupyter/base-notebook)**
|
||||||
|
|
||||||
[](https://hub.docker.com/r/jupyter/base-notebook/)
|
[](https://hub.docker.com/r/jupyter/base-notebook/)
|
||||||
|
|||||||
Reference in New Issue
Block a user