From 5768102ae6de19680f70d0e907ea0271589d8144 Mon Sep 17 00:00:00 2001 From: "weizhou.lan@daocloud.io" Date: Fri, 2 Aug 2024 00:34:34 -0600 Subject: [PATCH] d Signed-off-by: weizhou.lan@daocloud.io --- rdma-tools/Readme.md | 2 +- rdma-tools/image/install-tools.sh | 3 ++- rdma-tools/image/tools/testNcclTest | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/rdma-tools/Readme.md b/rdma-tools/Readme.md index 96e4777..546a5ec 100644 --- a/rdma-tools/Readme.md +++ b/rdma-tools/Readme.md @@ -23,7 +23,7 @@ git push --tags ```shell helm repo add spiderchart https://spidernet-io.github.io/charts -helm repo update +helm repo update spiderchart helm search repo rdma-tools # run daemonset on worker1 and worker2 diff --git a/rdma-tools/image/install-tools.sh b/rdma-tools/image/install-tools.sh index ac8028b..37e869b 100644 --- a/rdma-tools/image/install-tools.sh +++ b/rdma-tools/image/install-tools.sh @@ -18,7 +18,7 @@ InstallNccl(){ wget --no-check-certificate ${ENV_CUDA_DEB_SOURCE} dpkg -i *.deb apt-get update - apt install -y libnccl2 libnccl-dev + apt install --allow-change-held-packages -y libnccl2 libnccl-dev rm * -rf || true echo "ulimit -l 2000000" >> /etc/bash.bashrc @@ -110,6 +110,7 @@ packages=( jq inxi hwloc + libgomp1 ) export DEBIAN_FRONTEND=noninteractive diff --git a/rdma-tools/image/tools/testNcclTest b/rdma-tools/image/tools/testNcclTest index 2401a75..ea88016 100644 --- a/rdma-tools/image/tools/testNcclTest +++ b/rdma-tools/image/tools/testNcclTest @@ -23,6 +23,8 @@ HOST_LIST=$( echo -n "${POD_IP_LIST}" | tr '\n' ',' ) CMD_NAME=${CMD_NAME:-"all_reduce_perf"} CMD_OPTIONS=${CMD_OPTIONS:-"-b 512M -e 8G -f 2 -n 1 "} +# todo: check ethernet or infiband. +# todo: for infiniband , test sharp: -x NCCL_COLLNET_ENABLE=1 -x NCCL_ALGO=CollNet echo "" echo "***************************************************************************************************************"