Skip to content

Commit

Permalink
a
Browse files Browse the repository at this point in the history
  • Loading branch information
weizhoublue committed Feb 28, 2025
1 parent 90188ea commit bd6afbb
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 29 deletions.
4 changes: 3 additions & 1 deletion mx-rdma-tools/image/tools/testRdmaLatency
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ LOCAL_RDMA_INFO=$(GetLocalRdmaDeviceIP)
echo "error, failed to get local rdma information"
exit 1
}
LOCAL_ENDPOINT_IP=$(ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}')
LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' )
LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' )


echo "collecting remote information ..."
EXPECT_NUM=0
Expand Down
5 changes: 4 additions & 1 deletion rdma-tools/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,11 @@ resources:
spidernet.io/gpu2sriov: 1
# nvidia.com/gpu: 1
#hostnetwork: false
#ssh_port: 2022
#securityContext:
# # required by gdrcopy test
# # required by gdrcopy test or hostnetwork
# privileged: true
# capabilities:
# add: [ "IPC_LOCK" ]
Expand Down
2 changes: 2 additions & 0 deletions rdma-tools/chart/templates/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ spec:
fieldPath: spec.nodeName
- name: ENV_SERVICE_NAME
value: {{ include "project.name" . | trunc 63 | trimSuffix "-" | quote }}
- name: ENV_SSH_PORT
value: {{ .Values.ssh_port | quote }}
- name: ENV_POD_NAMESPACE
valueFrom:
fieldRef:
Expand Down
2 changes: 2 additions & 0 deletions rdma-tools/chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,5 @@ extraVolumeMounts: []

service:
type: ClusterIP

ssh_port: 2022
3 changes: 3 additions & 0 deletions rdma-tools/image/entry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@

source /usr/sbin/rdmatools


ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"}
sed -i -E "s/.*Port 22.*/Port ${ENV_SSH_PORT}/" /etc/ssh/sshd_config
service ssh start
ulimit -l 2000000
ulimit -a
Expand Down
6 changes: 4 additions & 2 deletions rdma-tools/image/tools/testConnect
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ set -o errexit
set -o nounset

CURRENT_FILENAME=$( basename $0 )
ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"}

source /usr/sbin/rdmatools

Expand All @@ -25,15 +26,16 @@ mkdir -p ${RESULT_DIR}

echo "collecting local information ..."
LOCAL_IP_INFO=$( GetUpAllIP )
LOCAL_ENDPOINT_IP=$( ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' )
LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' )
LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' )

echo "collecting remote information ..."
EXPECT_NUM=0
for ADDR in ${POD_IP_LIST} ; do
((EXPECT_NUM+=1))
(
echo "collecting information from ${ADDR}"
REMOTE_IP_INFO=$( ssh ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP " )
REMOTE_IP_INFO=$( ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP " )
if [ -n "${REMOTE_IP_INFO}" ] ; then
echo "${REMOTE_IP_INFO}" > ${REMOTE_INFOR_DIR}/${ADDR}
else
Expand Down
12 changes: 7 additions & 5 deletions rdma-tools/image/tools/testIperf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ if ! which iperf3 &>/dev/null; then
fi

CURRENT_FILENAME=$(basename $0)
ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"}

source /usr/sbin/rdmatools

Expand All @@ -35,15 +36,16 @@ mkdir -p ${REMOTE_OUTPUT_DIR}

echo "collecting local information ..."
LOCAL_IP_INFO=$(GetUpAllIP)
LOCAL_ENDPOINT_IP=$(ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}')
LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' )
LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' )

echo "collecting remote information ..."
EXPECT_NUM=0
for ADDR in ${POD_IP_LIST}; do
((EXPECT_NUM += 1))
(
echo "collecting information from ${ADDR}"
REMOTE_IP_INFO=$(ssh ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP ")
REMOTE_IP_INFO=$(ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP ")
if [ -n "${REMOTE_IP_INFO}" ]; then
echo "${REMOTE_IP_INFO}" >${REMOTE_IP_INFOR_DIR}/${ADDR}
else
Expand Down Expand Up @@ -74,10 +76,10 @@ TestPerHost() {
printf "_____________________________________________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE}

{
ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
OPTIONS="${CMD_OPTIONS} -s "
echo "server on remote ${REMOTE_HOST}: ${CMD_CLI} ${OPTIONS}"
ssh ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null
} &
sleep 2

Expand Down Expand Up @@ -129,7 +131,7 @@ TestPerHost() {

printf "_____________________________________________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE}

ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true

echo "" >>${RESULT_FILE}
echo ""
Expand Down
12 changes: 7 additions & 5 deletions rdma-tools/image/tools/testRdmaBw
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ if ! which ib_write_bw &>/dev/null; then
fi

CURRENT_FILENAME=$(basename $0)
ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"}

source /usr/sbin/rdmatools

Expand All @@ -39,15 +40,16 @@ LOCAL_RDMA_INFO=$(GetLocalRdmaDeviceIP)
echo "error, failed to get local rdma information"
exit 1
}
LOCAL_ENDPOINT_IP=$(ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}')
LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' )
LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' )

echo "collecting remote information ..."
EXPECT_NUM=0
for ADDR in ${POD_IP_LIST}; do
((EXPECT_NUM += 1))
(
echo "collecting information from ${ADDR}"
REMOTE_RDMA_INFO=$(ssh ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ")
REMOTE_RDMA_INFO=$(ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ")
if [ -n "${REMOTE_RDMA_INFO}" ]; then
echo "${REMOTE_RDMA_INFO}" >${REMOTE_INFOR_DIR}/${ADDR}
else
Expand Down Expand Up @@ -115,10 +117,10 @@ TestPerHost() {
else

{
ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
OPTIONS="-d ${REMOTE_DEVICE} -x ${REMOTE_INDEX} ${CMD_OPTIONS} "
echo "server on remote ${REMOTE_HOST}: ${CMD_CLI} ${OPTIONS}"
ssh ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null
} &
sleep 2

Expand All @@ -141,7 +143,7 @@ TestPerHost() {

printf "_____________________________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE}

ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true

echo "" >>${RESULT_FILE}
echo ""
Expand Down
12 changes: 7 additions & 5 deletions rdma-tools/image/tools/testRdmaLatency
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ if ! which ib_write_lat &>/dev/null; then
fi

CURRENT_FILENAME=$(basename $0)
ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"}

source /usr/sbin/rdmatools

Expand All @@ -39,15 +40,16 @@ LOCAL_RDMA_INFO=$(GetLocalRdmaDeviceIP)
echo "error, failed to get local rdma information"
exit 1
}
LOCAL_ENDPOINT_IP=$(ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}')
LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' )
LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' )

echo "collecting remote information ..."
EXPECT_NUM=0
for ADDR in ${POD_IP_LIST}; do
((EXPECT_NUM += 1))
(
echo "collecting information from ${ADDR}"
REMOTE_RDMA_INFO=$(ssh ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ")
REMOTE_RDMA_INFO=$(ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ")
if [ -n "${REMOTE_RDMA_INFO}" ]; then
echo "${REMOTE_RDMA_INFO}" >${REMOTE_INFOR_DIR}/${ADDR}
else
Expand Down Expand Up @@ -114,10 +116,10 @@ TestPerHost() {
else

{
ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
OPTIONS="-d ${REMOTE_DEVICE} -x ${REMOTE_INDEX} ${CMD_OPTIONS} "
echo "server on remote ${REMOTE_HOST}: ${CMD_CLI} ${OPTIONS}"
ssh ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null
} &
sleep 2

Expand All @@ -140,7 +142,7 @@ TestPerHost() {

printf "___________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE}

ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true
ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true

echo "" >>${RESULT_FILE}
echo ""
Expand Down
21 changes: 11 additions & 10 deletions rdma-tools/image/tools/testRemoteGpu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ if ! which ib_write_bw &>/dev/null; then
fi

REMOTE_IP=${1:-""}
ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"}

source /usr/sbin/rdmatools
[ -n "${REMOTE_IP}" ] || REMOTE_IP=$(getOneRemoteEndpoint)
Expand Down Expand Up @@ -75,43 +76,43 @@ TestPerftestGpu() {
}
LOCAL_INTERFACE=$(ibdev2netdev | grep -E "^${LOCAL_RDMA_DEV} " | awk '{print $5}')

GPU_TOTAL=$(ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetGPUAmount ") || {
GPU_TOTAL=$(ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetGPUAmount ") || {
echo "error, there is no GPU on remote ${REMOTE_IP}"
exit 1
}
RDMA_DEV_LIST=$(ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetRdmaDevList ") || {
RDMA_DEV_LIST=$(ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetRdmaDevList ") || {
echo "error, there is RDMA device on remote ${REMOTE_IP}"
exit 1
}
echo ""
echo "---------remote ${REMOTE_IP} topo:"
ssh ${REMOTE_IP} " nvidia-smi topo -m || true "
ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP "
ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " nvidia-smi topo -m || true "
ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP "
echo ""
echo "GPU range: 0 ... $((GPU_TOTAL - 1))"
while :; do
echo ""
echo -n "Please input the expected GPU number of remote node ${REMOTE_IP}: "
read REMOTE_GPU_NUM
if [ -n "${REMOTE_GPU_NUM}" ]; then
ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckGpuIdValidity ${REMOTE_GPU_NUM} " || continue
ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckGpuIdValidity ${REMOTE_GPU_NUM} " || continue
break
fi
done
echo ""
ssh ${REMOTE_IP} " show_gids "
ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " show_gids "
echo ""
echo "Rdma device: ${RDMA_DEV_LIST}"
while :; do
echo ""
echo -n "Please input the expected RDMA device of remote node ${REMOTE_IP}: "
read REMOTE_RDMA_DEV
if [ -n "${REMOTE_RDMA_DEV}" ]; then
ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckRdmaDevValidity ${REMOTE_RDMA_DEV} " || continue
ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckRdmaDevValidity ${REMOTE_RDMA_DEV} " || continue
break
fi
done
REMOTE_RDMA_IP=$(ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetBestRdmaDeviceIP ${REMOTE_RDMA_DEV} ")
REMOTE_RDMA_IP=$(ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetBestRdmaDeviceIP ${REMOTE_RDMA_DEV} ")
[ -n "${REMOTE_RDMA_IP}" ] || {
echo "error, failed to find any IP on device ${REMOTE_RDMA_IP}"
exit 1
Expand All @@ -137,8 +138,8 @@ TestPerftestGpu() {
sleep 3

echo "clinet on remote ${REMOTE_IP}: ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} "
#ssh ${REMOTE_IP} " ulimit -l 2000000 && ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} "
ssh ${REMOTE_IP} " ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} "
#ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " ulimit -l 2000000 && ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} "
ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} "

}
TestPerftestGpu

0 comments on commit bd6afbb

Please sign in to comment.