diff --git a/mx-rdma-tools/chart/templates/daemonset.yaml b/mx-rdma-tools/chart/templates/daemonset.yaml index ad7605f..ca46d95 100644 --- a/mx-rdma-tools/chart/templates/daemonset.yaml +++ b/mx-rdma-tools/chart/templates/daemonset.yaml @@ -93,6 +93,8 @@ spec: fieldPath: spec.nodeName - name: ENV_SERVICE_NAME value: {{ include "project.name" . | trunc 63 | trimSuffix "-" | quote }} + - name: ENV_SSH_PORT + value: {{ .Values.ssh_port }} - name: ENV_POD_NAMESPACE valueFrom: fieldRef: diff --git a/mx-rdma-tools/chart/values.yaml b/mx-rdma-tools/chart/values.yaml index e3c348f..87bfee7 100644 --- a/mx-rdma-tools/chart/values.yaml +++ b/mx-rdma-tools/chart/values.yaml @@ -55,3 +55,6 @@ extraVolumeMounts: [] service: type: ClusterIP + +# ssh port +ssh_port: 2022 diff --git a/mx-rdma-tools/image/entry.sh b/mx-rdma-tools/image/entry.sh index 1557683..c6b19f5 100644 --- a/mx-rdma-tools/image/entry.sh +++ b/mx-rdma-tools/image/entry.sh @@ -5,6 +5,9 @@ source /usr/sbin/rdmatools +ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"} +sed -i -E "s/.*Port 22.*/Port ${ENV_SSH_PORT}/" /etc/ssh/sshd_config + service ssh start ulimit -l 2000000 ulimit -a diff --git a/mx-rdma-tools/image/tools/testConnect b/mx-rdma-tools/image/tools/testConnect index 2c839fa..3eb9695 100755 --- a/mx-rdma-tools/image/tools/testConnect +++ b/mx-rdma-tools/image/tools/testConnect @@ -9,6 +9,7 @@ set -o errexit set -o nounset CURRENT_FILENAME=$( basename $0 ) +ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"} source /usr/sbin/rdmatools @@ -25,7 +26,8 @@ mkdir -p ${RESULT_DIR} echo "collecting local information ..." LOCAL_IP_INFO=$( GetUpAllIP ) -LOCAL_ENDPOINT_IP=$( ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' ) +LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' ) +LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' ) echo "collecting remote information ..." EXPECT_NUM=0 @@ -33,7 +35,7 @@ for ADDR in ${POD_IP_LIST} ; do ((EXPECT_NUM+=1)) ( echo "collecting information from ${ADDR}" - REMOTE_IP_INFO=$( ssh ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP " ) + REMOTE_IP_INFO=$( ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP " ) if [ -n "${REMOTE_IP_INFO}" ] ; then echo "${REMOTE_IP_INFO}" > ${REMOTE_INFOR_DIR}/${ADDR} else diff --git a/mx-rdma-tools/image/tools/testIperf b/mx-rdma-tools/image/tools/testIperf index 13ab843..f3a5ce3 100755 --- a/mx-rdma-tools/image/tools/testIperf +++ b/mx-rdma-tools/image/tools/testIperf @@ -14,6 +14,7 @@ if ! which iperf3 &>/dev/null; then fi CURRENT_FILENAME=$(basename $0) +ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"} source /usr/sbin/rdmatools @@ -35,7 +36,9 @@ mkdir -p ${REMOTE_OUTPUT_DIR} echo "collecting local information ..." LOCAL_IP_INFO=$(GetUpAllIP) -LOCAL_ENDPOINT_IP=$(ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}') +LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' ) +LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' ) + echo "collecting remote information ..." EXPECT_NUM=0 @@ -43,7 +46,7 @@ for ADDR in ${POD_IP_LIST}; do ((EXPECT_NUM += 1)) ( echo "collecting information from ${ADDR}" - REMOTE_IP_INFO=$(ssh ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP ") + REMOTE_IP_INFO=$(ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetUpAllIP ") if [ -n "${REMOTE_IP_INFO}" ]; then echo "${REMOTE_IP_INFO}" >${REMOTE_IP_INFOR_DIR}/${ADDR} else @@ -74,10 +77,10 @@ TestPerHost() { printf "_____________________________________________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE} { - ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true OPTIONS="${CMD_OPTIONS} -s " echo "server on remote ${REMOTE_HOST}: ${CMD_CLI} ${OPTIONS}" - ssh ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null } & sleep 2 @@ -129,7 +132,7 @@ TestPerHost() { printf "_____________________________________________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE} - ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true echo "" >>${RESULT_FILE} echo "" diff --git a/mx-rdma-tools/image/tools/testRdmaBw b/mx-rdma-tools/image/tools/testRdmaBw index 636b5f1..ace94a7 100644 --- a/mx-rdma-tools/image/tools/testRdmaBw +++ b/mx-rdma-tools/image/tools/testRdmaBw @@ -14,6 +14,7 @@ if ! which ib_write_bw &>/dev/null; then fi CURRENT_FILENAME=$(basename $0) +ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"} source /usr/sbin/rdmatools @@ -39,7 +40,9 @@ LOCAL_RDMA_INFO=$(GetLocalRdmaDeviceIP) echo "error, failed to get local rdma information" exit 1 } -LOCAL_ENDPOINT_IP=$(ip -4 addr show eth0 | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}') +LOCAL_INT=$( ip r | grep default | head -1 | grep -oE "dev [^[:space:]]+" | awk '{print $2}' ) +LOCAL_ENDPOINT_IP=$( ip -4 addr show ${LOCAL_INT} | grep -oP '(?<=inet\s)[0-9]+(\.[0-9]+){3}' ) + echo "collecting remote information ..." EXPECT_NUM=0 @@ -47,7 +50,7 @@ for ADDR in ${POD_IP_LIST}; do ((EXPECT_NUM += 1)) ( echo "collecting information from ${ADDR}" - REMOTE_RDMA_INFO=$(ssh ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ") + REMOTE_RDMA_INFO=$(ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ") if [ -n "${REMOTE_RDMA_INFO}" ]; then echo "${REMOTE_RDMA_INFO}" >${REMOTE_INFOR_DIR}/${ADDR} else @@ -115,10 +118,10 @@ TestPerHost() { else { - ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true OPTIONS="-d ${REMOTE_DEVICE} -x ${REMOTE_INDEX} ${CMD_OPTIONS} " echo "server on remote ${REMOTE_HOST}: ${CMD_CLI} ${OPTIONS}" - ssh ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null } & sleep 2 @@ -141,7 +144,7 @@ TestPerHost() { printf "_____________________________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE} - ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true echo "" >>${RESULT_FILE} echo "" diff --git a/mx-rdma-tools/image/tools/testRdmaLatency b/mx-rdma-tools/image/tools/testRdmaLatency index d630ec0..c35baba 100755 --- a/mx-rdma-tools/image/tools/testRdmaLatency +++ b/mx-rdma-tools/image/tools/testRdmaLatency @@ -14,6 +14,7 @@ if ! which ib_write_lat &>/dev/null; then fi CURRENT_FILENAME=$(basename $0) +ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"} source /usr/sbin/rdmatools @@ -47,7 +48,7 @@ for ADDR in ${POD_IP_LIST}; do ((EXPECT_NUM += 1)) ( echo "collecting information from ${ADDR}" - REMOTE_RDMA_INFO=$(ssh ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ") + REMOTE_RDMA_INFO=$(ssh -p ${ENV_SSH_PORT} ${ADDR} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP ") if [ -n "${REMOTE_RDMA_INFO}" ]; then echo "${REMOTE_RDMA_INFO}" >${REMOTE_INFOR_DIR}/${ADDR} else @@ -114,10 +115,10 @@ TestPerHost() { else { - ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true OPTIONS="-d ${REMOTE_DEVICE} -x ${REMOTE_INDEX} ${CMD_OPTIONS} " echo "server on remote ${REMOTE_HOST}: ${CMD_CLI} ${OPTIONS}" - ssh ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " ${CMD_CLI} ${OPTIONS} " &>/dev/null } & sleep 2 @@ -140,7 +141,7 @@ TestPerHost() { printf "___________________________________________________________________________________________________________________________________\n" >>${RESULT_FILE} - ssh ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true + ssh -p ${ENV_SSH_PORT} ${REMOTE_HOST} " PID=\`ps aux | grep ${CMD_CLI} | grep -v grep | awk '{print \$2}' \` && [ -n \"\${PID}\" ] && kill -9 \${PID} " || true echo "" >>${RESULT_FILE} echo "" diff --git a/mx-rdma-tools/image/tools/testRemoteGpu b/mx-rdma-tools/image/tools/testRemoteGpu index f5c3d19..0eaa25c 100755 --- a/mx-rdma-tools/image/tools/testRemoteGpu +++ b/mx-rdma-tools/image/tools/testRemoteGpu @@ -17,6 +17,7 @@ if ! which ib_write_bw &>/dev/null; then fi REMOTE_IP=${1:-""} +ENV_SSH_PORT=${ENV_SSH_PORT:-"2022"} source /usr/sbin/rdmatools [ -n "${REMOTE_IP}" ] || REMOTE_IP=$(getOneRemoteEndpoint) @@ -75,18 +76,18 @@ TestPerftestGpu() { } LOCAL_INTERFACE=$(ibdev2netdev | grep -E "^${LOCAL_RDMA_DEV} " | awk '{print $5}') - GPU_TOTAL=$(ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetGPUAmount ") || { + GPU_TOTAL=$(ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetGPUAmount ") || { echo "error, there is no GPU on remote ${REMOTE_IP}" exit 1 } - RDMA_DEV_LIST=$(ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetRdmaDevList ") || { + RDMA_DEV_LIST=$(ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetRdmaDevList ") || { echo "error, there is RDMA device on remote ${REMOTE_IP}" exit 1 } echo "" echo "---------remote ${REMOTE_IP} topo:" - ssh ${REMOTE_IP} " nvidia-smi topo -m || true " - ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP " + ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " nvidia-smi topo -m || true " + ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetLocalRdmaDeviceIP " echo "" echo "GPU range: 0 ... $((GPU_TOTAL - 1))" while :; do @@ -94,12 +95,12 @@ TestPerftestGpu() { echo -n "Please input the expected GPU number of remote node ${REMOTE_IP}: " read REMOTE_GPU_NUM if [ -n "${REMOTE_GPU_NUM}" ]; then - ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckGpuIdValidity ${REMOTE_GPU_NUM} " || continue + ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckGpuIdValidity ${REMOTE_GPU_NUM} " || continue break fi done echo "" - ssh ${REMOTE_IP} " show_gids " + ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " show_gids " echo "" echo "Rdma device: ${RDMA_DEV_LIST}" while :; do @@ -107,11 +108,11 @@ TestPerftestGpu() { echo -n "Please input the expected RDMA device of remote node ${REMOTE_IP}: " read REMOTE_RDMA_DEV if [ -n "${REMOTE_RDMA_DEV}" ]; then - ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckRdmaDevValidity ${REMOTE_RDMA_DEV} " || continue + ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && CheckRdmaDevValidity ${REMOTE_RDMA_DEV} " || continue break fi done - REMOTE_RDMA_IP=$(ssh ${REMOTE_IP} " source /usr/sbin/rdmatools && GetBestRdmaDeviceIP ${REMOTE_RDMA_DEV} ") + REMOTE_RDMA_IP=$(ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " source /usr/sbin/rdmatools && GetBestRdmaDeviceIP ${REMOTE_RDMA_DEV} ") [ -n "${REMOTE_RDMA_IP}" ] || { echo "error, failed to find any IP on device ${REMOTE_RDMA_IP}" exit 1 @@ -137,8 +138,8 @@ TestPerftestGpu() { sleep 3 echo "clinet on remote ${REMOTE_IP}: ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} " - #ssh ${REMOTE_IP} " ulimit -l 2000000 && ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} " - ssh ${REMOTE_IP} " ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} " + #ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " ulimit -l 2000000 && ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} " + ssh -p ${ENV_SSH_PORT} ${REMOTE_IP} " ib_write_bw -d ${REMOTE_RDMA_DEV} --use_cuda=${REMOTE_GPU_NUM} ${CMD_OPTIONS} ${LOCAL_RDMA_IP} " } TestPerftestGpu