Skip to content

Commit

Permalink
Docs: add mtu size configuration for SpiderMultusConfig
Browse files Browse the repository at this point in the history
Signed-off-by: Cyclinder Kuo <[email protected]>
  • Loading branch information
cyclinder committed Feb 20, 2025
1 parent 285e341 commit 6e7ec21
Show file tree
Hide file tree
Showing 7 changed files with 840 additions and 50 deletions.
330 changes: 330 additions & 0 deletions docs/example/qos/rdma-qos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
#!/bin/bash
# Copyright 2025 Authors of spidernet-io
# SPDX-License-Identifier: Apache-2.0

# set -x
# set -o xtrace
set -o errexit

echo -e "\e[31m Prepare rdma qos script \e[0m"

function validate_nic() {
nic_list=$1
if [ -z "$nic_list" ]; then
echo "error, nic list is empty"
exit 1
fi

rdma_priority=$2
if [ -z "$rdma_priority" ]; then
echo "error, rdma_priority is empty"
exit 1
fi

# nic_list is a comma-separated list of NICs
IFS=',' read -r -a gpu_nic_array <<<"$nic_list"
for nic_item in "${gpu_nic_array[@]}"; do
# Perform operations on each NIC
echo "prechecking for device: $nic_item"
if [ -z "$nic_item" ]; then
echo "error, invalid nic name"
exit 1
fi

ip link show "$nic_item" &>/dev/null || {
echo "error, device $nic_item does not exist"
exit 1
}

rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}')
[[ -n "$rdma_dev" ]] || {
echo "error, rdma device does not exist for $nic_item, is it an rdma nic?"
exit 1
}

ip a show "$nic_item" | grep link/infiniband &>/dev/null && {
echo "error, device $nic_item is an infiniband nic, it should be an rdma roce nic"
exit 1
}

if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority ]; then
echo "error, /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority is not found"
return 1
fi

if ! [ -f /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority ]; then
echo "error, /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority is not found"
return 1
fi

if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp ]; then
echo "error, /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp is not found"
return 1
fi

if ! [ -f /sys/class/infiniband/$rdma_dev/tc/1/traffic_class ]; then
echo "error, /sys/class/infiniband/$rdma_dev/tc/1/traffic_class is not found"
return 1
fi
echo "device $nic_item is ready"
done
}

GPU_NIC_LIST=${GPU_NIC_LIST:-""}
GPU_RDMA_PRIORITY=${GPU_RDMA_PRIORITY:-5}
GPU_RDMA_QOS=${GPU_RDMA_QOS:-""}
GPU_CNP_PRIORITY=${GPU_CNP_PRIORITY:-6}
GPU_CNP_QOS=${GPU_NIC_QOS:-""}
STORAGE_NIC_LIST=${STORAGE_NIC_LIST:-""}
STORAGE_RDMA_PRIORITY=${STORAGE_RDMA_PRIORITY:-5}
STORAGE_RDMA_QOS=${STORAGE_RDMA_QOS:-""}
STORAGE_CNP_PRIORITY=${STORAGE_CNP_PRIORITY:-6}
STORAGE_CNP_QOS=${STORAGE_CNP_QOS:-""}

[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && {
echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured"
exit 1
}

# Check if ecn_priority is within the range 0-7
if ! [[ "$GPU_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, GPU_RDMA_PRIORITY must be in the range of 0-7, but got $GPU_RDMA_PRIORITY"
exit 1
fi

if ! [[ "$GPU_CNP_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, GPU_CNP_PRIORITY must be in the range of 0-7, but got $GPU_CNP_PRIORITY"
exit 1
fi

if ! [[ "$STORAGE_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, STORAGE_RDMA_PRIORITY must be in the range of 0-7, but got $STORAGE_RDMA_PRIORITY"
exit 1
fi

if ! [[ "$STORAGE_CNP_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, STORAGE_CNP_PRIORITY must be in the range of 0-7, but got $STORAGE_CNP_PRIORITY"
exit 1
fi

if [[ "$GPU_RDMA_PRIORITY" -eq "$GPU_CNP_PRIORITY" ]]; then
echo "error, GPU_RDMA_PRIORITY and GPU_CNP_PRIORITY cannot be the same"
exit 1
fi

if [[ "$STORAGE_RDMA_PRIORITY" -eq "$STORAGE_CNP_PRIORITY" ]]; then
echo "error, STORAGE_RDMA_PRIORITY and STORAGE_CNP_PRIORITY cannot be the same"
exit 1
fi

# ##################### wait unit all tools are ready ################################
if ! which mlnx_qos >/dev/null; then
echo "mlnx_qos is not ready..."
exit 1
fi
echo "mlnx_qos is ready"

if ! ibdev2netdev >/dev/null; then
echo "ibdev2netdev is not ready..."
exit 1
fi
echo "ibdev2netdev is ready"

if ! cma_roce_tos >/dev/null; then
echo "cma_roce_tos is not ready..."
exit 1
fi
echo "cma_roce_tos is ready"

[ -n "$GPU_NIC_LIST" ] && validate_nic "$GPU_NIC_LIST" "$GPU_RDMA_PRIORITY"
[ -n "$STORAGE_NIC_LIST" ] && validate_nic "$STORAGE_NIC_LIST" "$STORAGE_RDMA_PRIORITY"

[ -z "$GPU_RDMA_QOS" ] && GPU_RDMA_QOS=$((GPU_RDMA_PRIORITY * 8))
[ -z "$GPU_CNP_QOS" ] && GPU_CNP_QOS=$((GPU_CNP_PRIORITY * 8))
[ -z "$STORAGE_RDMA_QOS" ] && STORAGE_RDMA_QOS=$((STORAGE_RDMA_PRIORITY * 8))
[ -z "$STORAGE_CNP_QOS" ] && STORAGE_CNP_QOS=$((STORAGE_CNP_PRIORITY * 8))

if [[ "$GPU_RDMA_QOS" -eq "$GPU_CNP_QOS" ]]; then
echo "error, GPU_RDMA_QOS and GPU_CNP_QOS cannot be the same"
exit 1
fi

if [[ "$STORAGE_RDMA_QOS" -eq "$STORAGE_CNP_QOS" ]]; then
echo "error, STORAGE_RDMA_QOS and STORAGE_CNP_QOS cannot be the same"
exit 1
fi

echo "debug, GPU_NIC_LIST=$GPU_NIC_LIST, GPU_RDMA_PRIORITY=$GPU_RDMA_PRIORITY, GPU_CNP_PRIORITY=$GPU_CNP_PRIORITY, GPU_RDMA_QOS=$GPU_RDMA_QOS, GPU_CNP_QOS=$GPU_CNP_QOS"
echo "debug, STORAGE_NIC_LIST=$STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY=$STORAGE_RDMA_PRIORITY, STORAGE_CNP_PRIORITY=$STORAGE_CNP_PRIORITY, STORAGE_RDMA_QOS=$STORAGE_RDMA_QOS, STORAGE_CNP_QOS=$STORAGE_CNP_QOS"

cat <<G_EOF >/usr/local/bin/rdma_qos.sh
#!/bin/bash
# set -x
# set -o xtrace
set -o errexit
GROUP_GPU="gpu"
GPU_NIC_LIST="$GPU_NIC_LIST"
GPU_RDMA_PRIORITY="$GPU_RDMA_PRIORITY"
GPU_CNP_PRIORITY="$GPU_CNP_PRIORITY"
GPU_RDMA_QOS="$GPU_RDMA_QOS"
GPU_CNP_QOS="$GPU_CNP_QOS"
GROUP_STORAGE="storage"
STORAGE_NIC_LIST="$STORAGE_NIC_LIST"
STORAGE_RDMA_PRIORITY="$STORAGE_RDMA_PRIORITY"
STORAGE_CNP_PRIORITY="$STORAGE_CNP_PRIORITY"
STORAGE_RDMA_QOS="$STORAGE_RDMA_QOS"
STORAGE_CNP_QOS="$STORAGE_CNP_QOS"
G_EOF

cat <<"S_EOF" >>/usr/local/bin/rdma_qos.sh
RUN_ONCE=${RUN_ONCE:-false}
DEBUG_LOG=${DEBUG_LOG:-false}
function set_rdma_qos() {
# $GPU_NIC_LIST $GPU_RDMA_PRIORITY $GPU_RDMA_QOS $GPU_CNP_QOS $DEBUG_LOG
nic_list=$1
if [ -z "$nic_list" ]; then
echo "error, nic_list is empty"
exit 1
fi
rdma_priority=$2
if [ -z "$rdma_priority" ]; then
echo "error, rdma_priority is empty"
exit 1
fi
rdma_qos=$3
if [ -z "$rdma_qos" ]; then
echo "error, rdma_qos is empty"
exit 1
fi
cnp_qos=$4
if [ -z "$cnp_qos" ]; then
echo "error, cnp_qos is empty"
exit 1
fi
debug_log=$5
if [ -z "$debug_log" ]; then
echo "error, debug_log is empty"
exit 1
fi
qos_queues=(0 0 0 0 0 0 0 0)
qos_queues[$rdma_priority]=1
pfc_queue=$(echo "${qos_queues[*]}" | sed 's? ?,?g' | tr -d ' ')
$debug_log && echo "Qos Parameters: rdma_priority: $rdma_priority, rdma_qos: $rdma_qos, cnp_priority: $cnp_priority, cnp_qos: $cnp_qos, pfc_queue: $pfc_queue"
# nic_list is a comma-separated list of NICs
IFS=',' read -r -a gpu_nic_array <<<"$nic_list"
for nic_item in "${gpu_nic_array[@]}"; do
if [ -z "$nic_item" ]; then
echo "warn, nic_item is empty, skip ..."
exit 1
fi
ip link show "$nic_item" &>/dev/null || {
echo "warn, device $nic_item does not exist, ignore setting qos"
exit 1
}
rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}')
[[ -n "$rdma_dev" ]] || {
echo "warn, rdma device does not exist for $nic_item, is it an rdma nic?"
exit 1
}
ip a show "$nic_item" | grep link/infiniband &>/dev/null && {
echo "warn, device $nic_item is an infiniband nic, it should be an rdma roce nic"
exit 1
}
$debug_log && echo -e "\e[31minfo, start to apply QoS and ecn for nic $nic_item, rdma device $rdma_dev ...\e[0m"
mlnx_qos -i "$nic_item" --trust=dscp --pfc ${pfc_queue} &> /dev/null
$debug_log && mlnx_qos -i "$nic_item"
$debug_log && echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority"
echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority
$debug_log && echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority"
echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority
$debug_log && echo "echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp"
echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp
$debug_log && echo -e "\e[31minfo, start to apply cma_roce_tox for port ${rdma_dev}\e[0m"
traffic_class=$((rdma_qos << 2))
$debug_log && echo "cma_roce_tos -d $rdma_dev -t $traffic_class"
cma_roce_tos -d $rdma_dev -t $traffic_class &> /dev/null
cma_roce_mode -d $rdma_dev -p 1 -m 2 &> /dev/null
$debug_log && echo "echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class"
echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class
done
}
[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && {
echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured"
exit 1
}
while true ; do
if [ -n "$GPU_NIC_LIST" ] ; then
echo "Config RDMA QoS for GPU group, GPU_NIC_LIST: $GPU_NIC_LIST, GPU_RDMA_PRIORITY: $GPU_RDMA_PRIORITY, GPU_RDMA_QOS: $GPU_RDMA_QOS, GPU_CNP_PRIORITY: $GPU_CNP_PRIORITY, GPU_CNP_QOS: $GPU_CNP_QOS"
set_rdma_qos $GPU_NIC_LIST $GPU_RDMA_PRIORITY $GPU_RDMA_QOS $GPU_CNP_QOS $DEBUG_LOG
else
echo "No nics configured for Group GPU, no need to config RDMA QoS"
fi
if [ -n "$STORAGE_NIC_LIST" ] ; then
echo "Config RDMA QoS for storage group, STORAGE_NIC_LIST: $STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY: $STORAGE_RDMA_PRIORITY, STORAGE_RDMA_QOS: $STORAGE_RDMA_QOS, STORAGE_CNP_PRIORITY: $STORAGE_CNP_PRIORITY, STORAGE_CNP_QOS: $STORAGE_CNP_QOS"
set_rdma_qos $STORAGE_NIC_LIST $STORAGE_RDMA_PRIORITY $STORAGE_RDMA_QOS $STORAGE_CNP_QOS $DEBUG_LOG
else
echo "No nics configured for Group Storage, no need to config RDMA QoS"
fi
sysctl -w net.ipv4.tcp_ecn=1 &> /dev/null
if [ "$RUN_ONCE" = true ] ; then
exit 0
fi
echo "Done, sleep 60s"
sleep 60
done
S_EOF

chmod +x /usr/local/bin/rdma_qos.sh
echo -e "\e[31m Pre-run rdma_qos.sh once \e[0m"
RUN_ONCE=true DEBUG_LOG=true /usr/local/bin/rdma_qos.sh || {
echo "error, failed to pre-set qos"
exit 1
}

echo -e "\e[31m Prepare rdma qos systemd unit file \e[0m"

cat <<"SYS_EOF" >/etc/systemd/system/rdma-qos.service
[Unit]
Description=RDMA QoS Configuration Service
After=network.target
[Service]
Type=simple
ExecStart=/bin/bash /usr/local/bin/rdma_qos.sh
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
SYS_EOF

echo -e "\e[31m Start rdma-qos systemd service \e[0m"
systemctl daemon-reload
systemctl enable rdma-qos.service
systemctl restart rdma-qos.service
echo -e "\e[31m Done \e[0m"
Loading

0 comments on commit 6e7ec21

Please sign in to comment.