Skip to content

Commit

Permalink
Docs: add mtu size configuration for SpiderMultusConfig
Browse files Browse the repository at this point in the history
Signed-off-by: Cyclinder Kuo <[email protected]>
  • Loading branch information
cyclinder committed Feb 19, 2025
1 parent 285e341 commit d521ef5
Show file tree
Hide file tree
Showing 7 changed files with 828 additions and 50 deletions.
318 changes: 318 additions & 0 deletions docs/example/qos/rdma-qos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
#!/bin/bash
# Copyright 2025 Authors of spidernet-io
# SPDX-License-Identifier: Apache-2.0

# set -x
# set -o xtrace
set -o errexit

echo -e "\e[31m Prepare rdma qos script \e[0m"

function validate_nic() {
nic_list=$1
if [ -z "$nic_list" ]; then
echo "error, nic list is empty"
exit 1
fi

rdma_priority=$2
if [ -z "$rdma_priority" ]; then
echo "error, rdma_priority is empty"
exit 1
fi

# nic_list is a comma-separated list of NICs
IFS=',' read -r -a gpu_nic_array <<<"$nic_list"
for nic_item in "${gpu_nic_array[@]}"; do
# Perform operations on each NIC
echo "prechecking for device: $nic_item"
if [ -z "$nic_item" ]; then
echo "error, invalid nic name"
exit 1
fi

ip link show "$nic_item" &>/dev/null || {
echo "error, device $nic_item does not exist"
exit 1
}

rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}')
[[ -n "$rdma_dev" ]] || {
echo "error, rdma device does not exist for $nic_item, is it an rdma nic?"
exit 1
}

ip a show "$nic_item" | grep link/infiniband &>/dev/null && {
echo "error, device $nic_item is an infiniband nic, it should be an rdma roce nic"
exit 1
}

if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority ]; then
echo "error, /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority is not found"
return 1
fi

if ! [ -f /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority ]; then
echo "error, /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority is not found"
return 1
fi

if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp ]; then
echo "error, /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp is not found"
return 1
fi

if ! [ -f /sys/class/infiniband/$rdma_dev/tc/1/traffic_class ]; then
echo "error, /sys/class/infiniband/$rdma_dev/tc/1/traffic_class is not found"
return 1
fi
echo "device $nic_item is ready"
done
}

GPU_NIC_LIST=${GPU_NIC_LIST:-""}
GPU_RDMA_PRIORITY=${GPU_RDMA_PRIORITY:-5}
GPU_RDMA_QOS=${GPU_RDMA_QOS:-""}
GPU_CNP_PRIORITY=${GPU_CNP_PRIORITY:-6}
GPU_CNP_QOS=${GPU_NIC_QOS:-""}
STORAGE_NIC_LIST=${STORAGE_NIC_LIST:-""}
STORAGE_RDMA_PRIORITY=${STORAGE_NIC_PRIORITY:-5}
STORAGE_RDMA_QOS=${STORAGE_NIC_QOS:-""}
STORAGE_CNP_PRIORITY=${STORAGE_NIC_PRIORITY:-5}
STORAGE_CNP_QOS=${STORAGE_NIC_QOS:-""}

[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && {
echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured"
exit 1
}

# Check if ecn_priority is within the range 0-7
if ! [[ "$GPU_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, GPU_RDMA_PRIORITY must be in the range of 0-7, but got $GPU_RDMA_PRIORITY"
return 1
fi

if ! [[ "$GPU_CNP_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, GPU_CNP_PRIORITY must be in the range of 0-7, but got $GPU_CNP_PRIORITY"
return 1
fi

if ! [[ "$STORAGE_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, STORAGE_RDMA_PRIORITY must be in the range of 0-7, but got $STORAGE_RDMA_PRIORITY"
return 1
fi

if ! [[ "$STORAGE_CNP_PRIORITY" =~ ^[0-7]$ ]]; then
echo "error, STORAGE_CNP_PRIORITY must be in the range of 0-7, but got $STORAGE_CNP_PRIORITY"
return 1
fi

# ##################### wait unit all tools are ready ################################
if ! which mlnx_qos >/dev/null; then
echo "mlnx_qos is not ready..."
exit 1
fi
echo "mlnx_qos is ready"

if ! ibdev2netdev >/dev/null; then
echo "ibdev2netdev is not ready..."
exit 1
fi
echo "ibdev2netdev is ready"

if ! cma_roce_tos >/dev/null; then
echo "cma_roce_tos is not ready..."
exit 1
fi
echo "cma_roce_tos is ready"

[ -n "$GPU_NIC_LIST" ] && validate_nic "$GPU_NIC_LIST" "$GPU_RDMA_PRIORITY"
[ -n "$STORAGE_NIC_LIST" ] && validate_nic "$STORAGE_NIC_LIST" "$STORAGE_RDMA_PRIORITY"

echo "debug, GPU_NIC_LIST=$GPU_NIC_LIST, GPU_RDMA_PRIORITY=$GPU_RDMA_PRIORITY, GPU_CNP_PRIORITY=$GPU_CNP_PRIORITY, GPU_RDMA_QOS=$GPU_RDMA_QOS, GPU_CNP_QOS=$GPU_CNP_QOS"
echo "debug, STORAGE_NIC_LIST=$STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY=$STORAGE_RDMA_PRIORITY, STORAGE_CNP_PRIORITY=$STORAGE_CNP_PRIORITY, STORAGE_RDMA_QOS=$STORAGE_RDMA_QOS, STORAGE_CNP_QOS=$STORAGE_CNP_QOS"

[ -z "$GPU_RDMA_QOS" ] && GPU_RDMA_QOS=$((GPU_RDMA_PRIORITY * 8))
[ -z "$GPU_CNP_QOS" ] && GPU_CNP_QOS=$((GPU_CNP_PRIORITY * 8))
[ -z "$STORAGE_RDMA_QOS" ] && STORAGE_RDMA_QOS=$((STORAGE_RDMA_PRIORITY * 8))
[ -z "$STORAGE_CNP_QOS" ] && STORAGE_CNP_QOS=$((STORAGE_CNP_PRIORITY * 8))

GROUP_GPU_CONFIG=""
[ -n "$GPU_NIC_LIST" ] && GROUP_GPU_CONFIG="group=gpu;nic=$GPU_NIC_LIST;rdma_priority=$GPU_RDMA_PRIORITY;rdma_qos=$GPU_RDMA_QOS;cnp_priority=$GPU_CNP_PRIORITY;cnp_qos=$GPU_CNP_QOS"

GROUP_STORAGE_CONFIG=""
[ -n "$STORAGE_NIC_LIST" ] && GROUP_STORAGE_CONFIG="group=storage;nic=$STORAGE_NIC_LIST;rdma_priority=$GPU_RDMA_PRIORITY;rdma_qos=$GPU_RDMA_QOS;cnp_priority=$GPU_CNP_PRIORITY;cnp_qos=$GPU_CNP_QOS"

mkdir -p /etc/systemd/system/rdma-qos.d
cat <<F_EOF >/etc/systemd/system/rdma-qos.d/10-qos.conf
GROUP_GPU_CONFIG=${GROUP_GPU_CONFIG}
GROUP_STORAGE_CONFIG=${GROUP_STORAGE_CONFIG}
F_EOF

cat <<"S_EOF" >/usr/local/bin/rdma_qos.sh
#!/bin/bash
# set -x
# set -o xtrace
set -o errexit
function set_rdma_qos() {
CONFIG=$1
if [ -z "$CONFIG" ]; then
echo "error, config is empty"
return 1
fi
group=$(echo $CONFIG | grep -Eo "group=[^;]*" | awk -F'=' '{print $2}' | tr -d ' ')
nic_list=$(echo $CONFIG | grep -Eo "nic=[^;]*" | awk -F'=' '{print $2}' | tr -d ' ' | tr -d '"' |tr ',' ' ')
if [ -z "$group" ] || [ -z "$nic_list" ]; then
echo "warn, group or nic is empty, skip ..."
return 1
fi
rdma_priority=$(echo $CONFIG | grep -Eo "rdma_priority=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ')
if [ -z "$rdma_priority" ]; then
echo "error, no rdma_priority found for group $group"
return 1
fi
cnp_priority=$(echo $CONFIG | grep -Eo "cnp_priority=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ')
if [ -z "$cnp_priority" ]; then
echo "error, no cnp_priority found for group $group"
return 1
fi
rdma_qos=$(echo $CONFIG | grep -Eo "rdma_qos=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ')
if [ -z "$rdma_qos" ]; then
echo "error, no rdma_qos found for group $group"
return 1
fi
cnp_qos=$(echo $CONFIG | grep -Eo "cnp_qos=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ')
if [ -z "$cnp_qos" ]; then
echo "error, no cnp_qos found for group $group"
return 1
fi
qos_queues=(0 0 0 0 0 0 0 0)
qos_queues[$rdma_priority]=1
qos_queues[$cnp_priority]=1
pfc_queue=$(echo "${qos_queues[*]}" | sed 's? ?,?g' | tr -d ' ')
echo "Qos Parameters: rdma_priority: $rdma_priority, rdma_qos: $rdma_qos, cnp_priority: $cnp_priority, cnp_qos: $cnp_qos, pfc_queue: $pfc_queue"
for nic_item in ${nic_list}; do
if [ -z "$nic_item" ]; then
echo "warn, nic_item is empty, skip ..."
continue
fi
ip link show "$nic_item" &>/dev/null || {
echo "warn, device $nic_item does not exist, ignore setting qos"
continue
}
rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}')
[[ -n "$rdma_dev" ]] || {
echo "warn, rdma device does not exist for $nic_item, is it an rdma nic?"
continue
}
ip a show "$nic_item" | grep link/infiniband &>/dev/null && {
echo "warn, device $nic_item is an infiniband nic, it should be an rdma roce nic"
continue
}
echo -e "\e[31minfo, start to apply QoS and ecn for nic $nic_item, rdma device $rdma_dev ...\e[0m"
mlnx_qos -i "$nic_item" --trust=dscp --pfc ${pfc_queue}
echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority"
echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority
echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority"
echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority
echo "echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp"
echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp
echo -e "\e[31minfo, start to apply cma_roce_tox for port ${rdma_dev}\e[0m"
traffic_class=$((rdma_qos << 2))
echo "cma_roce_tos -d $rdma_dev -t $traffic_class"
cma_roce_tos -d $rdma_dev -t $traffic_class
cma_roce_mode -d $rdma_dev -p 1 -m 2
echo "echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class"
echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class
done
}
if ! [ -f /etc/systemd/system/rdma-qos.d/10-qos.conf ] ; then
echo "error, /etc/systemd/system/rdma-qos.d/10-qos.conf is not found"
exit 1
fi
[ -z "$GROUP_GPU_CONFIG" ] && [ -z "$GROUP_STORAGE_CONFIG" ] && {
echo "error, GROUP_GPU_CONFIG and GROUP_STORAGE_CONFIG cannot be empty at the same time, at least one of them needs to be configured"
exit 1
}
if [ -n "$GROUP_GPU_CONFIG" ] ; then
echo "Config RDMA QoS for GPU group: $GROUP_GPU_CONFIG"
set_rdma_qos "$GROUP_GPU_CONFIG"
else
echo "GROUP_GPU_CONFIG is empty, this isn't GPU Server, no need to config RDMA QoS"
fi
if [ -n "$GROUP_STORAGE_CONFIG" ] ; then
echo "Config RDMA QoS for storage group: $GROUP_STORAGE_CONFIG"
set_rdma_qos "$GROUP_STORAGE_CONFIG"
else
echo "GROUP_STORAGE_CONFIG is empty, this isn't Storage Server, no need to config RDMA QoS"
fi
sysctl -w net.ipv4.tcp_ecn=1
S_EOF

chmod +x /usr/local/bin/rdma_qos.sh

echo -e "\e[31m Prepare rdma qos systemd unit file \e[0m"

cat <<"SYS_EOF" >/etc/systemd/system/rdma-qos.service
[Unit]
Description=RDMA QoS Configuration Service
After=network.target
[Service]
Type=simple
ExecStart=/bin/bash /usr/local/bin/rdma_qos.sh
StandardOutput=journal
StandardError=journal
EnvironmentFile=/etc/systemd/system/rdma-qos.d/10-qos.conf
[Install]
WantedBy=multi-user.target
SYS_EOF

echo -e "\e[31m Prepare rdma qos timer unit file \e[0m"
cat <<"T_EOF" >/etc/systemd/system/rdma-qos.timer
[Unit]
Description=Timer for RDMA QoS Configuration Service
[Timer]
OnBootSec=1min
OnUnitActiveSec=5min
[Install]
WantedBy=timers.target
T_EOF

echo -e "\e[31mStart rdma-qos systemd service \e[0m"
systemctl daemon-reload
systemctl enable rdma-qos.service
systemctl enable rdma-qos.timer
systemctl start rdma-qos.service
systemctl start rdma-qos.timer

systemctl status rdma-qos.service

echo -e "\e[31mDone \e[0m"
Loading

0 comments on commit d521ef5

Please sign in to comment.