-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Docs: add mtu size configuration for SpiderMultusConfig
Signed-off-by: Cyclinder Kuo <[email protected]>
- Loading branch information
Showing
7 changed files
with
828 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,318 @@ | ||
#!/bin/bash | ||
# Copyright 2025 Authors of spidernet-io | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# set -x | ||
# set -o xtrace | ||
set -o errexit | ||
|
||
echo -e "\e[31m Prepare rdma qos script \e[0m" | ||
|
||
function validate_nic() { | ||
nic_list=$1 | ||
if [ -z "$nic_list" ]; then | ||
echo "error, nic list is empty" | ||
exit 1 | ||
fi | ||
|
||
rdma_priority=$2 | ||
if [ -z "$rdma_priority" ]; then | ||
echo "error, rdma_priority is empty" | ||
exit 1 | ||
fi | ||
|
||
# nic_list is a comma-separated list of NICs | ||
IFS=',' read -r -a gpu_nic_array <<<"$nic_list" | ||
for nic_item in "${gpu_nic_array[@]}"; do | ||
# Perform operations on each NIC | ||
echo "prechecking for device: $nic_item" | ||
if [ -z "$nic_item" ]; then | ||
echo "error, invalid nic name" | ||
exit 1 | ||
fi | ||
|
||
ip link show "$nic_item" &>/dev/null || { | ||
echo "error, device $nic_item does not exist" | ||
exit 1 | ||
} | ||
|
||
rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}') | ||
[[ -n "$rdma_dev" ]] || { | ||
echo "error, rdma device does not exist for $nic_item, is it an rdma nic?" | ||
exit 1 | ||
} | ||
|
||
ip a show "$nic_item" | grep link/infiniband &>/dev/null && { | ||
echo "error, device $nic_item is an infiniband nic, it should be an rdma roce nic" | ||
exit 1 | ||
} | ||
|
||
if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority ]; then | ||
echo "error, /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority is not found" | ||
return 1 | ||
fi | ||
|
||
if ! [ -f /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority ]; then | ||
echo "error, /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority is not found" | ||
return 1 | ||
fi | ||
|
||
if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp ]; then | ||
echo "error, /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp is not found" | ||
return 1 | ||
fi | ||
|
||
if ! [ -f /sys/class/infiniband/$rdma_dev/tc/1/traffic_class ]; then | ||
echo "error, /sys/class/infiniband/$rdma_dev/tc/1/traffic_class is not found" | ||
return 1 | ||
fi | ||
echo "device $nic_item is ready" | ||
done | ||
} | ||
|
||
GPU_NIC_LIST=${GPU_NIC_LIST:-""} | ||
GPU_RDMA_PRIORITY=${GPU_RDMA_PRIORITY:-5} | ||
GPU_RDMA_QOS=${GPU_RDMA_QOS:-""} | ||
GPU_CNP_PRIORITY=${GPU_CNP_PRIORITY:-6} | ||
GPU_CNP_QOS=${GPU_NIC_QOS:-""} | ||
STORAGE_NIC_LIST=${STORAGE_NIC_LIST:-""} | ||
STORAGE_RDMA_PRIORITY=${STORAGE_NIC_PRIORITY:-5} | ||
STORAGE_RDMA_QOS=${STORAGE_NIC_QOS:-""} | ||
STORAGE_CNP_PRIORITY=${STORAGE_NIC_PRIORITY:-5} | ||
STORAGE_CNP_QOS=${STORAGE_NIC_QOS:-""} | ||
|
||
[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && { | ||
echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured" | ||
exit 1 | ||
} | ||
|
||
# Check if ecn_priority is within the range 0-7 | ||
if ! [[ "$GPU_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, GPU_RDMA_PRIORITY must be in the range of 0-7, but got $GPU_RDMA_PRIORITY" | ||
return 1 | ||
fi | ||
|
||
if ! [[ "$GPU_CNP_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, GPU_CNP_PRIORITY must be in the range of 0-7, but got $GPU_CNP_PRIORITY" | ||
return 1 | ||
fi | ||
|
||
if ! [[ "$STORAGE_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, STORAGE_RDMA_PRIORITY must be in the range of 0-7, but got $STORAGE_RDMA_PRIORITY" | ||
return 1 | ||
fi | ||
|
||
if ! [[ "$STORAGE_CNP_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, STORAGE_CNP_PRIORITY must be in the range of 0-7, but got $STORAGE_CNP_PRIORITY" | ||
return 1 | ||
fi | ||
|
||
# ##################### wait unit all tools are ready ################################ | ||
if ! which mlnx_qos >/dev/null; then | ||
echo "mlnx_qos is not ready..." | ||
exit 1 | ||
fi | ||
echo "mlnx_qos is ready" | ||
|
||
if ! ibdev2netdev >/dev/null; then | ||
echo "ibdev2netdev is not ready..." | ||
exit 1 | ||
fi | ||
echo "ibdev2netdev is ready" | ||
|
||
if ! cma_roce_tos >/dev/null; then | ||
echo "cma_roce_tos is not ready..." | ||
exit 1 | ||
fi | ||
echo "cma_roce_tos is ready" | ||
|
||
[ -n "$GPU_NIC_LIST" ] && validate_nic "$GPU_NIC_LIST" "$GPU_RDMA_PRIORITY" | ||
[ -n "$STORAGE_NIC_LIST" ] && validate_nic "$STORAGE_NIC_LIST" "$STORAGE_RDMA_PRIORITY" | ||
|
||
echo "debug, GPU_NIC_LIST=$GPU_NIC_LIST, GPU_RDMA_PRIORITY=$GPU_RDMA_PRIORITY, GPU_CNP_PRIORITY=$GPU_CNP_PRIORITY, GPU_RDMA_QOS=$GPU_RDMA_QOS, GPU_CNP_QOS=$GPU_CNP_QOS" | ||
echo "debug, STORAGE_NIC_LIST=$STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY=$STORAGE_RDMA_PRIORITY, STORAGE_CNP_PRIORITY=$STORAGE_CNP_PRIORITY, STORAGE_RDMA_QOS=$STORAGE_RDMA_QOS, STORAGE_CNP_QOS=$STORAGE_CNP_QOS" | ||
|
||
[ -z "$GPU_RDMA_QOS" ] && GPU_RDMA_QOS=$((GPU_RDMA_PRIORITY * 8)) | ||
[ -z "$GPU_CNP_QOS" ] && GPU_CNP_QOS=$((GPU_CNP_PRIORITY * 8)) | ||
[ -z "$STORAGE_RDMA_QOS" ] && STORAGE_RDMA_QOS=$((STORAGE_RDMA_PRIORITY * 8)) | ||
[ -z "$STORAGE_CNP_QOS" ] && STORAGE_CNP_QOS=$((STORAGE_CNP_PRIORITY * 8)) | ||
|
||
GROUP_GPU_CONFIG="" | ||
[ -n "$GPU_NIC_LIST" ] && GROUP_GPU_CONFIG="group=gpu;nic=$GPU_NIC_LIST;rdma_priority=$GPU_RDMA_PRIORITY;rdma_qos=$GPU_RDMA_QOS;cnp_priority=$GPU_CNP_PRIORITY;cnp_qos=$GPU_CNP_QOS" | ||
|
||
GROUP_STORAGE_CONFIG="" | ||
[ -n "$STORAGE_NIC_LIST" ] && GROUP_STORAGE_CONFIG="group=storage;nic=$STORAGE_NIC_LIST;rdma_priority=$GPU_RDMA_PRIORITY;rdma_qos=$GPU_RDMA_QOS;cnp_priority=$GPU_CNP_PRIORITY;cnp_qos=$GPU_CNP_QOS" | ||
|
||
mkdir -p /etc/systemd/system/rdma-qos.d | ||
cat <<F_EOF >/etc/systemd/system/rdma-qos.d/10-qos.conf | ||
GROUP_GPU_CONFIG=${GROUP_GPU_CONFIG} | ||
GROUP_STORAGE_CONFIG=${GROUP_STORAGE_CONFIG} | ||
F_EOF | ||
|
||
cat <<"S_EOF" >/usr/local/bin/rdma_qos.sh | ||
#!/bin/bash | ||
# set -x | ||
# set -o xtrace | ||
set -o errexit | ||
function set_rdma_qos() { | ||
CONFIG=$1 | ||
if [ -z "$CONFIG" ]; then | ||
echo "error, config is empty" | ||
return 1 | ||
fi | ||
group=$(echo $CONFIG | grep -Eo "group=[^;]*" | awk -F'=' '{print $2}' | tr -d ' ') | ||
nic_list=$(echo $CONFIG | grep -Eo "nic=[^;]*" | awk -F'=' '{print $2}' | tr -d ' ' | tr -d '"' |tr ',' ' ') | ||
if [ -z "$group" ] || [ -z "$nic_list" ]; then | ||
echo "warn, group or nic is empty, skip ..." | ||
return 1 | ||
fi | ||
rdma_priority=$(echo $CONFIG | grep -Eo "rdma_priority=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ') | ||
if [ -z "$rdma_priority" ]; then | ||
echo "error, no rdma_priority found for group $group" | ||
return 1 | ||
fi | ||
cnp_priority=$(echo $CONFIG | grep -Eo "cnp_priority=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ') | ||
if [ -z "$cnp_priority" ]; then | ||
echo "error, no cnp_priority found for group $group" | ||
return 1 | ||
fi | ||
rdma_qos=$(echo $CONFIG | grep -Eo "rdma_qos=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ') | ||
if [ -z "$rdma_qos" ]; then | ||
echo "error, no rdma_qos found for group $group" | ||
return 1 | ||
fi | ||
cnp_qos=$(echo $CONFIG | grep -Eo "cnp_qos=[^;]*" | awk -F'=' '{print $2}' | tr -d '"' | tr -d ' ') | ||
if [ -z "$cnp_qos" ]; then | ||
echo "error, no cnp_qos found for group $group" | ||
return 1 | ||
fi | ||
qos_queues=(0 0 0 0 0 0 0 0) | ||
qos_queues[$rdma_priority]=1 | ||
qos_queues[$cnp_priority]=1 | ||
pfc_queue=$(echo "${qos_queues[*]}" | sed 's? ?,?g' | tr -d ' ') | ||
echo "Qos Parameters: rdma_priority: $rdma_priority, rdma_qos: $rdma_qos, cnp_priority: $cnp_priority, cnp_qos: $cnp_qos, pfc_queue: $pfc_queue" | ||
for nic_item in ${nic_list}; do | ||
if [ -z "$nic_item" ]; then | ||
echo "warn, nic_item is empty, skip ..." | ||
continue | ||
fi | ||
ip link show "$nic_item" &>/dev/null || { | ||
echo "warn, device $nic_item does not exist, ignore setting qos" | ||
continue | ||
} | ||
rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}') | ||
[[ -n "$rdma_dev" ]] || { | ||
echo "warn, rdma device does not exist for $nic_item, is it an rdma nic?" | ||
continue | ||
} | ||
ip a show "$nic_item" | grep link/infiniband &>/dev/null && { | ||
echo "warn, device $nic_item is an infiniband nic, it should be an rdma roce nic" | ||
continue | ||
} | ||
echo -e "\e[31minfo, start to apply QoS and ecn for nic $nic_item, rdma device $rdma_dev ...\e[0m" | ||
mlnx_qos -i "$nic_item" --trust=dscp --pfc ${pfc_queue} | ||
echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority" | ||
echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority | ||
echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority" | ||
echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority | ||
echo "echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp" | ||
echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp | ||
echo -e "\e[31minfo, start to apply cma_roce_tox for port ${rdma_dev}\e[0m" | ||
traffic_class=$((rdma_qos << 2)) | ||
echo "cma_roce_tos -d $rdma_dev -t $traffic_class" | ||
cma_roce_tos -d $rdma_dev -t $traffic_class | ||
cma_roce_mode -d $rdma_dev -p 1 -m 2 | ||
echo "echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class" | ||
echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class | ||
done | ||
} | ||
if ! [ -f /etc/systemd/system/rdma-qos.d/10-qos.conf ] ; then | ||
echo "error, /etc/systemd/system/rdma-qos.d/10-qos.conf is not found" | ||
exit 1 | ||
fi | ||
[ -z "$GROUP_GPU_CONFIG" ] && [ -z "$GROUP_STORAGE_CONFIG" ] && { | ||
echo "error, GROUP_GPU_CONFIG and GROUP_STORAGE_CONFIG cannot be empty at the same time, at least one of them needs to be configured" | ||
exit 1 | ||
} | ||
if [ -n "$GROUP_GPU_CONFIG" ] ; then | ||
echo "Config RDMA QoS for GPU group: $GROUP_GPU_CONFIG" | ||
set_rdma_qos "$GROUP_GPU_CONFIG" | ||
else | ||
echo "GROUP_GPU_CONFIG is empty, this isn't GPU Server, no need to config RDMA QoS" | ||
fi | ||
if [ -n "$GROUP_STORAGE_CONFIG" ] ; then | ||
echo "Config RDMA QoS for storage group: $GROUP_STORAGE_CONFIG" | ||
set_rdma_qos "$GROUP_STORAGE_CONFIG" | ||
else | ||
echo "GROUP_STORAGE_CONFIG is empty, this isn't Storage Server, no need to config RDMA QoS" | ||
fi | ||
sysctl -w net.ipv4.tcp_ecn=1 | ||
S_EOF | ||
|
||
chmod +x /usr/local/bin/rdma_qos.sh | ||
|
||
echo -e "\e[31m Prepare rdma qos systemd unit file \e[0m" | ||
|
||
cat <<"SYS_EOF" >/etc/systemd/system/rdma-qos.service | ||
[Unit] | ||
Description=RDMA QoS Configuration Service | ||
After=network.target | ||
[Service] | ||
Type=simple | ||
ExecStart=/bin/bash /usr/local/bin/rdma_qos.sh | ||
StandardOutput=journal | ||
StandardError=journal | ||
EnvironmentFile=/etc/systemd/system/rdma-qos.d/10-qos.conf | ||
[Install] | ||
WantedBy=multi-user.target | ||
SYS_EOF | ||
|
||
echo -e "\e[31m Prepare rdma qos timer unit file \e[0m" | ||
cat <<"T_EOF" >/etc/systemd/system/rdma-qos.timer | ||
[Unit] | ||
Description=Timer for RDMA QoS Configuration Service | ||
[Timer] | ||
OnBootSec=1min | ||
OnUnitActiveSec=5min | ||
[Install] | ||
WantedBy=timers.target | ||
T_EOF | ||
|
||
echo -e "\e[31mStart rdma-qos systemd service \e[0m" | ||
systemctl daemon-reload | ||
systemctl enable rdma-qos.service | ||
systemctl enable rdma-qos.timer | ||
systemctl start rdma-qos.service | ||
systemctl start rdma-qos.timer | ||
|
||
systemctl status rdma-qos.service | ||
|
||
echo -e "\e[31mDone \e[0m" |
Oops, something went wrong.