-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Docs: add mtu size configuration for SpiderMultusConfig
Signed-off-by: Cyclinder Kuo <[email protected]>
- Loading branch information
Showing
7 changed files
with
838 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,328 @@ | ||
#!/bin/bash | ||
# Copyright 2025 Authors of spidernet-io | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# set -x | ||
# set -o xtrace | ||
set -o errexit | ||
|
||
echo -e "\e[31m Prepare rdma qos script \e[0m" | ||
|
||
function validate_nic() { | ||
nic_list=$1 | ||
if [ -z "$nic_list" ]; then | ||
echo "error, nic list is empty" | ||
exit 1 | ||
fi | ||
|
||
rdma_priority=$2 | ||
if [ -z "$rdma_priority" ]; then | ||
echo "error, rdma_priority is empty" | ||
exit 1 | ||
fi | ||
|
||
# nic_list is a comma-separated list of NICs | ||
IFS=',' read -r -a gpu_nic_array <<<"$nic_list" | ||
for nic_item in "${gpu_nic_array[@]}"; do | ||
# Perform operations on each NIC | ||
echo "prechecking for device: $nic_item" | ||
if [ -z "$nic_item" ]; then | ||
echo "error, invalid nic name" | ||
exit 1 | ||
fi | ||
|
||
ip link show "$nic_item" &>/dev/null || { | ||
echo "error, device $nic_item does not exist" | ||
exit 1 | ||
} | ||
|
||
rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}') | ||
[[ -n "$rdma_dev" ]] || { | ||
echo "error, rdma device does not exist for $nic_item, is it an rdma nic?" | ||
exit 1 | ||
} | ||
|
||
ip a show "$nic_item" | grep link/infiniband &>/dev/null && { | ||
echo "error, device $nic_item is an infiniband nic, it should be an rdma roce nic" | ||
exit 1 | ||
} | ||
|
||
if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority ]; then | ||
echo "error, /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority is not found" | ||
return 1 | ||
fi | ||
|
||
if ! [ -f /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority ]; then | ||
echo "error, /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority is not found" | ||
return 1 | ||
fi | ||
|
||
if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp ]; then | ||
echo "error, /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp is not found" | ||
return 1 | ||
fi | ||
|
||
if ! [ -f /sys/class/infiniband/$rdma_dev/tc/1/traffic_class ]; then | ||
echo "error, /sys/class/infiniband/$rdma_dev/tc/1/traffic_class is not found" | ||
return 1 | ||
fi | ||
echo "device $nic_item is ready" | ||
done | ||
} | ||
|
||
GPU_NIC_LIST=${GPU_NIC_LIST:-""} | ||
GPU_RDMA_PRIORITY=${GPU_RDMA_PRIORITY:-5} | ||
GPU_RDMA_QOS=${GPU_RDMA_QOS:-""} | ||
GPU_CNP_PRIORITY=${GPU_CNP_PRIORITY:-6} | ||
GPU_CNP_QOS=${GPU_NIC_QOS:-""} | ||
STORAGE_NIC_LIST=${STORAGE_NIC_LIST:-""} | ||
STORAGE_RDMA_PRIORITY=${STORAGE_RDMA_PRIORITY:-5} | ||
STORAGE_RDMA_QOS=${STORAGE_RDMA_QOS:-""} | ||
STORAGE_CNP_PRIORITY=${STORAGE_CNP_PRIORITY:-6} | ||
STORAGE_CNP_QOS=${STORAGE_CNP_QOS:-""} | ||
|
||
[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && { | ||
echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured" | ||
exit 1 | ||
} | ||
|
||
# Check if ecn_priority is within the range 0-7 | ||
if ! [[ "$GPU_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, GPU_RDMA_PRIORITY must be in the range of 0-7, but got $GPU_RDMA_PRIORITY" | ||
exit 1 | ||
fi | ||
|
||
if ! [[ "$GPU_CNP_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, GPU_CNP_PRIORITY must be in the range of 0-7, but got $GPU_CNP_PRIORITY" | ||
exit 1 | ||
fi | ||
|
||
if ! [[ "$STORAGE_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, STORAGE_RDMA_PRIORITY must be in the range of 0-7, but got $STORAGE_RDMA_PRIORITY" | ||
exit 1 | ||
fi | ||
|
||
if ! [[ "$STORAGE_CNP_PRIORITY" =~ ^[0-7]$ ]]; then | ||
echo "error, STORAGE_CNP_PRIORITY must be in the range of 0-7, but got $STORAGE_CNP_PRIORITY" | ||
exit 1 | ||
fi | ||
|
||
if [[ "$GPU_RDMA_PRIORITY" -eq "$GPU_CNP_PRIORITY" ]]; then | ||
echo "error, GPU_RDMA_PRIORITY and GPU_CNP_PRIORITY cannot be the same" | ||
exit 1 | ||
fi | ||
|
||
if [[ "$STORAGE_RDMA_PRIORITY" -eq "$STORAGE_CNP_PRIORITY" ]]; then | ||
echo "error, STORAGE_RDMA_PRIORITY and STORAGE_CNP_PRIORITY cannot be the same" | ||
exit 1 | ||
fi | ||
|
||
# ##################### wait unit all tools are ready ################################ | ||
if ! which mlnx_qos >/dev/null; then | ||
echo "mlnx_qos is not ready..." | ||
exit 1 | ||
fi | ||
echo "mlnx_qos is ready" | ||
|
||
if ! ibdev2netdev >/dev/null; then | ||
echo "ibdev2netdev is not ready..." | ||
exit 1 | ||
fi | ||
echo "ibdev2netdev is ready" | ||
|
||
if ! cma_roce_tos >/dev/null; then | ||
echo "cma_roce_tos is not ready..." | ||
exit 1 | ||
fi | ||
echo "cma_roce_tos is ready" | ||
|
||
[ -n "$GPU_NIC_LIST" ] && validate_nic "$GPU_NIC_LIST" "$GPU_RDMA_PRIORITY" | ||
[ -n "$STORAGE_NIC_LIST" ] && validate_nic "$STORAGE_NIC_LIST" "$STORAGE_RDMA_PRIORITY" | ||
|
||
[ -z "$GPU_RDMA_QOS" ] && GPU_RDMA_QOS=$((GPU_RDMA_PRIORITY * 8)) | ||
[ -z "$GPU_CNP_QOS" ] && GPU_CNP_QOS=$((GPU_CNP_PRIORITY * 8)) | ||
[ -z "$STORAGE_RDMA_QOS" ] && STORAGE_RDMA_QOS=$((STORAGE_RDMA_PRIORITY * 8)) | ||
[ -z "$STORAGE_CNP_QOS" ] && STORAGE_CNP_QOS=$((STORAGE_CNP_PRIORITY * 8)) | ||
|
||
if [[ "$GPU_RDMA_QOS" -eq "$GPU_CNP_QOS" ]]; then | ||
echo "error, GPU_RDMA_QOS and GPU_CNP_QOS cannot be the same" | ||
exit 1 | ||
fi | ||
|
||
if [[ "$STORAGE_RDMA_QOS" -eq "$STORAGE_CNP_QOS" ]]; then | ||
echo "error, STORAGE_RDMA_QOS and STORAGE_CNP_QOS cannot be the same" | ||
exit 1 | ||
fi | ||
|
||
echo "debug, GPU_NIC_LIST=$GPU_NIC_LIST, GPU_RDMA_PRIORITY=$GPU_RDMA_PRIORITY, GPU_CNP_PRIORITY=$GPU_CNP_PRIORITY, GPU_RDMA_QOS=$GPU_RDMA_QOS, GPU_CNP_QOS=$GPU_CNP_QOS" | ||
echo "debug, STORAGE_NIC_LIST=$STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY=$STORAGE_RDMA_PRIORITY, STORAGE_CNP_PRIORITY=$STORAGE_CNP_PRIORITY, STORAGE_RDMA_QOS=$STORAGE_RDMA_QOS, STORAGE_CNP_QOS=$STORAGE_CNP_QOS" | ||
|
||
cat <<G_EOF >/usr/local/bin/rdma_qos.sh | ||
#!/bin/bash | ||
# set -x | ||
# set -o xtrace | ||
set -o errexit | ||
RUN_ONCE=true | ||
GROUP_GPU="gpu" | ||
GPU_NIC_LIST="$GPU_NIC_LIST" | ||
GPU_RDMA_PRIORITY="$GPU_RDMA_PRIORITY" | ||
GPU_CNP_PRIORITY="$GPU_CNP_PRIORITY" | ||
GPU_RDMA_QOS="$GPU_RDMA_QOS" | ||
GPU_CNP_QOS="$GPU_CNP_QOS" | ||
GROUP_STORAGE="storage" | ||
STORAGE_NIC_LIST="$STORAGE_NIC_LIST" | ||
STORAGE_RDMA_PRIORITY="$STORAGE_RDMA_PRIORITY" | ||
STORAGE_CNP_PRIORITY="$STORAGE_CNP_PRIORITY" | ||
STORAGE_RDMA_QOS="$STORAGE_RDMA_QOS" | ||
STORAGE_CNP_QOS="$STORAGE_CNP_QOS" | ||
G_EOF | ||
|
||
cat <<"S_EOF" >>/usr/local/bin/rdma_qos.sh | ||
function set_rdma_qos() { | ||
# $GPU_NIC_LIST $GPU_RDMA_PRIORITY $GPU_RDMA_QOS $GPU_CNP_PRIORITY $GPU_CNP_QOS | ||
nic_list=$1 | ||
if [ -z "$nic_list" ]; then | ||
echo "error, nic_list is empty" | ||
return 1 | ||
fi | ||
rdma_priority=$2 | ||
if [ -z "$rdma_priority" ]; then | ||
echo "error, rdma_priority is empty" | ||
return 1 | ||
fi | ||
rdma_qos=$3 | ||
if [ -z "$rdma_qos" ]; then | ||
echo "error, rdma_qos is empty" | ||
return 1 | ||
fi | ||
cnp_priority=$4 | ||
if [ -z "$cnp_priority" ]; then | ||
echo "error, cnp_priority is empty" | ||
return 1 | ||
fi | ||
cnp_qos=$5 | ||
if [ -z "$cnp_qos" ]; then | ||
echo "error, cnp_qos is empty" | ||
return 1 | ||
fi | ||
qos_queues=(0 0 0 0 0 0 0 0) | ||
qos_queues[$rdma_priority]=1 | ||
pfc_queue=$(echo "${qos_queues[*]}" | sed 's? ?,?g' | tr -d ' ') | ||
echo "Qos Parameters: rdma_priority: $rdma_priority, rdma_qos: $rdma_qos, cnp_priority: $cnp_priority, cnp_qos: $cnp_qos, pfc_queue: $pfc_queue" | ||
# nic_list is a comma-separated list of NICs | ||
IFS=',' read -r -a gpu_nic_array <<<"$nic_list" | ||
for nic_item in "${gpu_nic_array[@]}"; do | ||
if [ -z "$nic_item" ]; then | ||
echo "warn, nic_item is empty, skip ..." | ||
exit 1 | ||
fi | ||
ip link show "$nic_item" &>/dev/null || { | ||
echo "warn, device $nic_item does not exist, ignore setting qos" | ||
exit 1 | ||
} | ||
rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}') | ||
[[ -n "$rdma_dev" ]] || { | ||
echo "warn, rdma device does not exist for $nic_item, is it an rdma nic?" | ||
exit 1 | ||
} | ||
ip a show "$nic_item" | grep link/infiniband &>/dev/null && { | ||
echo "warn, device $nic_item is an infiniband nic, it should be an rdma roce nic" | ||
exit 1 | ||
} | ||
echo -e "\e[31minfo, start to apply QoS and ecn for nic $nic_item, rdma device $rdma_dev ...\e[0m" | ||
mlnx_qos -i "$nic_item" --trust=dscp --pfc ${pfc_queue} | ||
echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority" | ||
echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority | ||
echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority" | ||
echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority | ||
echo "echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp" | ||
echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp | ||
echo -e "\e[31minfo, start to apply cma_roce_tox for port ${rdma_dev}\e[0m" | ||
traffic_class=$((rdma_qos << 2)) | ||
echo "cma_roce_tos -d $rdma_dev -t $traffic_class" | ||
cma_roce_tos -d $rdma_dev -t $traffic_class | ||
cma_roce_mode -d $rdma_dev -p 1 -m 2 | ||
echo "echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class" | ||
echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class | ||
done | ||
} | ||
[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && { | ||
echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured" | ||
exit 1 | ||
} | ||
while true ; do | ||
if [ -n "$GPU_NIC_LIST" ] ; then | ||
echo "Config RDMA QoS for GPU group, GPU_NIC_LIST: $GPU_NIC_LIST, GPU_RDMA_PRIORITY: $GPU_RDMA_PRIORITY, GPU_RDMA_QOS: $GPU_RDMA_QOS, GPU_CNP_PRIORITY: $GPU_CNP_PRIORITY, GPU_CNP_QOS: $GPU_CNP_QOS" | ||
set_rdma_qos $GPU_NIC_LIST $GPU_RDMA_PRIORITY $GPU_RDMA_QOS $GPU_CNP_PRIORITY $GPU_CNP_QOS | ||
else | ||
echo "No nics configured for Group GPU, no need to config RDMA QoS" | ||
fi | ||
if [ -n "$STORAGE_NIC_LIST" ] ; then | ||
echo "Config RDMA QoS for storage group, STORAGE_NIC_LIST: $STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY: $STORAGE_RDMA_PRIORITY, STORAGE_RDMA_QOS: $STORAGE_RDMA_QOS, STORAGE_CNP_PRIORITY: $STORAGE_CNP_PRIORITY, STORAGE_CNP_QOS: $STORAGE_CNP_QOS" | ||
set_rdma_qos $STORAGE_NIC_LIST $STORAGE_RDMA_PRIORITY $STORAGE_RDMA_QOS $STORAGE_CNP_PRIORITY $STORAGE_CNP_QOS | ||
else | ||
echo "No nics configured for Group Storage, no need to config RDMA QoS" | ||
fi | ||
sysctl -w net.ipv4.tcp_ecn=1 | ||
if [ "$RUN_ONCE" = true ] ; then | ||
exit 0 | ||
fi | ||
sleep 60 | ||
done | ||
S_EOF | ||
|
||
chmod +x /usr/local/bin/rdma_qos.sh | ||
/usr/local/bin/rdma_qos.sh | ||
|
||
sed -i 's?RUN_ONCE=true?RUN_ONCE=false?' /usr/local/bin/rdma_qos.sh | ||
echo -e "\e[31m Prepare rdma qos systemd unit file \e[0m" | ||
|
||
cat <<"SYS_EOF" >/etc/systemd/system/rdma-qos.service | ||
[Unit] | ||
Description=RDMA QoS Configuration Service | ||
After=network.target | ||
[Service] | ||
Type=simple | ||
ExecStart=/bin/bash /usr/local/bin/rdma_qos.sh | ||
StandardOutput=journal | ||
StandardError=journal | ||
[Install] | ||
WantedBy=multi-user.target | ||
SYS_EOF | ||
|
||
echo -e "\e[31m Start rdma-qos systemd service \e[0m" | ||
systemctl daemon-reload | ||
systemctl enable rdma-qos.service | ||
systemctl restart rdma-qos.service | ||
echo -e "\e[31m Done \e[0m" | ||
|
||
systemctl status rdma-qos.service |
Oops, something went wrong.