diff --git a/docs/example/qos/rdma-qos.sh b/docs/example/qos/rdma-qos.sh new file mode 100644 index 0000000000..a015af39b6 --- /dev/null +++ b/docs/example/qos/rdma-qos.sh @@ -0,0 +1,328 @@ +#!/bin/bash +# Copyright 2025 Authors of spidernet-io +# SPDX-License-Identifier: Apache-2.0 + +# set -x +# set -o xtrace +set -o errexit + +echo -e "\e[31m Prepare rdma qos script \e[0m" + +function validate_nic() { + nic_list=$1 + if [ -z "$nic_list" ]; then + echo "error, nic list is empty" + exit 1 + fi + + rdma_priority=$2 + if [ -z "$rdma_priority" ]; then + echo "error, rdma_priority is empty" + exit 1 + fi + + # nic_list is a comma-separated list of NICs + IFS=',' read -r -a gpu_nic_array <<<"$nic_list" + for nic_item in "${gpu_nic_array[@]}"; do + # Perform operations on each NIC + echo "prechecking for device: $nic_item" + if [ -z "$nic_item" ]; then + echo "error, invalid nic name" + exit 1 + fi + + ip link show "$nic_item" &>/dev/null || { + echo "error, device $nic_item does not exist" + exit 1 + } + + rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}') + [[ -n "$rdma_dev" ]] || { + echo "error, rdma device does not exist for $nic_item, is it an rdma nic?" + exit 1 + } + + ip a show "$nic_item" | grep link/infiniband &>/dev/null && { + echo "error, device $nic_item is an infiniband nic, it should be an rdma roce nic" + exit 1 + } + + if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority ]; then + echo "error, /sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority is not found" + return 1 + fi + + if ! [ -f /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority ]; then + echo "error, /sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority is not found" + return 1 + fi + + if ! [ -f /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp ]; then + echo "error, /sys/class/net/$nic_item/ecn/roce_np/cnp_dscp is not found" + return 1 + fi + + if ! [ -f /sys/class/infiniband/$rdma_dev/tc/1/traffic_class ]; then + echo "error, /sys/class/infiniband/$rdma_dev/tc/1/traffic_class is not found" + return 1 + fi + echo "device $nic_item is ready" + done +} + +GPU_NIC_LIST=${GPU_NIC_LIST:-""} +GPU_RDMA_PRIORITY=${GPU_RDMA_PRIORITY:-5} +GPU_RDMA_QOS=${GPU_RDMA_QOS:-""} +GPU_CNP_PRIORITY=${GPU_CNP_PRIORITY:-6} +GPU_CNP_QOS=${GPU_NIC_QOS:-""} +STORAGE_NIC_LIST=${STORAGE_NIC_LIST:-""} +STORAGE_RDMA_PRIORITY=${STORAGE_RDMA_PRIORITY:-5} +STORAGE_RDMA_QOS=${STORAGE_RDMA_QOS:-""} +STORAGE_CNP_PRIORITY=${STORAGE_CNP_PRIORITY:-6} +STORAGE_CNP_QOS=${STORAGE_CNP_QOS:-""} + +[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && { + echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured" + exit 1 +} + +# Check if ecn_priority is within the range 0-7 +if ! [[ "$GPU_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then + echo "error, GPU_RDMA_PRIORITY must be in the range of 0-7, but got $GPU_RDMA_PRIORITY" + exit 1 +fi + +if ! [[ "$GPU_CNP_PRIORITY" =~ ^[0-7]$ ]]; then + echo "error, GPU_CNP_PRIORITY must be in the range of 0-7, but got $GPU_CNP_PRIORITY" + exit 1 +fi + +if ! [[ "$STORAGE_RDMA_PRIORITY" =~ ^[0-7]$ ]]; then + echo "error, STORAGE_RDMA_PRIORITY must be in the range of 0-7, but got $STORAGE_RDMA_PRIORITY" + exit 1 +fi + +if ! [[ "$STORAGE_CNP_PRIORITY" =~ ^[0-7]$ ]]; then + echo "error, STORAGE_CNP_PRIORITY must be in the range of 0-7, but got $STORAGE_CNP_PRIORITY" + exit 1 +fi + +if [[ "$GPU_RDMA_PRIORITY" -eq "$GPU_CNP_PRIORITY" ]]; then + echo "error, GPU_RDMA_PRIORITY and GPU_CNP_PRIORITY cannot be the same" + exit 1 +fi + +if [[ "$STORAGE_RDMA_PRIORITY" -eq "$STORAGE_CNP_PRIORITY" ]]; then + echo "error, STORAGE_RDMA_PRIORITY and STORAGE_CNP_PRIORITY cannot be the same" + exit 1 +fi + +# ##################### wait unit all tools are ready ################################ +if ! which mlnx_qos >/dev/null; then + echo "mlnx_qos is not ready..." + exit 1 +fi +echo "mlnx_qos is ready" + +if ! ibdev2netdev >/dev/null; then + echo "ibdev2netdev is not ready..." + exit 1 +fi +echo "ibdev2netdev is ready" + +if ! cma_roce_tos >/dev/null; then + echo "cma_roce_tos is not ready..." + exit 1 +fi +echo "cma_roce_tos is ready" + +[ -n "$GPU_NIC_LIST" ] && validate_nic "$GPU_NIC_LIST" "$GPU_RDMA_PRIORITY" +[ -n "$STORAGE_NIC_LIST" ] && validate_nic "$STORAGE_NIC_LIST" "$STORAGE_RDMA_PRIORITY" + +[ -z "$GPU_RDMA_QOS" ] && GPU_RDMA_QOS=$((GPU_RDMA_PRIORITY * 8)) +[ -z "$GPU_CNP_QOS" ] && GPU_CNP_QOS=$((GPU_CNP_PRIORITY * 8)) +[ -z "$STORAGE_RDMA_QOS" ] && STORAGE_RDMA_QOS=$((STORAGE_RDMA_PRIORITY * 8)) +[ -z "$STORAGE_CNP_QOS" ] && STORAGE_CNP_QOS=$((STORAGE_CNP_PRIORITY * 8)) + +if [[ "$GPU_RDMA_QOS" -eq "$GPU_CNP_QOS" ]]; then + echo "error, GPU_RDMA_QOS and GPU_CNP_QOS cannot be the same" + exit 1 +fi + +if [[ "$STORAGE_RDMA_QOS" -eq "$STORAGE_CNP_QOS" ]]; then + echo "error, STORAGE_RDMA_QOS and STORAGE_CNP_QOS cannot be the same" + exit 1 +fi + +echo "debug, GPU_NIC_LIST=$GPU_NIC_LIST, GPU_RDMA_PRIORITY=$GPU_RDMA_PRIORITY, GPU_CNP_PRIORITY=$GPU_CNP_PRIORITY, GPU_RDMA_QOS=$GPU_RDMA_QOS, GPU_CNP_QOS=$GPU_CNP_QOS" +echo "debug, STORAGE_NIC_LIST=$STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY=$STORAGE_RDMA_PRIORITY, STORAGE_CNP_PRIORITY=$STORAGE_CNP_PRIORITY, STORAGE_RDMA_QOS=$STORAGE_RDMA_QOS, STORAGE_CNP_QOS=$STORAGE_CNP_QOS" + +cat </usr/local/bin/rdma_qos.sh +#!/bin/bash + +# set -x +# set -o xtrace +set -o errexit + +RUN_ONCE=true +GROUP_GPU="gpu" +GPU_NIC_LIST="$GPU_NIC_LIST" +GPU_RDMA_PRIORITY="$GPU_RDMA_PRIORITY" +GPU_CNP_PRIORITY="$GPU_CNP_PRIORITY" +GPU_RDMA_QOS="$GPU_RDMA_QOS" +GPU_CNP_QOS="$GPU_CNP_QOS" + +GROUP_STORAGE="storage" +STORAGE_NIC_LIST="$STORAGE_NIC_LIST" +STORAGE_RDMA_PRIORITY="$STORAGE_RDMA_PRIORITY" +STORAGE_CNP_PRIORITY="$STORAGE_CNP_PRIORITY" +STORAGE_RDMA_QOS="$STORAGE_RDMA_QOS" +STORAGE_CNP_QOS="$STORAGE_CNP_QOS" +G_EOF + +cat <<"S_EOF" >>/usr/local/bin/rdma_qos.sh + +function set_rdma_qos() { + # $GPU_NIC_LIST $GPU_RDMA_PRIORITY $GPU_RDMA_QOS $GPU_CNP_PRIORITY $GPU_CNP_QOS + nic_list=$1 + if [ -z "$nic_list" ]; then + echo "error, nic_list is empty" + return 1 + fi + + rdma_priority=$2 + if [ -z "$rdma_priority" ]; then + echo "error, rdma_priority is empty" + return 1 + fi + + rdma_qos=$3 + if [ -z "$rdma_qos" ]; then + echo "error, rdma_qos is empty" + return 1 + fi + + cnp_priority=$4 + if [ -z "$cnp_priority" ]; then + echo "error, cnp_priority is empty" + return 1 + fi + + cnp_qos=$5 + if [ -z "$cnp_qos" ]; then + echo "error, cnp_qos is empty" + return 1 + fi + + qos_queues=(0 0 0 0 0 0 0 0) + qos_queues[$rdma_priority]=1 + pfc_queue=$(echo "${qos_queues[*]}" | sed 's? ?,?g' | tr -d ' ') + echo "Qos Parameters: rdma_priority: $rdma_priority, rdma_qos: $rdma_qos, cnp_priority: $cnp_priority, cnp_qos: $cnp_qos, pfc_queue: $pfc_queue" + + # nic_list is a comma-separated list of NICs + IFS=',' read -r -a gpu_nic_array <<<"$nic_list" + for nic_item in "${gpu_nic_array[@]}"; do + if [ -z "$nic_item" ]; then + echo "warn, nic_item is empty, skip ..." + exit 1 + fi + + ip link show "$nic_item" &>/dev/null || { + echo "warn, device $nic_item does not exist, ignore setting qos" + exit 1 + } + + rdma_dev=$(grep "$nic_item" <<<$(ibdev2netdev) | awk '{print $1}') + [[ -n "$rdma_dev" ]] || { + echo "warn, rdma device does not exist for $nic_item, is it an rdma nic?" + exit 1 + } + + ip a show "$nic_item" | grep link/infiniband &>/dev/null && { + echo "warn, device $nic_item is an infiniband nic, it should be an rdma roce nic" + exit 1 + } + + echo -e "\e[31minfo, start to apply QoS and ecn for nic $nic_item, rdma device $rdma_dev ...\e[0m" + mlnx_qos -i "$nic_item" --trust=dscp --pfc ${pfc_queue} + + echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority" + echo 1 >/sys/class/net/$nic_item/ecn/roce_np/enable/$rdma_priority + + echo "echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority" + echo 1 >/sys/class/net/$nic_item/ecn/roce_rp/enable/$rdma_priority + + echo "echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp" + echo $cnp_qos >/sys/class/net/$nic_item/ecn/roce_np/cnp_dscp + + echo -e "\e[31minfo, start to apply cma_roce_tox for port ${rdma_dev}\e[0m" + traffic_class=$((rdma_qos << 2)) + + echo "cma_roce_tos -d $rdma_dev -t $traffic_class" + cma_roce_tos -d $rdma_dev -t $traffic_class + cma_roce_mode -d $rdma_dev -p 1 -m 2 + + echo "echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class" + echo $traffic_class >/sys/class/infiniband/$rdma_dev/tc/1/traffic_class + done +} + +[ -z "$GPU_NIC_LIST" ] && [ -z "$STORAGE_NIC_LIST" ] && { + echo "error, GPU_NIC_LIST and STORAGE_NIC_LIST cannot be empty at the same time, at least one of them needs to be configured" + exit 1 +} + +while true ; do + + if [ -n "$GPU_NIC_LIST" ] ; then + echo "Config RDMA QoS for GPU group, GPU_NIC_LIST: $GPU_NIC_LIST, GPU_RDMA_PRIORITY: $GPU_RDMA_PRIORITY, GPU_RDMA_QOS: $GPU_RDMA_QOS, GPU_CNP_PRIORITY: $GPU_CNP_PRIORITY, GPU_CNP_QOS: $GPU_CNP_QOS" + set_rdma_qos $GPU_NIC_LIST $GPU_RDMA_PRIORITY $GPU_RDMA_QOS $GPU_CNP_PRIORITY $GPU_CNP_QOS + else + echo "No nics configured for Group GPU, no need to config RDMA QoS" + fi + + if [ -n "$STORAGE_NIC_LIST" ] ; then + echo "Config RDMA QoS for storage group, STORAGE_NIC_LIST: $STORAGE_NIC_LIST, STORAGE_RDMA_PRIORITY: $STORAGE_RDMA_PRIORITY, STORAGE_RDMA_QOS: $STORAGE_RDMA_QOS, STORAGE_CNP_PRIORITY: $STORAGE_CNP_PRIORITY, STORAGE_CNP_QOS: $STORAGE_CNP_QOS" + set_rdma_qos $STORAGE_NIC_LIST $STORAGE_RDMA_PRIORITY $STORAGE_RDMA_QOS $STORAGE_CNP_PRIORITY $STORAGE_CNP_QOS + else + echo "No nics configured for Group Storage, no need to config RDMA QoS" + fi + + sysctl -w net.ipv4.tcp_ecn=1 + + if [ "$RUN_ONCE" = true ] ; then + exit 0 + fi + + sleep 60 +done +S_EOF + +chmod +x /usr/local/bin/rdma_qos.sh +/usr/local/bin/rdma_qos.sh + +sed -i 's?RUN_ONCE=true?RUN_ONCE=false?' /usr/local/bin/rdma_qos.sh +echo -e "\e[31m Prepare rdma qos systemd unit file \e[0m" + +cat <<"SYS_EOF" >/etc/systemd/system/rdma-qos.service +[Unit] +Description=RDMA QoS Configuration Service +After=network.target + +[Service] +Type=simple +ExecStart=/bin/bash /usr/local/bin/rdma_qos.sh +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=multi-user.target +SYS_EOF + +echo -e "\e[31m Start rdma-qos systemd service \e[0m" +systemctl daemon-reload +systemctl enable rdma-qos.service +systemctl restart rdma-qos.service +echo -e "\e[31m Done \e[0m" + +systemctl status rdma-qos.service diff --git a/docs/usage/install/ai/get-started-macvlan-zh_CN.md b/docs/usage/install/ai/get-started-macvlan-zh_CN.md index dd9a062c40..18bbd23bba 100644 --- a/docs/usage/install/ai/get-started-macvlan-zh_CN.md +++ b/docs/usage/install/ai/get-started-macvlan-zh_CN.md @@ -12,6 +12,19 @@ - 在 Infiniband 的 IPOIB 网卡上不支持创建 Macvlan 接口,因此,本方案只能适用在 RoCE 网络场景下,不能使用在 infiniband 网络场景下。 +## 对比 SR-IOV CNI 的 RDMA 方案 + +| 比较维度 | Macvlan 共享 RDMA 方案 | SR-IOV CNI 隔离 RDMA 方案 | +| ----------- | ------------------------------------- | --------------------------------- | +| 网络隔离 | 所有容器共享 RDMA 设备,隔离性较差 | 容器独享 RDMA 设备,隔离性较好 | +| 性能 | 性能较高 | 硬件直通,性能最优 | +| 资源利用率 | 资源利用率较高 | 较低,受硬件支持的 VFs 数量限制 | +| 配置复杂度 | 配置相对简单 | 配置较为复杂,需要硬件支持和配置 | +| 兼容性 | 兼容性较好,适用于大多数环境 | 依赖硬件支持,兼容性较差 | +| 适用场景 | 适用于大多数场景,包括裸金属,虚拟机等 | 只适用于裸金属,不适用于虚拟机场景 | +| 成本 | 成本较低,因为不需要额外的硬件支持 | 成本较高,需要支持 SR-IOV 的硬件设备 | +| 支持 RDMA 协议 | 支持 Roce 协议,不支持 Infiniband 协议 | 支持 Roce 和 Infiniband 协议 | + ## 方案 本文将以如下典型的 AI 集群拓扑为例,介绍如何搭建 Spiderpool。 @@ -83,20 +96,13 @@ ....... ``` - 确认网卡的工作模式,如下输出表示网卡工作在 Ethernet 模式下,可实现 RoCE 通信 + 确认网卡的工作模式为 Ethernet: ```shell $ ibstat mlx5_0 | grep "Link layer" Link layer: Ethernet ``` - 如下输出表示网卡工作在 Infiniband 模式下,可实现 Infiniband 通信 - - ```shell - $ ibstat mlx5_0 | grep "Link layer" - Link layer: InfiniBand - ``` - 如果网卡没有工作在预期的模式下,请输入如下命令,确认网卡支持配置 LINK_TYPE 参数,如果没有该参数,请更换支持的网卡型号 ```shell @@ -113,7 +119,53 @@ LINK_TYPE_P1 IB(1) ``` -3. 开启 [GPUDirect RMDA](https://docs.nvidia.com/cuda/gpudirect-rdma/) 功能 +3. (可选)更改主机网卡的 MTU 大小 + + 在一些特殊的通信场景下,用户需要为主机网卡自定义 MTU 大小以满足不同数据报文通信需求。本文以 Ubuntu 系统为例,主机网卡的 MTU 默认值为 1500,您可以通过以下方式自定义配置主机网卡的 MTU 大小: + + 打开 `netplan` 配置文件,这些文件位于 /etc/netplan/ 目录下,文件名可能是 01-netcfg.yaml 或类似的名称。使用文本编辑器打开文件,例如: + + ```shell + vim /etc/netplan/01-netcfg.yaml + ``` + + 修改文件中的 `network:` 部分中关于 mtu 的配置,例如: + + ```yaml + network: + version: 2 + ethernets: + enp11s0f0np0: + mtu: 8000 + ... + ``` + + 在这个例子中,我们将 `enp11s0f0np0` 的 mtu 设置为 8000,以满足通信需求。保存文件并退出,使用 `netplan apply`应用更改。 + + ``` + $ sudo netplan apply + ``` + + 执行更新后,请检查主机上的 `enp11s0f0np0` 网卡的 mtu 是否已经更新为 8000。 + + ```shell + ~# ip l show enp11s0f0np0 + 6: enp11s0f0np0: mtu 8000 qdisc mq state UP mode DEFAULT group default qlen 1000 + link/ether b8:3f:d2:9f:09:42 brd ff:ff:ff:ff:ff:ff + ... + ``` + +4. 配置主机 RDMA 无损网络 + + 在高性能网络场景下,RDMA 网络对于丢包非常敏感,一旦发生丢包重传,性能会急剧下降。因此要使得 RDMA 网络性能不受影响,丢包率必须保证在 1e-05(十万分之一)以下,最好为零丢包。对于 Roce 网络,可通过 PFC + ECN 机制来保障网络传输过程不丢包。 + + 可参考 [配置 RDMA 无损网络](../../roce-qos-zh_CN.md) + + > 配置无损网络要求网卡必须工作在 RDMA Roce 网络环境下,不能是 Infiniband + > + > 配置无损网络必须要求交换机支持 PFC + ECN 机制,并且配置与主机侧对齐,否则不能工作 + +5. 开启 [GPUDirect RMDA](https://docs.nvidia.com/cuda/gpudirect-rdma/) 功能 在安装或使用 [gpu-operator](https://github.com/NVIDIA/gpu-operator) 过程中 @@ -131,7 +183,7 @@ gdrdrv 24576 0 ``` -4. 确认主机上的 RDMA 子系统为 shared 模式,这是 macvlan 场景下提供 RDMA 设备给容器的要求。 +6. 确认主机上的 RDMA 子系统为 shared 模式,这是 macvlan 场景下提供 RDMA 设备给容器的要求。 ```shell # Check the current operating mode (the Linux RDMA subsystem operates in shared mode by default): @@ -246,6 +298,31 @@ EOF ``` + 默认情况下,Pod 网卡的 MTU 与其 macvlan 网卡的 MTU 相同。在一些特殊通信场景下,用户需要为 Pod 自定义 MTU 大小以满足不同数据报文通信需求。您可以通过以下方式自定义配置 Pod 的 MTU 大小: + + ```yaml + apiVersion: spiderpool.spidernet.io/v2beta1 + kind: SpiderMultusConfig + metadata: + name: gpu1-macvlan + namespace: spiderpool + spec: + cniType: macvlan + rdmaResourceName: spidernet.io/shared_cx5_gpu1 + macvlan: + master: ["enp11s0f0np0"] + ippools: + ipv4: ["gpu1-net11"] + chainCNIJsonData: + - | + { + "type": "tuning", + "mtu": 1480 + } + ``` + + 注意: MTU 的取值范围不应该大于 macvlan master 网卡的 MTU 值,否则无法创建 Pod。 + ## 创建测试应用 1. 在指定节点上创建一组 DaemonSet 应用 diff --git a/docs/usage/install/ai/get-started-macvlan.md b/docs/usage/install/ai/get-started-macvlan.md index a4c513aa9b..a66412d165 100644 --- a/docs/usage/install/ai/get-started-macvlan.md +++ b/docs/usage/install/ai/get-started-macvlan.md @@ -12,6 +12,19 @@ By using [RDMA shared device plugin](https://github.com/Mellanox/k8s-rdma-shared - Macvlan interfaces cannot be created on an Infiniband IPOIB network card, so this solution is only applicable in RoCE network scenarios and cannot be used in Infiniband network scenarios. +## Comparison of SR-IOV CNI RDMA Solution + +| Comparison Dimension | Macvlan Shared RDMA Solution | SR-IOV CNI Isolated RDMA Solution | +| -------------------- | ---------------------------------- | ---------------------------------- | +| Network Isolation | All containers share RDMA devices, poor isolation | Containers have dedicated RDMA devices, good isolation | +| Performance | High performance | Optimal performance with hardware passthrough | +| Resource Utilization | High resource utilization | Low, limited by the number of supported VFs | +| Configuration Complexity | Relatively simple configuration | More complex configuration, requires hardware support | +| Compatibility | Good compatibility, suitable for most environments | Depends on hardware support, less compatible | +| Applicable Scenarios | Suitable for most scenarios, including bare metal and VMs | Only suitable for bare metal, not for VM scenarios | +| Cost | Low cost, no additional hardware support needed | High cost, requires hardware supporting SR-IOV | +| Support RDMA Protocol | Support Roce protocol, not support Infiniband protocol | Support Roce and Infiniband protocol | + ## Solution This article will introduce how to set up Spiderpool using the following typical AI cluster topology as an example. @@ -113,7 +126,53 @@ The network planning for the cluster is as follows: LINK_TYPE_P1 IB(1) ``` -3. Enable [GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/) +3. (Optional) Change the MTU size of the host network card + + In some special communication scenarios, users may need to customize the MTU size of the host network card to meet the communication needs of different data packets. + + This document uses the Ubuntu system as an example, where the default MTU value of the host network card is 1500. You can customize the MTU size of the host network card as follows: + + Open the netplan configuration file, which is located in the /etc/netplan/ directory. The filename might be 01-netcfg.yaml or something similar. Use a text editor to open the file, for example: + + ```shell + vim /etc/netplan/01-netcfg.yaml + ``` + + Modify the network: section of the file to configure the MTU, for example: + + ```shell + network: + version: 2 + ethernets: + enp11s0f0np0: + mtu: 8000 + ... + ``` + + In this example, we set the MTU of `enp11s0f0np0` to 8000 to meet communication needs. Save the file and exit, then apply the changes using `netplan apply`. + + ```shell + $ sudo netplan apply + ``` + + After executing the update, check if the MTU of the `enp11s0f0np0` network card on the host has been updated to 8000. + + ``` + ~# ip l show enp11s0f0np0 + 6: enp11s0f0np0: mtu 8000 qdisc mq state UP mode DEFAULT group default qlen 1000 + link/ether b8:3f:d2:9f:09:42 brd ff:ff:ff:ff:ff:ff + ... + ``` + +4. Configure Host RDMA Lossless Network + + In high-performance network scenarios, RDMA networks are very sensitive to packet loss. Once packet retransmission occurs, performance will drop sharply. Therefore, to ensure that RDMA network performance is not affected, the packet loss rate must be kept below 1e-05 (one in 100,000), ideally zero packet loss. For RoCE networks, the PFC + ECN mechanism can be used to ensure no packet loss during network transmission. Refer to [RoCE Lossless Network Configuration](../../roce-qos.md) + + > Configuring a lossless network requires an RDMA RoCE network environment and cannot be Infiniband. + > + > Configuring a lossless network requires the switch to support the PFC + ECN mechanism and be aligned with the host side configuration; otherwise, it will not work. + +5. Enable [GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/) The installation of the [gpu-operator](https://github.com/NVIDIA/gpu-operator): @@ -132,7 +191,7 @@ The network planning for the cluster is as follows: gdrdrv 24576 0 ``` -4. Set the RDMA subsystem on the host to shared mode, allowing containers to independently use shared RDMA device. +6. Set the RDMA subsystem on the host to shared mode, allowing containers to independently use shared RDMA device. ```shell # Check the current operating mode (the Linux RDMA subsystem operates in shared mode by default): @@ -246,6 +305,31 @@ The network planning for the cluster is as follows: EOF ``` + In some special communication scenarios, users need to customize the MTU size for Pods to meet the communication needs of different data packets. You can customize the MTU size for Pods in the following way. + + ```yaml + apiVersion: spiderpool.spidernet.io/v2beta1 + kind: SpiderMultusConfig + metadata: + name: gpu1-macvlan + namespace: spiderpool + spec: + cniType: macvlan + rdmaResourceName: spidernet.io/shared_cx5_gpu1 + macvlan: + master: ["enp11s0f0np0"] + ippools: + ipv4: ["gpu1-net11"] + chainCNIJsonData: + - | + { + "type": "tuning", + "mtu": 1480 + } + ``` + + Note: The MTU value should not exceed the MTU value of the macvlan master network interface, otherwise the Pod cannot be created. + ## Create a Test Application 1. Create a DaemonSet application on specified nodes. diff --git a/docs/usage/install/ai/get-started-sriov-zh_CN.md b/docs/usage/install/ai/get-started-sriov-zh_CN.md index 8d4a6ae287..67deb1aa56 100644 --- a/docs/usage/install/ai/get-started-sriov-zh_CN.md +++ b/docs/usage/install/ai/get-started-sriov-zh_CN.md @@ -20,6 +20,23 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb 2. RoCE 网络场景下, 使用了 [SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni) 来暴露宿主机上的 RDMA 网卡给 Pod 使用,暴露 RDMA 资源。可额外使用 [RDMA CNI](https://github.com/k8snetworkplumbingwg/rdma-cni) 来完成 RDMA 设备隔离。 +注意: + +- 基于 SR-IOV 技术给容器提供 RDMA 通信能力只适用于裸金属环境,不适用于虚拟机环境。 + +## 对比 Macvlan CNI 的 RDMA 方案 + +| 比较维度 | Macvlan 共享 RDMA 方案 | SR-IOV CNI 隔离 RDMA 方案 | +| ------------| ------------------------------------- | --------------------------------- | +| 网络隔离 | 所有容器共享 RDMA 设备,隔离性较差 | 容器独享 RDMA 设备,隔离性较好 | +| 性能 | 性能较高 | 硬件直通,性能最优 | +| 资源利用率 | 资源利用率较高 | 较低,受硬件支持的 VFs 数量限制 | +| 配置复杂度 | 配置相对简单 | 配置较为复杂,需要硬件支持和配置 | +| 兼容性 | 兼容性较好,适用于大多数环境 | 依赖硬件支持,兼容性较差 | +| 适用场景 | 适用于大多数场景,包括裸金属,虚拟机等 | 只适用于裸金属,不适用于虚拟机场景 | +| 成本 | 成本较低,因为不需要额外的硬件支持 | 成本较高,需要支持 SR-IOV 的硬件设备 | +| 支持 RDMA 协议 | 支持 Roce 协议,不支持 Infiniband 协议 | 支持 Roce 和 Infiniband 协议 | + ## 方案 本文将以如下典型的 AI 集群拓扑为例,介绍如何搭建 Spiderpool @@ -124,7 +141,52 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb LINK_TYPE_P1 IB(1) ``` -3. 开启 [GPUDirect RMDA](https://docs.nvidia.com/cuda/gpudirect-rdma/) 功能 +3. (可选)更改主机网卡的 MTU 大小 + + 在一些特殊的通信场景下,用户需要为主机网卡自定义 MTU 大小以满足不同数据报文通信需求。本文以 Ubuntu 系统为例,主机网卡的 MTU 默认值为 1500,您可以通过以下方式自定义配置主机网卡的 MTU 大小: + + 打开 `netplan` 配置文件,这些文件位于 /etc/netplan/ 目录下,文件名可能是 01-netcfg.yaml 或类似的名称。使用文本编辑器打开文件,例如: + + ```shell + vim /etc/netplan/01-netcfg.yaml + ``` + + 修改文件中的 `network:` 部分中关于 mtu 的配置,例如: + + ```yaml + network: + version: 2 + ethernets: + enp11s0f0np0: + mtu: 8000 + ... + ``` + + 在这个例子中,我们将 `enp11s0f0np0` 的 mtu 设置为 8000,以满足通信需求。保存文件并退出,使用 `netplan apply`应用更改。 + + ``` + $ sudo netplan apply + ``` + + 执行更新后,请检查主机上的 `enp11s0f0np0` 网卡的 mtu 是否已经更新为 8000。 + + ```shell + ~# ip l show enp11s0f0np0 + 6: enp11s0f0np0: mtu 8000 qdisc mq state UP mode DEFAULT group default qlen 1000 + link/ether b8:3f:d2:9f:09:42 brd ff:ff:ff:ff:ff:ff + ... + ``` + +4. 配置主机 RDMA 无损网络 + + 在高性能网络场景下,RDMA 网络对于丢包非常敏感,一旦发生丢包重传,性能会急剧下降。因此要使得 RDMA 网络性能不受影响,丢包率必须保证在 1e-05(十万分之一)以下,最好为零丢包。对于 Roce 网络,可通过 PFC + ECN 机制来保障网络传输过程不丢包。 + + 可参考 [配置 RDMA 无损网络](../../roce-qos-zh_CN.md) + + > 配置无损网络要求必须在 RDMA Roce 网络环境下,不能是 Infiniband + > 配置无损网络必须要求交换机支持 PFC + ECN 机制,并且配置与主机侧对齐,否则不能工作 + +5. 开启 [GPUDirect RMDA](https://docs.nvidia.com/cuda/gpudirect-rdma/) 功能 在安装或使用 [gpu-operator](https://github.com/NVIDIA/gpu-operator) 过程中 @@ -142,7 +204,7 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb gdrdrv 24576 0 ``` -4. 若希望 RDMA 系统工作在独占模式下,请设置主机上的 RDMA 子系统为 exclusive 模式,使得容器能够独立使用 RDMA 设备过程,避免与其他容器共享 +6. 若希望 RDMA 系统工作在独占模式下,请设置主机上的 RDMA 子系统为 exclusive 模式,使得容器能够独立使用 RDMA 设备过程,避免与其他容器共享 ```shell # Check the current operating mode (the Linux RDMA subsystem operates in shared mode by default): @@ -218,10 +280,10 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb priority: 99 numVfs: 12 nicSelector: - deviceID: "1017" - vendor: "15b3" - rootDevices: - - 0000:86:00.0 + deviceID: "1017" + vendor: "15b3" + rootDevices: + - 0000:86:00.0 linkType: ${LINK_TYPE} deviceType: netdevice isRdma: true @@ -238,10 +300,10 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb priority: 99 numVfs: 12 nicSelector: - deviceID: "1017" - vendor: "15b3" - rootDevices: - - 0000:86:00.0 + deviceID: "1017" + vendor: "15b3" + rootDevices: + - 0000:86:00.0 linkType: ${LINK_TYPE} deviceType: netdevice isRdma: true @@ -337,6 +399,8 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb EOF ``` + 如果您需要自定义配置 VF 的 MTU,参考 [自定义配置 VF 的 MTU](#自定义-vf-的-mtu). + (2) 对于 Ethernet 网络,请为所有的 GPU 亲和的 SR-IOV 网卡配置 [SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni) 配置,并创建对应的 IP 地址池 。 如下例子,配置了 GPU1 亲和的网卡和 IP 地址池 ```shell @@ -366,6 +430,8 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb EOF ``` + 如果您需要自定义配置 VF 的 MTU,参考 [自定义配置 VF 的 MTU](#自定义-vf-的-mtu). + ## 创建测试应用 1. 在指定节点上创建一组 DaemonSet 应用,测试指定节点上的 SR-IOV 设备的可用性 @@ -698,3 +764,30 @@ Spiderpool 使用了 [sriov-network-operator](https://github.com/k8snetworkplumb spidernet.io/gpu7rdma: 1 spidernet.io/gpu8rdma: 1 ``` + +## 自定义 VF 的 MTU + + 默认情况下,SR-IOV VF 的 MTU 不会继承其 PF 的值影响,因此在一些特殊通信场景下,用户需要为 Pod 自定义 MTU 大小以满足不同数据报文通信需求。您可以参考以下方式自定义配置 Pod 的 MTU 大小(以 Ethernet 为例): + + ```yaml + apiVersion: spiderpool.spidernet.io/v2beta1 + kind: SpiderMultusConfig + metadata: + name: gpu1-sriov + namespace: spiderpool + spec: + cniType: sriov + sriov: + resourceName: spidernet.io/gpu1sriov + enableRdma: true + ippools: + ipv4: ["gpu1-net11"] + chainCNIJsonData: + - | + { + "type": "tuning", + "mtu": 8000 + } + ``` + + 注意:MTU 的取值范围不应该大于 sriov PF 的 MTU 值。 diff --git a/docs/usage/install/ai/get-started-sriov.md b/docs/usage/install/ai/get-started-sriov.md index 6fd7facff1..ce9256aa85 100644 --- a/docs/usage/install/ai/get-started-sriov.md +++ b/docs/usage/install/ai/get-started-sriov.md @@ -19,6 +19,23 @@ Different CNIs are used for different network scenarios: 2. In RoCE network scenarios, the [SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni) is used to expose the RDMA network interface on the host to the Pod, thereby exposing RDMA resources. Additionally, the [RDMA CNI](https://github.com/k8snetworkplumbingwg/rdma-cni) can be used to achieve RDMA device isolation. +Note: + +- Based on SR-IOV technology, the RDMA communication capability of containers is only applicable to bare metal environments, not to virtual machine environments. + +## Comparison of Macvlan CNI RDMA Solution + +| Comparison Dimension | Macvlan Shared RDMA Solution | SR-IOV CNI Isolated RDMA Solution | +| -------------------- | ---------------------------------- | ---------------------------------- | +| Network Isolation | All containers share RDMA devices, poor isolation | Containers have dedicated RDMA devices, good isolation | +| Performance | High performance | Optimal performance with hardware passthrough | +| Resource Utilization | High resource utilization | Low, limited by the number of supported VFs | +| Configuration Complexity | Relatively simple configuration | More complex configuration, requires hardware support | +| Compatibility | Good compatibility, suitable for most environments | Depends on hardware support, less compatible | +| Applicable Scenarios | Suitable for most scenarios, including bare metal and VMs | Only suitable for bare metal, not for VM scenarios | +| Cost | Low cost, no additional hardware support needed | High cost, requires hardware supporting SR-IOV | +| Support RDMA Protocol | Support Roce protocol, not support Infiniband protocol | Support Roce and Infiniband protocol | + ## Solution This article will introduce how to set up Spiderpool using the following typical AI cluster topology as an example. @@ -123,7 +140,53 @@ The network planning for the cluster is as follows: LINK_TYPE_P1 IB(1) ``` -3. Enable [GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/) +3. (Optional) Change the MTU size of the host network card + + In some special communication scenarios, users may need to customize the MTU size of the host network card to meet the communication needs of different data packets. + + This document uses the Ubuntu system as an example, where the default MTU value of the host network card is 1500. You can customize the MTU size of the host network card as follows: + + Open the netplan configuration file, which is located in the /etc/netplan/ directory. The filename might be 01-netcfg.yaml or something similar. Use a text editor to open the file, for example: + + ```shell + vim /etc/netplan/01-netcfg.yaml + ``` + + Modify the network: section of the file to configure the MTU, for example: + + ```shell + network: + version: 2 + ethernets: + enp11s0f0np0: + mtu: 8000 + ... + ``` + + In this example, we set the MTU of `enp11s0f0np0` to 8000 to meet communication needs. Save the file and exit, then apply the changes using `netplan apply`. + + ```shell + $ sudo netplan apply + ``` + + After executing the update, check if the MTU of the `enp11s0f0np0` network card on the host has been updated to 8000. + + ``` + ~# ip l show enp11s0f0np0 + 6: enp11s0f0np0: mtu 8000 qdisc mq state UP mode DEFAULT group default qlen 1000 + link/ether b8:3f:d2:9f:09:42 brd ff:ff:ff:ff:ff:ff + ... + ``` + +4. Configure Host RDMA Lossless Network + + In high-performance network scenarios, RDMA networks are very sensitive to packet loss. Once packet retransmission occurs, performance will drop sharply. Therefore, to ensure that RDMA network performance is not affected, the packet loss rate must be kept below 1e-05 (one in 100,000), ideally zero packet loss. For RoCE networks, the PFC + ECN mechanism can be used to ensure no packet loss during network transmission. Refer to [RoCE Lossless Network Configuration](../../roce-qos.md) + + > Configuring a lossless network requires an RDMA RoCE network environment and cannot be Infiniband. + > + > Configuring a lossless network requires the switch to support the PFC + ECN mechanism and be aligned with the host side configuration; otherwise, it will not work. + +5. Enable [GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/) The installation of the [gpu-operator](https://github.com/NVIDIA/gpu-operator): @@ -142,7 +205,7 @@ The network planning for the cluster is as follows: gdrdrv 24576 0 ``` -4. Set the RDMA subsystem on the host to exclusive mode under infiniband network, allowing containers to independently use RDMA devices and avoiding sharing with other containers. +6. Set the RDMA subsystem on the host to exclusive mode under infiniband network, allowing containers to independently use RDMA devices and avoiding sharing with other containers. ```shell # Check the current operating mode (the Linux RDMA subsystem operates in shared mode by default): @@ -213,19 +276,19 @@ The network planning for the cluster is as follows: name: gpu1-nic-policy namespace: spiderpool spec: - nodeSelector: - kubernetes.io/os: "linux" - resourceName: gpu1sriov - priority: 99 - numVfs: 12 - nicSelector: - deviceID: "1017" - vendor: "15b3" - rootDevices: - - 0000:86:00.0 - linkType: ${LINK_TYPE} - deviceType: netdevice - isRdma: true + nodeSelector: + kubernetes.io/os: "linux" + resourceName: gpu1sriov + priority: 99 + numVfs: 12 + nicSelector: + deviceID: "1017" + vendor: "15b3" + rootDevices: + - 0000:86:00.0 + linkType: ${LINK_TYPE} + deviceType: netdevice + isRdma: true --- apiVersion: sriovnetwork.openshift.io/v1 kind: SriovNetworkNodePolicy @@ -233,19 +296,19 @@ The network planning for the cluster is as follows: name: gpu2-nic-policy namespace: spiderpool spec: - nodeSelector: - kubernetes.io/os: "linux" - resourceName: gpu2sriov - priority: 99 - numVfs: 12 - nicSelector: - deviceID: "1017" - vendor: "15b3" - rootDevices: - - 0000:86:00.0 - linkType: ${LINK_TYPE} - deviceType: netdevice - isRdma: true + nodeSelector: + kubernetes.io/os: "linux" + resourceName: gpu2sriov + priority: 99 + numVfs: 12 + nicSelector: + deviceID: "1017" + vendor: "15b3" + rootDevices: + - 0000:86:00.0 + linkType: ${LINK_TYPE} + deviceType: netdevice + isRdma: true EOF ``` @@ -338,6 +401,8 @@ The network planning for the cluster is as follows: EOF ``` + If you want to customize the MTU size, please refer to [Customize the MTU of SR-IOV VF](#customize-the-mtu-of-sr-iov-vf). + b. For Ethernet Networks, configure [the SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni) for all GPU-affinitized SR-IOV network cards and create the corresponding IP address pool. The following example configures the network card and IP address pool for GPU1 ```shell @@ -367,6 +432,8 @@ The network planning for the cluster is as follows: EOF ``` + If you want to customize the MTU size, please refer to [Customize the MTU of SR-IOV VF](#customize-the-mtu-of-sr-iov-vf). + ## Create a Test Application 1. Create a DaemonSet application on a specified node to test the availability of SR-IOV devices on that node. @@ -699,3 +766,30 @@ In the steps above, we demonstrated how to use SR-IOV technology to provide RDMA spidernet.io/gpu7rdma: 1 spidernet.io/gpu8rdma: 1 ``` + +## Customize the MTU of SR-IOV VF + + By default, the MTU of an SR-IOV VF does not inherit the value of its PF. Therefore, in some special communication scenarios, users need to customize the MTU size for Pods to meet the communication requirements of different data packets. You can refer to the following method to customize the Pod's MTU configuration (using Ethernet as an example): + + ```yaml + apiVersion: spiderpool.spidernet.io/v2beta1 + kind: SpiderMultusConfig + metadata: + name: gpu1-sriov + namespace: spiderpool + spec: + cniType: sriov + sriov: + resourceName: spidernet.io/gpu1sriov + enableRdma: true + ippools: + ipv4: ["gpu1-net11"] + chainCNIJsonData: + - | + { + "type": "tuning", + "mtu": 8000 + } + ``` + + Note: The MTU value should not exceed the MTU value of the sriov PF. diff --git a/docs/usage/roce-qos-zh_CN.md b/docs/usage/roce-qos-zh_CN.md new file mode 100644 index 0000000000..bffeee2c9a --- /dev/null +++ b/docs/usage/roce-qos-zh_CN.md @@ -0,0 +1,55 @@ +# 配置无损网络 + +## 介绍 + +在各种 HPC 高性能计算场景中,对网络的诉求基本上是高吞吐和低时延这两个重要特性,为了实现高吞吐和低时延,业界一般采用 RDMA(Remote Direct Memory Access,远程直接内存访问)替代 TCP 协议。但是 RDMA 网络对于丢包非常敏感,一旦发生丢包重传,性能会急剧下降。因此要使得 RDMA 吞吐不受影响,丢包率必须保证在 1e-05(十万分之一)以下,最好为丢包率为 0。 + +RoCE (RDMA over Converged Ethernet)网络通过 PFC+ECN 特性来保障网络传输过程不丢包。 + +- PFC: 优先级流量控制 (Priority Flow Control),IEEE 802.1Qbb,基于优先级的流量控制。 +- ECN: 显式拥塞通知(Explicit Congestion Notification),通过在 IP 头部的特定位上设置标志实现:在不丢弃数据包的情况下指示网络拥塞。 + +本文将介绍如何在主机侧配置 Roce 的无损网络。注: 不涉及交换机配置。 + +## 如何配置 + +本文档提供一个脚本以 Systemd 方式帮助配置主机侧的 Roce 无损网络。 + +1. 下载脚本到本地文件路径,并添加脚本权限执行: + + + ```shell + wget https://raw.githubusercontent.com/spidernet-io/spiderpool/refs/heads/main/docs/example/qos/rdma-qos.sh + chmod +x rdma-qos.sh + ``` + + 如果是 GPU 服务器,需要配置网卡列表及 RDMA 流量 和 CNP 报文的优先级队列。注意网卡名称与实际保持一致。 + + ```shell + GPU_NIC_LIST=eth0,eth1 GPU_RDMA_PRIORITY=5 GPU_CNP_PRIORITY=6 bash rdma-qos.sh + ``` + + - GPU_NIC_LIST: 指定要配置的网卡列表。 + - GPU_RDMA_PRIORITY: 指定 Roce 流量的优先级队列,配置范围为 0~7,默认为 5。 + - GPU_CNP_PRIORITY: 指定 CNP 报文的优先级队列,配置范围为 0~7,默认为 6。 + - GPU_RDMA_QOS: 指定 Roce 流量的 dscp。默认为空,默认值 = GPU_RDMA_PRIORITY * 8 = 40。 + - GPU_CNP_QOS: 指定 CNP 报文的 dscp。默认为空,默认值 = GPU_CNP_PRIORITY * 8 = 48。 + + 如果是 Storage 服务器,需要配置网卡列表及 RDMA 流量 和 CNP 报文的优先级队列。注意网卡名称与实际保持一致。 + + ```shell + STORAGE_NIC_LIST=eth0,eth1 STORAGE_RDMA_PRIORITY=5 STORAGE_CNP_PRIORITY=6 bash rdma-qos.sh + ``` + + - STORAGE_NIC_LIST: 指定要配置的网卡列表。 + - STORAGE_RDMA_PRIORITY: 指定 Roce 流量的优先级队列,配置范围为 0~7,默认为 5。 + - STORAGE_CNP_PRIORITY: 指定 CNP 报文的优先级队列,配置范围为 0~7,默认为 6。 + - STORAGE_RDMA_QOS: 指定 Roce 流量的 dscp。默认为空,默认值 = STORAGE_RDMA_PRIORITY * 8 = 40。 + - STORAGE_CNP_QOS: 指定 CNP 报文的 dscp。默认为空,默认值 = STORAGE_RDMA_PRIORITY * 8 = 48。 + +2. 查看 Systemd 服务运行状态 + + ```shell + systemctl status rdma-qos.service + journalctl -u rdma-qos.service + ``` diff --git a/docs/usage/roce-qos.md b/docs/usage/roce-qos.md new file mode 100644 index 0000000000..02e913ffa1 --- /dev/null +++ b/docs/usage/roce-qos.md @@ -0,0 +1,57 @@ +# Configure Lossless Network For RoCE + +## Introduction + +In various HPC high-performance computing scenarios, the main requirements for networks are high throughput and low latency. To achieve high throughput and low latency, the industry generally uses RDMA (Remote Direct Memory Access) to replace the TCP protocol. However, RDMA networks are very sensitive to packet loss. Once packet retransmission occurs, performance will drop sharply. Therefore, to ensure that RDMA throughput is not affected, the packet loss rate must be kept below 1e-05 (one in 100,000), ideally zero. + +RoCE (RDMA over Converged Ethernet) networks use PFC+ECN features to ensure no packet loss during network transmission. + +- PFC: Priority Flow Control, IEEE 802.1Qbb, flow control based on priority. +- ECN: Explicit Congestion Notification, implemented by setting flags in specific bits of the IP header to indicate network congestion without dropping packets. + +This document will introduce how to configure a lossless network on the host side for RoCE. Note: This does not involve switch configuration. + +## How to Configure + +This document provides a script to help configure a lossless network on the host side using Systemd. + +1. Download the script, then add script permissions and execute + + ```shell + cd /usr/local/bin + curl -O https://raw.githubusercontent.com/spidernet-io/spiderpool/master/docs/usage/rdma-qos.sh + chmod +x rdma-qos.sh + ``` + + If the server is a GPU server, you need to configure the NIC list and the priority queues for RDMA traffic and CNP packets. Make sure the NIC names are consistent with the actual names. + + ```shell + chmod +x rdma-qos.sh + GPU_NIC_LIST=eth0,eth1 GPU_RDMA_PRIORITY=5 GPU_CNP_PRIORITY=6 bash rdma-qos.sh + ``` + + - GPU_NIC_LIST: Specifies the list of NICs to configure. + - GPU_RDMA_PRIORITY: Specifies the priority queue for Roce traffic, with a range of 0-7, default is 5. + - GPU_CNP_PRIORITY: Specifies the priority queue for CNP packets, with a range of 0-7, default is 6. + - GPU_RDMA_QOS: Specifies the DSCP for Roce traffic. Default is empty, calculated as GPU_RDMA_PRIORITY * 8 = 40. + - GPU_CNP_QOS: Specifies the DSCP for CNP packets. Default is empty, calculated as GPU_CNP_PRIORITY * 8 = 48. + + If the server is a Storage server, you need to configure the NIC list and the priority queues for RDMA traffic and CNP packets. Make sure the NIC names are consistent with the actual names. + + ```shell + chmod +x rdma-qos.sh + STORAGE_NIC_LIST=eth0,eth1 STORAGE_RDMA_PRIORITY=5 STORAGE_CNP_PRIORITY=6 bash rdma-qos.sh + ``` + + - STORAGE_NIC_LIST: Specifies the list of NICs to configure. + - STORAGE_RDMA_PRIORITY: Specifies the priority queue for Roce traffic, with a range of 0-7, default is 5. + - STORAGE_CNP_PRIORITY: Specifies the priority queue for CNP packets, with a range of 0-7, default is 6. + - STORAGE_RDMA_QOS: Specifies the DSCP for Roce traffic. Default is empty, calculated as STORAGE_RDMA_PRIORITY * 8 = 40. + - STORAGE_CNP_QOS: Specifies the DSCP for CNP packets. Default is empty, calculated as STORAGE_RDMA_PRIORITY * 8 = 48. + +2. Check Systemd service running status + + ```shell + systemctl status rdma-qos.service + journalctl -u rdma-qos.service + ```