From 02c7962c4334e1c32f4350c452998bc79add9f1b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 13 Nov 2024 21:41:03 +0000 Subject: [PATCH 1/2] Update mount parallelstore script to support multiple parallelstore mounts --- .../parallelstore/scripts/mount-daos.sh | 41 +++++++++++-------- .../scripts/mount-daos.sh | 41 +++++++++++-------- 2 files changed, 46 insertions(+), 36 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh index bb64c9a4d3..facac423b4 100644 --- a/modules/file-system/parallelstore/scripts/mount-daos.sh +++ b/modules/file-system/parallelstore/scripts/mount-daos.sh @@ -41,6 +41,17 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config +# Get interface names with "s0f0" suffix +if ifconfig -a | grep 's0f0'; then + sof0_interfaces=$(ifconfig -a | grep 's0f0:' | awk '{print $1}' | tr ':' '\n' | grep -v '^$' | awk '!a[$0]++' | sed 's/^/"/g' | sed 's/$/"/g' | paste -sd, -) + + # Append the sof0_interfaces to the existing list + exclude_fabric_ifaces="lo,$sof0_interfaces" + + # Update the file with the new list + sed -i "s/#.*exclude_fabric_ifaces: \[.*/exclude_fabric_ifaces: [$exclude_fabric_ifaces]/" $daos_config +fi + # Start service if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION_MAJOR}" = "8" ] || [ "${OS_VERSION_MAJOR}" = "9" ]; }; then # TODO: Update script to change default log destination folder, after daos_agent user is supported in debian and ubuntu. @@ -69,39 +80,33 @@ sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config # make sure limit of open files is high enough for dfuse (1M of open files) ulimit -n 1048576 -for i in {1..10}; do - # To parse mount_options as --disable-wb-cache --eq-count=8. - # shellcheck disable=SC2086 - dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options && break - - echo "dfuse failed, retrying in 1 seconds (attempt $i/10)..." - sleep 1 -done - -if ! mountpoint -q "$local_mount"; then - exit 1 -fi - # Store the mounting logic in a variable -mount_command='for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user '$mount_options' --foreground && break; echo \"dfuse, failed, retrying in 1 second (attempt '$i'/10)\"; sleep 1; done' +mount_command="if mountpoint -q '$local_mount'; then fusermount3 -u '$local_mount'; fi; for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user $mount_options --foreground && break; sleep 1; done" + +# Construct the service name with the local_mount suffix +service_name="mount_parallelstore_${local_mount//\//_}.service" # --- Begin: Add systemd service creation --- -cat >/usr/lib/systemd/system/mount_parallelstore.service </usr/lib/systemd/system/"${service_name}" </usr/lib/systemd/system/mount_parallelstore.service </usr/lib/systemd/system/"${service_name}" < Date: Tue, 3 Dec 2024 21:47:55 +0000 Subject: [PATCH 2/2] Address feedback during review of #3256 TESTED: - simple Debian and Ubuntu VMs with one NIC - a3-megagpu-8g Ubuntu and HPC Rocky 8 --- .../parallelstore/scripts/mount-daos.sh | 37 +++++++++++-------- .../scripts/mount-daos.sh | 37 +++++++++++-------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/modules/file-system/parallelstore/scripts/mount-daos.sh b/modules/file-system/parallelstore/scripts/mount-daos.sh index facac423b4..cbe141e9a1 100644 --- a/modules/file-system/parallelstore/scripts/mount-daos.sh +++ b/modules/file-system/parallelstore/scripts/mount-daos.sh @@ -16,6 +16,7 @@ set -e -o pipefail OS_ID=$(awk -F '=' '/^ID=/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_ID_LIKE=$(awk -F '=' '/^ID_LIKE=/ {print $2}' /etc/os-release | sed -e 's/"//g') OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g') OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//') @@ -41,14 +42,17 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config -# Get interface names with "s0f0" suffix -if ifconfig -a | grep 's0f0'; then - sof0_interfaces=$(ifconfig -a | grep 's0f0:' | awk '{print $1}' | tr ':' '\n' | grep -v '^$' | awk '!a[$0]++' | sed 's/^/"/g' | sed 's/$/"/g' | paste -sd, -) - - # Append the sof0_interfaces to the existing list - exclude_fabric_ifaces="lo,$sof0_interfaces" +# Get names of network interfaces not in first PCI slot +# The first PCI slot is a standard network adapter while remaining interfaces +# are typically network cards dedicated to GPU or workload communication +if [[ "$OS_ID_LIKE" == "debian" ]]; then + extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',') +elif [[ "$OS_ID_LIKE" =~ "rhel" ]]; then + extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',') +fi - # Update the file with the new list +if [[ -n "$extra_interfaces" ]]; then + exclude_fabric_ifaces="lo,$extra_interfaces" sed -i "s/#.*exclude_fabric_ifaces: \[.*/exclude_fabric_ifaces: [$exclude_fabric_ifaces]/" $daos_config fi @@ -80,31 +84,32 @@ sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config # make sure limit of open files is high enough for dfuse (1M of open files) ulimit -n 1048576 -# Store the mounting logic in a variable -mount_command="if mountpoint -q '$local_mount'; then fusermount3 -u '$local_mount'; fi; for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user $mount_options --foreground && break; sleep 1; done" - # Construct the service name with the local_mount suffix -service_name="mount_parallelstore_${local_mount//\//_}.service" +safe_mount_name=$(systemd-escape -p "${local_mount}") +service_name="mount_parallelstore_${safe_mount_name}.service" # --- Begin: Add systemd service creation --- -cat >/usr/lib/systemd/system/"${service_name}" </etc/systemd/system/"${service_name}" </usr/lib/systemd/system/"${service_name}" </etc/systemd/system/"${service_name}" <