Skip to content

Commit

Permalink
Merge pull request GoogleCloudPlatform#3256 from harshthakkar01/ps-fix-2
Browse files Browse the repository at this point in the history
Add multi-mount parallelstore support
  • Loading branch information
tpdownes authored Dec 5, 2024
2 parents 2cfd2f9 + 6888f3f commit c416381
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 38 deletions.
48 changes: 29 additions & 19 deletions modules/file-system/parallelstore/scripts/mount-daos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
set -e -o pipefail

OS_ID=$(awk -F '=' '/^ID=/ {print $2}' /etc/os-release | sed -e 's/"//g')
OS_ID_LIKE=$(awk -F '=' '/^ID_LIKE=/ {print $2}' /etc/os-release | sed -e 's/"//g')
OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g')
OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//')

Expand All @@ -41,6 +42,20 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config
sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config
sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config

# Get names of network interfaces not in first PCI slot
# The first PCI slot is a standard network adapter while remaining interfaces
# are typically network cards dedicated to GPU or workload communication
if [[ "$OS_ID_LIKE" == "debian" ]]; then
extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',')
elif [[ "$OS_ID_LIKE" =~ "rhel" ]]; then
extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',')
fi

if [[ -n "$extra_interfaces" ]]; then
exclude_fabric_ifaces="lo,$extra_interfaces"
sed -i "s/#.*exclude_fabric_ifaces: \[.*/exclude_fabric_ifaces: [$exclude_fabric_ifaces]/" $daos_config
fi

# Start service
if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION_MAJOR}" = "8" ] || [ "${OS_VERSION_MAJOR}" = "9" ]; }; then
# TODO: Update script to change default log destination folder, after daos_agent user is supported in debian and ubuntu.
Expand Down Expand Up @@ -69,39 +84,34 @@ sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config
# make sure limit of open files is high enough for dfuse (1M of open files)
ulimit -n 1048576

for i in {1..10}; do
# To parse mount_options as --disable-wb-cache --eq-count=8.
# shellcheck disable=SC2086
dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options && break

echo "dfuse failed, retrying in 1 seconds (attempt $i/10)..."
sleep 1
done

if ! mountpoint -q "$local_mount"; then
exit 1
fi

# Store the mounting logic in a variable
mount_command='for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user '$mount_options' --foreground && break; echo \"dfuse, failed, retrying in 1 second (attempt '$i'/10)\"; sleep 1; done'
# Construct the service name with the local_mount suffix
safe_mount_name=$(systemd-escape -p "${local_mount}")
service_name="mount_parallelstore_${safe_mount_name}.service"

# --- Begin: Add systemd service creation ---
cat >/usr/lib/systemd/system/mount_parallelstore.service <<EOF
cat >/etc/systemd/system/"${service_name}" <<EOF
[Unit]
Description=DAOS Mount Service
After=network-online.target daos_agent.service
Before=slurmd.service
ConditionPathIsMountPoint=!${local_mount}
[Service]
Type=oneshot
Type=simple
User=root
Group=root
ExecStart=/bin/bash -c '$mount_command'
Restart=on-failure
RestartSec=10
ExecStart=/bin/dfuse -m $local_mount --pool default-pool --container default-container --multi-user $mount_options --foreground
ExecStop=/usr/bin/fusermount3 -u $local_mount
[Install]
WantedBy=multi-user.target
EOF

systemctl enable mount_parallelstore.service
systemctl daemon-reload
systemctl enable "${service_name}"
systemctl start "${service_name}"
# --- End: Add systemd service creation ---

exit 0
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
set -e -o pipefail

OS_ID=$(awk -F '=' '/^ID=/ {print $2}' /etc/os-release | sed -e 's/"//g')
OS_ID_LIKE=$(awk -F '=' '/^ID_LIKE=/ {print $2}' /etc/os-release | sed -e 's/"//g')
OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g')
OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//')

Expand All @@ -41,6 +42,20 @@ sed -i "s/#.*transport_config/transport_config/g" $daos_config
sed -i "s/#.*allow_insecure:.*false/ allow_insecure: true/g" $daos_config
sed -i "s/.*access_points.*/access_points: $access_points/g" $daos_config

# Get names of network interfaces not in first PCI slot
# The first PCI slot is a standard network adapter while remaining interfaces
# are typically network cards dedicated to GPU or workload communication
if [[ "$OS_ID_LIKE" == "debian" ]]; then
extra_interfaces=$(find /sys/class/net/ -not -name 'enp0s*' -regextype posix-extended -regex '.*/enp[0-9]+s.*' -printf '"%f"\n' | paste -s -d ',')
elif [[ "$OS_ID_LIKE" =~ "rhel" ]]; then
extra_interfaces=$(find /sys/class/net/ -not -name eth0 -regextype posix-extended -regex '.*/eth[0-9]+' -printf '"%f"\n' | paste -s -d ',')
fi

if [[ -n "$extra_interfaces" ]]; then
exclude_fabric_ifaces="lo,$extra_interfaces"
sed -i "s/#.*exclude_fabric_ifaces: \[.*/exclude_fabric_ifaces: [$exclude_fabric_ifaces]/" $daos_config
fi

# Start service
if { [ "${OS_ID}" = "rocky" ] || [ "${OS_ID}" = "rhel" ]; } && { [ "${OS_VERSION_MAJOR}" = "8" ] || [ "${OS_VERSION_MAJOR}" = "9" ]; }; then
# TODO: Update script to change default log destination folder, after daos_agent user is supported in debian and ubuntu.
Expand Down Expand Up @@ -69,39 +84,34 @@ sed -i "s/#.*user_allow_other/user_allow_other/g" $fuse_config
# make sure limit of open files is high enough for dfuse (1M of open files)
ulimit -n 1048576

for i in {1..10}; do
# To parse mount_options as --disable-wb-cache --eq-count=8.
# shellcheck disable=SC2086
dfuse -m "$local_mount" --pool default-pool --container default-container --multi-user $mount_options && break

echo "dfuse failed, retrying in 1 seconds (attempt $i/10)..."
sleep 1
done

if ! mountpoint -q "$local_mount"; then
exit 1
fi

# Store the mounting logic in a variable
mount_command='for i in {1..10}; do /bin/dfuse -m '$local_mount' --pool default-pool --container default-container --multi-user '$mount_options' --foreground && break; echo \"dfuse, failed, retrying in 1 second (attempt '$i'/10)\"; sleep 1; done'
# Construct the service name with the local_mount suffix
safe_mount_name=$(systemd-escape -p "${local_mount}")
service_name="mount_parallelstore_${safe_mount_name}.service"

# --- Begin: Add systemd service creation ---
cat >/usr/lib/systemd/system/mount_parallelstore.service <<EOF
cat >/etc/systemd/system/"${service_name}" <<EOF
[Unit]
Description=DAOS Mount Service
After=network-online.target daos_agent.service
Before=slurmd.service
ConditionPathIsMountPoint=!${local_mount}
[Service]
Type=oneshot
Type=simple
User=root
Group=root
ExecStart=/bin/bash -c '$mount_command'
Restart=on-failure
RestartSec=10
ExecStart=/bin/dfuse -m $local_mount --pool default-pool --container default-container --multi-user $mount_options --foreground
ExecStop=/usr/bin/fusermount3 -u $local_mount
[Install]
WantedBy=multi-user.target
EOF

systemctl enable mount_parallelstore.service
systemctl daemon-reload
systemctl enable "${service_name}"
systemctl start "${service_name}"
# --- End: Add systemd service creation ---

exit 0

0 comments on commit c416381

Please sign in to comment.