From 52706cd3e053172429b80aa410d15bb45cd06e07 Mon Sep 17 00:00:00 2001
From: Allison Ko <allisonko@google.com>
Date: Mon, 23 Dec 2024 15:14:57 -0700
Subject: [PATCH 1/2] gcsfuse cache enabled a3mega blueprint

---
 .../machine-learning/a3-megagpu-8g/README.md  |   4 +
 .../slurm-a3mega-gcsfuse-lssd-cluster.yaml    | 418 ++++++++++++++++++
 2 files changed, 422 insertions(+)
 create mode 100644 examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml

diff --git a/examples/machine-learning/a3-megagpu-8g/README.md b/examples/machine-learning/a3-megagpu-8g/README.md
index 79202023c9..5bf1f36614 100644
--- a/examples/machine-learning/a3-megagpu-8g/README.md
+++ b/examples/machine-learning/a3-megagpu-8g/README.md
@@ -4,3 +4,7 @@ To deploy an a3-megagpu-8g cluster running Slurm on Google Cloud, please follow
 these [instructions].
 
 [instructions]: https://cloud.google.com/cluster-toolkit/docs/deploy/deploy-a3-mega-cluster
+
+# GCSFuse with Local SSD cache
+
+`slurm-a3mega-gcsfuse-lssd-cluster.yaml` reflects best practices for using GCSFuse for ML workloads. It is configured to mount all available GCS buckets on two mountpoints on a3-mega nodes. The `/gcs-rwx` mountpoint enables parallel downloads, intended for reading/writing checkpoints, logs, application outputs, model serving, or loading large files (e.g. squashfs files). The read-only `/gcs-ro` mountpoint disables parallel downloads and enables the list cache, intended for reading training data. Parallel downloads are not recommended for training workloads; see [GCSFuse documentation](https://cloud.google.com/storage/docs/cloud-storage-fuse/file-caching#parallel-downloads) for details.
diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml
new file mode 100644
index 0000000000..698524ee0d
--- /dev/null
+++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml
@@ -0,0 +1,418 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+blueprint_name: a3mega-cluster
+
+# this blueprint should be used with the extra variables defined in
+# deployment-image-cluster.yaml
+vars:
+  deployment_name: a3mega-cluster
+  a3mega_partition_name: a3mega
+  a3mega_maintenance_interval: ""
+  enable_placement: false
+  remote_mount_homefs: /nfsshare
+  local_mount_homefs: /home
+  instance_image_custom: true
+  instance_image:
+    family: $(vars.final_image_family)
+    project: $(vars.project_id)
+  enable_login_public_ips: true
+  enable_controller_public_ips: true
+  localssd_mountpoint: /mnt/localssd
+
+  gcsfuse_runners:
+  - type: ansible-local
+    destination: gcsfuse.yml
+    content: |
+      ---
+        - name: Create gcsfuse configuration
+          ansible.builtin.copy:
+            dest: /etc/gcsfuse.yml
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              file-system:
+                dir-mode: "777"
+                file-mode: "777"
+                rename-dir-limit: 0  # Set to 20000 for hierarchical buckets
+                fuse-options: allow_other
+              foreground: true
+
+        - name: Create gcsfuse systemd service
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/gcsfuse.service
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Unit]
+              Description=gcsfuse mount of all buckets
+              After=local-fs.target
+
+              [Service]
+              Type=simple
+              User=root
+              ExecStartPre=/bin/mkdir -p /gcs
+              ExecStart=gcsfuse --config-file /etc/gcsfuse.yml /gcs
+              ExecStop=fusermount3 -u /gcs
+
+              [Install]
+              WantedBy=slurmd.service multi-user.target
+
+        post_tasks:
+        - name: Enable and restart gcsfuse
+          ansible.builtin.service:
+            name: gcsfuse.service
+            state: restarted
+            enabled: true
+
+  gcsfuse_lssd_runners:
+  - type: ansible-local
+    destination: gcsfuse.yml
+    content: |
+      ---
+      - name: Create LSSD optimized gcsfuse mount
+        hosts: localhost
+        become: true
+        tasks:
+        - name: Create gcsfuse rwx configuration
+          ansible.builtin.copy:
+            dest: /etc/gcsfuse-rwx.yml
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              file-cache:
+                max-size-mb: -1
+                enable-parallel-downloads: true
+                download-chunk-size-mb: 50
+                parallel-downloads-per-file: 16
+              cache-dir: /mnt/localssd
+              file-system:
+                dir-mode: "777"
+                file-mode: "777"
+                rename-dir-limit: 0  # Set to 20000 for hierarchical buckets
+                temp-dir: /mnt/localssd
+                fuse-options: allow_other
+              foreground: true
+
+        - name: Create gcsfuse ro configuration
+          ansible.builtin.copy:
+            dest: /etc/gcsfuse-ro.yml
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              file-cache:
+                max-size-mb: -1
+              metadata-cache:
+                ttl-secs: 3600
+              cache-dir: /mnt/localssd
+              file-system:
+                dir-mode: "555" # need 5 on dir to enable ls
+                file-mode: "444"
+                rename-dir-limit: 0  # Set to 20000 for hierarchical buckets
+                temp-dir: /mnt/localssd
+                fuse-options: allow_other
+                kernel-list-cache-ttl-secs: 60
+              foreground: true
+
+        - name: Create gcsfuse@ systemd service
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/gcsfuse@.service
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Unit]
+              Description=gcsfuse %i mount of all buckets
+              After=local-fs.target
+
+              [Service]
+              Type=simple
+              User=root
+              ExecStartPre=/bin/mkdir -p /gcs-%i
+              ExecStart=gcsfuse --config-file /etc/gcsfuse-%i.yml /gcs-%i
+              ExecStop=fusermount3 -u /gcs-%i
+
+              [Install]
+              WantedBy=slurmd.service multi-user.target
+
+        post_tasks:
+        - name: Enable and restart gcsfuse@rwx
+          ansible.builtin.service:
+            name: gcsfuse@rwx.service
+            state: restarted
+            enabled: true
+
+        - name: Enable and restart gcsfuse@ro
+          ansible.builtin.service:
+            name: gcsfuse@ro.service
+            state: restarted
+            enabled: true
+
+  a3m_runners:
+  - type: ansible-local
+    destination: slurm_aperture.yml
+    content: |
+      ---
+      - name: Configure Slurm to depend upon aperture devices
+        hosts: all
+        become: true
+        vars: {}
+        tasks:
+        - name: Ensure slurmd starts after aperture devices are ready
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/slurmd.service.d/aperture.conf
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Service]
+              ExecCondition=/usr/bin/test -d /dev/aperture_devices/
+          notify: Reload SystemD
+        handlers:
+        - name: Reload SystemD
+          ansible.builtin.systemd:
+            daemon_reload: true
+  - type: ansible-local
+    destination: enable_dcgm.yml
+    content: |
+      ---
+      - name: Enable NVIDIA DCGM on GPU nodes
+        hosts: all
+        become: true
+        vars:
+          enable_ops_agent: $(vars.enable_ops_agent)
+          enable_nvidia_dcgm: $(vars.enable_nvidia_dcgm)
+        tasks:
+        - name: Update Ops Agent configuration
+          ansible.builtin.blockinfile:
+            path: /etc/google-cloud-ops-agent/config.yaml
+            insertafter: EOF
+            block: |
+              metrics:
+                receivers:
+                  dcgm:
+                    type: dcgm
+                service:
+                  pipelines:
+                    dcgm:
+                      receivers:
+                        - dcgm
+          notify:
+          - Restart Google Cloud Ops Agent
+        handlers:
+        - name: Restart Google Cloud Ops Agent
+          ansible.builtin.service:
+            name: google-cloud-ops-agent.service
+            state: "{{ 'restarted' if enable_ops_agent else 'stopped' }}"
+            enabled: "{{ enable_ops_agent }}"
+        post_tasks:
+        - name: Enable Google Cloud Ops Agent
+          ansible.builtin.service:
+            name: google-cloud-ops-agent.service
+            state: "{{ 'started' if enable_ops_agent else 'stopped' }}"
+            enabled: "{{ enable_ops_agent }}"
+        - name: Enable NVIDIA DCGM
+          ansible.builtin.service:
+            name: nvidia-dcgm.service
+            state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}"
+            enabled: "{{ enable_nvidia_dcgm }}"
+
+  login_runners:
+  - type: shell
+    destination: reset_enroot.sh
+    content: |
+      #!/bin/bash
+      # reset enroot to defaults of files under /home and running under /run
+      # allows basic enroot testing with reduced I/O performance
+      rm -f /etc/enroot/enroot.conf
+
+  controller_runners:
+  - type: shell
+    destination: stage_scripts.sh
+    content: |
+      #!/bin/bash
+      SLURM_ROOT=/opt/apps/adm/slurm
+      mkdir -m 0755 -p "${SLURM_ROOT}/scripts"
+      mkdir -p "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d"
+      mkdir -p "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d"
+      curl -s -o "${SLURM_ROOT}/scripts/sudo-oslogin" \
+          https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/sudo-oslogin
+      chmod 0755 "${SLURM_ROOT}/scripts/sudo-oslogin"
+      ln -s "${SLURM_ROOT}/scripts/sudo-oslogin" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/sudo-oslogin.prolog_slurmd"
+      ln -s "${SLURM_ROOT}/scripts/sudo-oslogin" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/sudo-oslogin.epilog_slurmd"
+      curl -s -o "${SLURM_ROOT}/scripts/rxdm" \
+          https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/receive-data-path-manager-mega
+      chmod 0755 "${SLURM_ROOT}/scripts/rxdm"
+      ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/rxdm.prolog_slurmd"
+      ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/rxdm.epilog_slurmd"
+      # Uncomment the line below to enable epilog that will check health of GPUs and drain node if problem is detected.
+      # ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd"
+  - type: shell
+    destination: reset_enroot.sh
+    content: |
+      #!/bin/bash
+      # reset enroot to defaults of files under /home and running under /run
+      # allows basic enroot testing with reduced I/O performance
+      rm -f /etc/enroot/enroot.conf
+
+deployment_groups:
+- group: cluster
+  modules:
+  - id: sysnet
+    source: modules/network/pre-existing-vpc
+    settings:
+      network_name: $(vars.network_name_system)
+      subnetwork_name: $(vars.subnetwork_name_system)
+
+  # if an existing bucket is desired, follow modules/file-system/pre-existing-network-storage/README.md
+  - id: data-bucket
+    source: community/modules/file-system/cloud-storage-bucket
+    settings:
+      local_mount: /gcs
+      mount_options: defaults,rw,_netdev,implicit_dirs,allow_other,implicit_dirs,file_mode=777,dir_mode=777
+      random_suffix: true
+
+  - id: gpunets
+    source: modules/network/multivpc
+    settings:
+      network_name_prefix: $(vars.deployment_name)-gpunet
+      global_ip_address_range: 10.0.0.0/9
+      network_count: 8
+      subnetwork_cidr_suffix: 20
+
+  - id: homefs
+    source: modules/file-system/pre-existing-network-storage
+    settings:
+      server_ip: $(vars.server_ip_homefs)
+      remote_mount: $(vars.remote_mount_homefs)
+      local_mount: $(vars.local_mount_homefs)
+      mount_options: "defaults,hard"
+
+  - id: debug_nodeset
+    use: [sysnet]
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    settings:
+      node_count_static: 0
+      node_count_dynamic_max: 4
+      machine_type: n2-standard-2
+
+  - id: debug_partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
+    use:
+    - debug_nodeset
+    settings:
+      partition_name: debug
+      exclusive: false
+
+  - id: a3mega_startup
+    source: modules/scripts/startup-script
+    settings:
+      # When shutting down a VM with local SSD disks, we strongly recommend the
+      # automatic migration of data following these instructions:
+      # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance
+      # Failure to do will result in VMs that lose data and do not automatically
+      # mount local SSD filesystems
+      local_ssd_filesystem:
+        mountpoint: $(vars.localssd_mountpoint)
+        permissions: "1777" # must quote numeric filesystem permissions!
+      # Docker was successfully installed in the image, this configures it
+      # to use the A3-specific local SSD volumes to store container images
+      docker:
+        enabled: true
+        world_writable: true
+        daemon_config: |
+          {
+            "data-root": "$(vars.localssd_mountpoint)/docker"
+          }
+      runners: $(flatten([vars.a3m_runners, vars.gcsfuse_lssd_runners]))
+
+  - id: a3mega_nodeset
+    source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
+    use:
+    - sysnet
+    - gpunets
+    settings:
+      node_count_static: $(vars.a3mega_cluster_size)
+      node_count_dynamic_max: 0
+      disk_type: pd-ssd
+      machine_type: a3-megagpu-8g
+      enable_public_ips: false
+      enable_smt: true
+      node_conf:
+        CoresPerSocket: 52
+        ThreadsPerCore: 2
+      on_host_maintenance: TERMINATE
+      bandwidth_tier: gvnic_enabled
+      reservation_name: $(vars.a3mega_reservation_name)
+      maintenance_interval: $(vars.a3mega_maintenance_interval)
+      startup_script: $(a3mega_startup.startup_script)
+
+  - id: a3mega_partition
+    source: community/modules/compute/schedmd-slurm-gcp-v6-partition
+    use:
+    - a3mega_nodeset
+    settings:
+      partition_name: $(vars.a3mega_partition_name)
+      exclusive: false
+      is_default: true
+      partition_conf:
+        OverSubscribe: EXCLUSIVE
+        ResumeTimeout: 900
+        SuspendTimeout: 600
+
+  - id: controller_startup
+    source: modules/scripts/startup-script
+    settings:
+      runners: $(flatten([vars.gcsfuse_runners, vars.controller_runners]))
+
+  - id: login_startup
+    source: modules/scripts/startup-script
+    settings:
+      runners: $(flatten([vars.gcsfuse_runners, vars.login_runners]))
+
+  - id: slurm_login
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
+    use:
+    - sysnet
+    settings:
+      name_prefix: login
+      disk_type: pd-balanced
+      machine_type: c2-standard-4
+
+  - id: slurm_controller
+    source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
+    use:
+    - sysnet
+    - a3mega_partition
+    - debug_partition
+    - slurm_login
+    - homefs
+    - data-bucket
+    settings:
+      machine_type: c2-standard-8
+      enable_cleanup_compute: true
+      enable_external_prolog_epilog: true
+      slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl
+      controller_startup_script: $(controller_startup.startup_script)
+      login_startup_script: $(login_startup.startup_script)
+      prolog_scripts:
+      - filename: set_hostname_for_enroot.sh
+        content: |
+          #!/bin/bash
+          hostname | tee /etc/hostname

From 0d394b5d140039aad7fb0ab58c2ff9e1dfea4a7a Mon Sep 17 00:00:00 2001
From: Allison Ko <allisonko@google.com>
Date: Fri, 10 Jan 2025 17:50:02 -0800
Subject: [PATCH 2/2] rename /gcs-rwx to /gcs

---
 .../machine-learning/a3-megagpu-8g/README.md  |  2 +-
 .../slurm-a3mega-gcsfuse-lssd-cluster.yaml    | 43 ++++++++++++++-----
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/examples/machine-learning/a3-megagpu-8g/README.md b/examples/machine-learning/a3-megagpu-8g/README.md
index 5bf1f36614..8a8d33a93d 100644
--- a/examples/machine-learning/a3-megagpu-8g/README.md
+++ b/examples/machine-learning/a3-megagpu-8g/README.md
@@ -7,4 +7,4 @@ these [instructions].
 
 # GCSFuse with Local SSD cache
 
-`slurm-a3mega-gcsfuse-lssd-cluster.yaml` reflects best practices for using GCSFuse for ML workloads. It is configured to mount all available GCS buckets on two mountpoints on a3-mega nodes. The `/gcs-rwx` mountpoint enables parallel downloads, intended for reading/writing checkpoints, logs, application outputs, model serving, or loading large files (e.g. squashfs files). The read-only `/gcs-ro` mountpoint disables parallel downloads and enables the list cache, intended for reading training data. Parallel downloads are not recommended for training workloads; see [GCSFuse documentation](https://cloud.google.com/storage/docs/cloud-storage-fuse/file-caching#parallel-downloads) for details.
+`slurm-a3mega-gcsfuse-lssd-cluster.yaml` reflects best practices for using GCSFuse for ML workloads. It is configured to mount all available GCS buckets on two mountpoints on a3-mega nodes. The `/gcs` mountpoint enables parallel downloads, intended for reading/writing checkpoints, logs, application outputs, model serving, or loading large files (e.g. squashfs files). The read-only `/gcs-ro` mountpoint disables parallel downloads and enables the list cache, intended for reading training data. Parallel downloads are not recommended for training workloads; see [GCSFuse documentation](https://cloud.google.com/storage/docs/cloud-storage-fuse/file-caching#parallel-downloads) for details.
diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml
index 698524ee0d..f79fcae5c6 100644
--- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml
+++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml
@@ -90,7 +90,7 @@ vars:
         tasks:
         - name: Create gcsfuse rwx configuration
           ansible.builtin.copy:
-            dest: /etc/gcsfuse-rwx.yml
+            dest: /etc/gcsfuse.yml
             owner: root
             group: root
             mode: 0o644
@@ -130,37 +130,58 @@ vars:
                 kernel-list-cache-ttl-secs: 60
               foreground: true
 
-        - name: Create gcsfuse@ systemd service
+        - name: Create gcsfuse systemd service
+          ansible.builtin.copy:
+            dest: /etc/systemd/system/gcsfuse.service
+            owner: root
+            group: root
+            mode: 0o644
+            content: |
+              [Unit]
+              Description=gcsfuse mount of all buckets
+              After=local-fs.target
+
+              [Service]
+              Type=simple
+              User=root
+              ExecStartPre=/bin/mkdir -p /gcs
+              ExecStart=gcsfuse --config-file /etc/gcsfuse.yml /gcs
+              ExecStop=fusermount3 -u /gcs
+
+              [Install]
+              WantedBy=slurmd.service multi-user.target
+
+        - name: Create gcsfuse-ro systemd service
           ansible.builtin.copy:
-            dest: /etc/systemd/system/gcsfuse@.service
+            dest: /etc/systemd/system/gcsfuse-ro.service
             owner: root
             group: root
             mode: 0o644
             content: |
               [Unit]
-              Description=gcsfuse %i mount of all buckets
+              Description=gcsfuse ro mount of all buckets
               After=local-fs.target
 
               [Service]
               Type=simple
               User=root
-              ExecStartPre=/bin/mkdir -p /gcs-%i
-              ExecStart=gcsfuse --config-file /etc/gcsfuse-%i.yml /gcs-%i
-              ExecStop=fusermount3 -u /gcs-%i
+              ExecStartPre=/bin/mkdir -p /gcs-ro
+              ExecStart=gcsfuse --config-file /etc/gcsfuse-ro.yml /gcs-ro
+              ExecStop=fusermount3 -u /gcs-ro
 
               [Install]
               WantedBy=slurmd.service multi-user.target
 
         post_tasks:
-        - name: Enable and restart gcsfuse@rwx
+        - name: Enable and restart gcsfuse
           ansible.builtin.service:
-            name: gcsfuse@rwx.service
+            name: gcsfuse.service
             state: restarted
             enabled: true
 
-        - name: Enable and restart gcsfuse@ro
+        - name: Enable and restart gcsfuse-ro
           ansible.builtin.service:
-            name: gcsfuse@ro.service
+            name: gcsfuse-ro.service
             state: restarted
             enabled: true