diff --git a/examples/machine-learning/a3-megagpu-8g/README.md b/examples/machine-learning/a3-megagpu-8g/README.md index 79202023c9..8a8d33a93d 100644 --- a/examples/machine-learning/a3-megagpu-8g/README.md +++ b/examples/machine-learning/a3-megagpu-8g/README.md @@ -4,3 +4,7 @@ To deploy an a3-megagpu-8g cluster running Slurm on Google Cloud, please follow these [instructions]. [instructions]: https://cloud.google.com/cluster-toolkit/docs/deploy/deploy-a3-mega-cluster + +# GCSFuse with Local SSD cache + +`slurm-a3mega-gcsfuse-lssd-cluster.yaml` reflects best practices for using GCSFuse for ML workloads. It is configured to mount all available GCS buckets on two mountpoints on a3-mega nodes. The `/gcs` mountpoint enables parallel downloads, intended for reading/writing checkpoints, logs, application outputs, model serving, or loading large files (e.g. squashfs files). The read-only `/gcs-ro` mountpoint disables parallel downloads and enables the list cache, intended for reading training data. Parallel downloads are not recommended for training workloads; see [GCSFuse documentation](https://cloud.google.com/storage/docs/cloud-storage-fuse/file-caching#parallel-downloads) for details. diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml new file mode 100644 index 0000000000..f79fcae5c6 --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-gcsfuse-lssd-cluster.yaml @@ -0,0 +1,439 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +blueprint_name: a3mega-cluster + +# this blueprint should be used with the extra variables defined in +# deployment-image-cluster.yaml +vars: + deployment_name: a3mega-cluster + a3mega_partition_name: a3mega + a3mega_maintenance_interval: "" + enable_placement: false + remote_mount_homefs: /nfsshare + local_mount_homefs: /home + instance_image_custom: true + instance_image: + family: $(vars.final_image_family) + project: $(vars.project_id) + enable_login_public_ips: true + enable_controller_public_ips: true + localssd_mountpoint: /mnt/localssd + + gcsfuse_runners: + - type: ansible-local + destination: gcsfuse.yml + content: | + --- + - name: Create gcsfuse configuration + ansible.builtin.copy: + dest: /etc/gcsfuse.yml + owner: root + group: root + mode: 0o644 + content: | + file-system: + dir-mode: "777" + file-mode: "777" + rename-dir-limit: 0 # Set to 20000 for hierarchical buckets + fuse-options: allow_other + foreground: true + + - name: Create gcsfuse systemd service + ansible.builtin.copy: + dest: /etc/systemd/system/gcsfuse.service + owner: root + group: root + mode: 0o644 + content: | + [Unit] + Description=gcsfuse mount of all buckets + After=local-fs.target + + [Service] + Type=simple + User=root + ExecStartPre=/bin/mkdir -p /gcs + ExecStart=gcsfuse --config-file /etc/gcsfuse.yml /gcs + ExecStop=fusermount3 -u /gcs + + [Install] + WantedBy=slurmd.service multi-user.target + + post_tasks: + - name: Enable and restart gcsfuse + ansible.builtin.service: + name: gcsfuse.service + state: restarted + enabled: true + + gcsfuse_lssd_runners: + - type: ansible-local + destination: gcsfuse.yml + content: | + --- + - name: Create LSSD optimized gcsfuse mount + hosts: localhost + become: true + tasks: + - name: Create gcsfuse rwx configuration + ansible.builtin.copy: + dest: /etc/gcsfuse.yml + owner: root + group: root + mode: 0o644 + content: | + file-cache: + max-size-mb: -1 + enable-parallel-downloads: true + download-chunk-size-mb: 50 + parallel-downloads-per-file: 16 + cache-dir: /mnt/localssd + file-system: + dir-mode: "777" + file-mode: "777" + rename-dir-limit: 0 # Set to 20000 for hierarchical buckets + temp-dir: /mnt/localssd + fuse-options: allow_other + foreground: true + + - name: Create gcsfuse ro configuration + ansible.builtin.copy: + dest: /etc/gcsfuse-ro.yml + owner: root + group: root + mode: 0o644 + content: | + file-cache: + max-size-mb: -1 + metadata-cache: + ttl-secs: 3600 + cache-dir: /mnt/localssd + file-system: + dir-mode: "555" # need 5 on dir to enable ls + file-mode: "444" + rename-dir-limit: 0 # Set to 20000 for hierarchical buckets + temp-dir: /mnt/localssd + fuse-options: allow_other + kernel-list-cache-ttl-secs: 60 + foreground: true + + - name: Create gcsfuse systemd service + ansible.builtin.copy: + dest: /etc/systemd/system/gcsfuse.service + owner: root + group: root + mode: 0o644 + content: | + [Unit] + Description=gcsfuse mount of all buckets + After=local-fs.target + + [Service] + Type=simple + User=root + ExecStartPre=/bin/mkdir -p /gcs + ExecStart=gcsfuse --config-file /etc/gcsfuse.yml /gcs + ExecStop=fusermount3 -u /gcs + + [Install] + WantedBy=slurmd.service multi-user.target + + - name: Create gcsfuse-ro systemd service + ansible.builtin.copy: + dest: /etc/systemd/system/gcsfuse-ro.service + owner: root + group: root + mode: 0o644 + content: | + [Unit] + Description=gcsfuse ro mount of all buckets + After=local-fs.target + + [Service] + Type=simple + User=root + ExecStartPre=/bin/mkdir -p /gcs-ro + ExecStart=gcsfuse --config-file /etc/gcsfuse-ro.yml /gcs-ro + ExecStop=fusermount3 -u /gcs-ro + + [Install] + WantedBy=slurmd.service multi-user.target + + post_tasks: + - name: Enable and restart gcsfuse + ansible.builtin.service: + name: gcsfuse.service + state: restarted + enabled: true + + - name: Enable and restart gcsfuse-ro + ansible.builtin.service: + name: gcsfuse-ro.service + state: restarted + enabled: true + + a3m_runners: + - type: ansible-local + destination: slurm_aperture.yml + content: | + --- + - name: Configure Slurm to depend upon aperture devices + hosts: all + become: true + vars: {} + tasks: + - name: Ensure slurmd starts after aperture devices are ready + ansible.builtin.copy: + dest: /etc/systemd/system/slurmd.service.d/aperture.conf + owner: root + group: root + mode: 0o644 + content: | + [Service] + ExecCondition=/usr/bin/test -d /dev/aperture_devices/ + notify: Reload SystemD + handlers: + - name: Reload SystemD + ansible.builtin.systemd: + daemon_reload: true + - type: ansible-local + destination: enable_dcgm.yml + content: | + --- + - name: Enable NVIDIA DCGM on GPU nodes + hosts: all + become: true + vars: + enable_ops_agent: $(vars.enable_ops_agent) + enable_nvidia_dcgm: $(vars.enable_nvidia_dcgm) + tasks: + - name: Update Ops Agent configuration + ansible.builtin.blockinfile: + path: /etc/google-cloud-ops-agent/config.yaml + insertafter: EOF + block: | + metrics: + receivers: + dcgm: + type: dcgm + service: + pipelines: + dcgm: + receivers: + - dcgm + notify: + - Restart Google Cloud Ops Agent + handlers: + - name: Restart Google Cloud Ops Agent + ansible.builtin.service: + name: google-cloud-ops-agent.service + state: "{{ 'restarted' if enable_ops_agent else 'stopped' }}" + enabled: "{{ enable_ops_agent }}" + post_tasks: + - name: Enable Google Cloud Ops Agent + ansible.builtin.service: + name: google-cloud-ops-agent.service + state: "{{ 'started' if enable_ops_agent else 'stopped' }}" + enabled: "{{ enable_ops_agent }}" + - name: Enable NVIDIA DCGM + ansible.builtin.service: + name: nvidia-dcgm.service + state: "{{ 'started' if enable_nvidia_dcgm else 'stopped' }}" + enabled: "{{ enable_nvidia_dcgm }}" + + login_runners: + - type: shell + destination: reset_enroot.sh + content: | + #!/bin/bash + # reset enroot to defaults of files under /home and running under /run + # allows basic enroot testing with reduced I/O performance + rm -f /etc/enroot/enroot.conf + + controller_runners: + - type: shell + destination: stage_scripts.sh + content: | + #!/bin/bash + SLURM_ROOT=/opt/apps/adm/slurm + mkdir -m 0755 -p "${SLURM_ROOT}/scripts" + mkdir -p "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d" + mkdir -p "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d" + curl -s -o "${SLURM_ROOT}/scripts/sudo-oslogin" \ + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/sudo-oslogin + chmod 0755 "${SLURM_ROOT}/scripts/sudo-oslogin" + ln -s "${SLURM_ROOT}/scripts/sudo-oslogin" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/sudo-oslogin.prolog_slurmd" + ln -s "${SLURM_ROOT}/scripts/sudo-oslogin" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/sudo-oslogin.epilog_slurmd" + curl -s -o "${SLURM_ROOT}/scripts/rxdm" \ + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/receive-data-path-manager-mega + chmod 0755 "${SLURM_ROOT}/scripts/rxdm" + ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/rxdm.prolog_slurmd" + ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/rxdm.epilog_slurmd" + # Uncomment the line below to enable epilog that will check health of GPUs and drain node if problem is detected. + # ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd" + - type: shell + destination: reset_enroot.sh + content: | + #!/bin/bash + # reset enroot to defaults of files under /home and running under /run + # allows basic enroot testing with reduced I/O performance + rm -f /etc/enroot/enroot.conf + +deployment_groups: +- group: cluster + modules: + - id: sysnet + source: modules/network/pre-existing-vpc + settings: + network_name: $(vars.network_name_system) + subnetwork_name: $(vars.subnetwork_name_system) + + # if an existing bucket is desired, follow modules/file-system/pre-existing-network-storage/README.md + - id: data-bucket + source: community/modules/file-system/cloud-storage-bucket + settings: + local_mount: /gcs + mount_options: defaults,rw,_netdev,implicit_dirs,allow_other,implicit_dirs,file_mode=777,dir_mode=777 + random_suffix: true + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 10.0.0.0/9 + network_count: 8 + subnetwork_cidr_suffix: 20 + + - id: homefs + source: modules/file-system/pre-existing-network-storage + settings: + server_ip: $(vars.server_ip_homefs) + remote_mount: $(vars.remote_mount_homefs) + local_mount: $(vars.local_mount_homefs) + mount_options: "defaults,hard" + + - id: debug_nodeset + use: [sysnet] + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + settings: + node_count_static: 0 + node_count_dynamic_max: 4 + machine_type: n2-standard-2 + + - id: debug_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - debug_nodeset + settings: + partition_name: debug + exclusive: false + + - id: a3mega_startup + source: modules/scripts/startup-script + settings: + # When shutting down a VM with local SSD disks, we strongly recommend the + # automatic migration of data following these instructions: + # https://cloud.google.com/compute/docs/disks/local-ssd#stop_instance + # Failure to do will result in VMs that lose data and do not automatically + # mount local SSD filesystems + local_ssd_filesystem: + mountpoint: $(vars.localssd_mountpoint) + permissions: "1777" # must quote numeric filesystem permissions! + # Docker was successfully installed in the image, this configures it + # to use the A3-specific local SSD volumes to store container images + docker: + enabled: true + world_writable: true + daemon_config: | + { + "data-root": "$(vars.localssd_mountpoint)/docker" + } + runners: $(flatten([vars.a3m_runners, vars.gcsfuse_lssd_runners])) + + - id: a3mega_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: + - sysnet + - gpunets + settings: + node_count_static: $(vars.a3mega_cluster_size) + node_count_dynamic_max: 0 + disk_type: pd-ssd + machine_type: a3-megagpu-8g + enable_public_ips: false + enable_smt: true + node_conf: + CoresPerSocket: 52 + ThreadsPerCore: 2 + on_host_maintenance: TERMINATE + bandwidth_tier: gvnic_enabled + reservation_name: $(vars.a3mega_reservation_name) + maintenance_interval: $(vars.a3mega_maintenance_interval) + startup_script: $(a3mega_startup.startup_script) + + - id: a3mega_partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition + use: + - a3mega_nodeset + settings: + partition_name: $(vars.a3mega_partition_name) + exclusive: false + is_default: true + partition_conf: + OverSubscribe: EXCLUSIVE + ResumeTimeout: 900 + SuspendTimeout: 600 + + - id: controller_startup + source: modules/scripts/startup-script + settings: + runners: $(flatten([vars.gcsfuse_runners, vars.controller_runners])) + + - id: login_startup + source: modules/scripts/startup-script + settings: + runners: $(flatten([vars.gcsfuse_runners, vars.login_runners])) + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login + use: + - sysnet + settings: + name_prefix: login + disk_type: pd-balanced + machine_type: c2-standard-4 + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller + use: + - sysnet + - a3mega_partition + - debug_partition + - slurm_login + - homefs + - data-bucket + settings: + machine_type: c2-standard-8 + enable_cleanup_compute: true + enable_external_prolog_epilog: true + slurm_conf_tpl: modules/embedded/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl + controller_startup_script: $(controller_startup.startup_script) + login_startup_script: $(login_startup.startup_script) + prolog_scripts: + - filename: set_hostname_for_enroot.sh + content: | + #!/bin/bash + hostname | tee /etc/hostname