From 38570da598f7f52cc6b64bfd36e2ad4c61c486c3 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 6 Jan 2025 20:46:54 +0000 Subject: [PATCH] Remove Slurm-gcp v5 modules and update documentation --- cmd/create.go | 17 - .../gcs_bucket/webserver/startup.sh | 1 - .../schedmd-slurm-gcp-v5-node-group/README.md | 174 ----- .../gpu_definition.tf | 58 -- .../schedmd-slurm-gcp-v5-node-group/main.tf | 96 --- .../metadata.yaml | 18 - .../outputs.tf | 29 - .../source_image_logic.tf | 78 -- .../variables.tf | 442 ------------ .../versions.tf | 28 - .../README.md | 104 --- .../main.tf | 46 -- .../metadata.yaml | 18 - .../outputs.tf | 23 - .../variables.tf | 94 --- .../versions.tf | 19 - .../schedmd-slurm-gcp-v5-partition/README.md | 192 ----- .../schedmd-slurm-gcp-v5-partition/main.tf | 88 --- .../metadata.yaml | 19 - .../schedmd-slurm-gcp-v5-partition/outputs.tf | 42 -- .../variables.tf | 270 ------- .../versions.tf | 28 - .../schedmd-slurm-gcp-v5-controller/README.md | 307 -------- .../etc/htc-slurm.conf.tpl | 67 -- .../etc/htc-slurmdbd.conf.tpl | 34 - .../etc/long-prolog-slurm.conf.tpl | 70 -- .../gpu_definition.tf | 58 -- .../schedmd-slurm-gcp-v5-controller/main.tf | 137 ---- .../metadata.yaml | 22 - .../outputs.tf | 30 - .../source_image_logic.tf | 78 -- .../variables.tf | 671 ------------------ .../versions.tf | 28 - .../schedmd-slurm-gcp-v5-hybrid/README.md | 230 ------ .../schedmd-slurm-gcp-v5-hybrid/main.tf | 59 -- .../schedmd-slurm-gcp-v5-hybrid/metadata.yaml | 20 - .../schedmd-slurm-gcp-v5-hybrid/variables.tf | 344 --------- .../schedmd-slurm-gcp-v5-hybrid/versions.tf | 19 - .../schedmd-slurm-gcp-v5-login/README.md | 153 ---- .../gpu_definition.tf | 58 -- .../schedmd-slurm-gcp-v5-login/main.tf | 116 --- .../schedmd-slurm-gcp-v5-login/metadata.yaml | 19 - .../source_image_logic.tf | 78 -- .../schedmd-slurm-gcp-v5-login/variables.tf | 429 ----------- .../schedmd-slurm-gcp-v5-login/versions.tf | 28 - .../schedmd-slurm-gcp-v6-controller/README.md | 9 +- .../schedmd-slurm-gcp-v6-login/README.md | 2 +- .../modules/scripts/spack-setup/README.md | 2 +- docs/gpu-support.md | 23 +- docs/slurm-troubleshooting.md | 63 +- docs/vm-images.md | 30 +- examples/hpc-enterprise-slurm.yaml | 2 +- modules/README.md | 22 +- pkg/modulereader/metadata_legacy.go | 16 - tools/cloud-build/project-cleanup-slurm.yaml | 5 +- tools/duplicate-diff.py | 13 +- .../configs/versioned_blueprint.yaml | 2 +- tools/validate_configs/validate_configs.sh | 8 + 58 files changed, 47 insertions(+), 5089 deletions(-) delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/metadata.yaml delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/outputs.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf delete mode 100644 community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/versions.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf delete mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf diff --git a/cmd/create.go b/cmd/create.go index 3ea151cdcd..0b18714fe0 100644 --- a/cmd/create.go +++ b/cmd/create.go @@ -125,27 +125,10 @@ func expandOrDie(path string) (config.Blueprint, *config.YamlCtx) { // Expand the blueprint checkErr(bp.Expand(), ctx) validateMaybeDie(bp, *ctx) - v5DeprecationWarning(bp) return bp, ctx } -// TODO: Remove this warning when v5 deprecation is complete -func v5DeprecationWarning(bp config.Blueprint) { - alreadyContainsV5 := false - bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) { - if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 { - logging.Info("%s", boldYellow( - "We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n"+ - "Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n"+ - "However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n"+ - "While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.", - )) - alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers - } - }) -} - // TODO: move to expand.go func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) { err := validators.Execute(bp) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index dd92f7641f..35e5077653 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -76,7 +76,6 @@ EOL dnf install -y grafana -# Packages for https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute pip3.8 install google-api-python-client \ google-cloud-secret-manager \ google.cloud.pubsub \ diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md deleted file mode 100644 index bc54d36396..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ /dev/null @@ -1,174 +0,0 @@ -## Description - -> [!NOTE] -> Slurm-gcp-v5-node-group module is deprecated. See -> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) -> for specific recommendations and timelines. - -This module creates a node group data structure intended to be input to the -[schedmd-slurm-gcp-v5-partition](../schedmd-slurm-gcp-v5-partition/) module. - -Node groups allow adding heterogeneous node types to a partition, and hence -running jobs that mix multiple node characteristics. See the [heterogeneous jobs -section][hetjobs] of the SchedMD documentation for more information. - -To specify nodes from a specific node group in a partition, the [`--nodelist`] -(or `-w`) flag can be used, for example: - -```bash -srun -N 3 -p compute --nodelist cluster-compute-group-[0-2] hostname -``` - -Where the 3 nodes will be selected from the nodes `cluster-compute-group-[0-2]` -in the compute partition. - -Additionally, depending on how the nodes differ, a constraint can be added via -the [`--constraint`] (or `-C`) flag or other flags such as `--mincpus` can be -used to specify nodes with the desired characteristics. - -[`--nodelist`]: https://slurm.schedmd.com/srun.html#OPT_nodelist -[`--constraint`]: https://slurm.schedmd.com/srun.html#OPT_constraint -[hetjobs]: https://slurm.schedmd.com/heterogeneous_jobs.html - -### Example - -The following code snippet creates a partition module using the `node-group` -module as input with: - -* a max node count of 200 -* VM machine type of `c2-standard-30` -* partition name of "compute" -* default group name of "ghpc" -* connected to the `network1` module via `use` -* nodes mounted to homefs via `use` - -```yaml -- id: node_group - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - settings: - node_count_dynamic_max: 200 - machine_type: c2-standard-30 - -- id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition - use: - - network1 - - homefs - - node_group - settings: - partition_name: compute -``` - -## Custom Images - -For more information on creating valid custom images for the node group VM -instances or for custom instance templates, see our [vm-images.md] documentation -page. - -[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images - -## GPU Support - -More information on GPU support in Slurm on GCP and other Cluster Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform - -## License - -Copyright 2023 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 1.1 | -| [google](#requirement\_google) | >= 5.11 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 5.11 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | -| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the node group instances can be accessed via the internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | -| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [disable\_public\_ips](#input\_disable\_public\_ips) | If set to false. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | -| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | -| [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-standard"` | no | -| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | -| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | -| [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | -| [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-12-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | -| [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | -| [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | -| [maintenance\_interval](#input\_maintenance\_interval) | Specifies the frequency of planned maintenance events. Must be "PERIODIC" or empty string to not use this feature. | `string` | `""` | no | -| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | -| [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name](#input\_name) | Name of the node group. | `string` | `"ghpc"` | no | -| [node\_conf](#input\_node\_conf) | Map of Slurm node line configuration. | `map(any)` | `{}` | no | -| [node\_count\_dynamic\_max](#input\_node\_count\_dynamic\_max) | Maximum number of auto-scaling nodes allowed in this partition. | `number` | `10` | no | -| [node\_count\_static](#input\_node\_count\_static) | Number of nodes to be statically created. | `number` | `0` | no | -| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | -| [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | -| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources
- Must be a "SPECIFIC" reservation
- Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | -| [service\_account](#input\_service\_account) | Service account to attach to the compute instances. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | -| [source\_image](#input\_source\_image) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [source\_image\_family](#input\_source\_image\_family) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [source\_image\_project](#input\_source\_image\_project) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | -| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [node\_groups](#output\_node\_groups) | Details of the node group. Typically used as input to `schedmd-slurm-gcp-v5-partition`. | - diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf deleted file mode 100644 index ae8b93e4d3..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v5-node-group", ghpc_role = "compute" }) -} - -locals { - disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } - - metadata = merge( - local.disable_automatic_updates_metadata, - var.metadata - ) - - enable_public_ip_access_config = var.disable_public_ips ? [] : [{ nat_ip = null, network_tier = null }] - access_config = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config - - additional_disks = [ - for ad in var.additional_disks : { - disk_name = ad.disk_name - device_name = ad.device_name - disk_type = ad.disk_type - disk_size_gb = ad.disk_size_gb - disk_labels = merge(ad.disk_labels, local.labels) - auto_delete = ad.auto_delete - boot = ad.boot - } - ] - - node_group = { - # Group Definition - group_name = var.name - node_count_dynamic_max = var.node_count_dynamic_max - node_count_static = var.node_count_static - node_conf = var.node_conf - - # Template By Definition - additional_disks = local.additional_disks - additional_networks = var.additional_networks - bandwidth_tier = var.bandwidth_tier - can_ip_forward = var.can_ip_forward - disable_smt = !var.enable_smt - disk_auto_delete = var.disk_auto_delete - disk_labels = merge(local.labels, var.disk_labels) - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - enable_confidential_vm = var.enable_confidential_vm - enable_oslogin = var.enable_oslogin - enable_shielded_vm = var.enable_shielded_vm - gpu = one(local.guest_accelerator) - labels = local.labels - machine_type = var.machine_type - maintenance_interval = var.maintenance_interval - metadata = local.metadata - min_cpu_platform = var.min_cpu_platform - on_host_maintenance = var.on_host_maintenance - preemptible = var.preemptible - reservation_name = var.reservation_name - shielded_instance_config = var.shielded_instance_config - source_image_family = local.source_image_family # requires source_image_logic.tf - source_image_project = local.source_image_project_normalized # requires source_image_logic.tf - source_image = local.source_image # requires source_image_logic.tf - tags = var.tags - access_config = local.access_config - service_account = var.service_account != null ? var.service_account : { - email = data.google_compute_default_service_account.default.email - scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } - - # Spot VM settings - enable_spot_vm = var.enable_spot_vm - spot_instance_config = var.spot_instance_config - - # Template By Source - instance_template = var.instance_template - } -} - -data "google_compute_default_service_account" "default" { - project = var.project_id -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml deleted file mode 100644 index 641832182d..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/metadata.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: [] diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf deleted file mode 100644 index d289ee3554..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "node_groups" { - description = "Details of the node group. Typically used as input to `schedmd-slurm-gcp-v5-partition`." - value = local.node_group - - precondition { - condition = !contains([ - "c3-:pd-standard", - "h3-:pd-standard", - "h3-:pd-ssd", - ], "${substr(var.machine_type, 0, 3)}:${var.disk_type}") - error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}." - } -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf deleted file mode 100644 index 1df327a60b..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # Currently supported images and projects - known_project_families = { - schedmd-slurm-public = [ - "slurm-gcp-5-12-debian-11", - "slurm-gcp-5-12-hpc-rocky-linux-8", - "slurm-gcp-5-12-ubuntu-2004-lts", - "slurm-gcp-5-12-ubuntu-2204-lts-arm64", - "slurm-gcp-5-12-hpc-centos-7" - ] - } - - # This approach to "hacking" the project name allows a chain of Terraform - # calls to set the instance source_image (boot disk) with a "relative - # resource name" that passes muster with VPC Service Control rules - # - # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 - # https://cloud.google.com/apis/design/resource_names#relative_resource_name - source_image_project_normalized = (can(var.instance_image.family) ? - "projects/${data.google_compute_image.slurm.project}/global/images/family" : - "projects/${data.google_compute_image.slurm.project}/global/images" - ) - source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" - source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" -} - -data "google_compute_image" "slurm" { - family = try(var.instance_image.family, null) - name = try(var.instance_image.name, null) - project = var.instance_image.project - - lifecycle { - precondition { - condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 - error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." - } - - postcondition { - condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) - error_message = <<-EOD - Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. - EOD - } - postcondition { - condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) - error_message = <<-EOD - Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: - ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} - EOD - } - postcondition { - condition = var.disk_size_gb >= self.disk_size_gb - error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" - } - postcondition { - # Condition needs to check the suffix of the license, as prefix contains an API version which can change. - # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" - } - } -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf deleted file mode 100644 index 86b9f8d021..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ /dev/null @@ -1,442 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 - -variable "project_id" { - description = "Project in which the HPC deployment will be created." - type = string -} - -## Node Group Definition - -variable "name" { - description = "Name of the node group." - type = string - default = "ghpc" - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,5})$", var.name)) - error_message = "Node group name (var.name) must begin with a letter, be fully alphanumeric and be 6 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,5})$'." - } -} - -variable "node_conf" { - description = "Map of Slurm node line configuration." - type = map(any) - default = {} -} - -variable "node_count_dynamic_max" { - description = "Maximum number of auto-scaling nodes allowed in this partition." - type = number - default = 10 -} - -variable "node_count_static" { - description = "Number of nodes to be statically created." - type = number - default = 0 -} - -## VM Definition - -variable "instance_template" { - description = <<-EOD - Self link to a custom instance template. If set, other VM definition - variables such as machine_type and instance_image will be ignored in favor - of the provided instance template. - - For more information on creating custom images for the instance template - that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section - in docs/vm-images.md. - EOD - type = string - default = null -} - -variable "machine_type" { - description = "Compute Platform machine type to use for this partition compute nodes." - type = string - default = "c2-standard-60" -} - -variable "metadata" { - type = map(string) - description = "Metadata, provided as a map." - default = {} -} - -variable "instance_image" { - description = <<-EOD - Defines the image that will be used in the Slurm node group VM instances. - - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - - For more information on creating custom images that comply with Slurm on GCP - see the "Slurm on GCP Custom Images" section in docs/vm-images.md. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "slurm-gcp-5-12-hpc-centos-7" - } - - validation { - condition = can(coalesce(var.instance_image.project)) - error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID." - } - - validation { - condition = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family)) - error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name." - } -} - -variable "instance_image_custom" { - description = <<-EOD - A flag that designates that the user is aware that they are requesting - to use a custom and potentially incompatible image for this Slurm on - GCP module. - - If the field is set to false, only the compatible families and project - names will be accepted. The deployment will fail with any other image - family or name. If set to true, no checks will be done. - - See: https://goo.gle/hpc-slurm-images - EOD - type = bool - default = false -} - -# tflint-ignore: terraform_unused_declarations -variable "source_image_project" { - type = string - description = "DEPRECATED: Use `instance_image` instead." - default = null - validation { - condition = var.source_image_project == null - error_message = "Variable `source_image_project` is deprecated. Use `instance_image` instead." - } -} - -# tflint-ignore: terraform_unused_declarations -variable "source_image_family" { - type = string - description = "DEPRECATED: Use `instance_image` instead." - default = null - validation { - condition = var.source_image_family == null - error_message = "Variable `source_image_family` is deprecated. Use `instance_image` instead." - } -} - -# tflint-ignore: terraform_unused_declarations -variable "source_image" { - type = string - description = "DEPRECATED: Use `instance_image` instead." - default = null - validation { - condition = var.source_image == null - error_message = "Variable `source_image` is deprecated. Use `instance_image` instead." - } -} - -variable "tags" { - type = list(string) - description = "Network tag list." - default = [] -} - -variable "disk_type" { - description = "Boot disk type." - type = string - default = "pd-standard" -} - -variable "disk_size_gb" { - description = "Size of boot disk to create for the partition compute nodes." - type = number - default = 50 -} - -variable "disk_auto_delete" { - type = bool - description = "Whether or not the boot disk should be auto-deleted." - default = true -} - -variable "disk_labels" { - description = "Labels specific to the boot disk. These will be merged with var.labels." - type = map(string) - default = {} -} - -variable "additional_disks" { - description = "Configurations of additional disks to be included on the partition nodes." - type = list(object({ - disk_name = string - device_name = string - disk_size_gb = number - disk_type = string - disk_labels = map(string) - auto_delete = bool - boot = bool - })) - default = [] -} - -variable "enable_confidential_vm" { - type = bool - description = "Enable the Confidential VM configuration. Note: the instance image must support option." - default = false -} - -variable "enable_shielded_vm" { - type = bool - description = "Enable the Shielded VM configuration. Note: the instance image must support option." - default = false -} - -variable "enable_oslogin" { - type = bool - description = <<-EOD - Enables Google Cloud os-login for user login and authentication for VMs. - See https://cloud.google.com/compute/docs/oslogin - EOD - default = true -} - -variable "can_ip_forward" { - description = "Enable IP forwarding, for NAT instances for example." - type = bool - default = false -} - -variable "enable_smt" { - type = bool - description = "Enables Simultaneous Multi-Threading (SMT) on instance." - default = false -} - -variable "labels" { - description = "Labels to add to partition compute instances. Key-value pairs." - type = map(string) - default = {} -} - -variable "min_cpu_platform" { - description = "The name of the minimum CPU platform that you want the instance to use." - type = string - default = null -} - -variable "on_host_maintenance" { - type = string - description = <<-EOD - Instance availability Policy. - - Note: Placement groups are not supported when on_host_maintenance is set to - "MIGRATE" and will be deactivated regardless of the value of - enable_placement. To support enable_placement, ensure on_host_maintenance is - set to "TERMINATE". - EOD - default = "TERMINATE" -} - -# tflint-ignore: terraform_unused_declarations -variable "gpu" { - type = object({ - type = string - count = number - }) - description = "DEPRECATED: use var.guest_accelerator" - default = null - validation { - condition = var.gpu == null - error_message = "var.gpu is deprecated. Use var.guest_accelerator." - } -} - -variable "guest_accelerator" { - description = "List of the type and count of accelerator cards attached to the instance." - type = list(object({ - type = string, - count = number - })) - default = [] - nullable = false - - validation { - condition = length(var.guest_accelerator) <= 1 - error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." - } -} - -variable "preemptible" { - description = "Should use preemptibles to burst." - type = bool - default = false -} - -variable "reservation_name" { - description = <<-EOD - Name of the reservation to use for VM resources - - Must be a "SPECIFIC" reservation - - Set to empty string if using no reservation or automatically-consumed reservations - EOD - type = string - default = "" - nullable = false -} - -variable "service_account" { - type = object({ - email = string - scopes = set(string) - }) - description = <<-EOD - Service account to attach to the compute instances. If not set, the - default compute service account for the given project will be used with the - "https://www.googleapis.com/auth/cloud-platform" scope. - EOD - default = null -} - -variable "shielded_instance_config" { - type = object({ - enable_integrity_monitoring = bool - enable_secure_boot = bool - enable_vtpm = bool - }) - description = <<-EOD - Shielded VM configuration for the instance. Note: not used unless - enable_shielded_vm is 'true'. - - enable_integrity_monitoring : Compare the most recent boot measurements to the - integrity policy baseline and return a pair of pass/fail results depending on - whether they match or not. - - enable_secure_boot : Verify the digital signature of all boot components, and - halt the boot process if signature verification fails. - - enable_vtpm : Use a virtualized trusted platform module, which is a - specialized computer chip you can use to encrypt objects like keys and - certificates. - EOD - default = { - enable_integrity_monitoring = true - enable_secure_boot = true - enable_vtpm = true - } -} - -variable "enable_spot_vm" { - description = "Enable the partition to use spot VMs (https://cloud.google.com/spot-vms)." - type = bool - default = false -} - -variable "spot_instance_config" { - description = "Configuration for spot VMs." - type = object({ - termination_action = string - }) - default = null -} - -variable "bandwidth_tier" { - description = < [!NOTE] -> Slurm-gcp-v5-partition-dynamic module is deprecated. See -> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) -> for specific recommendations and timelines. - -This module creates a dynamic compute partition that can be used as input to the -[schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md). -This will configure the slurm partition to contain nodes with the corresponding feature. -This supports externally created nodes that register as a dynamic node to also be placed -into their corresponding partition based on node feature. - -> **Warning**: updating a partition and running `terraform apply` will not cause -> the slurm controller to update its own configurations (`slurm.conf`) unless -> `enable_reconfigure` is set to true in the partition and controller modules. - -## Example - -The following example creates a dynamic partition, which is then used by a slurm -controller. This partition will register nodes that have the partition feature -of "dyn". - -```yaml - - id: dynamic_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic - use: [network1] - settings: - partition_name: dynamic - partition_feature: dyn - - - id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - use: [network1, dynamic_partition] -``` - -## Support - -The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform - -## License - -Copyright 2022 Google LLC -Copyright (C) SchedMD LLC. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | -| [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | -| [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | -| [partition\_feature](#input\_partition\_feature) | Any nodes with this feature will automatically be put into this partition.

NOTE: meant to be used for external dynamic nodes that register. | `string` | n/a | yes | -| [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | -| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | -| [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [partition](#output\_partition) | Details of a slurm partition | - diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf deleted file mode 100644 index 38fd95b761..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Copyright 2022 Google LLC - * Copyright (C) SchedMD LLC. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # Default to value in partition_conf if both set the same key - partition_conf = merge({ - "Default" = var.is_default ? "YES" : null, - "SuspendTime" = "INFINITE" - }, var.partition_conf) - - # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning - # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string - tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) - slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name -} - -module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2" - - slurm_cluster_name = local.slurm_cluster_name - enable_job_exclusive = var.exclusive - partition_conf = local.partition_conf - partition_feature = var.partition_feature - partition_name = var.partition_name - partition_nodes = [] - project_id = var.project_id - # region, subnetwork, and subnetwork_project do nothing in this configuration - # but are currently required by the module - region = var.region - subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - subnetwork_project = var.subnetwork_project -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml deleted file mode 100644 index 641832182d..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/metadata.yaml +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: [] diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf deleted file mode 100644 index e000aa2a1a..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/outputs.tf +++ /dev/null @@ -1,23 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "partition" { - description = "Details of a slurm partition" - value = { - compute_list = module.slurm_partition.compute_list - partition = module.slurm_partition.partition - } -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf deleted file mode 100644 index 653862e030..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 - -variable "deployment_name" { - description = "Name of the deployment." - type = string -} - -variable "slurm_cluster_name" { - type = string - description = "Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters)." - default = null -} - -variable "project_id" { - description = "Project in which the HPC deployment will be created." - type = string -} - -variable "region" { - description = "The default region for Cloud resources." - type = string -} - -variable "partition_name" { - description = "The name of the slurm partition." - type = string - - validation { - condition = can(regex("^[a-z](?:[a-z0-9]{0,6})$", var.partition_name)) - error_message = "Variable 'partition_name' must be composed of only alphanumeric characters, start with a letter and be 7 characters or less. Regexp: '^[a-z](?:[a-z0-9]{0,6})$'." - } -} - -variable "partition_conf" { - description = <<-EOD - Slurm partition configuration as a map. - See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION - EOD - type = map(string) - default = {} -} - -variable "is_default" { - description = <<-EOD - Sets this partition as the default partition by updating the partition_conf. - If "Default" is already set in partition_conf, this variable will have no effect. - EOD - type = bool - default = false -} - -variable "subnetwork_self_link" { - type = string - description = "Subnet to deploy to." - default = null -} - -variable "subnetwork_project" { - description = "The project the subnetwork belongs to." - type = string - default = null -} - -variable "exclusive" { - description = "Exclusive job access to nodes." - type = bool - default = true -} - -variable "partition_feature" { - description = <<-EOD - Any nodes with this feature will automatically be put into this partition. - - NOTE: meant to be used for external dynamic nodes that register. - EOD - type = string -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf deleted file mode 100644 index 1b471a522a..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/versions.tf +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -terraform { - required_version = ">= 0.13.0" -} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md deleted file mode 100644 index 552f50c50e..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ /dev/null @@ -1,192 +0,0 @@ -## Description - -> [!NOTE] -> Slurm-gcp-v5-partition module is deprecated. See -> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) -> for specific recommendations and timelines. - -This module creates a compute partition that can be used as input to the -[schedmd-slurm-gcp-v5-controller](../../scheduler/schedmd-slurm-gcp-v5-controller/README.md). - -The partition module is designed to work alongside the -[schedmd-slurm-gcp-v5-node-group](../schedmd-slurm-gcp-v5-node-group/README.md) -module. A partition can be made up of one or -more node groups, provided either through `use` (preferred) or defined manually -in the `node_groups` variable. - -> **Warning**: updating a partition and running `terraform apply` will not cause -> the slurm controller to update its own configurations (`slurm.conf`) unless -> `enable_reconfigure` is set to true in the partition and controller modules. - -### Example - -The following code snippet creates a partition module with: - -* 2 node groups added via `use`. - * The first node group is made up of machines of type `c2-standard-30`. - * The second node group is made up of machines of type `c2-standard-60`. - * Both node groups have a maximum count of 200 dynamically created nodes. -* partition name of "compute". -* connected to the `network1` module via `use`. -* nodes mounted to homefs via `use`. - -```yaml -- id: node_group_1 - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - settings: - name: c30 - node_count_dynamic_max: 200 - machine_type: c2-standard-30 - -- id: node_group_2 - source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - settings: - name: c60 - node_count_dynamic_max: 200 - machine_type: c2-standard-60 - -- id: compute_partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition - use: - - network1 - - homefs - - node_group_1 - - node_group_2 - settings: - partition_name: compute -``` - -For a complete example using this module, see -[slurm-gcp-v5-cluster.yaml](../../../examples/slurm-gcp-v5-cluster.yaml). - -### Compute VM Zone Policies - -The Slurm on GCP partition module allows you to specify additional zones in -which to create VMs through [bulk creation][bulk]. This is valuable when -configuring partitions with popular VM families and you desire access to -more compute resources across zones. - -[bulk]: https://cloud.google.com/compute/docs/instances/multiple/about-bulk-creation -[networkpricing]: https://cloud.google.com/vpc/network-pricing - -> **_WARNING:_** Lenient zone policies can lead to additional egress costs when -> moving large amounts of data between zones in the same region. For example, -> traffic between VMs and traffic from VMs to shared filesystems such as -> Filestore. For more information on egress fees, see the -> [Network Pricing][networkpricing] Google Cloud documentation. -> -> To avoid egress charges, ensure your compute nodes are created in a single -> zone by setting var.zone and leaving var.zones to its default value of the -> empty list. -> -> **_NOTE:_** If a new zone is added to the region while the cluster is active, -> nodes in the partition may be created in that zone. In this case, the -> partition may need to be redeployed (possible via `enable_reconfigure` if set) -> to ensure the newly added zone is denied. - -In the zonal example below, the partition's zone implicitly defaults to the -deployment variable `vars.zone`: - -```yaml -vars: - zone: us-central1-f - -- id: zonal-partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition -``` - -In the example below, we enable creation in additional zones: - -```yaml -vars: - zone: us-central1-f - -- id: multi-zonal-partition - source: community/modules/compute/schedmd-slurm-gcp-v5-partition - settings: - zones: - - us-central1-a - - us-central1-b -``` - -## Support - -The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform - -## License - -Copyright 2022 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | -| [google](#requirement\_google) | >= 5.11 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 5.11 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 | - -## Resources - -| Name | Type | -|------|------| -| [google_compute_reservation.reservation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_reservation) | data source | -| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no | -| [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details). Compute instances and resource policies
(e.g. placement groups) will be destroyed to align with new configuration.

NOTE: Requires Python and Google Pub/Sub API.

*WARNING*: Toggling this will impact the running workload. Deployed compute nodes
will be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | -| [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on the partition compute nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [node\_groups](#input\_node\_groups) | A list of node groups associated with this partition. See
schedmd-slurm-gcp-v5-node-group for more information on defining a node
group in a blueprint. |
list(object({
node_count_static = number
node_count_dynamic_max = number
group_name = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
additional_disks = list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
additional_networks = list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
bandwidth_tier = string
can_ip_forward = bool
disable_smt = bool
disk_auto_delete = bool
disk_labels = map(string)
disk_size_gb = number
disk_type = string
enable_confidential_vm = bool
enable_oslogin = bool
enable_shielded_vm = bool
enable_spot_vm = bool
gpu = object({
count = number
type = string
})
instance_template = string
labels = map(string)
machine_type = string
maintenance_interval = string
metadata = map(string)
min_cpu_platform = string
on_host_maintenance = string
preemptible = bool
reservation_name = string
service_account = object({
email = string
scopes = list(string)
})
shielded_instance_config = object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
spot_instance_config = object({
termination_action = string
})
source_image_family = string
source_image_project = string
source_image = string
tags = list(string)
}))
| `[]` | no | -| [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | -| [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | -| [partition\_startup\_scripts\_timeout](#input\_partition\_startup\_scripts\_timeout) | The timeout (seconds) applied to the partition startup script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | -| [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | Startup script that will be used by the partition VMs. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | -| [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | -| [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | -| [zones](#input\_zones) | Additional nodes in which to allow creation of partition nodes. Google Cloud
will find zone based on availability, quota and reservations. | `set(string)` | `[]` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [partition](#output\_partition) | Details of a slurm partition | - diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf deleted file mode 100644 index 2bf5bb7b30..0000000000 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - ghpc_startup_script = [{ - filename = "ghpc_startup.sh" - content = var.startup_script - }] - - # Default to value in partition_conf if both set "Default" - partition_conf = merge(var.is_default == true ? { "Default" : "YES" } : {}, var.partition_conf) - - # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning - # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string - tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) - slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name - - all_zones = toset(concat([var.zone], tolist(var.zones))) - excluded_zones = [for z in data.google_compute_zones.available.names : z if !contains(local.all_zones, z)] - - reservation_map = { for x in var.node_groups : x.reservation_name => x if x.reservation_name != "" } -} - -data "google_compute_zones" "available" { - project = var.project_id - region = var.region -} - -module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2" - - slurm_cluster_name = local.slurm_cluster_name - partition_nodes = var.node_groups - enable_job_exclusive = var.exclusive - enable_placement_groups = var.enable_placement - enable_reconfigure = var.enable_reconfigure - network_storage = var.network_storage - partition_name = var.partition_name - project_id = var.project_id - region = var.region - zone_policy_allow = [] # this setting is effectively useless because allow is implied default - zone_policy_deny = local.excluded_zones - zone_target_shape = var.zone_target_shape - subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - subnetwork_project = var.subnetwork_project - partition_conf = local.partition_conf - partition_startup_scripts = local.ghpc_startup_script - partition_startup_scripts_timeout = var.partition_startup_scripts_timeout -} - -# tflint-ignore: terraform_unused_declarations -data "google_compute_reservation" "reservation" { - project = var.project_id - zone = var.zone - - for_each = local.reservation_map - name = each.value.reservation_name - - lifecycle { - postcondition { - condition = self.self_link != null - error_message = "couldn't find the reservation ${each.value.reservation_name}}" - } - - postcondition { - condition = coalesce(self.specific_reservation_required, true) - error_message = < 0 - ]) - error_message = "A value in var.zones is not a valid zone (example: us-central1-f)." - } -} - -variable "zone_target_shape" { - description = < [!NOTE] -> Slurm-gcp-v5-controller module is deprecated. See -> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) -> for specific recommendations and timelines. - -This module creates a slurm controller node via the [SchedMD/slurm-gcp] -[slurm\_controller\_instance] and [slurm\_instance\_template] modules. - -More information about Slurm On GCP can be found at the -[project's GitHub page][SchedMD/slurm-gcp] and in the -[Slurm on Google Cloud User Guide][slurm-ug]. - -The [user guide][slurm-ug] provides detailed instructions on customizing and -enhancing the Slurm on GCP cluster as well as recommendations on configuring the -controller for optimal performance at different scales. - -> **Warning**: The variables `enable_reconfigure`, -> `enable_cleanup_compute`, and `enable_cleanup_subscriptions`, if set to -> `true`, require additional dependencies **to be installed on the system deploying the infrastructure**. -> -> ```shell -> # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt -> ``` - -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template -[slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt -[enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute -[enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions -[enable\_reconfigure]: #input\_enable\_reconfigure - -### Example - -```yaml -- id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - use: - - network1 - - homefs - - compute_partition - settings: - machine_type: c2-standard-8 -``` - -This creates a controller node with the following attributes: - -* connected to the primary subnetwork of `network1` -* the filesystem with the ID `homefs` (defined elsewhere in the blueprint) - mounted -* One partition with the ID `compute_partition` (defined elsewhere in the - blueprint) -* machine type upgraded from the default `c2-standard-4` to `c2-standard-8` - -For a complete example using this module, see -[slurm-gcp-v5-cluster.yaml](../../../examples/slurm-gcp-v5-cluster.yaml). - -### Live Cluster Reconfiguration (`enable_reconfigure`) - -The schedmd-slurm-gcp-v5-controller module supports the reconfiguration of -partitions and slurm configuration in a running, active cluster. This option is -activated through the `enable_reconfigure` setting: - -```yaml -- id: slurm_controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - settings: - enable_reconfigure: true -``` - -To reconfigure a running cluster: - -1. Edit the blueprint with the desired configuration changes -1. Call `gcluster create -w` to overwrite the deployment directory -1. Follow instructions in terminal to deploy - -The following are examples of updates that can be made to a running cluster: - -* Add or remove a partition to the cluster -* Resize an existing partition -* Attach new network storage to an existing partition - -> **NOTE**: Changing the VM `machine_type` of a partition may not work with -> `enable_reconfigure`. It is better to create a new partition and delete the -> old one. - -This option has some additional requirements: - -* The Pub/Sub API must be activated in the target project: - `gcloud services enable pubsub.googleapis.com --project "<>"` -* The authenticated user in the local development environment (or where - `terraform apply` is called) must have the Pub/Sub Admin (roles/pubsub.admin) - IAM role. -* Python and some python packages need to be installed with pip in the local - development environment deploying the cluster. One can use following commands: - - ```bash - pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt - ``` - - For more information, see the [description][optdeps] of this module. - -[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster#optional - -## Custom Images - -For more information on creating valid custom images for the controller VM -instance or for custom instance templates, see our [vm-images.md] documentation -page. - -[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images - -## GPU Support - -More information on GPU support in Slurm on GCP and other Cluster Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Placement Max Distance - -When using -[enable_placement](../../compute/schedmd-slurm-gcp-v5-partition/README.md#input_enable_placement) -with Slurm, Google Compute Engine will attempt to place VMs as physically close -together as possible. Capacity constraints at the time of VM creation may still -force VMs to be spread across multiple racks. Google provides the `max-distance` -flag which can used to control the maximum spreading allowed. Read more about -`max-distance` in the -[official docs](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies -). - -After deploying a Slurm cluster, you can use the following steps to manually -configure the max-distance parameter. - -1. Make sure your blueprint has `enable_placement: true` setting for Slurm - partitions. -2. Deploy the Slurm cluster and wait for the deployment to complete. -3. SSH to the deployed Slurm controller -4. Apply the following edit to `/slurm/scripts/config.yaml`: - - ```yaml - # Replace - enable_slurm_gcp_plugins: false - - # With - enable_slurm_gcp_plugins: - max_hops: - max_hops: 1 - ``` - -The `max_hops` parameter will be used for the `max-distance` argument. In the -above case using a value of 1 will restrict VM to be placed on the same rack. - -You can confirm that the `max-distance`` was applied by calling the following -command while jobs are running: - -```shell -gcloud beta compute resource-policies list \ - --format='yaml(name,groupPlacementPolicy.maxDistance)' -``` - -> [!WARNING] -> If a zone lacks capacity, using a lower `max-distance` value (such as 1) is -> more likely to cause VMs creation to fail. - - - -> [!WARNING] -> `/slurm/scripts/config.yaml` will be overwritten if the blueprint is -> re-deployed using the `enable_reconfigure` flag. - -## Hybrid Slurm Clusters -For more information on how to configure an on premise slurm cluster with hybrid -cloud partitions, see the [schedmd-slurm-gcp-v5-hybrid] module and our -extended instructions in our [docs](../../../../docs/hybrid-slurm-cluster/). - -[schedmd-slurm-gcp-v5-hybrid]: ../schedmd-slurm-gcp-v5-hybrid/README.md - -## Support -The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp#slurm-on-google-cloud-platform - -## License - - -Copyright 2023 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 1.1 | -| [google](#requirement\_google) | >= 3.83 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.83 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.2 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 | - -## Resources - -| Name | Type | -|------|------| -| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | -| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | -| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = bool
resume_rate = number
resume_timeout = number
suspend_rate = number
suspend_timeout = number
})
|
{
"no_comma_params": false,
"resume_rate": 0,
"resume_timeout": 300,
"suspend_rate": 0,
"suspend_timeout": 300
}
| no | -| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | -| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no | -| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to the compute\_startup\_script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `""` | no | -| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to the controller\_startup\_script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `false` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | -| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | -| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | -| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-ssd"` | no | -| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enable loading of cluster job usage into big query. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | -| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | -| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared under /opt/apps from the controller to compute nodes. | `bool` | `false` | no | -| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | -| [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfiguration when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details). Compute instances and resource policies
(e.g. placement groups) will be destroyed to align with new configuration.
NOTE: Requires Python and Google Pub/Sub API.
*WARNING*: Toggling this will impact the running workload. Deployed compute nodes
will be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | -| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-12-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | -| [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | -| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to the login startup script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | -| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | -| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | -| [network\_ip](#input\_network\_ip) | DEPRECATED: Use `static_ips` variable to assign an internal static ip address. | `string` | `null` | no | -| [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
access_config = list(object({
network_tier = string
}))
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
maintenance_interval = string
node_conf = map(string)
reservation_name = string
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | -| [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [region](#input\_region) | Region where the instances should be created. | `string` | `null` | no | -| [service\_account](#input\_service\_account) | Service account to attach to the controller instance. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | -| [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | -| [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | -| [source\_image](#input\_source\_image) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [source\_image\_family](#input\_source\_image\_family) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [source\_image\_project](#input\_source\_image\_project) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | -| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | -| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [cloud\_logging\_filter](#output\_cloud\_logging\_filter) | Cloud Logging filter to cluster errors. | -| [controller\_instance\_id](#output\_controller\_instance\_id) | The server-assigned unique identifier of the controller compute instance. | -| [pubsub\_topic](#output\_pubsub\_topic) | Cluster Pub/Sub topic. | - diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl deleted file mode 100644 index 8fb3f695e0..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurm.conf.tpl +++ /dev/null @@ -1,67 +0,0 @@ -# slurm.conf -# https://slurm.schedmd.com/high_throughput.html - -ProctrackType=proctrack/cgroup -SlurmctldPidFile=/var/run/slurm/slurmctld.pid -SlurmdPidFile=/var/run/slurm/slurmd.pid -TaskPlugin=task/affinity,task/cgroup -MaxArraySize=10001 -MaxJobCount=500000 -MaxNodeCount=100000 -MinJobAge=60 - -# -# -# SCHEDULING -SchedulerType=sched/backfill -SelectType=select/cons_tres -SelectTypeParameters=CR_Core_Memory - -# -# -# LOGGING AND ACCOUNTING -SlurmctldDebug=error -SlurmdDebug=error - -# -# -# TIMERS -MessageTimeout=60 - -################################################################################ -# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # -################################################################################ - -SlurmctldHost={control_host}({control_addr}) - -AuthType=auth/munge -AuthInfo=cred_expire=120 -AuthAltTypes=auth/jwt -CredType=cred/munge -MpiDefault={mpi_default} -ReturnToService=2 -SlurmctldPort={control_host_port} -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurmd -SlurmUser=slurm -StateSaveLocation={state_save} - -# -# -# LOGGING AND ACCOUNTING -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageHost={control_host} -ClusterName={name} -SlurmctldLogFile={slurmlog}/slurmctld.log -SlurmdLogFile={slurmlog}/slurmd-%n.log - -# -# -# GENERATED CLOUD CONFIGURATIONS -include cloud.conf - -################################################################################ -# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # -################################################################################ - -SchedulerParameters=defer,salloc_wait_nodes,batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl deleted file mode 100644 index 9dc4ed9c70..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/htc-slurmdbd.conf.tpl +++ /dev/null @@ -1,34 +0,0 @@ -# slurmdbd.conf -# https://slurm.schedmd.com/slurmdbd.conf.html - -DebugLevel=info -PidFile=/var/run/slurm/slurmdbd.pid - -# https://slurm.schedmd.com/slurmdbd.conf.html#OPT_CommitDelay -CommitDelay=1 - -################################################################################ -# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # -################################################################################ - -AuthType=auth/munge -AuthAltTypes=auth/jwt -AuthAltParameters=jwt_key={state_save}/jwt_hs256.key - -DbdHost={control_host} - -LogFile={slurmlog}/slurmdbd.log - -SlurmUser=slurm - -StorageLoc={db_name} - -StorageType=accounting_storage/mysql -StorageHost={db_host} -StoragePort={db_port} -StorageUser={db_user} -StoragePass={db_pass} - -################################################################################ -# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # -################################################################################ diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl deleted file mode 100644 index 22c7bf4ca7..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl +++ /dev/null @@ -1,70 +0,0 @@ -# slurm.conf -# https://slurm.schedmd.com/slurm.conf.html -# https://slurm.schedmd.com/configurator.html - -ProctrackType=proctrack/cgroup -SlurmctldPidFile=/var/run/slurm/slurmctld.pid -SlurmdPidFile=/var/run/slurm/slurmd.pid -TaskPlugin=task/affinity,task/cgroup -MaxNodeCount=64000 - -# -# -# SCHEDULING -SchedulerType=sched/backfill -SelectType=select/cons_tres -SelectTypeParameters=CR_Core_Memory - -# -# -# LOGGING AND ACCOUNTING -AccountingStoreFlags=job_comment -JobAcctGatherFrequency=30 -JobAcctGatherType=jobacct_gather/cgroup -SlurmctldDebug=info -SlurmdDebug=info -DebugFlags=Power - -# -# -# TIMERS -MessageTimeout=600 -BatchStartTimeout=600 -PrologEpilogTimeout=600 -PrologFlags=Contain - -################################################################################ -# vvvvv WARNING: DO NOT MODIFY SECTION BELOW vvvvv # -################################################################################ - -SlurmctldHost={control_host}({control_addr}) - -AuthType=auth/munge -AuthInfo=cred_expire=120 -AuthAltTypes=auth/jwt -CredType=cred/munge -MpiDefault={mpi_default} -ReturnToService=2 -SlurmctldPort={control_host_port} -SlurmdPort=6818 -SlurmdSpoolDir=/var/spool/slurmd -SlurmUser=slurm -StateSaveLocation={state_save} - -# -# -# LOGGING AND ACCOUNTING -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageHost={control_host} -ClusterName={name} -SlurmctldLogFile={slurmlog}/slurmctld.log -SlurmdLogFile={slurmlog}/slurmd-%n.log - -# -# -# GENERATED CLOUD CONFIGURATIONS -include cloud.conf - -################################################################################ -# ^^^^^ WARNING: DO NOT MODIFY SECTION ABOVE ^^^^^ # -################################################################################ diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf deleted file mode 100644 index fca4d3e203..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ /dev/null @@ -1,137 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v5-controller", ghpc_role = "scheduler" }) -} - -locals { - disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } - - metadata = merge( - local.disable_automatic_updates_metadata, - var.metadata - ) - - ghpc_startup_script_controller = [{ - filename = "ghpc_startup.sh" - content = var.controller_startup_script - }] - ghpc_startup_script_compute = [{ - filename = "ghpc_startup.sh" - content = var.compute_startup_script - }] - # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning - # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string - tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) - slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name - - enable_public_ip_access_config = var.disable_controller_public_ips ? [] : [{ nat_ip = null, network_tier = null }] - access_config = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config - - additional_disks = [ - for ad in var.additional_disks : { - disk_name = ad.disk_name - device_name = ad.device_name - disk_type = ad.disk_type - disk_size_gb = ad.disk_size_gb - disk_labels = merge(ad.disk_labels, local.labels) - auto_delete = ad.auto_delete - boot = ad.boot - } - ] -} - -data "google_compute_default_service_account" "default" { - project = var.project_id -} - -module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.2" - - access_config = local.access_config - slurm_cluster_name = local.slurm_cluster_name - instance_template = var.instance_template != null ? var.instance_template : module.slurm_controller_template.self_link - project_id = var.project_id - region = var.region - network = var.network_self_link == null ? "" : var.network_self_link - subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - subnetwork_project = var.subnetwork_project - zone = var.zone - static_ips = var.static_ips - cgroup_conf_tpl = var.cgroup_conf_tpl - cloud_parameters = var.cloud_parameters - cloudsql = var.cloudsql - controller_startup_scripts = local.ghpc_startup_script_controller - compute_startup_scripts = local.ghpc_startup_script_compute - controller_startup_scripts_timeout = var.controller_startup_scripts_timeout - compute_startup_scripts_timeout = var.compute_startup_scripts_timeout - login_startup_scripts_timeout = var.login_startup_scripts_timeout - enable_devel = var.enable_devel - enable_cleanup_compute = var.enable_cleanup_compute - enable_cleanup_subscriptions = var.enable_cleanup_subscriptions - enable_external_prolog_epilog = var.enable_external_prolog_epilog - enable_reconfigure = var.enable_reconfigure - enable_bigquery_load = var.enable_bigquery_load - enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins - epilog_scripts = var.epilog_scripts - disable_default_mounts = var.disable_default_mounts - login_network_storage = var.network_storage - network_storage = var.network_storage - partitions = var.partition - prolog_scripts = var.prolog_scripts - slurmdbd_conf_tpl = var.slurmdbd_conf_tpl - slurm_conf_tpl = var.slurm_conf_tpl -} - -module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2" - - additional_disks = local.additional_disks - can_ip_forward = var.can_ip_forward - slurm_cluster_name = local.slurm_cluster_name - disable_smt = var.disable_smt - disk_auto_delete = var.disk_auto_delete - disk_labels = merge(var.disk_labels, local.labels) - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - enable_confidential_vm = var.enable_confidential_vm - enable_oslogin = var.enable_oslogin - enable_shielded_vm = var.enable_shielded_vm - gpu = one(local.guest_accelerator) - labels = local.labels - machine_type = var.machine_type - metadata = local.metadata - min_cpu_platform = var.min_cpu_platform - on_host_maintenance = var.on_host_maintenance - preemptible = var.preemptible - project_id = var.project_id - region = var.region - shielded_instance_config = var.shielded_instance_config - slurm_instance_role = "controller" - source_image_family = local.source_image_family # requires source_image_logic.tf - source_image_project = local.source_image_project_normalized # requires source_image_logic.tf - source_image = local.source_image # requires source_image_logic.tf - network = var.network_self_link == null ? "" : var.network_self_link - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - tags = concat([local.slurm_cluster_name], var.tags) - service_account = var.service_account != null ? var.service_account : { - email = data.google_compute_default_service_account.default.email - scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml deleted file mode 100644 index d6b28f6239..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/metadata.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com - - iam.googleapis.com - - pubsub.googleapis.com - - secretmanager.googleapis.com diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf deleted file mode 100644 index 86cd242f09..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/outputs.tf +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "controller_instance_id" { - description = "The server-assigned unique identifier of the controller compute instance." - value = one(module.slurm_controller_instance.slurm_controller_instance.instances_details[*].id) -} - -output "cloud_logging_filter" { - description = "Cloud Logging filter to cluster errors." - value = module.slurm_controller_instance.cloud_logging_filter -} - -output "pubsub_topic" { - description = "Cluster Pub/Sub topic." - value = module.slurm_controller_instance.pubsub_topic -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf deleted file mode 100644 index 1df327a60b..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # Currently supported images and projects - known_project_families = { - schedmd-slurm-public = [ - "slurm-gcp-5-12-debian-11", - "slurm-gcp-5-12-hpc-rocky-linux-8", - "slurm-gcp-5-12-ubuntu-2004-lts", - "slurm-gcp-5-12-ubuntu-2204-lts-arm64", - "slurm-gcp-5-12-hpc-centos-7" - ] - } - - # This approach to "hacking" the project name allows a chain of Terraform - # calls to set the instance source_image (boot disk) with a "relative - # resource name" that passes muster with VPC Service Control rules - # - # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 - # https://cloud.google.com/apis/design/resource_names#relative_resource_name - source_image_project_normalized = (can(var.instance_image.family) ? - "projects/${data.google_compute_image.slurm.project}/global/images/family" : - "projects/${data.google_compute_image.slurm.project}/global/images" - ) - source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" - source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" -} - -data "google_compute_image" "slurm" { - family = try(var.instance_image.family, null) - name = try(var.instance_image.name, null) - project = var.instance_image.project - - lifecycle { - precondition { - condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 - error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." - } - - postcondition { - condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) - error_message = <<-EOD - Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. - EOD - } - postcondition { - condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) - error_message = <<-EOD - Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: - ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} - EOD - } - postcondition { - condition = var.disk_size_gb >= self.disk_size_gb - error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" - } - postcondition { - # Condition needs to check the suffix of the license, as prefix contains an API version which can change. - # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" - } - } -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf deleted file mode 100644 index e921ba3dc6..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ /dev/null @@ -1,671 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 - -variable "access_config" { - description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." - type = list(object({ - nat_ip = string - network_tier = string - })) - default = [] -} - -variable "additional_disks" { - type = list(object({ - disk_name = string - device_name = string - disk_type = string - disk_size_gb = number - disk_labels = map(string) - auto_delete = bool - boot = bool - })) - description = "List of maps of disks." - default = [] -} - -variable "can_ip_forward" { - type = bool - description = "Enable IP forwarding, for NAT instances for example." - default = false -} - -variable "cloud_parameters" { - description = "cloud.conf options." - type = object({ - no_comma_params = bool - resume_rate = number - resume_timeout = number - suspend_rate = number - suspend_timeout = number - }) - default = { - no_comma_params = false - resume_rate = 0 - resume_timeout = 300 - suspend_rate = 0 - suspend_timeout = 300 - } -} - -variable "cloudsql" { - description = < [!NOTE] -> Slurm-gcp-v5-hybrid module is deprecated. See -> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) -> for specific recommendations and timelines. - -This module is a wrapper around the [slurm-controller-hybrid] module by SchedMD -as part of the [slurm-gcp] github repository. The hybrid module serves to create -the configurations needed to extend an on-premise slurm cluster to one with one -or more Google Cloud bursting partitions. These partitions will create the -requested nodes in a GCP project on-demand and scale after a period of not being -used, in the same way as the [schedmd-slurm-gcp-v5-controller] module -auto-scales VMs. - -Further documentation on how to use this module when deploying a hybrid Slurm -cluster can be found in our [docs](../../../../docs/hybrid-slurm-cluster/). There, you can -find two tutorials. The [first] tutorial walks you through deploying a test -environment entirely in GCP that is designed to demonstrate the capabilities -without needing to make any changes to your local slurm cluster. The [second] -tutorial goes through the process of deploying the hybrid configuration onto a -on-premise slurm cluster. - -> **_NOTE:_** This is an experimental module and the functionality and -> documentation will likely be updated in the near future. This module has only -> been tested in limited capacity with the Cluster Toolkit. On Premise -> Slurm configurations can vary significantly, this module should -> be used as a starting point, not a complete solution. - -[schedmd-slurm-gcp-v5-controller]: ../schedmd-slurm-gcp-v5-controller/ -[first]: ../../../../docs/hybrid-slurm-cluster/README.md#demo-with-cloud-controller-instructionsmd -[second]: ../../../../docs/hybrid-slurm-cluster/README.md#on-prem-instructionsmd - -### Usage -The [slurm-controller-hybrid] is intended to be run on the controller of the on -premise slurm cluster, meaning executing `terraform init/apply` against the -deployment directory. This allows the module to infer settings such as the -slurm user and user ID when setting permissions for the created configurations. - -If unable to install terraform and other dependencies on the controller -directly, it is possible to deploy the hybrid module in a separate build -environment and copy the created configurations to the on premise controller -manually. This will require addition configuration and verification of -permissions. For more information see the [hybrid.md] documentation on -[slurm-gcp]. - -[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_hybrid - -> **_NOTE:_** The hybrid module requires the following dependencies to be -> installed on the system deploying the module: -> -> * [terraform] -> * [addict] -> * [httplib2] -> * [pyyaml] -> * [google-api-python-client] -> * [google-cloud-pubsub] -> * A full list of recommended python packages is available in a -> [requirements.txt] file in the [slurm-gcp] repo. - -[terraform]: https://learn.hashicorp.com/tutorials/terraform/install-cli -[addict]: https://pypi.org/project/addict/ -[httplib2]: https://pypi.org/project/httplib2/ -[pyyaml]: https://pypi.org/project/PyYAML/ -[google-api-python-client]: https://pypi.org/project/google-api-python-client/ -[google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt - -### Manual Configuration -This module *does not* complete the installation of hybrid partitions on your -slurm cluster. After deploying, you must follow the steps listed out in the -[hybrid.md] documentation under [manual steps]. - -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md -[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md#manual-configurations - -### Example Usage -The hybrid module can be added to a blueprint as follows: - -```yaml -- id: slurm-controller - source: community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - use: - - debug-partition - - compute-partition - - pre-existing-storage - settings: - output_dir: ./hybrid - slurm_bin_dir: /usr/local/bin - slurm_control_host: static-controller -``` - -This defines a HPC module that create a hybrid configuration with the following -attributes: - -* 2 partitions defined in previous modules with the IDs of `debug-partition` and - `compute-partition`. These are the same partition modules used by - [schedmd-slurm-gcp-v5-controller]. -* Network storage to be mounted on the compute nodes when created, defined in - `pre-existing-storage`. -* `output_directory` set to `./hybrid`. This is where the hybrid - configurations will be created. -* `slurm_bin_dir` located at `/usr/local/bin`. Set this to wherever the slurm - executables are installed on your system. -* `slurm_control_host`: The name of the on premise host is provided to the - module for configuring NFS mounts and communicating with the controller after - VM creation. - -[schedmd-slurm-gcp-v5-controller]: ../schedmd-slurm-gcp-v5-controller/ - -### Assumptions and Limitations -**Shared directories from the controller:** By default, the following -directories are NFS mounted from the on premise controller to the created cloud -VMs: -* /home -* /opt/apps -* /etc/munge -* /usr/local/slurm/etc - -The expectation is that these directories exist on the controller and that all -files required by slurmd to be in sync with the controller are in those -directories. - -If this does not match your slurm cluster, these directories can be overwritten -with a custom NFS mount using [pre-existing-network-storage] or by setting the -`network_storage` variable directly in the hybrid module. **Any value in -`network_storage`, added directly or with `use`, will override the default -directories above.** - -The variable `disable_default_mounts` will disregard these defaults. Note that -at a minimum, the cloud VMs require `/etc/munge` and `/usr/local/slurm/etc` to -be mounted from the controller. Those will need to be managed manually if the -`disable_default_mounts` variable is set to true. - -**Power Saving Logic:** The cloud partitions will make use of the power saving -logic and the suspend and resume programs will be set. If any local partitions -also make use of these `slurm.conf` variables, a conflict will likely occur. -There is no support currently for partition level suspend and resume scripts, -therefore either the local partition will need to turn this off or the hybrid -module will not work. - -**Slurm versions:** The version of slurm on the on premise cluster must match the -slurm version on the cloud VMs created by the hybrid partitions. The version -on the cloud VMs will be dictated by the version on the disk image that can be -set when defining the partitions using [schedmd-slurm-gcp-v5-partition]. - -If the publicly available images do not suffice, [slurm-gcp] provides -[packer templates] for creating custom disk images. - -SchedMD only supports the current and last major version of slurm, therefore we -strongly advise only using versions 21 or 22 when using this module. Attempting -to use this module with any version older than 21 may lead to unexpected -results. - -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 -[pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ -[schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer - -## License - -Copyright 2022 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | - -## Providers - -No providers. - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.2 | - -## Resources - -No resources. - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = bool
resume_rate = number
resume_timeout = number
suspend_rate = number
suspend_timeout = number
})
|
{
"no_comma_params": false,
"resume_rate": 0,
"resume_timeout": 300,
"suspend_rate": 0,
"suspend_timeout": 300
}
| no | -| [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no | -| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to the compute\_startup\_script. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller: /usr/local/etc/slurm,
/etc/munge, /home, /apps.
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | -| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.
NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.
NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.
NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | -| [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | -| [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details). Compute instances and resource policies
(e.g. placement groups) will be destroyed to align with new configuration.
NOTE: Requires Python and Google Pub/Sub API.
*WARNING*: Toggling this will impact the running workload. Deployed compute nodes
will be destroyed and their jobs will be requeued. | `bool` | `false` | no | -| [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `bool` | `false` | no | -| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | -| [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller. This updates the prefix path for the resume and
suspend scripts in the generated `cloud.conf` file. The value defaults to
output\_dir if not specified. | `string` | `null` | no | -| [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [output\_dir](#input\_output\_dir) | Directory where this module will write its files to. These files include:
cloud.conf; cloud\_gres.conf; config.yaml; resume.py; suspend.py; and util.py.
If not specified explicitly, this will also be used as the default value
for the `install_dir` variable. | `string` | `null` | no | -| [partition](#input\_partition) | Cluster partitions as a list. |
list(object({
compute_list = list(string)
partition = object({
enable_job_exclusive = bool
enable_placement_groups = bool
network_storage = list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
partition_conf = map(string)
partition_feature = string
partition_name = string
partition_nodes = map(object({
bandwidth_tier = string
node_count_dynamic_max = number
node_count_static = number
enable_spot_vm = bool
group_name = string
instance_template = string
node_conf = map(string)
access_config = list(object({
nat_ip = string
network_tier = string
}))
spot_instance_config = object({
termination_action = string
})
}))
partition_startup_scripts_timeout = number
subnetwork = string
zone_policy_allow = list(string)
zone_policy_deny = list(string)
zone_target_shape = string
})
}))
| `[]` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directory of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided
it will default to the first 8 characters of the deployment name (removing
any invalid characters). | `string` | `null` | no | -| [slurm\_control\_addr](#input\_slurm\_control\_addr) | The IP address or a name by which the address can be identified.
This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)
See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | -| [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").
This value is passed to slurm.conf such that:
SlurmctldHost={var.slurm\_control\_host}\({var.slurm\_control\_addr}\)
See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | n/a | yes | -| [slurm\_control\_host\_port](#input\_slurm\_control\_host\_port) | The port number that the Slurm controller, slurmctld, listens to for work.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldPort | `string` | `null` | no | -| [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | - -## Outputs - -No outputs. - diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf deleted file mode 100644 index c721a13bb3..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ /dev/null @@ -1,59 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -locals { - ghpc_startup_script_compute = [{ - filename = "ghpc_startup.sh" - content = var.compute_startup_script - }] - - # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning - # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string - tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) - slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name - -} - -module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.2" - - project_id = var.project_id - slurm_cluster_name = local.slurm_cluster_name - enable_devel = var.enable_devel - enable_cleanup_compute = var.enable_cleanup_compute - enable_cleanup_subscriptions = var.enable_cleanup_subscriptions - enable_reconfigure = var.enable_reconfigure - enable_bigquery_load = var.enable_bigquery_load - enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins - compute_startup_scripts = local.ghpc_startup_script_compute - compute_startup_scripts_timeout = var.compute_startup_scripts_timeout - prolog_scripts = var.prolog_scripts - epilog_scripts = var.epilog_scripts - network_storage = var.network_storage - disable_default_mounts = var.disable_default_mounts - login_network_storage = var.network_storage - partitions = var.partition - google_app_cred_path = var.google_app_cred_path - slurm_bin_dir = var.slurm_bin_dir - slurm_log_dir = var.slurm_log_dir - cloud_parameters = var.cloud_parameters - output_dir = var.output_dir - slurm_control_host = var.slurm_control_host - slurm_control_host_port = var.slurm_control_host_port - slurm_control_addr = var.slurm_control_addr - install_dir = var.install_dir - munge_mount = var.munge_mount -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml deleted file mode 100644 index af2107286d..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/metadata.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com - - pubsub.googleapis.com diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf deleted file mode 100644 index 1630e92708..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -variable "project_id" { - type = string - description = "Project ID to create resources in." -} - -variable "deployment_name" { - description = "Name of the deployment." - type = string -} - -variable "slurm_cluster_name" { - type = string - description = <<-EOD - Cluster name, used for resource naming and slurm accounting. If not provided - it will default to the first 8 characters of the deployment name (removing - any invalid characters). - EOD - default = null -} - -variable "enable_devel" { - type = bool - description = "Enables development mode. Not for production use." - default = false -} - -variable "enable_cleanup_compute" { - description = <<-EOD - Enables automatic cleanup of compute nodes and resource policies (e.g. - placement groups) managed by this module, when cluster is destroyed. - NOTE: Requires Python and script dependencies. - *WARNING*: Toggling this may impact the running workload. Deployed compute nodes - may be destroyed and their jobs will be requeued. - EOD - type = bool - default = false -} - -variable "enable_cleanup_subscriptions" { - description = <<-EOD - Enables automatic cleanup of pub/sub subscriptions managed by this module, when - cluster is destroyed. - NOTE: Requires Python and script dependencies. - *WARNING*: Toggling this may temporarily impact var.enable_reconfigure behavior. - EOD - type = bool - default = false -} - -variable "enable_reconfigure" { - description = <<-EOD - Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g. - slurm.conf.tpl, partition details). Compute instances and resource policies - (e.g. placement groups) will be destroyed to align with new configuration. - NOTE: Requires Python and Google Pub/Sub API. - *WARNING*: Toggling this will impact the running workload. Deployed compute nodes - will be destroyed and their jobs will be requeued. - EOD - type = bool - default = false -} - -variable "enable_bigquery_load" { - description = <<-EOD - Enables loading of cluster job usage into big query. - NOTE: Requires Google Bigquery API. - EOD - type = bool - default = false -} - -variable "enable_slurm_gcp_plugins" { - description = < [!NOTE] -> Slurm-gcp-v5-login module is deprecated. See -> [this update](../../../../examples/README.md#completed-migration-to-slurm-gcp-v6) -> for specific recommendations and timelines. - -This module creates a login node for a Slurm cluster based on the -[SchedMD/slurm-gcp] [slurm\_instance\_template] and [slurm\_login\_instance] -terraform modules. The login node is used in conjunction with the -[Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). - -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template - -### Example - -```yaml -- id: slurm_login - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - use: - - network1 - - slurm_controller - settings: - machine_type: n2-standard-4 -``` - -This creates a Slurm login node which is: - -* connected to the primary subnet of network1 via `use` -* associated with the `slurm_controller` module as the slurm controller via - `use` -* of VM machine type `n2-standard-4` - -## Custom Images - -For more information on creating valid custom images for the login node VM -instances or for custom instance templates, see our [vm-images.md] documentation -page. - -[vm-images.md]: ../../../../docs/vm-images.md#slurm-on-gcp-custom-images - -## GPU Support - -More information on GPU support in Slurm on GCP and other Cluster Toolkit modules -can be found at [docs/gpu-support.md](../../../../docs/gpu-support.md) - -## Support -The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform -modules. For support with the underlying modules, see the instructions in the -[slurm-gcp README][slurm-gcp-readme]. - -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2#slurm-on-google-cloud-platform - -## License - -Copyright 2023 Google LLC - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 1.1 | -| [google](#requirement\_google) | >= 3.83 | - -## Providers - -| Name | Version | -|------|---------| -| [google](#provider\_google) | >= 3.83 | - -## Modules - -| Name | Source | Version | -|------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.2 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 | - -## Resources - -| Name | Type | -|------|------| -| [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | -| [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | -| [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | -| [controller\_instance\_id](#input\_controller\_instance\_id) | The server-assigned unique identifier of the controller instance. This value
must be supplied as an output of the controller module, typically via `use`. | `string` | n/a | yes | -| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | -| [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | -| [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | -| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-standard"` | no | -| [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | -| [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details).

NOTE: Requires Google Pub/Sub API. | `bool` | `false` | no | -| [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-12-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | -| [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | -| [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"n2-standard-2"` | no | -| [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | -| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | -| [network\_ip](#input\_network\_ip) | DEPRECATED: Use `static_ips` variable to assign an internal static ip address. | `string` | `null` | no | -| [network\_self\_link](#input\_network\_self\_link) | Network to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | -| [num\_instances](#input\_num\_instances) | Number of instances to create. This value is ignored if static\_ips is provided. | `number` | `1` | no | -| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | -| [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [pubsub\_topic](#input\_pubsub\_topic) | The cluster pubsub topic created by the controller when enable\_reconfigure=true. | `string` | `null` | no | -| [region](#input\_region) | Region where the instances should be created.
Note: region will be ignored if it can be extracted from subnetwork. | `string` | `null` | no | -| [service\_account](#input\_service\_account) | Service account to attach to the login instance. If not set, the
default compute service account for the given project will be used with the
"https://www.googleapis.com/auth/cloud-platform" scope. |
object({
email = string
scopes = set(string)
})
| `null` | no | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | -| [source\_image](#input\_source\_image) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [source\_image\_family](#input\_source\_image\_family) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [source\_image\_project](#input\_source\_image\_project) | DEPRECATED: Use `instance_image` instead. | `string` | `null` | no | -| [startup\_script](#input\_startup\_script) | Startup script that will be used by the login node VM. | `string` | `""` | no | -| [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project that subnetwork belongs to. | `string` | `null` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. Either network\_self\_link or subnetwork\_self\_link must be specified. | `string` | `null` | no | -| [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | -| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | - -## Outputs - -No outputs. - diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf deleted file mode 100644 index 1c84a92721..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -## Required variables: -# guest_accelerator -# machine_type - -locals { - # example state; terraform will ignore diffs if last element of URL matches - # guest_accelerator = [ - # { - # count = 1 - # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" - # }, - # ] - accelerator_machines = { - "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, - "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, - "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, - "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, - "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, - "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, - "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, - "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, - "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, - "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, - "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, - "a3-ultragpu-8g" = { type = "nvidia-h200-141gb", count = 8 }, - "g2-standard-4" = { type = "nvidia-l4", count = 1 }, - "g2-standard-8" = { type = "nvidia-l4", count = 1 }, - "g2-standard-12" = { type = "nvidia-l4", count = 1 }, - "g2-standard-16" = { type = "nvidia-l4", count = 1 }, - "g2-standard-24" = { type = "nvidia-l4", count = 2 }, - "g2-standard-32" = { type = "nvidia-l4", count = 1 }, - "g2-standard-48" = { type = "nvidia-l4", count = 4 }, - "g2-standard-96" = { type = "nvidia-l4", count = 8 }, - } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - - # Select in priority order: - # (1) var.guest_accelerator if not empty - # (2) local.generated_guest_accelerator if not empty - # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf deleted file mode 100644 index 3046dbac9d..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "schedmd-slurm-gcp-v5-login", ghpc_role = "scheduler" }) -} - -locals { - disable_automatic_updates_metadata = var.allow_automatic_updates ? {} : { google_disable_automatic_updates = "TRUE" } - - metadata = merge( - local.disable_automatic_updates_metadata, - var.metadata - ) - - ghpc_startup_script = [{ - filename = "ghpc_startup.sh" - content = var.startup_script - }] - # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning - # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string - tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) - slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name - - enable_public_ip_access_config = var.disable_login_public_ips ? [] : [{ nat_ip = null, network_tier = null }] - access_config = length(var.access_config) == 0 ? local.enable_public_ip_access_config : var.access_config - - additional_disks = [ - for ad in var.additional_disks : { - disk_name = ad.disk_name - device_name = ad.device_name - disk_type = ad.disk_type - disk_size_gb = ad.disk_size_gb - disk_labels = merge(ad.disk_labels, local.labels) - auto_delete = ad.auto_delete - boot = ad.boot - } - ] -} - -data "google_compute_default_service_account" "default" { - project = var.project_id -} - -module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2" - - additional_disks = local.additional_disks - can_ip_forward = var.can_ip_forward - slurm_cluster_name = local.slurm_cluster_name - disable_smt = var.disable_smt - disk_auto_delete = var.disk_auto_delete - disk_labels = merge(var.disk_labels, local.labels) - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - enable_confidential_vm = var.enable_confidential_vm - enable_oslogin = var.enable_oslogin - enable_shielded_vm = var.enable_shielded_vm - gpu = one(local.guest_accelerator) - labels = local.labels - machine_type = var.machine_type - metadata = local.metadata - min_cpu_platform = var.min_cpu_platform - on_host_maintenance = var.on_host_maintenance - preemptible = var.preemptible - project_id = var.project_id - region = var.region - shielded_instance_config = var.shielded_instance_config - slurm_instance_role = "login" - source_image_family = local.source_image_family # requires source_image_logic.tf - source_image_project = local.source_image_project_normalized # requires source_image_logic.tf - source_image = local.source_image # requires source_image_logic.tf - network = var.network_self_link == null ? "" : var.network_self_link - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - tags = concat([local.slurm_cluster_name], var.tags) - service_account = var.service_account != null ? var.service_account : { - email = data.google_compute_default_service_account.default.email - scopes = ["https://www.googleapis.com/auth/cloud-platform"] - } -} - -module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.2" - - access_config = local.access_config - slurm_cluster_name = local.slurm_cluster_name - instance_template = var.instance_template != null ? var.instance_template : module.slurm_login_template.self_link - network = var.network_self_link - num_instances = var.num_instances - project_id = var.project_id - region = var.region - static_ips = var.static_ips - subnetwork_project = var.subnetwork_project - subnetwork = var.subnetwork_self_link - zone = var.zone - login_startup_scripts = local.ghpc_startup_script - metadata = local.metadata - slurm_depends_on = var.controller_instance_id == null ? [] : [var.controller_instance_id] - enable_reconfigure = var.enable_reconfigure - pubsub_topic = var.pubsub_topic -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml deleted file mode 100644 index 4c2f23a8d7..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/metadata.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - compute.googleapis.com diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf deleted file mode 100644 index 1df327a60b..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -locals { - # Currently supported images and projects - known_project_families = { - schedmd-slurm-public = [ - "slurm-gcp-5-12-debian-11", - "slurm-gcp-5-12-hpc-rocky-linux-8", - "slurm-gcp-5-12-ubuntu-2004-lts", - "slurm-gcp-5-12-ubuntu-2204-lts-arm64", - "slurm-gcp-5-12-hpc-centos-7" - ] - } - - # This approach to "hacking" the project name allows a chain of Terraform - # calls to set the instance source_image (boot disk) with a "relative - # resource name" that passes muster with VPC Service Control rules - # - # https://github.com/terraform-google-modules/terraform-google-vm/blob/735bd415fc5f034d46aa0de7922e8fada2327c0c/modules/instance_template/main.tf#L28 - # https://cloud.google.com/apis/design/resource_names#relative_resource_name - source_image_project_normalized = (can(var.instance_image.family) ? - "projects/${data.google_compute_image.slurm.project}/global/images/family" : - "projects/${data.google_compute_image.slurm.project}/global/images" - ) - source_image_family = can(var.instance_image.family) ? data.google_compute_image.slurm.family : "" - source_image = can(var.instance_image.name) ? data.google_compute_image.slurm.name : "" -} - -data "google_compute_image" "slurm" { - family = try(var.instance_image.family, null) - name = try(var.instance_image.name, null) - project = var.instance_image.project - - lifecycle { - precondition { - condition = length(regexall("^projects/.+?/global/images/family$", var.instance_image.project)) == 0 - error_message = "The \"project\" field in var.instance_image no longer supports a long-form ending in \"family\". Specify only the project ID." - } - - postcondition { - condition = var.instance_image_custom || contains(keys(local.known_project_families), self.project) - error_message = <<-EOD - Images in project ${self.project} are not published by SchedMD. Images must be created by compatible releases of the Terraform and Packer modules following the guidance at https://goo.gle/hpc-slurm-images. Set var.instance_image_custom to true to silence this error and acknowledge that you are using a compatible image. - EOD - } - postcondition { - condition = !contains(keys(local.known_project_families), self.project) || try(contains(local.known_project_families[self.project], self.family), false) - error_message = <<-EOD - Image family ${self.family} published by SchedMD in project ${self.project} is not compatible with this release of the Terraform Slurm modules. Select from known compatible releases: - ${join("\n", [for p in try(local.known_project_families[self.project], []) : "\t\"${p}\""])} - EOD - } - postcondition { - condition = var.disk_size_gb >= self.disk_size_gb - error_message = "'disk_size_gb: ${var.disk_size_gb}' is smaller than the image size (${self.disk_size_gb}GB), please increase the blueprint disk size" - } - postcondition { - # Condition needs to check the suffix of the license, as prefix contains an API version which can change. - # Example license value: https://www.googleapis.com/compute/v1/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates - condition = var.allow_automatic_updates || anytrue([for license in self.licenses : endswith(license, "/projects/cloud-hpc-image-public/global/licenses/hpc-vm-image-feature-disable-auto-updates")]) - error_message = "Disabling automatic updates is not supported with the selected VM image. More information: https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates" - } - } -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf deleted file mode 100644 index a86bab126f..0000000000 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ /dev/null @@ -1,429 +0,0 @@ -/** - * Copyright 2023 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -# Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 - -variable "project_id" { - type = string - description = "Project ID to create resources in." -} - -variable "labels" { - type = map(string) - description = "Labels, provided as a map." - default = {} -} - -variable "disable_smt" { - type = bool - description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = true -} - -variable "deployment_name" { - description = "Name of the deployment." - type = string -} - -variable "disable_login_public_ips" { - description = "If set to false. The login will have a random public IP assigned to it. Ignored if access_config is set." - type = bool - default = true -} - -variable "slurm_cluster_name" { - type = string - description = "Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters)." - default = null -} - -variable "controller_instance_id" { - description = <<-EOD - The server-assigned unique identifier of the controller instance. This value - must be supplied as an output of the controller module, typically via `use`. - EOD - type = string -} - -variable "can_ip_forward" { - type = bool - description = "Enable IP forwarding, for NAT instances for example." - default = false -} - -variable "network_self_link" { - type = string - description = "Network to deploy to. Either network_self_link or subnetwork_self_link must be specified." - default = null -} - -variable "subnetwork_self_link" { - type = string - description = "Subnet to deploy to. Either network_self_link or subnetwork_self_link must be specified." - default = null -} - -variable "subnetwork_project" { - type = string - description = "The project that subnetwork belongs to." - default = null -} - -variable "region" { - type = string - description = <<-EOD - Region where the instances should be created. - Note: region will be ignored if it can be extracted from subnetwork. - EOD - default = null -} - -# tflint-ignore: terraform_unused_declarations -variable "network_ip" { - type = string - description = "DEPRECATED: Use `static_ips` variable to assign an internal static ip address." - default = null - validation { - condition = var.network_ip == null - error_message = "network_ip is deprecated. Use static_ips to assign an internal static ip address." - } -} - -variable "static_ips" { - type = list(string) - description = "List of static IPs for VM instances." - default = [] -} - -variable "access_config" { - description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." - type = list(object({ - nat_ip = string - network_tier = string - })) - default = [] -} - -variable "zone" { - type = string - description = <<-EOD - Zone where the instances should be created. If not specified, instances will be - spread across available zones in the region. - EOD - default = null -} - -variable "metadata" { - type = map(string) - description = "Metadata, provided as a map." - default = {} -} - -variable "tags" { - type = list(string) - description = "Network tag list." - default = [] -} - -variable "machine_type" { - type = string - description = "Machine type to create." - default = "n2-standard-2" -} - -variable "min_cpu_platform" { - type = string - description = <<-EOD - Specifies a minimum CPU platform. Applicable values are the friendly names of - CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list: - https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform - EOD - default = null -} - -# tflint-ignore: terraform_unused_declarations -variable "gpu" { - type = object({ - type = string - count = number - }) - description = "DEPRECATED: use var.guest_accelerator" - default = null - validation { - condition = var.gpu == null - error_message = "var.gpu is deprecated. Use var.guest_accelerator." - } -} - -variable "guest_accelerator" { - description = "List of the type and count of accelerator cards attached to the instance." - type = list(object({ - type = string, - count = number - })) - default = [] - nullable = false - - validation { - condition = length(var.guest_accelerator) <= 1 - error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." - } -} - -variable "service_account" { - type = object({ - email = string - scopes = set(string) - }) - description = <<-EOD - Service account to attach to the login instance. If not set, the - default compute service account for the given project will be used with the - "https://www.googleapis.com/auth/cloud-platform" scope. - EOD - default = null -} - -variable "shielded_instance_config" { - type = object({ - enable_integrity_monitoring = bool - enable_secure_boot = bool - enable_vtpm = bool - }) - description = <<-EOD - Shielded VM configuration for the instance. Note: not used unless - enable_shielded_vm is 'true'. - - enable_integrity_monitoring : Compare the most recent boot measurements to the - integrity policy baseline and return a pair of pass/fail results depending on - whether they match or not. - - enable_secure_boot : Verify the digital signature of all boot components, and - halt the boot process if signature verification fails. - - enable_vtpm : Use a virtualized trusted platform module, which is a - specialized computer chip you can use to encrypt objects like keys and - certificates. - EOD - default = { - enable_integrity_monitoring = true - enable_secure_boot = true - enable_vtpm = true - } -} - -variable "enable_confidential_vm" { - type = bool - description = "Enable the Confidential VM configuration. Note: the instance image must support option." - default = false -} - -variable "enable_shielded_vm" { - type = bool - description = "Enable the Shielded VM configuration. Note: the instance image must support option." - default = false -} - -variable "preemptible" { - type = bool - description = "Allow the instance to be preempted." - default = false -} - -variable "on_host_maintenance" { - type = string - description = "Instance availability Policy." - default = "MIGRATE" -} - -variable "enable_oslogin" { - type = bool - description = <<-EOD - Enables Google Cloud os-login for user login and authentication for VMs. - See https://cloud.google.com/compute/docs/oslogin - EOD - default = true -} - -variable "num_instances" { - type = number - description = "Number of instances to create. This value is ignored if static_ips is provided." - default = 1 -} - -variable "startup_script" { - description = "Startup script that will be used by the login node VM." - type = string - default = "" -} - -variable "instance_template" { - description = <<-EOD - Self link to a custom instance template. If set, other VM definition - variables such as machine_type and instance_image will be ignored in favor - of the provided instance template. - - For more information on creating custom images for the instance template - that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section - in docs/vm-images.md. - EOD - type = string - default = null -} - -variable "instance_image" { - description = <<-EOD - Defines the image that will be used in the Slurm login node VM instances. - - Expected Fields: - name: The name of the image. Mutually exclusive with family. - family: The image family to use. Mutually exclusive with name. - project: The project where the image is hosted. - - For more information on creating custom images that comply with Slurm on GCP - see the "Slurm on GCP Custom Images" section in docs/vm-images.md. - EOD - type = map(string) - default = { - project = "schedmd-slurm-public" - family = "slurm-gcp-5-12-hpc-centos-7" - } - - validation { - condition = can(coalesce(var.instance_image.project)) - error_message = "In var.instance_image, the \"project\" field must be a string set to the Cloud project ID." - } - - validation { - condition = can(coalesce(var.instance_image.name)) != can(coalesce(var.instance_image.family)) - error_message = "In var.instance_image, exactly one of \"family\" or \"name\" fields must be set to desired image family or name." - } -} - -variable "instance_image_custom" { - description = <<-EOD - A flag that designates that the user is aware that they are requesting - to use a custom and potentially incompatible image for this Slurm on - GCP module. - - If the field is set to false, only the compatible families and project - names will be accepted. The deployment will fail with any other image - family or name. If set to true, no checks will be done. - - See: https://goo.gle/hpc-slurm-images - EOD - type = bool - default = false -} - - -variable "allow_automatic_updates" { - description = <<-EOT - If false, disables automatic system package updates on the created instances. This feature is - only available on supported images (or images derived from them). For more details, see - https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates - EOT - type = bool - default = true - nullable = false -} - -# tflint-ignore: terraform_unused_declarations -variable "source_image_project" { - type = string - description = "DEPRECATED: Use `instance_image` instead." - default = null - validation { - condition = var.source_image_project == null - error_message = "Variable `source_image_project` is deprecated. Use `instance_image` instead." - } -} - -# tflint-ignore: terraform_unused_declarations -variable "source_image_family" { - type = string - description = "DEPRECATED: Use `instance_image` instead." - default = null - validation { - condition = var.source_image_family == null - error_message = "Variable `source_image_family` is deprecated. Use `instance_image` instead." - } -} - -# tflint-ignore: terraform_unused_declarations -variable "source_image" { - type = string - description = "DEPRECATED: Use `instance_image` instead." - default = null - validation { - condition = var.source_image == null - error_message = "Variable `source_image` is deprecated. Use `instance_image` instead." - } -} - -variable "disk_type" { - type = string - description = "Boot disk type." - default = "pd-standard" -} - -variable "disk_size_gb" { - type = number - description = "Boot disk size in GB." - default = 50 -} - -variable "disk_auto_delete" { - type = bool - description = "Whether or not the boot disk should be auto-deleted." - default = true -} - -variable "disk_labels" { - description = "Labels specific to the boot disk. These will be merged with var.labels." - type = map(string) - default = {} -} - -variable "additional_disks" { - type = list(object({ - disk_name = string - device_name = string - disk_type = string - disk_size_gb = number - disk_labels = map(string) - auto_delete = bool - boot = bool - })) - description = "List of maps of disks." - default = [] -} - -variable "enable_reconfigure" { - description = < **_NOTE:_**: Many different potential issues could be indicated by the above -> message, so be sure to verify issue in logs. - -To confirm the issue, ssh onto the controller and call `sudo cat /slurm/scripts/setup.log`. Look for -the following logs: - -```text -google_metadata_script_runner: startup-script: ERROR: [Errno 101] Network is unreachable -google_metadata_script_runner: startup-script: OSError: [Errno 101] Network is unreachable -google_metadata_script_runner: startup-script: ERROR: Aborting setup... -google_metadata_script_runner: startup-script exit status 0 -google_metadata_script_runner: Finished running startup scripts. -``` - -You may also notice mount failure logs on the login node: - -```text -INFO: Waiting for '/usr/local/etc/slurm' to be mounted... -INFO: Waiting for '/home' to be mounted... -INFO: Waiting for '/opt/apps' to be mounted... -INFO: Waiting for '/etc/munge' to be mounted... -ERROR: mount of path '/usr/local/etc/slurm' failed: : Command '['mount', '/usr/local/etc/slurm']' returned non-zero exit status 32. -ERROR: mount of path '/opt/apps' failed: : Command '['mount', '/opt/apps']' returned non-zero exit status 32. -ERROR: mount of path '/home' failed: : Command '['mount', '/home']' returned non-zero exit status 32. -ERROR: mount of path '/etc/munge' failed: : Command '['mount', '/etc/munge']' returned non-zero exit status 32. -``` - -> **_NOTE:_**: The above logs only indicate that something went wrong with the -> startup of the controller. Check logs on the controller to be sure it is a -> network issue. - ### Failure to Create Auto Scale Nodes (Slurm) If your deployment succeeds but your jobs fail with the following error: @@ -213,9 +160,9 @@ After creating the service account, it can be set via the [slurm-on-gcp-con]: community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md [slurm-on-gcp-login]: community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md -### Timeout Error / Startup Script Failure (Slurm V5) +### Timeout Error / Startup Script Failure (Slurm V6) -If you observe failure of startup scripts in version 5 of the Slurm module, +If you observe failure of startup scripts in version 6 of the Slurm module, they may be due to a 300 second maximum timeout on scripts. All startup script logging is found in `/slurm/scripts/setup.log` on every node in a Slurm cluster. The error will appear similar to: @@ -237,7 +184,7 @@ to execute scripts of significant duration. This pattern is demonstrated in the ### Slurm Controller Startup Fails with `exportfs` Error -Example error in `/slurm/scripts/setup.log` (on Slurm V5 controller): +Example error in `/slurm/scripts/setup.log` (on Slurm V6 controller): ```text exportfs: /****** does not support NFS export @@ -262,9 +209,9 @@ the `local_mount` and `filestore_share_name`. ### `local-exec provisioner error` During Terraform Apply -Using the `enable_reconfigure` setting with Slurm v5 modules uses `local-exec` +Using the `enable_reconfigure` setting with Slurm v6 modules uses `local-exec` provisioners to perform additional cluster configuration. Some common issues experienced when using this feature are missing local python requirements and incorrectly configured gcloud cli. There is more information about these issues and fixes on the -[`schedmd-slurm-gcp-v5-controller` documentation](../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md#live-cluster-reconfiguration-enable_reconfigure). +[`schedmd-slurm-gcp-v6-controller` documentation](../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md#live-cluster-reconfiguration-enable_reconfigure). diff --git a/docs/vm-images.md b/docs/vm-images.md index c56f4ac0ac..c8bf56b644 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -28,7 +28,7 @@ Please see the [blueprint catalog](https://cloud.google.com/hpc-toolkit/docs/set > documentation for any module utilized. When an Cluster Toolkit blueprint points to a predefined source module (e.g. -`community/modules/compute/schedmd-slurm-gcp-v5-node-group`), generally the +`community/modules/compute/schedmd-slurm-gcp-v6-nodeset`), generally the module has a default image defined. In order to override this default image, a user may specify the `instance_image` setting in the yaml blueprint, within either the specific module definition or the global variables. The @@ -159,15 +159,6 @@ description of our support for Windows images. - - - Slurm - Chrome Remote Desktop - - - - - Lustre @@ -260,7 +251,7 @@ Packer modules. For example, images built for version 5.8 are compatible with all Terraform modules from 5.8.0 but below 5.9.0. The version of the Slurm modules used by your copy of the Toolkit in the local filesystem can be inspected by looking for the source line in -`community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf`. +`community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf`. The latest GitHub release supports [these images][slurm-gcp-published-images]. @@ -280,21 +271,21 @@ and [Image Builder](../examples/README.md#image-builderyaml-core-badge) These instructions apply to the following modules: -* [schedmd-slurm-gcp-v5-controller] -* [schedmd-slurm-gcp-v5-login] -* [schedmd-slurm-gcp-v5-node-group] +* [schedmd-slurm-gcp-v6-controller] +* [schedmd-slurm-gcp-v6-login] +* [schedmd-slurm-gcp-v6-nodeset] -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5 -[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/v5/packer +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master +[slurm-gcp-packer]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/packer [slurm-gcp-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md [slurm-gcp-published-images]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family [gcloud-compute-images]: https://cloud.google.com/sdk/gcloud/reference/compute/images/create [vm-instance]: ../modules/compute/vm-instance [hpc-toolkit-packer]: ../modules/packer/custom-image -[schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller -[schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login -[schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group +[schedmd-slurm-gcp-v6-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-controller +[schedmd-slurm-gcp-v6-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-login +[schedmd-slurm-gcp-v6-nodeset]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset [batch-job]: ../modules/scheduler/batch-job-template [batch-login]: ../modules/scheduler/batch-login-node [htcondor-setup]: ../community/modules/scheduler/htcondor-setup @@ -307,7 +298,6 @@ These instructions apply to the following modules: [vm-crd.yaml]: ../tools/validate_configs/os_compatibility_tests/vm-crd.yaml [vm-filestore.yaml]: ../tools/validate_configs/os_compatibility_tests/vm-filestore.yaml [vm-lustre.yaml]: ../tools/validate_configs/os_compatibility_tests/vm-lustre.yaml -[hpc-slurm-chromedesktop-v5-legacy.yaml]: ../community/examples/hpc-slurm-chromedesktop.yaml [slurm-filestore.yaml]: ../tools/validate_configs/os_compatibility_tests/slurm-filestore.yaml [batch-startup.yaml]: ../tools/validate_configs/os_compatibility_tests/batch-startup.yaml [batch-filestore.yaml]: ../tools/validate_configs/os_compatibility_tests/batch-filestore.yaml diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index e03daa3d27..afa659d563 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -217,7 +217,7 @@ deployment_groups: node_count_dynamic_max: 16 machine_type: a2-megagpu-16g # This makes this nodeset look for machines in any of the following zones - # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies // !!! + # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v6-nodeset#compute-vm-zone-policies // !!! zones: $(vars.gpu_zones) bandwidth_tier: gvnic_enabled instance_image: $(vars.slurm_image) diff --git a/modules/README.md b/modules/README.md index 1f89107f0b..f326bda54f 100644 --- a/modules/README.md +++ b/modules/README.md @@ -35,10 +35,6 @@ Modules that are still in development and less stable are labeled with the ### Compute * **[vm-instance]** ![core-badge] : Creates one or more VM instances. -* **[schedmd-slurm-gcp-v5-partition]** ![community-badge] ![deprecated-badge] : - Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v5-controller]. -* **[schedmd-slurm-gcp-v5-node-group]** ![community-badge] ![deprecated-badge]: - Creates a node group to be used by the [schedmd-slurm-gcp-v5-partition] module. * **[schedmd-slurm-gcp-v6-partition]** ![core-badge] : Creates a partition to be used by a [slurm-controller][schedmd-slurm-gcp-v6-controller]. * **[schedmd-slurm-gcp-v6-nodeset]** ![core-badge] : @@ -65,8 +61,6 @@ Modules that are still in development and less stable are labeled with the [gke-node-pool]: ../modules/compute/gke-node-pool/README.md [resource-policy]: ../modules/compute/resource-policy/README.md [gke-job-template]: ../modules/compute/gke-job-template/README.md -[schedmd-slurm-gcp-v5-partition]: ../community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md -[schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md [schedmd-slurm-gcp-v6-partition]: ../community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md [schedmd-slurm-gcp-v6-nodeset]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md [schedmd-slurm-gcp-v6-nodeset-tpu]: ../community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -188,12 +182,6 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca * **[gke-cluster]** ![core-badge] ![experimental-badge] : Creates a Kubernetes cluster using GKE. * **[pre-existing-gke-cluster]** ![core-badge] ![experimental-badge] : Retrieves an existing GKE cluster. Substitute for ([gke-cluster]) module. -* **[schedmd-slurm-gcp-v5-controller]** ![community-badge] ![deprecated-badge] : - Creates a Slurm controller node using [slurm-gcp-version-5]. -* **[schedmd-slurm-gcp-v5-login]** ![community-badge] ![deprecated-badge] : - Creates a Slurm login node using [slurm-gcp-version-5]. -* **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] ![deprecated-badge] : - Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v6-controller]** ![core-badge] : Creates a Slurm controller node using [slurm-gcp-version-6]. * **[schedmd-slurm-gcp-v6-login]** ![core-badge] : @@ -220,10 +208,6 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md [schedmd-slurm-gcp-v6-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md [schedmd-slurm-gcp-v6-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md -[schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md -[schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md -[schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 [slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.6 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md @@ -278,10 +262,14 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [spack-execute]: ../community/modules/scripts/spack-execute/README.md [wait-for-startup]: ../community/modules/scripts/wait-for-startup/README.md -> **_NOTE:_** Slurm V4 is deprecated. In case, you want to use V4 modules, please use +> **_NOTE:_** Slurm-GCP V4 is deprecated. In case, you want to use V4 modules, please use [ghpc-v1.27.0](https://github.com/GoogleCloudPlatform/hpc-toolkit/releases/tag/v1.27.0) source code and build ghpc binary from this. This source code also contains deprecated examples using V4 modules for your reference. +> **_NOTE:_** Slurm-GCP V5 is deprecated. In case, you want to use V5 modules, please use +[ghpc-v1.44.1](https://github.com/GoogleCloudPlatform/hpc-toolkit/releases/tag/v1.44.1) +source code and build ghpc binary from this. This source code also contains +deprecated examples using V5 modules for your reference. ## Module Fields diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go index 7f4e22c1ec..37d5c1fb0b 100644 --- a/pkg/modulereader/metadata_legacy.go +++ b/pkg/modulereader/metadata_legacy.go @@ -49,9 +49,6 @@ func defaultAPIList(source string) []string { "compute.googleapis.com", "storage.googleapis.com", }, - "community/modules/compute/schedmd-slurm-gcp-v5-partition": { - "compute.googleapis.com", - }, "community/modules/database/slurm-cloudsql-federation": { "bigqueryconnection.googleapis.com", "sqladmin.googleapis.com", @@ -115,19 +112,6 @@ func defaultAPIList(source string) []string { "compute.googleapis.com", "storage.googleapis.com", }, - "community/modules/scheduler/schedmd-slurm-gcp-v5-controller": { - "compute.googleapis.com", - "iam.googleapis.com", - "pubsub.googleapis.com", - "secretmanager.googleapis.com", - }, - "community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid": { - "compute.googleapis.com", - "pubsub.googleapis.com", - }, - "community/modules/scheduler/schedmd-slurm-gcp-v5-login": { - "compute.googleapis.com", - }, "community/modules/scripts/htcondor-install": {}, "community/modules/scripts/omnia-install": {}, "community/modules/scripts/pbspro-preinstall": { diff --git a/tools/cloud-build/project-cleanup-slurm.yaml b/tools/cloud-build/project-cleanup-slurm.yaml index dbd986fd89..e45d4fa8a3 100644 --- a/tools/cloud-build/project-cleanup-slurm.yaml +++ b/tools/cloud-build/project-cleanup-slurm.yaml @@ -36,9 +36,8 @@ steps: fi # look only for tests that either use Slurm5, or Slurm6 - # v5: clean project metadata - # v5+v6: clean resource policies - builds_filter="tags=m.schedmd-slurm-gcp-v6-controller OR tags=m.schedmd-slurm-gcp-v5-controller" + # v6: clean resource policies + builds_filter="tags=m.schedmd-slurm-gcp-v6-controller" builds_format="value(substitutions.TRIGGER_NAME,logUrl)" active_builds=$(gcloud builds list --project "${PROJECT_ID}" --filter="${builds_filter}" --format="${builds_format}" --ongoing 2>/dev/null) if [[ -n "$active_builds" ]]; then diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index dd221801ac..a1ab586ef3 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -35,21 +35,11 @@ "modules/scheduler/batch-job-template/startup_from_network_storage.tf", "modules/compute/vm-instance/startup_from_network_storage.tf", ], - [ - "community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf", - "community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf", - "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf", - ], [ "modules/compute/gke-node-pool/threads_per_core_calc.tf", "modules/compute/vm-instance/threads_per_core_calc.tf", ], - [ # Slurm V5 - "community/modules/compute/schedmd-slurm-gcp-v5-node-group/source_image_logic.tf", - "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/source_image_logic.tf", - "community/modules/scheduler/schedmd-slurm-gcp-v5-login/source_image_logic.tf", - ], - [ # Slurm V6 + [ "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf", "community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf", "community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf", @@ -68,7 +58,6 @@ "community/modules/scripts/ramble-setup/scripts/install_ramble_deps.yml", ], [ - "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/etc/long-prolog-slurm.conf.tpl", "community/modules/scheduler/schedmd-slurm-gcp-v6-controller/etc/long-prolog-slurm.conf.tpl", ], [ diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml index 711c9f72e0..8ca6f6994d 100644 --- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml +++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml @@ -217,7 +217,7 @@ deployment_groups: node_count_dynamic_max: 16 machine_type: a2-megagpu-16g # This makes this nodeset look for machines in any of the following zones - # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies // !!! + # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v6-nodeset#compute-vm-zone-policies // !!! zones: $(vars.gpu_zones) bandwidth_tier: gvnic_enabled instance_image: $(vars.slurm_image) diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index 57929a4ded..b00a2e36b6 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -121,12 +121,20 @@ check_background() { } CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f -not -path 'examples/machine-learning/a3-megagpu-8g/*' -not -path 'examples/machine-learning/a3-ultragpu-8g/*' -not -path 'examples/gke-a3-ultragpu/*' -not -path 'examples/hypercompute_clusters/*') +# Exclude blueprints that use v5 modules. +declare -A EXCLUDE_EXAMPLE +EXCLUDE_EXAMPLE["tools/validate_configs/test_configs/two-clusters-sql.yaml"]= cwd=$(pwd) NPROCS=${NPROCS:-$(nproc)} echo "Running tests in $NPROCS processes" pids=() for example in $CONFIGS; do + if [[ ${EXCLUDE_EXAMPLE[$example]+_} ]]; then + echo "Skipping example: $example" + continue + fi + JNUM=$(jobs | wc -l) # echo "$JNUM jobs running" if [ "$JNUM" -ge "$NPROCS" ]; then