From 3c43c83543be0b723ad56395ff7a06e5e98e2916 Mon Sep 17 00:00:00 2001 From: David Costa Date: Thu, 21 Dec 2023 15:35:27 +0000 Subject: [PATCH] feat!: autoscaler with scaling schedules Add the ability to use an autoscaler to scale down to zero outside the defined schedules. Only non-stateful MIGs can be used with autoscalers, so this commit also removes the responsibility of creating the home folder disk (atlantis-disk-0) from the MIG, effectively making it a stateless MIG. Nonetheless, destroying the group will not destroy the disk. Add resources for the disk and the autoscaler, and a usage example. Update the README. BREAKING CHANGE: the 50GB stateful disk is no longer created by the mig, which makes the mig no longer stateful. Additionally, if terraform destroy is executed, the disk is destroyed. --- README.md | 6 ++ examples/autoscaling/README.md | 35 +++++++++ examples/autoscaling/main.tf | 93 +++++++++++++++++++++++ examples/autoscaling/server-atlantis.yaml | 6 ++ main.tf | 77 +++++++++++++------ variables.tf | 15 ++++ 6 files changed, 208 insertions(+), 24 deletions(-) create mode 100644 examples/autoscaling/README.md create mode 100644 examples/autoscaling/main.tf create mode 100644 examples/autoscaling/server-atlantis.yaml diff --git a/README.md b/README.md index 7d4b6b9..fecefe0 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,8 @@ This Terraform module deploys various resources to run Atlantis on Google Comput - **Confidential VM** - A Confidential VM is a type of Compute Engine VM that ensures that your data and applications stay private and encrypted even while in use. You can use a Confidential VM as part of your security strategy so you do not expose sensitive data or workloads during processing. Note that Confidential VM [does not support live migration](https://cloud.google.com/confidential-computing/confidential-vm/docs/error-messages#live_migration_isnt_supported), so if this feature is enabled, `onHostMaintenance` will be set to `TERMINATE`. +- **Scale to zero** - Use [scaling schedules](https://cloud.google.com/compute/docs/autoscaler/scaling-schedules#schedule_configuration_options) so that the instance group only scales up when configured, and down to zero otherwise. Useful to minimize costs. + ## Prerequisites This module expects that you already own or create the below resources yourself. @@ -66,6 +68,7 @@ Here are some examples to choose from. Look at the prerequisites above to find o - [Secure Environment Variables](https://github.com/runatlantis/terraform-gce-atlantis/tree/master/examples/secure-env-vars) - [Cloud Armor](https://github.com/runatlantis/terraform-gce-atlantis/tree/master/examples/cloud-armor) - [Shared VPC](https://github.com/runatlantis/terraform-gce-atlantis/tree/master/examples/shared-vpc) +- [Scale to zero](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/autoscaling) ```hcl module "atlantis" { @@ -211,8 +214,10 @@ You can check the status of the certificate in the Google Cloud Console. | Name | Type | |------|------| | [google-beta_google_compute_instance_group_manager.default](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_instance_group_manager) | resource | +| [google_compute_autoscaler.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_autoscaler) | resource | | [google_compute_backend_service.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_backend_service) | resource | | [google_compute_backend_service.iap](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_backend_service) | resource | +| [google_compute_disk.persistent](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_disk) | resource | | [google_compute_firewall.lb_health_check](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | | [google_compute_global_address.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_global_address) | resource | | [google_compute_global_forwarding_rule.https](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_global_forwarding_rule) | resource | @@ -232,6 +237,7 @@ You can check the status of the certificate in the Google Cloud Console. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [autoscaling](#input\_autoscaling) | Set schedules so that the instance group only scales up when configured |
object({
schedules = list(object({
name = string
description = string
schedule = string
time_zone = string
duration_sec = number
}))
})
| `null` | no | | [block\_project\_ssh\_keys\_enabled](#input\_block\_project\_ssh\_keys\_enabled) | Blocks the use of project-wide publich SSH keys | `bool` | `false` | no | | [default\_backend\_security\_policy](#input\_default\_backend\_security\_policy) | Name of the security policy to apply to the default backend service | `string` | `null` | no | | [disk\_kms\_key\_self\_link](#input\_disk\_kms\_key\_self\_link) | The self link of the encryption key that is stored in Google Cloud KMS | `string` | `null` | no | diff --git a/examples/autoscaling/README.md b/examples/autoscaling/README.md new file mode 100644 index 0000000..c42c717 --- /dev/null +++ b/examples/autoscaling/README.md @@ -0,0 +1,35 @@ +# Example usage + +This example uses [scaling schedules](https://cloud.google.com/compute/docs/autoscaler/scaling-schedules#schedule_configuration_options) to only deploy Atlantis during business hours. + +The schedules follow the syntax [described in the documentation](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_autoscaler#nested_scaling_schedules), but in short: + +- The time zone must be a time zone from the tz database: +- The schedule field uses the extended cron format + +> [!NOTE] +> It takes 2 to 3 minutes from the beginning of the scheduled time for the instance to be ready to serve requests. After the scheduled end time, it approximately takes 10 minutes for the instance to be destroyed. + +Read through the below before you deploy this module. + +- [Prerequisites](#prerequisites) +- [How to deploy](#how-to-deploy) +- [After it's successfully deployed](#after-its-successfully-deployed) + +## Prerequisites + +This module expects that you already own or create the below resources yourself. + +- Google network, subnetwork and a Cloud NAT +- Service account, [specifics can be found here](../../README.md#service-account) +- Domain, [specifics can be found here](../../README.md#dns-record) + +If you prefer an example that includes the above resources, see [`complete example`](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/complete). + +## How to deploy + +See [`main.tf`](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/basic/main.tf) and the [`server-atlantis.yaml`](https://github.com/runatlantis/atlantis-on-gcp-vm/tree/master/examples/basic/server-atlantis.yaml). + +## After it's successfully deployed + +Once you're done, see [Configuring Webhooks for Atlantis](https://www.runatlantis.io/docs/configuring-webhooks.html#configuring-webhooks) diff --git a/examples/autoscaling/main.tf b/examples/autoscaling/main.tf new file mode 100644 index 0000000..6d0aa6a --- /dev/null +++ b/examples/autoscaling/main.tf @@ -0,0 +1,93 @@ +locals { + project_id = "" + network = "" + subnetwork = "" + region = "" + zone = "" + domain = "" + managed_zone = "" + + github_repo_allow_list = "github.com/example/*" + github_user = "" + github_token = "" + github_webhook_secret = "" +} + +# Create a service account and attach the required Cloud Logging permissions to it. +resource "google_service_account" "atlantis" { + account_id = "atlantis" + display_name = "Service Account for Atlantis" + project = local.project_id +} + +resource "google_project_iam_member" "atlantis_log_writer" { + role = "roles/logging.logWriter" + member = "serviceAccount:${google_service_account.atlantis.email}" + project = local.project_id +} + +resource "google_project_iam_member" "atlantis_metric_writer" { + role = "roles/monitoring.metricWriter" + member = "serviceAccount:${google_service_account.atlantis.email}" + project = local.project_id +} + +module "atlantis" { + source = "bschaatsbergen/atlantis/gce" + name = "atlantis" + network = local.network + subnetwork = local.subnetwork + region = local.region + zone = local.zone + service_account = { + email = google_service_account.atlantis.email + scopes = ["cloud-platform"] + } + # Note: environment variables are shown in the Google Cloud UI + # See the `examples/secure-env-vars` if you want to protect sensitive information + env_vars = { + ATLANTIS_GH_USER = local.github_user + ATLANTIS_GH_TOKEN = local.github_token + ATLANTIS_GH_WEBHOOK_SECRET = local.github_webhook_secret + ATLANTIS_REPO_ALLOWLIST = local.github_repo_allow_list + ATLANTIS_ATLANTIS_URL = "https://${local.domain}" + ATLANTIS_REPO_CONFIG_JSON = jsonencode(yamldecode(file("${path.module}/server-atlantis.yaml"))) + } + + autoscaling = { + schedules = [ + # Monday through Friday, between 7h30 and 19h30 + { + name = "business-hours" + description = "Deploy during business hours" + schedule = "30 07 * * 1-5" + time_zone = "Europe/London" + duration_sec = 12 * 60 * 60 + }, + # Monday through Friday, all day + # { + # name = "mon-fri" + # description = "Deploy during weekdays" + # schedule = "00 00 * * 1-5" + # time_zone = "Europe/London" + # duration_sec = 24 * 60 * 60 + # }, + ] + } + + domain = local.domain + project = local.project_id +} + +# As your DNS records might be managed at another registrar's site, we create the DNS record outside of the module. +# This record is mandatory in order to provision the managed SSL certificate successfully. +resource "google_dns_record_set" "default" { + name = "${local.domain}." + type = "A" + ttl = 60 + managed_zone = local.managed_zone + rrdatas = [ + module.atlantis.ip_address + ] + project = local.project_id +} diff --git a/examples/autoscaling/server-atlantis.yaml b/examples/autoscaling/server-atlantis.yaml new file mode 100644 index 0000000..71ec5f7 --- /dev/null +++ b/examples/autoscaling/server-atlantis.yaml @@ -0,0 +1,6 @@ +repos: +- id: /.*/ + apply_requirements: [mergeable] + allowed_overrides: [apply_requirements, workflow] + allow_custom_workflows: true + delete_source_branch_on_merge: true diff --git a/main.tf b/main.tf index a6201be..0b659ba 100644 --- a/main.tf +++ b/main.tf @@ -171,24 +171,10 @@ resource "google_compute_instance_template" "default" { # Persistent disk for Atlantis disk { - device_name = "atlantis-disk-0" - disk_type = var.persistent_disk_type - mode = "READ_WRITE" - disk_size_gb = var.persistent_disk_size_gb - auto_delete = false - labels = merge( - local.atlantis_labels, - { - "disk-type" = "data" - }, - ) - - dynamic "disk_encryption_key" { - for_each = var.disk_kms_key_self_link != null ? [1] : [] - content { - kms_key_self_link = var.disk_kms_key_self_link - } - } + device_name = "atlantis-disk-0" + mode = "READ_WRITE" + source = google_compute_disk.persistent.name + auto_delete = false } network_interface { @@ -222,6 +208,27 @@ resource "google_compute_instance_template" "default" { } } +resource "google_compute_disk" "persistent" { + name = var.name + type = var.persistent_disk_type + size = var.persistent_disk_size_gb + zone = var.zone + labels = merge( + local.atlantis_labels, + { + "disk-type" = "data" + }, + ) + + dynamic "disk_encryption_key" { + for_each = var.disk_kms_key_self_link != null ? [1] : [] + content { + kms_key_self_link = var.disk_kms_key_self_link + } + } + +} + resource "google_compute_health_check" "default" { name = var.name check_interval_sec = 1 @@ -268,17 +275,13 @@ resource "google_compute_instance_group_manager" "default" { port = local.atlantis_port } - stateful_disk { - device_name = "atlantis-disk-0" - delete_rule = "NEVER" - } - auto_healing_policies { health_check = google_compute_health_check.default_instance_group_manager.id initial_delay_sec = 30 } - target_size = 1 + # We cannot set target_size when using an autoscaler + target_size = var.autoscaling == null ? 1 : null update_policy { type = "PROACTIVE" @@ -292,6 +295,32 @@ resource "google_compute_instance_group_manager" "default" { provider = google-beta } +resource "google_compute_autoscaler" "default" { + count = var.autoscaling == null ? 0 : 1 + + name = var.name + zone = var.zone + target = google_compute_instance_group_manager.default.id + + autoscaling_policy { + max_replicas = 1 # Allow at most one instance + min_replicas = 0 # Allow scaling down to zero + cooldown_period = 60 + + dynamic "scaling_schedules" { + for_each = var.autoscaling.schedules == null ? [] : var.autoscaling.schedules + content { + name = scaling_schedules.value.name + description = scaling_schedules.value.description + min_required_replicas = 1 + schedule = scaling_schedules.value.schedule + time_zone = scaling_schedules.value.time_zone + duration_sec = scaling_schedules.value.duration_sec + } + } + } +} + resource "google_compute_global_address" "default" { name = var.name project = var.project diff --git a/variables.tf b/variables.tf index 5c864d5..c8406aa 100644 --- a/variables.tf +++ b/variables.tf @@ -206,4 +206,19 @@ variable "persistent_disk_type" { type = string description = "The type of persistent disk that Atlantis uses to store its data on" default = "pd-ssd" + +} + +variable "autoscaling" { + description = "Set schedules so that the instance group only scales up when configured" + type = object({ + schedules = list(object({ + name = string + description = string + schedule = string + time_zone = string + duration_sec = number + })) + }) + default = null }