feat: add autoscaler_policy_overrides support to module (#95)

castai · Jun 11, 2024 · 2e6eeba · 2e6eeba
1 parent 3a0e76d
commit 2e6eeba
Show file tree

Hide file tree

Showing 4 changed files with 279 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -97,6 +97,53 @@ module "castai-eks-cluster" {
       }
     }
   }
+
+  autoscaler_settings = {
+    enabled                                 = true
+    node_templates_partial_matching_enabled = false
+
+    unschedulable_pods = {
+      enabled = true
+
+      headroom = {
+        enabled           = true
+        cpu_percentage    = 10
+        memory_percentage = 10
+      }
+
+      headroom_spot = {
+        enabled           = true
+        cpu_percentage    = 10
+        memory_percentage = 10
+      }
+    }
+
+    node_downscaler = {
+      enabled = true
+
+      empty_nodes = {
+        enabled = true
+      }
+
+      evictor = {
+        aggressive_mode           = false
+        cycle_interval            = "5s10s"
+        dry_run                   = false
+        enabled                   = true
+        node_grace_period_minutes = 10
+        scoped_mode               = false
+      }
+    }
+
+    cluster_limits = {
+      enabled = true
+
+      cpu = {
+        max_cores = 20
+        min_cores = 1
+      }
+    }
+  }
 }
 ```
 
@@ -311,6 +358,88 @@ module "castai-eks-cluster" {
 }
 ```
 
+Migrating from 9.x.x to 9.3.x
+---------------------------
+
+Version 9.3.x changed:
+* Deprecated `autoscaler_policies_json` attribute. Use `autoscaler_settings` instead.
+
+Old configuration:
+```hcl
+module "castai-eks-cluster" {
+  autoscaler_policies_json = <<-EOT
+    {
+        "enabled": true,
+        "unschedulablePods": {
+            "enabled": true
+        },
+        "nodeDownscaler": {
+            "enabled": true,
+            "emptyNodes": {
+                "enabled": true
+            },
+            "evictor": {
+                "aggressiveMode": false,
+                "cycleInterval": "5m10s",
+                "dryRun": false,
+                "enabled": true,
+                "nodeGracePeriodMinutes": 10,
+                "scopedMode": false
+            }
+        },
+        "nodeTemplatesPartialMatchingEnabled": false,
+        "clusterLimits": {
+            "cpu": {
+                "maxCores": 20,
+                "minCores": 1
+            },
+            "enabled": true
+        }
+    }
+  EOT
+}
+```
+
+New configuration:
+```hcl
+module "castai-eks-cluster" {
+  autoscaler_settings = {
+    enabled                                 = true
+    node_templates_partial_matching_enabled = false
+
+    unschedulable_pods = {
+      enabled = true
+    }
+
+    node_downscaler = {
+      enabled = true
+
+      empty_nodes = {
+        enabled = true
+      }
+
+      evictor = {
+        aggressive_mode           = false
+        cycle_interval            = "5m10s"
+        dry_run                   = false
+        enabled                   = true
+        node_grace_period_minutes = 10
+        scoped_mode               = false
+      }
+    }
+
+    cluster_limits = {
+      enabled = true
+
+      cpu = {
+        max_cores = 20
+        min_cores = 1
+      }
+    }
+  }
+}
+```
+
 # Examples
 
 Usage examples are located in [terraform provider repo](https://github.com/castai/terraform-provider-castai/tree/master/examples/eks)
@@ -324,20 +453,20 @@ terraform-docs markdown table . --output-file README.md
 <!-- BEGIN_TF_DOCS -->
 ## Requirements
 
-| Name | Version |
-|------|---------|
-| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13 |
-| <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 2.49 |
-| <a name="requirement_castai"></a> [castai](#requirement\_castai) | ~> 7.3.0 |
+| Name | Version  |
+|------|----------|
+| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 0.13  |
+| <a name="requirement_aws"></a> [aws](#requirement\_aws) | >= 2.49  |
+| <a name="requirement_castai"></a> [castai](#requirement\_castai) | ~> 7.4.0 |
 | <a name="requirement_helm"></a> [helm](#requirement\_helm) | >= 2.0.0 |
 
 ## Providers
 
 | Name | Version |
 |------|---------|
-| <a name="provider_castai"></a> [castai](#provider\_castai) | ~> 7.3.0 |
-| <a name="provider_helm"></a> [helm](#provider\_helm) | >= 2.0.0 |
-| <a name="provider_null"></a> [null](#provider\_null) | n/a |
+| <a name="provider_castai"></a> [castai](#provider\_castai) | 7.4.0   |
+| <a name="provider_helm"></a> [helm](#provider\_helm) | 2.13.2  |
+| <a name="provider_null"></a> [null](#provider\_null) | 3.2.2   |
 
 ## Modules
 
@@ -380,7 +509,8 @@ No modules.
 | <a name="input_agent_version"></a> [agent\_version](#input\_agent\_version) | Version of castai-agent helm chart. Default latest | `string` | `null` | no |
 | <a name="input_api_grpc_addr"></a> [api\_grpc\_addr](#input\_api\_grpc\_addr) | CAST AI GRPC API address | `string` | `"api-grpc.cast.ai:443"` | no |
 | <a name="input_api_url"></a> [api\_url](#input\_api\_url) | URL of alternative CAST AI API to be used during development or testing | `string` | `"https://api.cast.ai"` | no |
-| <a name="input_autoscaler_policies_json"></a> [autoscaler\_policies\_json](#input\_autoscaler\_policies\_json) | Optional json object to override CAST AI cluster autoscaler policies | `string` | `null` | no |
+| <a name="input_autoscaler_policies_json"></a> [autoscaler\_policies\_json](#input\_autoscaler\_policies\_json) | Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use `autoscaler_settings` instead. | `string` | `null` | no |
+| <a name="input_autoscaler_settings"></a> [autoscaler\_policy\_overrides](#input\_autoscaler\_policy\_overrides) | Optional Autoscaler policy definitions to override current autoscaler settings | `any` | `null` | no |
 | <a name="input_aws_account_id"></a> [aws\_account\_id](#input\_aws\_account\_id) | ID of AWS account the cluster is located in. | `string` | n/a | yes |
 | <a name="input_aws_assume_role_arn"></a> [aws\_assume\_role\_arn](#input\_aws\_assume\_role\_arn) | Arn of the role to be used by CAST AI for IAM access | `string` | `null` | no |
 | <a name="input_aws_cluster_name"></a> [aws\_cluster\_name](#input\_aws\_cluster\_name) | Name of the cluster to be connected to CAST AI. | `string` | n/a | yes |

diff --git a/main.tf b/main.tf
@@ -811,8 +811,139 @@ resource "helm_release" "castai_kvisor_self_managed" {
 }
 
 resource "castai_autoscaler" "castai_autoscaler_policies" {
+  cluster_id = castai_eks_cluster.my_castai_cluster.id
+
   autoscaler_policies_json = var.autoscaler_policies_json
-  cluster_id               = castai_eks_cluster.my_castai_cluster.id
+
+  dynamic "autoscaler_settings" {
+    for_each = var.autoscaler_settings != null ? [var.autoscaler_settings] : []
+
+    content {
+      enabled                                 = try(autoscaler_settings.value.enabled, null)
+      is_scoped_mode                          = try(autoscaler_settings.value.is_scoped_mode, null)
+      node_templates_partial_matching_enabled = try(autoscaler_settings.value.node_templates_partial_matching_enabled, null)
+
+      dynamic "unschedulable_pods" {
+        for_each = try([autoscaler_settings.value.unschedulable_pods], [])
+
+        content {
+          enabled                  = try(unschedulable_pods.value.enabled, null)
+          custom_instances_enabled = try(unschedulable_pods.value.custom_instances_enabled, null)
+
+          dynamic "headroom" {
+            for_each = try([unschedulable_pods.value.headroom], [])
+
+            content {
+              enabled           = try(headroom.value.enabled, null)
+              cpu_percentage    = try(headroom.value.cpu_percentage, null)
+              memory_percentage = try(headroom.value.memory_percentage, null)
+            }
+          }
+
+          dynamic "headroom_spot" {
+            for_each = try([unschedulable_pods.value.headroom_spot], [])
+
+            content {
+              enabled           = try(headroom_spot.value.enabled, null)
+              cpu_percentage    = try(headroom_spot.value.cpu_percentage, null)
+              memory_percentage = try(headroom_spot.value.memory_percentage, null)
+            }
+          }
+
+          dynamic "node_constraints" {
+            for_each = try([unschedulable_pods.value.node_constraints], [])
+
+            content {
+              enabled       = try(node_constraints.value.enabled, null)
+              min_cpu_cores = try(node_constraints.value.min_cpu_cores, null)
+              max_cpu_cores = try(node_constraints.value.max_cpu_cores, null)
+              min_ram_mib   = try(node_constraints.value.min_ram_mib, null)
+              max_ram_mib   = try(node_constraints.value.max_ram_mib, null)
+            }
+          }
+        }
+      }
+
+      dynamic "cluster_limits" {
+        for_each = try([autoscaler_settings.value.cluster_limits], [])
+
+        content {
+          enabled = try(cluster_limits.value.enabled, null)
+
+
+          dynamic "cpu" {
+            for_each = try([cluster_limits.value.cpu], [])
+
+            content {
+              min_cores = try(cpu.value.min_cores, null)
+              max_cores = try(cpu.value.max_cores, null)
+            }
+          }
+        }
+      }
+
+      dynamic "spot_instances" {
+        for_each = try([autoscaler_settings.value.spot_instances], [])
+
+        content {
+          enabled                             = try(spot_instances.value.enabled, null)
+          max_reclaim_rate                    = try(spot_instances.value.max_reclaim_rate, null)
+          spot_diversity_enabled              = try(spot_instances.value.spot_diversity_enabled, null)
+          spot_diversity_price_increase_limit = try(spot_instances.value.spot_diversity_price_increase_limit, null)
+
+          dynamic "spot_backups" {
+            for_each = try([spot_instances.value.spot_backups], [])
+
+            content {
+              enabled                          = try(spot_backups.value.enabled, null)
+              spot_backup_restore_rate_seconds = try(spot_backups.value.spot_backup_restore_rate_seconds, null)
+            }
+          }
+
+          dynamic "spot_interruption_predictions" {
+            for_each = try([spot_instances.value.spot_interruption_predictions], [])
+
+            content {
+              enabled                            = try(spot_interruption_predictions.value.enabled, null)
+              spot_interruption_predictions_type = try(spot_interruption_predictions.value.spot_interruption_predictions_type, null)
+            }
+          }
+        }
+      }
+
+      dynamic "node_downscaler" {
+        for_each = try([autoscaler_settings.value.node_downscaler], [])
+
+        content {
+          enabled = try(node_downscaler.value.enabled, null)
+
+          dynamic "empty_nodes" {
+            for_each = try([node_downscaler.value.empty_nodes], [])
+
+            content {
+              enabled       = try(empty_nodes.value.enabled, null)
+              delay_seconds = try(empty_nodes.value.delay_seconds, null)
+            }
+          }
+
+          dynamic "evictor" {
+            for_each = try([node_downscaler.value.evictor], [])
+
+            content {
+              enabled                                = try(evictor.value.enabled, null)
+              dry_run                                = try(evictor.value.dry_run, null)
+              aggressive_mode                        = try(evictor.value.aggressive_mode, null)
+              scoped_mode                            = try(evictor.value.scoped_mode, null)
+              cycle_interval                         = try(evictor.value.cycle_interval, null)
+              node_grace_period_minutes              = try(evictor.value.node_grace_period_minutes, null)
+              pod_eviction_failure_back_off_interval = try(evictor.value.pod_eviction_failure_back_off_interval, null)
+              ignore_pod_disruption_budgets          = try(evictor.value.ignore_pod_disruption_budgets, null)
+            }
+          }
+        }
+      }
+    }
+  }
 
   depends_on = [helm_release.castai_agent, helm_release.castai_evictor, helm_release.castai_evictor_ext]
 }
diff --git a/variables.tf b/variables.tf
@@ -40,7 +40,13 @@ variable "api_grpc_addr" {
 
 variable "autoscaler_policies_json" {
   type        = string
-  description = "Optional json object to override CAST AI cluster autoscaler policies"
+  description = "Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use `autoscaler_settings` instead."
+  default     = null
+}
+
+variable "autoscaler_settings" {
+  type        = any
+  description = "Optional Autoscaler policy definitions to override current autoscaler settings"
   default     = null
 }
 

diff --git a/versions.tf b/versions.tf
@@ -8,7 +8,7 @@ terraform {
     }
     castai = {
       source  = "castai/castai"
-      version = "~> 7.3.0"
+      version = "~> 7.4.0"
     }
     helm = {
       source  = "hashicorp/helm"