From 25e9a3d3e3f1588c70c45c43163f523036828a44 Mon Sep 17 00:00:00 2001 From: Nuru Date: Mon, 8 Jul 2024 09:06:27 -0700 Subject: [PATCH] [eks/karpenter] Add support for `kubelet` config, fix IAM support for `v1alpha` cleanup (#1076) --- modules/eks/karpenter-node-pool/CHANGELOG.md | 15 +++++++++ modules/eks/karpenter-node-pool/README.md | 2 +- modules/eks/karpenter-node-pool/main.tf | 12 +++++-- modules/eks/karpenter-node-pool/variables.tf | 10 ++++-- modules/eks/karpenter/CHANGELOG.md | 32 ++++++++++++++++++- modules/eks/karpenter/README.md | 2 ++ .../karpenter/controller-policy-v1alpha.tf | 30 ++++++++++++++--- modules/eks/karpenter/main.tf | 2 +- 8 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 modules/eks/karpenter-node-pool/CHANGELOG.md diff --git a/modules/eks/karpenter-node-pool/CHANGELOG.md b/modules/eks/karpenter-node-pool/CHANGELOG.md new file mode 100644 index 000000000..2a110392c --- /dev/null +++ b/modules/eks/karpenter-node-pool/CHANGELOG.md @@ -0,0 +1,15 @@ +## Components [PR #1076](https://github.com/cloudposse/terraform-aws-components/pull/1076) + +- Allow specifying elements of `spec.template.spec.kubelet` +- Make taint values optional + +The `var.node_pools` map now includes a `kubelet` field that allows specifying elements of `spec.template.spec.kubelet`. +This is useful for configuring the kubelet to use custom settings, such as reserving resources for system daemons. + +For more information, see: + +- [Karpenter documentation](https://karpenter.sh/docs/concepts/nodepools/#spectemplatespeckubelet) +- [Kubernetes documentation](https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/) + +The `value` fields of the `taints` and `startup_taints` lists in the `var.node_pools` map are now optional. This is in +alignment with the Kubernetes API, where `key` and `effect` are required, but the `value` field is optional. diff --git a/modules/eks/karpenter-node-pool/README.md b/modules/eks/karpenter-node-pool/README.md index 1d2c16355..449fb589d 100644 --- a/modules/eks/karpenter-node-pool/README.md +++ b/modules/eks/karpenter-node-pool/README.md @@ -203,7 +203,7 @@ components: | [labels\_as\_tags](#input\_labels\_as\_tags) | Set of labels (ID elements) to include as tags in the `tags` output.
Default is to include all labels.
Tags with empty values will not be included in the `tags` output.
Set to `[]` to suppress all generated tags.
**Notes:**
The value of the `name` tag, if included, will be the `id`, not the `name`.
Unlike other `null-label` inputs, the initial setting of `labels_as_tags` cannot be
changed in later chained modules. Attempts to change it will be silently ignored. | `set(string)` |
[
"default"
]
| no | | [name](#input\_name) | ID element. Usually the component or solution name, e.g. 'app' or 'jenkins'.
This is the only ID element not also included as a `tag`.
The "name" tag is set to the full `id` string. There is no tag with the value of the `name` input. | `string` | `null` | no | | [namespace](#input\_namespace) | ID element. Usually an abbreviation of your organization name, e.g. 'eg' or 'cp', to help ensure generated IDs are globally unique | `string` | `null` | no | -| [node\_pools](#input\_node\_pools) | Configuration for node pools. See code for details. |
map(object({
# The name of the Karpenter provisioner. The map key is used if this is not set.
name = optional(string)
# Whether to place EC2 instances launched by Karpenter into VPC private subnets. Set it to `false` to use public subnets.
private_subnets_enabled = bool
# The Disruption spec controls how Karpenter scales down the node group.
# See the example (sadly not the specific `spec.disruption` documentation) at https://karpenter.sh/docs/concepts/nodepools/ for details
disruption = optional(object({
# Describes which types of Nodes Karpenter should consider for consolidation.
# If using 'WhenUnderutilized', Karpenter will consider all nodes for consolidation and attempt to remove or
# replace Nodes when it discovers that the Node is underutilized and could be changed to reduce cost.
# If using `WhenEmpty`, Karpenter will only consider nodes for consolidation that contain no workload pods.
consolidation_policy = optional(string, "WhenUnderutilized")

# The amount of time Karpenter should wait after discovering a consolidation decision (`go` duration string, s, m, or h).
# This value can currently (v0.36.0) only be set when the consolidationPolicy is 'WhenEmpty'.
# You can choose to disable consolidation entirely by setting the string value 'Never' here.
# Earlier versions of Karpenter called this field `ttl_seconds_after_empty`.
consolidate_after = optional(string)

# The amount of time a Node can live on the cluster before being removed (`go` duration string, s, m, or h).
# You can choose to disable expiration entirely by setting the string value 'Never' here.
# This module sets a default of 336 hours (14 days), while the Karpenter default is 720 hours (30 days).
# Note that Karpenter calls this field "expiresAfter", and earlier versions called it `ttl_seconds_until_expired`,
# but we call it "max_instance_lifetime" to match the corresponding field in EC2 Auto Scaling Groups.
max_instance_lifetime = optional(string, "336h")

# Budgets control the the maximum number of NodeClaims owned by this NodePool that can be terminating at once.
# See https://karpenter.sh/docs/concepts/disruption/#disruption-budgets for details.
# A percentage is the percentage of the total number of active, ready nodes not being deleted, rounded up.
# If there are multiple active budgets, Karpenter uses the most restrictive value.
# If left undefined, this will default to one budget with a value of nodes: 10%.
# Note that budgets do not prevent or limit involuntary terminations.
# Example:
# On Weekdays during business hours, don't do any deprovisioning.
# budgets = {
# schedule = "0 9 * * mon-fri"
# duration = 8h
# nodes = "0"
# }
budgets = optional(list(object({
# The schedule specifies when a budget begins being active, using extended cronjob syntax.
# See https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#schedule-syntax for syntax details.
# Timezones are not supported. This field is required if Duration is set.
schedule = optional(string)
# Duration determines how long a Budget is active after each Scheduled start.
# If omitted, the budget is always active. This is required if Schedule is set.
# Must be a whole number of minutes and hours, as cron does not work in seconds,
# but since Go's `duration.String()` always adds a "0s" at the end, that is allowed.
duration = optional(string)
# The percentage or number of nodes that Karpenter can scale down during the budget.
nodes = string
})), [])
}), {})
# Karpenter provisioner total CPU limit for all pods running on the EC2 instances launched by Karpenter
total_cpu_limit = string
# Karpenter provisioner total memory limit for all pods running on the EC2 instances launched by Karpenter
total_memory_limit = string
# Set a weight for this node pool.
# See https://karpenter.sh/docs/concepts/scheduling/#weighted-nodepools
weight = optional(number, 50)
labels = optional(map(string))
annotations = optional(map(string))
# Karpenter provisioner taints configuration. See https://aws.github.io/aws-eks-best-practices/karpenter/#create-provisioners-that-are-mutually-exclusive for more details
taints = optional(list(object({
key = string
effect = string
value = string
})))
startup_taints = optional(list(object({
key = string
effect = string
value = string
})))
# Karpenter node metadata options. See https://karpenter.sh/docs/concepts/nodeclasses/#specmetadataoptions for more details
metadata_options = optional(object({
httpEndpoint = optional(string, "enabled")
httpProtocolIPv6 = optional(string, "disabled")
httpPutResponseHopLimit = optional(number, 2)
# httpTokens can be either "required" or "optional"
httpTokens = optional(string, "required")
}), {})
# The AMI used by Karpenter provisioner when provisioning nodes. Based on the value set for amiFamily, Karpenter will automatically query for the appropriate EKS optimized AMI via AWS Systems Manager (SSM)
ami_family = string
# Karpenter nodes block device mappings. Controls the Elastic Block Storage volumes that Karpenter attaches to provisioned nodes.
# Karpenter uses default block device mappings for the AMI Family specified.
# For example, the Bottlerocket AMI Family defaults with two block device mappings,
# and normally you only want to scale `/dev/xvdb` where Containers and there storage are stored.
# Most other AMIs only have one device mapping at `/dev/xvda`.
# See https://karpenter.sh/docs/concepts/nodeclasses/#specblockdevicemappings for more details
block_device_mappings = list(object({
deviceName = string
ebs = optional(object({
volumeSize = string
volumeType = string
deleteOnTermination = optional(bool, true)
encrypted = optional(bool, true)
iops = optional(number)
kmsKeyID = optional(string, "alias/aws/ebs")
snapshotID = optional(string)
throughput = optional(number)
}))
}))
# Set acceptable (In) and unacceptable (Out) Kubernetes and Karpenter values for node provisioning based on Well-Known Labels and cloud-specific settings. These can include instance types, zones, computer architecture, and capacity type (such as AWS spot or on-demand). See https://karpenter.sh/v0.18.0/provisioner/#specrequirements for more details
requirements = list(object({
key = string
operator = string
# Operators like "Exists" and "DoesNotExist" do not require a value
values = optional(list(string))
}))
}))
| n/a | yes | +| [node\_pools](#input\_node\_pools) | Configuration for node pools. See code for details. |
map(object({
# The name of the Karpenter provisioner. The map key is used if this is not set.
name = optional(string)
# Whether to place EC2 instances launched by Karpenter into VPC private subnets. Set it to `false` to use public subnets.
private_subnets_enabled = bool
# The Disruption spec controls how Karpenter scales down the node group.
# See the example (sadly not the specific `spec.disruption` documentation) at https://karpenter.sh/docs/concepts/nodepools/ for details
disruption = optional(object({
# Describes which types of Nodes Karpenter should consider for consolidation.
# If using 'WhenUnderutilized', Karpenter will consider all nodes for consolidation and attempt to remove or
# replace Nodes when it discovers that the Node is underutilized and could be changed to reduce cost.
# If using `WhenEmpty`, Karpenter will only consider nodes for consolidation that contain no workload pods.
consolidation_policy = optional(string, "WhenUnderutilized")

# The amount of time Karpenter should wait after discovering a consolidation decision (`go` duration string, s, m, or h).
# This value can currently (v0.36.0) only be set when the consolidationPolicy is 'WhenEmpty'.
# You can choose to disable consolidation entirely by setting the string value 'Never' here.
# Earlier versions of Karpenter called this field `ttl_seconds_after_empty`.
consolidate_after = optional(string)

# The amount of time a Node can live on the cluster before being removed (`go` duration string, s, m, or h).
# You can choose to disable expiration entirely by setting the string value 'Never' here.
# This module sets a default of 336 hours (14 days), while the Karpenter default is 720 hours (30 days).
# Note that Karpenter calls this field "expiresAfter", and earlier versions called it `ttl_seconds_until_expired`,
# but we call it "max_instance_lifetime" to match the corresponding field in EC2 Auto Scaling Groups.
max_instance_lifetime = optional(string, "336h")

# Budgets control the the maximum number of NodeClaims owned by this NodePool that can be terminating at once.
# See https://karpenter.sh/docs/concepts/disruption/#disruption-budgets for details.
# A percentage is the percentage of the total number of active, ready nodes not being deleted, rounded up.
# If there are multiple active budgets, Karpenter uses the most restrictive value.
# If left undefined, this will default to one budget with a value of nodes: 10%.
# Note that budgets do not prevent or limit involuntary terminations.
# Example:
# On Weekdays during business hours, don't do any deprovisioning.
# budgets = {
# schedule = "0 9 * * mon-fri"
# duration = 8h
# nodes = "0"
# }
budgets = optional(list(object({
# The schedule specifies when a budget begins being active, using extended cronjob syntax.
# See https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#schedule-syntax for syntax details.
# Timezones are not supported. This field is required if Duration is set.
schedule = optional(string)
# Duration determines how long a Budget is active after each Scheduled start.
# If omitted, the budget is always active. This is required if Schedule is set.
# Must be a whole number of minutes and hours, as cron does not work in seconds,
# but since Go's `duration.String()` always adds a "0s" at the end, that is allowed.
duration = optional(string)
# The percentage or number of nodes that Karpenter can scale down during the budget.
nodes = string
})), [])
}), {})
# Karpenter provisioner total CPU limit for all pods running on the EC2 instances launched by Karpenter
total_cpu_limit = string
# Karpenter provisioner total memory limit for all pods running on the EC2 instances launched by Karpenter
total_memory_limit = string
# Set a weight for this node pool.
# See https://karpenter.sh/docs/concepts/scheduling/#weighted-nodepools
weight = optional(number, 50)
labels = optional(map(string))
annotations = optional(map(string))
# Karpenter provisioner taints configuration. See https://aws.github.io/aws-eks-best-practices/karpenter/#create-provisioners-that-are-mutually-exclusive for more details
taints = optional(list(object({
key = string
effect = string
value = optional(string)
})))
startup_taints = optional(list(object({
key = string
effect = string
value = optional(string)
})))
# Karpenter node metadata options. See https://karpenter.sh/docs/concepts/nodeclasses/#specmetadataoptions for more details
metadata_options = optional(object({
httpEndpoint = optional(string, "enabled")
httpProtocolIPv6 = optional(string, "disabled")
httpPutResponseHopLimit = optional(number, 2)
# httpTokens can be either "required" or "optional"
httpTokens = optional(string, "required")
}), {})
# The AMI used by Karpenter provisioner when provisioning nodes. Based on the value set for amiFamily, Karpenter will automatically query for the appropriate EKS optimized AMI via AWS Systems Manager (SSM)
ami_family = string
# Karpenter nodes block device mappings. Controls the Elastic Block Storage volumes that Karpenter attaches to provisioned nodes.
# Karpenter uses default block device mappings for the AMI Family specified.
# For example, the Bottlerocket AMI Family defaults with two block device mappings,
# and normally you only want to scale `/dev/xvdb` where Containers and there storage are stored.
# Most other AMIs only have one device mapping at `/dev/xvda`.
# See https://karpenter.sh/docs/concepts/nodeclasses/#specblockdevicemappings for more details
block_device_mappings = list(object({
deviceName = string
ebs = optional(object({
volumeSize = string
volumeType = string
deleteOnTermination = optional(bool, true)
encrypted = optional(bool, true)
iops = optional(number)
kmsKeyID = optional(string, "alias/aws/ebs")
snapshotID = optional(string)
throughput = optional(number)
}))
}))
# Set acceptable (In) and unacceptable (Out) Kubernetes and Karpenter values for node provisioning based on Well-Known Labels and cloud-specific settings. These can include instance types, zones, computer architecture, and capacity type (such as AWS spot or on-demand). See https://karpenter.sh/v0.18.0/provisioner/#specrequirements for more details
requirements = list(object({
key = string
operator = string
# Operators like "Exists" and "DoesNotExist" do not require a value
values = optional(list(string))
}))
# Any values for spec.template.spec.kubelet allowed by Karpenter.
# Not fully specified, because they are subject to change.
# See:
# https://karpenter.sh/docs/concepts/nodepools/#spectemplatespeckubelet
# https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/
kubelet = optional(any, {})
}))
| n/a | yes | | [regex\_replace\_chars](#input\_regex\_replace\_chars) | Terraform regular expression (regex) string.
Characters matching the regex will be removed from the ID elements.
If not set, `"/[^a-zA-Z0-9-]/"` is used to remove all characters other than hyphens, letters and digits. | `string` | `null` | no | | [region](#input\_region) | AWS Region | `string` | n/a | yes | | [stage](#input\_stage) | ID element. Usually used to indicate role, e.g. 'prod', 'staging', 'source', 'build', 'test', 'deploy', 'release' | `string` | `null` | no | diff --git a/modules/eks/karpenter-node-pool/main.tf b/modules/eks/karpenter-node-pool/main.tf index 67c5b57b9..d43d8d2ac 100644 --- a/modules/eks/karpenter-node-pool/main.tf +++ b/modules/eks/karpenter-node-pool/main.tf @@ -8,6 +8,11 @@ locals { public_subnet_ids = module.vpc.outputs.public_subnet_ids node_pools = { for k, v in var.node_pools : k => v if local.enabled } + kubelets_specs_filtered = { for k, v in local.node_pools : k => { + for kk, vv in v.kubelet : kk => vv if vv != null + } + } + kubelet_specs = { for k, v in local.kubelets_specs_filtered : k => v if length(v) > 0 } } # https://karpenter.sh/docs/concepts/nodepools/ @@ -40,8 +45,8 @@ resource "kubernetes_manifest" "node_pool" { ) template = { metadata = { - labels = each.value.labels - annotations = each.value.annotations + labels = coalesce(each.value.labels, {}) + annotations = coalesce(each.value.annotations, {}) } spec = merge({ nodeClassRef = { @@ -64,6 +69,9 @@ resource "kubernetes_manifest" "node_pool" { }, try(length(each.value.startup_taints), 0) == 0 ? {} : { startupTaints = each.value.startup_taints + }, + try(local.kubelet_specs[each.key], null) == null ? {} : { + kubelet = local.kubelet_specs[each.key] } ) } diff --git a/modules/eks/karpenter-node-pool/variables.tf b/modules/eks/karpenter-node-pool/variables.tf index ae319b02e..522e79e77 100644 --- a/modules/eks/karpenter-node-pool/variables.tf +++ b/modules/eks/karpenter-node-pool/variables.tf @@ -77,12 +77,12 @@ variable "node_pools" { taints = optional(list(object({ key = string effect = string - value = string + value = optional(string) }))) startup_taints = optional(list(object({ key = string effect = string - value = string + value = optional(string) }))) # Karpenter node metadata options. See https://karpenter.sh/docs/concepts/nodeclasses/#specmetadataoptions for more details metadata_options = optional(object({ @@ -120,6 +120,12 @@ variable "node_pools" { # Operators like "Exists" and "DoesNotExist" do not require a value values = optional(list(string)) })) + # Any values for spec.template.spec.kubelet allowed by Karpenter. + # Not fully specified, because they are subject to change. + # See: + # https://karpenter.sh/docs/concepts/nodepools/#spectemplatespeckubelet + # https://kubernetes.io/docs/reference/config-api/kubelet-config.v1beta1/ + kubelet = optional(any, {}) })) description = "Configuration for node pools. See code for details." nullable = false diff --git a/modules/eks/karpenter/CHANGELOG.md b/modules/eks/karpenter/CHANGELOG.md index 6304d8034..6d7a8f2d2 100644 --- a/modules/eks/karpenter/CHANGELOG.md +++ b/modules/eks/karpenter/CHANGELOG.md @@ -1,6 +1,36 @@ +## Components [PR #1076](https://github.com/cloudposse/terraform-aws-components/pull/1076) + +#### Bugfix + +- Fixed issues with IAM Policy support for cleaning up `v1alpha` resources. + +With the previous release of this component, we encouraged users to delete their `v1alpha` Karpenter resources before +upgrading to `v1beta`. However, certain things, such as EC2 Instance Profiles, would not be deleted by Terraform because +they were created or modified by the Karpenter controller. + +To enable the `v1beta` Karpenter controller to clean up these resources, we added a second IAM Policy to the official +Karpenter IAM Policy document. This second policy allows the Karpenter controller to delete the `v1alpha` resources. +However, there were 2 problems with that. + +First, the policy was subtly incorrect, and did not, in fact, allow the Karpenter controller to delete all the +resources. This has been fixed. + +Second, a long EKS cluster name could cause the Karpenter IRSA's policy to exceed the maximum character limit for an IAM +Policy. This has also been fixed by making the `v1alpha` policy a separate managed policy attached to the Karpenter +controller's role, rather than merging the statements into the `v1beta` policy. This change also avoids potential +conflicts with policy SIDs. + +:::note Innocuous Changes + +Terraform will show IAM Policy changes, including deletion of statements from the existing policy and creation of a new +policy. This is expected and innocuous. The IAM Policy has been split into 2 to avoid exceeding length limits, but the +current (`v1beta`) policy remains the same and the now separate (`v1alpha`) policy has been corrected. + +::: + ## Version 1.445.0 -Components PR #1039 +Components [PR #1039](https://github.com/cloudposse/terraform-aws-components/pull/1039) :::warning Major Breaking Changes diff --git a/modules/eks/karpenter/README.md b/modules/eks/karpenter/README.md index e0afff396..5732ce94c 100644 --- a/modules/eks/karpenter/README.md +++ b/modules/eks/karpenter/README.md @@ -362,6 +362,8 @@ For more details on the CRDs, see: |------|------| | [aws_cloudwatch_event_rule.interruption_handler](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_target.interruption_handler](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_iam_policy.v1alpha](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | +| [aws_iam_role_policy_attachment.v1alpha](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_sqs_queue.interruption_handler](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue) | resource | | [aws_sqs_queue_policy.interruption_handler](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/sqs_queue_policy) | resource | | [aws_eks_cluster_auth.eks](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | diff --git a/modules/eks/karpenter/controller-policy-v1alpha.tf b/modules/eks/karpenter/controller-policy-v1alpha.tf index 0c4d010d8..d2c5f6b29 100644 --- a/modules/eks/karpenter/controller-policy-v1alpha.tf +++ b/modules/eks/karpenter/controller-policy-v1alpha.tf @@ -13,8 +13,10 @@ # v1alpha API tag "karpenter.sh/provisioner-name" and to manage the EC2 Instance Profile # created by the EKS cluster component. # -# WARNING: it is important that the SID values do not conflict with the SID values in the -# controller-policy.tf file, otherwise they will be overwritten. +# We create a separate policy and attach it separately to the Karpenter controller role +# because the main policy is near the 6,144 character limit for an IAM policy, and +# adding this to it can push it over. See: +# https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_iam-quotas.html#reference_iam-quotas-entities # locals { @@ -35,10 +37,10 @@ locals { ], "Condition": { "StringEquals": { - "aws:ResourceTag/kubernetes.io/cluster/${local.eks_cluster_id}": "owned" + "ec2:ResourceTag/karpenter.k8s.aws/cluster": "${local.eks_cluster_id}" }, "StringLike": { - "aws:ResourceTag/karpenter.sh/provisioner-name": "*" + "ec2:ResourceTag/karpenter.sh/provisioner-name": "*" } } }, @@ -65,3 +67,23 @@ locals { } EndOfPolicy } + +# We create a separate policy and attach it separately to the Karpenter controller role +# because the main policy is near the 6,144 character limit for an IAM policy, and +# adding this to it can push it over. See: +# https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_iam-quotas.html#reference_iam-quotas-entities +resource "aws_iam_policy" "v1alpha" { + count = local.enabled ? 1 : 0 + + name = "${module.this.id}-v1alpha" + description = "Legacy Karpenter controller policy for v1alpha workloads" + policy = local.controller_policy_v1alpha_json + tags = module.this.tags +} + +resource "aws_iam_role_policy_attachment" "v1alpha" { + count = local.enabled ? 1 : 0 + + role = module.karpenter.service_account_role_name + policy_arn = one(aws_iam_policy.v1alpha[*].arn) +} diff --git a/modules/eks/karpenter/main.tf b/modules/eks/karpenter/main.tf index a038b645b..3d930b117 100644 --- a/modules/eks/karpenter/main.tf +++ b/modules/eks/karpenter/main.tf @@ -77,7 +77,7 @@ module "karpenter" { service_account_role_arn_annotation_enabled = true iam_role_enabled = true - iam_source_policy_documents = [local.controller_policy_v1alpha_json, local.controller_policy_json] + iam_source_policy_documents = [local.controller_policy_json] values = compact([ yamlencode({