From 902271c5fea3c454cefb2f3cad1c581abe6a7307 Mon Sep 17 00:00:00 2001 From: Nuru Date: Sat, 1 Jun 2024 14:21:33 -0700 Subject: [PATCH] [eks/actions-runner-controller] Add ability to dynamically annotate pods once they start a job (#1055) --- .../eks/actions-runner-controller/README.md | 64 ++++--- .../charts/actions-runner/Chart.yaml | 2 +- .../templates/runnerdeployment.yaml | 178 ++++++++++++------ .../charts/actions-runner/values.yaml | 31 ++- modules/eks/actions-runner-controller/main.tf | 17 +- .../resources/values.yaml | 11 +- .../actions-runner-controller/variables.tf | 106 +++++------ 7 files changed, 238 insertions(+), 171 deletions(-) diff --git a/modules/eks/actions-runner-controller/README.md b/modules/eks/actions-runner-controller/README.md index 7ea596356..b9adc5f1a 100644 --- a/modules/eks/actions-runner-controller/README.md +++ b/modules/eks/actions-runner-controller/README.md @@ -26,7 +26,7 @@ components: name: "actions-runner" # avoids hitting name length limit on IAM role chart: "actions-runner-controller" chart_repository: "https://actions-runner-controller.github.io/actions-runner-controller" - chart_version: "0.22.0" + chart_version: "0.23.7" kubernetes_namespace: "actions-runner-system" create_namespace: true kubeconfig_exec_auth_api_version: "client.authentication.k8s.io/v1beta1" @@ -79,12 +79,11 @@ components: image: summerwind/actions-runner-dind # `scope` is org name for Organization runners, repo name for Repository runners scope: "org/infra" - # We can trade the fast-start behavior of min_replicas > 0 for the better guarantee - # that Karpenter will not terminate the runner while it is running a job. - # # Tell Karpenter not to evict this pod. This is only safe when min_replicas is 0. - # # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job. - # pod_annotations: - # karpenter.sh/do-not-evict: "true" + # Tell Karpenter not to evict this pod while it is running a job. + # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job, + # as part of its consolidation efforts, even when using "on demand" instances. + running_pod_annotations: + karpenter.sh/do-not-disrupt: "true" min_replicas: 1 max_replicas: 20 scale_down_delay_seconds: 100 @@ -96,7 +95,14 @@ components: cpu: 100m memory: 128Mi webhook_driven_scaling_enabled: true - webhook_startup_timeout: "30m" + # The name `webhook_startup_timeout` is misleading. + # It is actually the duration after which a job will be considered completed, + # (and the runner killed) even if the webhook has not received a "job completed" event. + # This is to ensure that if an event is missed, it does not leave the runner running forever. + # Set it long enough to cover the longest job you expect to run and then some. + # See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268 + webhook_startup_timeout: "90m" + # Pull-driven scaling is obsolete and should not be used. pull_driven_scaling_enabled: false # Labels are not case-sensitive to GitHub, but *are* case-sensitive # to the webhook based autoscaler, which requires exact matches @@ -134,11 +140,12 @@ components: # # `scope` is org name for Organization runners, repo name for Repository runners # scope: "org/infra" # group: "ArmRunners" - # # Tell Karpenter not to evict this pod. This is only safe when min_replicas is 0. - # # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job. - # pod_annotations: - # karpenter.sh/do-not-evict: "true" - # min_replicas: 0 + # # Tell Karpenter not to evict this pod while it is running a job. + # # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job, + # # as part of its consolidation efforts, even when using "on demand" instances. + # running_pod_annotations: + # karpenter.sh/do-not-disrupt: "true" + # min_replicas: 0 # Set to so that no ARM instance is running idle, set to 1 for faster startups # max_replicas: 20 # scale_down_delay_seconds: 100 # resources: @@ -149,7 +156,7 @@ components: # cpu: 100m # memory: 128Mi # webhook_driven_scaling_enabled: true - # webhook_startup_timeout: "30m" + # webhook_startup_timeout: "90m" # pull_driven_scaling_enabled: false # # Labels are not case-sensitive to GitHub, but *are* case-sensitive # # to the webhook based autoscaler, which requires exact matches @@ -315,8 +322,10 @@ can assign one or more Runner pools (from the `runners` map) to groups (only one ### Using Webhook Driven Autoscaling (recommended) -We recommend using Webhook Driven Autoscaling until GitHub releases their own autoscaling solution (said to be "in the -works" as of April 2023). +We recommend using Webhook Driven Autoscaling until GitHub's own autoscaling solution is as capable as the Summerwind +solution this component deploys. See +[this discussion](https://github.com/actions/actions-runner-controller/discussions/3340) for some perspective on why the +Summerwind solution is currently (summer 2024) considered superior. To use the Webhook Driven Autoscaling, in addition to setting `webhook_driven_scaling_enabled` to `true`, you must also install the GitHub organization-level webhook after deploying the component (specifically, the webhook server). The URL @@ -424,7 +433,7 @@ spec: template: metadata: annotations: - karpenter.sh/do-not-evict: "true" + karpenter.sh/do-not-disrupt: "true" ``` When you set this annotation on the Pod, Karpenter will not evict it. This means that the Pod will stay on the Node it @@ -437,14 +446,14 @@ Since the Runner Pods terminate at the end of the job, this is not a problem for However, if you have set `minReplicas > 0`, then you have some Pods that are just idling, waiting for jobs to be assigned to them. These Pods are exactly the kind of Pods you want terminated and moved when the cluster is underutilized. Therefore, when you set `minReplicas > 0`, you should **NOT** set `karpenter.sh/do-not-evict: "true"` on -the Pod. +the Pod via the `pod_annotations` attribute of the `runners` input. (**But wait**, _there is good news_!) We have [requested a feature](https://github.com/actions/actions-runner-controller/issues/2562) that will allow you to -set `karpenter.sh/do-not-evict: "true"` and `minReplicas > 0` at the same time by only annotating Pods running jobs. -Meanwhile, another option is to set `minReplicas = 0` on a schedule using an ARC Autoscaler -[scheduled override](https://github.com/actions/actions-runner-controller/blob/master/docs/automatically-scaling-runners.md#scheduled-overrides). -At present, this component does not support that option, but it could be added in the future if our preferred solution -is not implemented. +set `karpenter.sh/do-not-disrupt: "true"` and `minReplicas > 0` at the same time by only annotating Pods running jobs. +Meanwhile, **we have implemented this for you** using a job startup hook. This hook will set annotations on the Pod when +the job starts. When the job finishes, the Pod will be deleted by the controller, so the annotations will not need to be +removed. Configure annotations that apply only to Pods running jobs in the `running_pod_annotations` attribute of the +`runners` input. ### Updating CRDs @@ -485,8 +494,8 @@ documentation for further details. | Name | Source | Version | |------|--------|---------| -| [actions\_runner](#module\_actions\_runner) | cloudposse/helm-release/aws | 0.10.0 | -| [actions\_runner\_controller](#module\_actions\_runner\_controller) | cloudposse/helm-release/aws | 0.10.0 | +| [actions\_runner](#module\_actions\_runner) | cloudposse/helm-release/aws | 0.10.1 | +| [actions\_runner\_controller](#module\_actions\_runner\_controller) | cloudposse/helm-release/aws | 0.10.1 | | [eks](#module\_eks) | cloudposse/stack-config/yaml//modules/remote-state | 1.5.0 | | [iam\_roles](#module\_iam\_roles) | ../../account-map/modules/iam-roles | n/a | | [this](#module\_this) | cloudposse/label/null | 0.25.0 | @@ -515,6 +524,7 @@ documentation for further details. | [cleanup\_on\_fail](#input\_cleanup\_on\_fail) | Allow deletion of new resources created in this upgrade when upgrade fails. | `bool` | `true` | no | | [context](#input\_context) | Single object for setting entire context at once.
See description of individual variables for details.
Leave string and numeric variables as `null` to use default value.
Individual variable settings (non-null) override settings in context object,
except for attributes, tags, and additional\_tag\_map, which are merged. | `any` |
{
"additional_tag_map": {},
"attributes": [],
"delimiter": null,
"descriptor_formats": {},
"enabled": true,
"environment": null,
"id_length_limit": null,
"label_key_case": null,
"label_order": [],
"label_value_case": null,
"labels_as_tags": [
"unset"
],
"name": null,
"namespace": null,
"regex_replace_chars": null,
"stage": null,
"tags": {},
"tenant": null
}
| no | | [context\_tags\_enabled](#input\_context\_tags\_enabled) | Whether or not to include all context tags as labels for each runner | `bool` | `false` | no | +| [controller\_replica\_count](#input\_controller\_replica\_count) | The number of replicas of the runner-controller to run. | `number` | `2` | no | | [create\_namespace](#input\_create\_namespace) | Create the namespace if it does not yet exist. Defaults to `false`. | `bool` | `null` | no | | [delimiter](#input\_delimiter) | Delimiter to be used between ID elements.
Defaults to `-` (hyphen). Set to `""` to use no delimiter at all. | `string` | `null` | no | | [descriptor\_formats](#input\_descriptor\_formats) | Describe additional descriptors to be output in the `descriptors` output map.
Map of maps. Keys are names of descriptors. Values are maps of the form
`{
format = string
labels = list(string)
}`
(Type is `any` so the map values can later be enhanced to provide additional options.)
`format` is a Terraform format string to be passed to the `format()` function.
`labels` is a list of labels, in order, to pass to `format()` function.
Label values will be normalized before being passed to `format()` so they will be
identical to how they appear in `id`.
Default is `{}` (`descriptors` output will be empty). | `any` | `{}` | no | @@ -549,7 +559,7 @@ documentation for further details. | [regex\_replace\_chars](#input\_regex\_replace\_chars) | Terraform regular expression (regex) string.
Characters matching the regex will be removed from the ID elements.
If not set, `"/[^a-zA-Z0-9-]/"` is used to remove all characters other than hyphens, letters and digits. | `string` | `null` | no | | [region](#input\_region) | AWS Region. | `string` | n/a | yes | | [resources](#input\_resources) | The cpu and memory of the deployment's limits and requests. |
object({
limits = object({
cpu = string
memory = string
})
requests = object({
cpu = string
memory = string
})
})
| n/a | yes | -| [runners](#input\_runners) | Map of Action Runner configurations, with the key being the name of the runner. Please note that the name must be in
kebab-case.

For example:
hcl
organization_runner = {
type = "organization" # can be either 'organization' or 'repository'
dind_enabled: false # A Docker sidecar container will be deployed
image: summerwind/actions-runner # If dind_enabled=true, set this to 'summerwind/actions-runner-dind'
scope = "ACME" # org name for Organization runners, repo name for Repository runners
group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.
scale_down_delay_seconds = 300
min_replicas = 1
max_replicas = 5
busy_metrics = {
scale_up_threshold = 0.75
scale_down_threshold = 0.25
scale_up_factor = 2
scale_down_factor = 0.5
}
labels = [
"Ubuntu",
"core-automation",
]
}
|
map(object({
type = string
scope = string
group = optional(string, null)
image = optional(string, "")
dind_enabled = bool
node_selector = optional(map(string), {})
pod_annotations = optional(map(string), {})
tolerations = optional(list(object({
key = string
operator = string
value = optional(string, null)
effect = string
})), [])
scale_down_delay_seconds = number
min_replicas = number
max_replicas = number
busy_metrics = optional(object({
scale_up_threshold = string
scale_down_threshold = string
scale_up_adjustment = optional(string)
scale_down_adjustment = optional(string)
scale_up_factor = optional(string)
scale_down_factor = optional(string)
}))
webhook_driven_scaling_enabled = bool
webhook_startup_timeout = optional(string, null)
pull_driven_scaling_enabled = bool
labels = list(string)
storage = optional(string, null)
pvc_enabled = optional(bool, false)
resources = object({
limits = object({
cpu = string
memory = string
ephemeral_storage = optional(string, null)
})
requests = object({
cpu = string
memory = string
})
})
}))
| n/a | yes | +| [runners](#input\_runners) | Map of Action Runner configurations, with the key being the name of the runner. Please note that the name must be in
kebab-case.

For example:
hcl
organization_runner = {
type = "organization" # can be either 'organization' or 'repository'
dind_enabled: true # A Docker daemon will be started in the runner Pod
image: summerwind/actions-runner-dind # If dind_enabled=false, set this to 'summerwind/actions-runner'
scope = "ACME" # org name for Organization runners, repo name for Repository runners
group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.
scale_down_delay_seconds = 300
min_replicas = 1
max_replicas = 5
labels = [
"Ubuntu",
"core-automation",
]
}
|
map(object({
type = string
scope = string
group = optional(string, null)
image = optional(string, "summerwind/actions-runner-dind")
dind_enabled = optional(bool, true)
node_selector = optional(map(string), {})
pod_annotations = optional(map(string), {})

# running_pod_annotations are only applied to the pods once they start running a job
running_pod_annotations = optional(map(string), {})

# affinity is too complex to model. Whatever you assigned affinity will be copied
# to the runner Pod spec.
affinity = optional(any)

tolerations = optional(list(object({
key = string
operator = string
value = optional(string, null)
effect = string
})), [])
scale_down_delay_seconds = optional(number, 300)
min_replicas = number
max_replicas = number
busy_metrics = optional(object({
scale_up_threshold = string
scale_down_threshold = string
scale_up_adjustment = optional(string)
scale_down_adjustment = optional(string)
scale_up_factor = optional(string)
scale_down_factor = optional(string)
}))
webhook_driven_scaling_enabled = optional(bool, true)
# The name `webhook_startup_timeout` is misleading.
# It is actually the duration after which a job will be considered completed,
# (and the runner killed) even if the webhook has not received a "job completed" event.
# This is to ensure that if an event is missed, it does not leave the runner running forever.
# Set it long enough to cover the longest job you expect to run and then some.
# See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268
webhook_startup_timeout = optional(string, "1h")
pull_driven_scaling_enabled = optional(bool, false)
labels = optional(list(string), [])
docker_storage = optional(string, null)
# storage is deprecated in favor of docker_storage, since it is only storage for the Docker daemon
storage = optional(string, null)
pvc_enabled = optional(bool, false)
resources = optional(object({
limits = optional(object({
cpu = optional(string, "1")
memory = optional(string, "1Gi")
ephemeral_storage = optional(string, "10Gi")
}), {})
requests = optional(object({
cpu = optional(string, "500m")
memory = optional(string, "256Mi")
ephemeral_storage = optional(string, "1Gi")
}), {})
}), {})
}))
| n/a | yes | | [s3\_bucket\_arns](#input\_s3\_bucket\_arns) | List of ARNs of S3 Buckets to which the runners will have read-write access to. | `list(string)` | `[]` | no | | [ssm\_docker\_config\_json\_path](#input\_ssm\_docker\_config\_json\_path) | SSM path to the Docker config JSON | `string` | `null` | no | | [ssm\_github\_secret\_path](#input\_ssm\_github\_secret\_path) | The path in SSM to the GitHub app private key file contents or GitHub PAT token. | `string` | `""` | no | @@ -559,7 +569,7 @@ documentation for further details. | [tenant](#input\_tenant) | ID element \_(Rarely used, not included by default)\_. A customer identifier, indicating who this instance of a resource is for | `string` | `null` | no | | [timeout](#input\_timeout) | Time in seconds to wait for any individual kubernetes operation (like Jobs for hooks). Defaults to `300` seconds | `number` | `null` | no | | [wait](#input\_wait) | Will wait until all resources are in a ready state before marking the release as successful. It will wait for as long as `timeout`. Defaults to `true`. | `bool` | `null` | no | -| [webhook](#input\_webhook) | Configuration for the GitHub Webhook Server.
`hostname_template` is the `format()` string to use to generate the hostname via `format(var.hostname_template, var.tenant, var.stage, var.environment)`"
Typically something like `"echo.%[3]v.%[2]v.example.com"`.
`queue_limit` is the maximum number of webhook events that can be queued up processing by the autoscaler.
When the queue gets full, webhook events will be dropped (status 500). |
object({
enabled = bool
hostname_template = string
queue_limit = optional(number, 100)
})
|
{
"enabled": false,
"hostname_template": null,
"queue_limit": 100
}
| no | +| [webhook](#input\_webhook) | Configuration for the GitHub Webhook Server.
`hostname_template` is the `format()` string to use to generate the hostname via `format(var.hostname_template, var.tenant, var.stage, var.environment)`"
Typically something like `"echo.%[3]v.%[2]v.example.com"`.
`queue_limit` is the maximum number of webhook events that can be queued up for processing by the autoscaler.
When the queue gets full, webhook events will be dropped (status 500). |
object({
enabled = bool
hostname_template = string
queue_limit = optional(number, 1000)
})
|
{
"enabled": false,
"hostname_template": null,
"queue_limit": 1000
}
| no | ## Outputs diff --git a/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml b/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml index b5c10525b..1ec5333d2 100644 --- a/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml +++ b/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.2 +version: 0.2.0 # This chart only deploys Resources for actions-runner-controller, so app version does not really apply. # We use Resource API version instead. diff --git a/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml b/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml index a44658dec..1321f22c8 100644 --- a/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml +++ b/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml @@ -1,34 +1,3 @@ -{{- if .Values.pvc_enabled }} ---- -# Persistent Volumes can be used for image caching -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: {{ .Values.release_name }} -spec: - accessModes: - - ReadWriteMany - # StorageClassName comes from efs-controller and must be deployed first. - storageClassName: efs-sc - resources: - requests: - # EFS is not actually storage constrained, but this storage request is - # required. 100Gi is a ballpark for how much we initially request, but this - # may grow. We are responsible for docker pruning this periodically to - # save space. - storage: 100Gi -{{- end }} -{{- if .Values.docker_config_json_enabled }} ---- -apiVersion: v1 -kind: Secret -metadata: - name: {{ .Values.release_name }}-regcred -type: kubernetes.io/dockerconfigjson -data: - .dockerconfigjson: {{ .Values.docker_config_json }} -{{- end }} ---- apiVersion: actions.summerwind.dev/v1alpha1 kind: RunnerDeployment metadata: @@ -38,13 +7,13 @@ spec: # See https://github.com/actions-runner-controller/actions-runner-controller/issues/206#issuecomment-748601907 # replicas: 1 template: - {{- with index .Values "pod_annotations" }} + {{- with .Values.pod_annotations }} metadata: annotations: {{- toYaml . | nindent 8 }} {{- end }} spec: - {{- if .Values.docker_config_json_enabled }} + {{- if .Values.docker_config_json_enabled }} # secrets volumeMount are always mounted readOnly so config.json has to be copied to the correct directory # https://github.com/kubernetes/kubernetes/issues/62099 # https://github.com/actions/actions-runner-controller/issues/2123#issuecomment-1527077517 @@ -82,14 +51,41 @@ spec: # - effect: NoSchedule # key: node-role.kubernetes.io/actions-runner # operator: Exists + {{- with .Values.node_selector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + + {{- with .Values.running_pod_annotations }} + # Run a pre-run hook to set pod annotations + # See https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job#triggering-the-scripts + containers: + - name: runner + # ARC (Summerwind) has its own pre-run hook, so we do not want to set + # env: + # - name: ACTIONS_RUNNER_HOOK_JOB_STARTED + # value: /hooks/pre-run.sh # triggers when a job is started, and sets the pod to NOT safe-to-evict + # Instead, its pre-run hook runs scripts in /etc/arc/hooks/job-started.d/ + volumeMounts: + - name: hooks + mountPath: /etc/arc/hooks/job-started.d/ + {{- end }} - {{ if eq .Values.type "organization" }} + {{- if eq .Values.type "organization" }} organization: {{ .Values.scope }} {{- end }} - {{ if eq .Values.type "repository" }} + {{- if eq .Values.type "repository" }} repository: {{ .Values.scope }} {{- end }} - {{ if index .Values "group" }} + {{- if index .Values "group" }} group: {{ .Values.group }} {{- end }} # You can use labels to create subsets of runners. @@ -103,14 +99,6 @@ spec: {{- range .Values.labels }} - {{ . | quote }} {{- end }} - {{- if gt ( len (index .Values "node_selector") ) 0 }} - nodeSelector: - {{- toYaml .Values.node_selector | nindent 8 }} - {{- end }} - {{- if gt ( len (index .Values "tolerations") ) 0 }} - tolerations: - {{- toYaml .Values.tolerations | nindent 8 }} - {{- end }} # dockerdWithinRunnerContainer = false means access to a Docker daemon is provided by a sidecar container. dockerdWithinRunnerContainer: {{ .Values.dind_enabled }} image: {{ .Values.image | quote }} @@ -133,7 +121,7 @@ spec: {{- if index .Values.resources.requests "ephemeral_storage" }} ephemeral-storage: {{ .Values.resources.requests.ephemeral_storage }} {{- end }} - {{- if and .Values.dind_enabled .Values.storage }} + {{- if and .Values.dind_enabled .Values.docker_storage }} dockerVolumeMounts: - mountPath: /var/lib/docker name: docker-volume @@ -150,10 +138,10 @@ spec: - mountPath: /home/runner/.docker name: docker-config-volume {{- end }} - {{- end }} - {{- if or (and .Values.dind_enabled .Values.storage) (.Values.pvc_enabled) (.Values.docker_config_json_enabled) }} + {{- end }}{{/* End of volumeMounts */}} + {{- if or (and .Values.dind_enabled .Values.docker_storage) (.Values.pvc_enabled) (.Values.docker_config_json_enabled) (not (empty .Values.running_pod_annotations)) }} volumes: - {{- if and .Values.dind_enabled .Values.storage }} + {{- if and .Values.dind_enabled .Values.docker_storage }} - name: docker-volume ephemeral: volumeClaimTemplate: @@ -161,13 +149,13 @@ spec: accessModes: [ "ReadWriteOnce" ] # Only 1 pod can connect at a time resources: requests: - storage: {{ .Values.storage }} - {{- end }} - {{- if .Values.pvc_enabled }} + storage: {{ .Values.docker_storage }} + {{- end }} + {{- if .Values.pvc_enabled }} - name: shared-volume persistentVolumeClaim: claimName: {{ .Values.release_name }} - {{- end }} + {{- end }} {{- if .Values.docker_config_json_enabled }} - name: docker-secret secret: @@ -178,4 +166,88 @@ spec: - name: docker-config-volume emptyDir: {{- end }} - {{- end }} + {{- with .Values.running_pod_annotations }} + - name: hooks + configMap: + name: runner-hooks + defaultMode: 0755 # Set execute permissions for all files + {{- end }} + {{- end }}{{/* End of volumes */}} +{{- if .Values.pvc_enabled }} +--- +# Persistent Volumes can be used for image caching +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.release_name }} +spec: + accessModes: + - ReadWriteMany + # StorageClassName comes from efs-controller and must be deployed first. + storageClassName: efs-sc + resources: + requests: + # EFS is not actually storage constrained, but this storage request is + # required. 100Gi is a ballpark for how much we initially request, but this + # may grow. We are responsible for docker pruning this periodically to + # save space. + storage: 100Gi +{{- end }} +{{- if .Values.docker_config_json_enabled }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.release_name }}-regcred +type: kubernetes.io/dockerconfigjson +data: + .dockerconfigjson: {{ .Values.docker_config_json }} +{{- end }} +{{- with .Values.running_pod_annotations }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: runner-hooks +data: + annotate.sh: | + #!/bin/bash + + # If we had kubectl and a KUBECONFIG, we could do this: + # kubectl annotate pod $HOSTNAME 'karpenter.sh/do-not-evict="true"' --overwrite + # kubectl annotate pod $HOSTNAME 'karpenter.sh/do-not-disrupt="true"' --overwrite + + # This is the same thing, the hard way + + # Metadata about the pod + NAMESPACE=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + POD_NAME=$(hostname) + + # Kubernetes API URL + API_URL="https://kubernetes.default.svc" + + # Read the service account token + TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) + + # Content type + CONTENT_TYPE="application/merge-patch+json" + + PATCH_JSON=$(cat <