From 902271c5fea3c454cefb2f3cad1c581abe6a7307 Mon Sep 17 00:00:00 2001
From: Nuru <Nuru@users.noreply.github.com>
Date: Sat, 1 Jun 2024 14:21:33 -0700
Subject: [PATCH] [eks/actions-runner-controller] Add ability to dynamically
 annotate pods once they start a job (#1055)

---
 .../eks/actions-runner-controller/README.md   |  64 ++++---
 .../charts/actions-runner/Chart.yaml          |   2 +-
 .../templates/runnerdeployment.yaml           | 178 ++++++++++++------
 .../charts/actions-runner/values.yaml         |  31 ++-
 modules/eks/actions-runner-controller/main.tf |  17 +-
 .../resources/values.yaml                     |  11 +-
 .../actions-runner-controller/variables.tf    | 106 +++++------
 7 files changed, 238 insertions(+), 171 deletions(-)

diff --git a/modules/eks/actions-runner-controller/README.md b/modules/eks/actions-runner-controller/README.md
index 7ea596356..b9adc5f1a 100644
--- a/modules/eks/actions-runner-controller/README.md
+++ b/modules/eks/actions-runner-controller/README.md
@@ -26,7 +26,7 @@ components:
         name: "actions-runner" # avoids hitting name length limit on IAM role
         chart: "actions-runner-controller"
         chart_repository: "https://actions-runner-controller.github.io/actions-runner-controller"
-        chart_version: "0.22.0"
+        chart_version: "0.23.7"
         kubernetes_namespace: "actions-runner-system"
         create_namespace: true
         kubeconfig_exec_auth_api_version: "client.authentication.k8s.io/v1beta1"
@@ -79,12 +79,11 @@ components:
             image: summerwind/actions-runner-dind
             # `scope` is org name for Organization runners, repo name for Repository runners
             scope: "org/infra"
-            # We can trade the fast-start behavior of min_replicas > 0 for the better guarantee
-            # that Karpenter will not terminate the runner while it is running a job.
-            #  # Tell Karpenter not to evict this pod. This is only safe when min_replicas is 0.
-            #  # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job.
-            #  pod_annotations:
-            #    karpenter.sh/do-not-evict: "true"
+            # Tell Karpenter not to evict this pod while it is running a job.
+            # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job,
+            # as part of its consolidation efforts, even when using "on demand" instances.
+            running_pod_annotations:
+              karpenter.sh/do-not-disrupt: "true"
             min_replicas: 1
             max_replicas: 20
             scale_down_delay_seconds: 100
@@ -96,7 +95,14 @@ components:
                 cpu: 100m
                 memory: 128Mi
             webhook_driven_scaling_enabled: true
-            webhook_startup_timeout: "30m"
+            # The name `webhook_startup_timeout` is misleading.
+            # It is actually the duration after which a job will be considered completed,
+            # (and the runner killed) even if the webhook has not received a "job completed" event.
+            # This is to ensure that if an event is missed, it does not leave the runner running forever.
+            # Set it long enough to cover the longest job you expect to run and then some.
+            # See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268
+            webhook_startup_timeout: "90m"
+            # Pull-driven scaling is obsolete and should not be used.
             pull_driven_scaling_enabled: false
             # Labels are not case-sensitive to GitHub, but *are* case-sensitive
             # to the webhook based autoscaler, which requires exact matches
@@ -134,11 +140,12 @@ components:
           #  # `scope` is org name for Organization runners, repo name for Repository runners
           #  scope: "org/infra"
           #  group: "ArmRunners"
-          #  # Tell Karpenter not to evict this pod. This is only safe when min_replicas is 0.
-          #  # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job.
-          #  pod_annotations:
-          #    karpenter.sh/do-not-evict: "true"
-          #  min_replicas: 0
+          #  # Tell Karpenter not to evict this pod while it is running a job.
+          #  # If we do not set this, Karpenter will feel free to terminate the runner while it is running a job,
+          #  # as part of its consolidation efforts, even when using "on demand" instances.
+          #  running_pod_annotations:
+          #    karpenter.sh/do-not-disrupt: "true"
+          #  min_replicas: 0 # Set to so that no ARM instance is running idle, set to 1 for faster startups
           #  max_replicas: 20
           #  scale_down_delay_seconds: 100
           #  resources:
@@ -149,7 +156,7 @@ components:
           #      cpu: 100m
           #      memory: 128Mi
           #  webhook_driven_scaling_enabled: true
-          #  webhook_startup_timeout: "30m"
+          #  webhook_startup_timeout: "90m"
           #  pull_driven_scaling_enabled: false
           #  # Labels are not case-sensitive to GitHub, but *are* case-sensitive
           #  # to the webhook based autoscaler, which requires exact matches
@@ -315,8 +322,10 @@ can assign one or more Runner pools (from the `runners` map) to groups (only one
 
 ### Using Webhook Driven Autoscaling (recommended)
 
-We recommend using Webhook Driven Autoscaling until GitHub releases their own autoscaling solution (said to be "in the
-works" as of April 2023).
+We recommend using Webhook Driven Autoscaling until GitHub's own autoscaling solution is as capable as the Summerwind
+solution this component deploys. See
+[this discussion](https://github.com/actions/actions-runner-controller/discussions/3340) for some perspective on why the
+Summerwind solution is currently (summer 2024) considered superior.
 
 To use the Webhook Driven Autoscaling, in addition to setting `webhook_driven_scaling_enabled` to `true`, you must also
 install the GitHub organization-level webhook after deploying the component (specifically, the webhook server). The URL
@@ -424,7 +433,7 @@ spec:
   template:
     metadata:
       annotations:
-        karpenter.sh/do-not-evict: "true"
+        karpenter.sh/do-not-disrupt: "true"
 ```
 
 When you set this annotation on the Pod, Karpenter will not evict it. This means that the Pod will stay on the Node it
@@ -437,14 +446,14 @@ Since the Runner Pods terminate at the end of the job, this is not a problem for
 However, if you have set `minReplicas > 0`, then you have some Pods that are just idling, waiting for jobs to be
 assigned to them. These Pods are exactly the kind of Pods you want terminated and moved when the cluster is
 underutilized. Therefore, when you set `minReplicas > 0`, you should **NOT** set `karpenter.sh/do-not-evict: "true"` on
-the Pod.
+the Pod via the `pod_annotations` attribute of the `runners` input. (**But wait**, _there is good news_!)
 
 We have [requested a feature](https://github.com/actions/actions-runner-controller/issues/2562) that will allow you to
-set `karpenter.sh/do-not-evict: "true"` and `minReplicas > 0` at the same time by only annotating Pods running jobs.
-Meanwhile, another option is to set `minReplicas = 0` on a schedule using an ARC Autoscaler
-[scheduled override](https://github.com/actions/actions-runner-controller/blob/master/docs/automatically-scaling-runners.md#scheduled-overrides).
-At present, this component does not support that option, but it could be added in the future if our preferred solution
-is not implemented.
+set `karpenter.sh/do-not-disrupt: "true"` and `minReplicas > 0` at the same time by only annotating Pods running jobs.
+Meanwhile, **we have implemented this for you** using a job startup hook. This hook will set annotations on the Pod when
+the job starts. When the job finishes, the Pod will be deleted by the controller, so the annotations will not need to be
+removed. Configure annotations that apply only to Pods running jobs in the `running_pod_annotations` attribute of the
+`runners` input.
 
 ### Updating CRDs
 
@@ -485,8 +494,8 @@ documentation for further details.
 
 | Name | Source | Version |
 |------|--------|---------|
-| <a name="module_actions_runner"></a> [actions\_runner](#module\_actions\_runner) | cloudposse/helm-release/aws | 0.10.0 |
-| <a name="module_actions_runner_controller"></a> [actions\_runner\_controller](#module\_actions\_runner\_controller) | cloudposse/helm-release/aws | 0.10.0 |
+| <a name="module_actions_runner"></a> [actions\_runner](#module\_actions\_runner) | cloudposse/helm-release/aws | 0.10.1 |
+| <a name="module_actions_runner_controller"></a> [actions\_runner\_controller](#module\_actions\_runner\_controller) | cloudposse/helm-release/aws | 0.10.1 |
 | <a name="module_eks"></a> [eks](#module\_eks) | cloudposse/stack-config/yaml//modules/remote-state | 1.5.0 |
 | <a name="module_iam_roles"></a> [iam\_roles](#module\_iam\_roles) | ../../account-map/modules/iam-roles | n/a |
 | <a name="module_this"></a> [this](#module\_this) | cloudposse/label/null | 0.25.0 |
@@ -515,6 +524,7 @@ documentation for further details.
 | <a name="input_cleanup_on_fail"></a> [cleanup\_on\_fail](#input\_cleanup\_on\_fail) | Allow deletion of new resources created in this upgrade when upgrade fails. | `bool` | `true` | no |
 | <a name="input_context"></a> [context](#input\_context) | Single object for setting entire context at once.<br>See description of individual variables for details.<br>Leave string and numeric variables as `null` to use default value.<br>Individual variable settings (non-null) override settings in context object,<br>except for attributes, tags, and additional\_tag\_map, which are merged. | `any` | <pre>{<br>  "additional_tag_map": {},<br>  "attributes": [],<br>  "delimiter": null,<br>  "descriptor_formats": {},<br>  "enabled": true,<br>  "environment": null,<br>  "id_length_limit": null,<br>  "label_key_case": null,<br>  "label_order": [],<br>  "label_value_case": null,<br>  "labels_as_tags": [<br>    "unset"<br>  ],<br>  "name": null,<br>  "namespace": null,<br>  "regex_replace_chars": null,<br>  "stage": null,<br>  "tags": {},<br>  "tenant": null<br>}</pre> | no |
 | <a name="input_context_tags_enabled"></a> [context\_tags\_enabled](#input\_context\_tags\_enabled) | Whether or not to include all context tags as labels for each runner | `bool` | `false` | no |
+| <a name="input_controller_replica_count"></a> [controller\_replica\_count](#input\_controller\_replica\_count) | The number of replicas of the runner-controller to run. | `number` | `2` | no |
 | <a name="input_create_namespace"></a> [create\_namespace](#input\_create\_namespace) | Create the namespace if it does not yet exist. Defaults to `false`. | `bool` | `null` | no |
 | <a name="input_delimiter"></a> [delimiter](#input\_delimiter) | Delimiter to be used between ID elements.<br>Defaults to `-` (hyphen). Set to `""` to use no delimiter at all. | `string` | `null` | no |
 | <a name="input_descriptor_formats"></a> [descriptor\_formats](#input\_descriptor\_formats) | Describe additional descriptors to be output in the `descriptors` output map.<br>Map of maps. Keys are names of descriptors. Values are maps of the form<br>`{<br>   format = string<br>   labels = list(string)<br>}`<br>(Type is `any` so the map values can later be enhanced to provide additional options.)<br>`format` is a Terraform format string to be passed to the `format()` function.<br>`labels` is a list of labels, in order, to pass to `format()` function.<br>Label values will be normalized before being passed to `format()` so they will be<br>identical to how they appear in `id`.<br>Default is `{}` (`descriptors` output will be empty). | `any` | `{}` | no |
@@ -549,7 +559,7 @@ documentation for further details.
 | <a name="input_regex_replace_chars"></a> [regex\_replace\_chars](#input\_regex\_replace\_chars) | Terraform regular expression (regex) string.<br>Characters matching the regex will be removed from the ID elements.<br>If not set, `"/[^a-zA-Z0-9-]/"` is used to remove all characters other than hyphens, letters and digits. | `string` | `null` | no |
 | <a name="input_region"></a> [region](#input\_region) | AWS Region. | `string` | n/a | yes |
 | <a name="input_resources"></a> [resources](#input\_resources) | The cpu and memory of the deployment's limits and requests. | <pre>object({<br>    limits = object({<br>      cpu    = string<br>      memory = string<br>    })<br>    requests = object({<br>      cpu    = string<br>      memory = string<br>    })<br>  })</pre> | n/a | yes |
-| <a name="input_runners"></a> [runners](#input\_runners) | Map of Action Runner configurations, with the key being the name of the runner. Please note that the name must be in<br>kebab-case.<br><br>For example:<pre>hcl<br>organization_runner = {<br>  type = "organization" # can be either 'organization' or 'repository'<br>  dind_enabled: false # A Docker sidecar container will be deployed<br>  image: summerwind/actions-runner # If dind_enabled=true, set this to 'summerwind/actions-runner-dind'<br>  scope = "ACME"  # org name for Organization runners, repo name for Repository runners<br>  group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.<br>  scale_down_delay_seconds = 300<br>  min_replicas = 1<br>  max_replicas = 5<br>  busy_metrics = {<br>    scale_up_threshold = 0.75<br>    scale_down_threshold = 0.25<br>    scale_up_factor = 2<br>    scale_down_factor = 0.5<br>  }<br>  labels = [<br>    "Ubuntu",<br>    "core-automation",<br>  ]<br>}</pre> | <pre>map(object({<br>    type            = string<br>    scope           = string<br>    group           = optional(string, null)<br>    image           = optional(string, "")<br>    dind_enabled    = bool<br>    node_selector   = optional(map(string), {})<br>    pod_annotations = optional(map(string), {})<br>    tolerations = optional(list(object({<br>      key      = string<br>      operator = string<br>      value    = optional(string, null)<br>      effect   = string<br>    })), [])<br>    scale_down_delay_seconds = number<br>    min_replicas             = number<br>    max_replicas             = number<br>    busy_metrics = optional(object({<br>      scale_up_threshold    = string<br>      scale_down_threshold  = string<br>      scale_up_adjustment   = optional(string)<br>      scale_down_adjustment = optional(string)<br>      scale_up_factor       = optional(string)<br>      scale_down_factor     = optional(string)<br>    }))<br>    webhook_driven_scaling_enabled = bool<br>    webhook_startup_timeout        = optional(string, null)<br>    pull_driven_scaling_enabled    = bool<br>    labels                         = list(string)<br>    storage                        = optional(string, null)<br>    pvc_enabled                    = optional(bool, false)<br>    resources = object({<br>      limits = object({<br>        cpu               = string<br>        memory            = string<br>        ephemeral_storage = optional(string, null)<br>      })<br>      requests = object({<br>        cpu    = string<br>        memory = string<br>      })<br>    })<br>  }))</pre> | n/a | yes |
+| <a name="input_runners"></a> [runners](#input\_runners) | Map of Action Runner configurations, with the key being the name of the runner. Please note that the name must be in<br>kebab-case.<br><br>For example:<pre>hcl<br>organization_runner = {<br>  type = "organization" # can be either 'organization' or 'repository'<br>  dind_enabled: true # A Docker daemon will be started in the runner Pod<br>  image: summerwind/actions-runner-dind # If dind_enabled=false, set this to 'summerwind/actions-runner'<br>  scope = "ACME"  # org name for Organization runners, repo name for Repository runners<br>  group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.<br>  scale_down_delay_seconds = 300<br>  min_replicas = 1<br>  max_replicas = 5<br>  labels = [<br>    "Ubuntu",<br>    "core-automation",<br>  ]<br>}</pre> | <pre>map(object({<br>    type            = string<br>    scope           = string<br>    group           = optional(string, null)<br>    image           = optional(string, "summerwind/actions-runner-dind")<br>    dind_enabled    = optional(bool, true)<br>    node_selector   = optional(map(string), {})<br>    pod_annotations = optional(map(string), {})<br><br>    # running_pod_annotations are only applied to the pods once they start running a job<br>    running_pod_annotations = optional(map(string), {})<br><br>    # affinity is too complex to model. Whatever you assigned affinity will be copied<br>    # to the runner Pod spec.<br>    affinity = optional(any)<br><br>    tolerations = optional(list(object({<br>      key      = string<br>      operator = string<br>      value    = optional(string, null)<br>      effect   = string<br>    })), [])<br>    scale_down_delay_seconds = optional(number, 300)<br>    min_replicas             = number<br>    max_replicas             = number<br>    busy_metrics = optional(object({<br>      scale_up_threshold    = string<br>      scale_down_threshold  = string<br>      scale_up_adjustment   = optional(string)<br>      scale_down_adjustment = optional(string)<br>      scale_up_factor       = optional(string)<br>      scale_down_factor     = optional(string)<br>    }))<br>    webhook_driven_scaling_enabled = optional(bool, true)<br>    # The name `webhook_startup_timeout` is misleading.<br>    # It is actually the duration after which a job will be considered completed,<br>    # (and the runner killed) even if the webhook has not received a "job completed" event.<br>    # This is to ensure that if an event is missed, it does not leave the runner running forever.<br>    # Set it long enough to cover the longest job you expect to run and then some.<br>    # See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268<br>    webhook_startup_timeout     = optional(string, "1h")<br>    pull_driven_scaling_enabled = optional(bool, false)<br>    labels                      = optional(list(string), [])<br>    docker_storage              = optional(string, null)<br>    # storage is deprecated in favor of docker_storage, since it is only storage for the Docker daemon<br>    storage     = optional(string, null)<br>    pvc_enabled = optional(bool, false)<br>    resources = optional(object({<br>      limits = optional(object({<br>        cpu               = optional(string, "1")<br>        memory            = optional(string, "1Gi")<br>        ephemeral_storage = optional(string, "10Gi")<br>      }), {})<br>      requests = optional(object({<br>        cpu               = optional(string, "500m")<br>        memory            = optional(string, "256Mi")<br>        ephemeral_storage = optional(string, "1Gi")<br>      }), {})<br>    }), {})<br>  }))</pre> | n/a | yes |
 | <a name="input_s3_bucket_arns"></a> [s3\_bucket\_arns](#input\_s3\_bucket\_arns) | List of ARNs of S3 Buckets to which the runners will have read-write access to. | `list(string)` | `[]` | no |
 | <a name="input_ssm_docker_config_json_path"></a> [ssm\_docker\_config\_json\_path](#input\_ssm\_docker\_config\_json\_path) | SSM path to the Docker config JSON | `string` | `null` | no |
 | <a name="input_ssm_github_secret_path"></a> [ssm\_github\_secret\_path](#input\_ssm\_github\_secret\_path) | The path in SSM to the GitHub app private key file contents or GitHub PAT token. | `string` | `""` | no |
@@ -559,7 +569,7 @@ documentation for further details.
 | <a name="input_tenant"></a> [tenant](#input\_tenant) | ID element \_(Rarely used, not included by default)\_. A customer identifier, indicating who this instance of a resource is for | `string` | `null` | no |
 | <a name="input_timeout"></a> [timeout](#input\_timeout) | Time in seconds to wait for any individual kubernetes operation (like Jobs for hooks). Defaults to `300` seconds | `number` | `null` | no |
 | <a name="input_wait"></a> [wait](#input\_wait) | Will wait until all resources are in a ready state before marking the release as successful. It will wait for as long as `timeout`. Defaults to `true`. | `bool` | `null` | no |
-| <a name="input_webhook"></a> [webhook](#input\_webhook) | Configuration for the GitHub Webhook Server.<br>`hostname_template` is the `format()` string to use to generate the hostname via `format(var.hostname_template, var.tenant, var.stage, var.environment)`"<br>Typically something like `"echo.%[3]v.%[2]v.example.com"`.<br>`queue_limit` is the maximum number of webhook events that can be queued up processing by the autoscaler.<br>When the queue gets full, webhook events will be dropped (status 500). | <pre>object({<br>    enabled           = bool<br>    hostname_template = string<br>    queue_limit       = optional(number, 100)<br>  })</pre> | <pre>{<br>  "enabled": false,<br>  "hostname_template": null,<br>  "queue_limit": 100<br>}</pre> | no |
+| <a name="input_webhook"></a> [webhook](#input\_webhook) | Configuration for the GitHub Webhook Server.<br>`hostname_template` is the `format()` string to use to generate the hostname via `format(var.hostname_template, var.tenant, var.stage, var.environment)`"<br>Typically something like `"echo.%[3]v.%[2]v.example.com"`.<br>`queue_limit` is the maximum number of webhook events that can be queued up for processing by the autoscaler.<br>When the queue gets full, webhook events will be dropped (status 500). | <pre>object({<br>    enabled           = bool<br>    hostname_template = string<br>    queue_limit       = optional(number, 1000)<br>  })</pre> | <pre>{<br>  "enabled": false,<br>  "hostname_template": null,<br>  "queue_limit": 1000<br>}</pre> | no |
 
 ## Outputs
 
diff --git a/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml b/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml
index b5c10525b..1ec5333d2 100644
--- a/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml
+++ b/modules/eks/actions-runner-controller/charts/actions-runner/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.2
+version: 0.2.0
 
 # This chart only deploys Resources for actions-runner-controller, so app version does not really apply.
 # We use Resource API version instead.
diff --git a/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml b/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml
index a44658dec..1321f22c8 100644
--- a/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml
+++ b/modules/eks/actions-runner-controller/charts/actions-runner/templates/runnerdeployment.yaml
@@ -1,34 +1,3 @@
-{{- if .Values.pvc_enabled }}
----
-# Persistent Volumes can be used for image caching
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: {{ .Values.release_name }}
-spec:
-  accessModes:
-    - ReadWriteMany
-  # StorageClassName comes from efs-controller and must be deployed first.
-  storageClassName: efs-sc
-  resources:
-    requests:
-      # EFS is not actually storage constrained, but this storage request is
-      # required. 100Gi is a ballpark for how much we initially request, but this
-      # may grow. We are responsible for docker pruning this periodically to
-      # save space.
-      storage: 100Gi
-{{- end }}
-{{- if .Values.docker_config_json_enabled }}
----
-apiVersion: v1
-kind: Secret
-metadata:
-  name: {{ .Values.release_name }}-regcred
-type: kubernetes.io/dockerconfigjson
-data:
-  .dockerconfigjson: {{ .Values.docker_config_json }}
-{{- end }}
----
 apiVersion: actions.summerwind.dev/v1alpha1
 kind: RunnerDeployment
 metadata:
@@ -38,13 +7,13 @@ spec:
   # See https://github.com/actions-runner-controller/actions-runner-controller/issues/206#issuecomment-748601907
   # replicas: 1
   template:
-    {{- with index .Values "pod_annotations" }}
+    {{- with .Values.pod_annotations }}
     metadata:
       annotations:
         {{- toYaml . | nindent 8 }}
     {{- end }}
     spec:
-      {{- if  .Values.docker_config_json_enabled }}
+      {{- if .Values.docker_config_json_enabled }}
       # secrets volumeMount are always mounted readOnly so config.json has to be copied to the correct directory
       # https://github.com/kubernetes/kubernetes/issues/62099
       # https://github.com/actions/actions-runner-controller/issues/2123#issuecomment-1527077517
@@ -82,14 +51,41 @@ spec:
       #  - effect: NoSchedule
       #    key: node-role.kubernetes.io/actions-runner
       #    operator: Exists
+      {{- with .Values.node_selector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+
+      {{- with .Values.running_pod_annotations }}
+      # Run a pre-run hook to set pod annotations
+      # See https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job#triggering-the-scripts
+      containers:
+        - name: runner
+          # ARC (Summerwind) has its own pre-run hook, so we do not want to set
+          #  env:
+          #  - name: ACTIONS_RUNNER_HOOK_JOB_STARTED
+          #    value: /hooks/pre-run.sh # triggers when a job is started, and sets the pod to NOT safe-to-evict
+          # Instead, its pre-run hook runs scripts in /etc/arc/hooks/job-started.d/
+          volumeMounts:
+            - name: hooks
+              mountPath: /etc/arc/hooks/job-started.d/
+      {{- end }}
 
-      {{ if eq .Values.type "organization" }}
+      {{- if eq .Values.type "organization" }}
       organization: {{ .Values.scope }}
       {{- end }}
-      {{ if eq .Values.type "repository" }}
+      {{- if eq .Values.type "repository" }}
       repository: {{ .Values.scope }}
       {{- end }}
-      {{ if index .Values "group" }}
+      {{- if index .Values "group" }}
       group: {{ .Values.group }}
       {{- end }}
       # You can use labels to create subsets of runners.
@@ -103,14 +99,6 @@ spec:
       {{- range .Values.labels }}
         - {{ . | quote }}
       {{- end }}
-      {{- if gt ( len (index .Values "node_selector") ) 0 }}
-      nodeSelector:
-        {{- toYaml .Values.node_selector | nindent 8 }}
-      {{- end }}
-      {{- if gt ( len (index .Values "tolerations") ) 0 }}
-      tolerations:
-        {{- toYaml .Values.tolerations | nindent 8 }}
-      {{- end }}
       # dockerdWithinRunnerContainer = false means access to a Docker daemon is provided by a sidecar container.
       dockerdWithinRunnerContainer: {{ .Values.dind_enabled }}
       image: {{ .Values.image | quote }}
@@ -133,7 +121,7 @@ spec:
           {{- if index .Values.resources.requests "ephemeral_storage" }}
           ephemeral-storage: {{ .Values.resources.requests.ephemeral_storage }}
           {{- end }}
-      {{- if and .Values.dind_enabled .Values.storage }}
+      {{- if and .Values.dind_enabled .Values.docker_storage }}
       dockerVolumeMounts:
         - mountPath: /var/lib/docker
           name: docker-volume
@@ -150,10 +138,10 @@ spec:
         - mountPath: /home/runner/.docker
           name: docker-config-volume
         {{- end }}
-      {{- end }}
-      {{- if or (and .Values.dind_enabled .Values.storage) (.Values.pvc_enabled) (.Values.docker_config_json_enabled) }}
+      {{- end }}{{/* End of volumeMounts */}}
+      {{- if or (and .Values.dind_enabled .Values.docker_storage) (.Values.pvc_enabled) (.Values.docker_config_json_enabled) (not (empty .Values.running_pod_annotations)) }}
       volumes:
-      {{- if and .Values.dind_enabled .Values.storage }}
+        {{- if and .Values.dind_enabled .Values.docker_storage }}
         - name: docker-volume
           ephemeral:
             volumeClaimTemplate:
@@ -161,13 +149,13 @@ spec:
                 accessModes: [ "ReadWriteOnce" ] # Only 1 pod can connect at a time
                 resources:
                   requests:
-                    storage: {{ .Values.storage }}
-      {{- end }}
-      {{- if .Values.pvc_enabled }}
+                    storage: {{ .Values.docker_storage }}
+        {{- end }}
+        {{- if .Values.pvc_enabled }}
         - name: shared-volume
           persistentVolumeClaim:
             claimName: {{ .Values.release_name }}
-      {{- end }}
+        {{- end }}
         {{- if .Values.docker_config_json_enabled }}
         - name: docker-secret
           secret:
@@ -178,4 +166,88 @@ spec:
         - name: docker-config-volume
           emptyDir:
         {{- end }}
-      {{- end }}
+        {{- with .Values.running_pod_annotations }}
+        - name: hooks
+          configMap:
+            name: runner-hooks
+            defaultMode: 0755  # Set execute permissions for all files
+        {{- end }}
+      {{- end }}{{/* End of volumes */}}
+{{- if .Values.pvc_enabled }}
+---
+# Persistent Volumes can be used for image caching
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ .Values.release_name }}
+spec:
+  accessModes:
+    - ReadWriteMany
+  # StorageClassName comes from efs-controller and must be deployed first.
+  storageClassName: efs-sc
+  resources:
+    requests:
+      # EFS is not actually storage constrained, but this storage request is
+      # required. 100Gi is a ballpark for how much we initially request, but this
+      # may grow. We are responsible for docker pruning this periodically to
+      # save space.
+      storage: 100Gi
+{{- end }}
+{{- if .Values.docker_config_json_enabled }}
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: {{ .Values.release_name }}-regcred
+type: kubernetes.io/dockerconfigjson
+data:
+  .dockerconfigjson: {{ .Values.docker_config_json }}
+{{- end }}
+{{- with .Values.running_pod_annotations }}
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: runner-hooks
+data:
+  annotate.sh: |
+    #!/bin/bash
+
+    # If we had kubectl and a KUBECONFIG, we could do this:
+    #   kubectl annotate pod $HOSTNAME 'karpenter.sh/do-not-evict="true"' --overwrite
+    #   kubectl annotate pod $HOSTNAME 'karpenter.sh/do-not-disrupt="true"' --overwrite
+
+    # This is the same thing, the hard way
+
+    # Metadata about the pod
+    NAMESPACE=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)
+    POD_NAME=$(hostname)
+
+    # Kubernetes API URL
+    API_URL="https://kubernetes.default.svc"
+
+    # Read the service account token
+    TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
+
+    # Content type
+    CONTENT_TYPE="application/merge-patch+json"
+
+    PATCH_JSON=$(cat <<EOF
+    {
+      "metadata": {
+        "annotations":
+         {{- . | toJson | nindent 10 }}
+      }
+    }
+    EOF
+    )
+
+    # Use curl to patch the pod
+      curl -sSk -X PATCH \
+    -H "Authorization: Bearer $TOKEN" \
+    -H "Content-Type: $CONTENT_TYPE" \
+    -H "Accept: application/json" \
+      -d "$PATCH_JSON" \
+      "$API_URL/api/v1/namespaces/$NAMESPACE/pods/$POD_NAME"  | jq .metadata.annotations
+
+{{ end }}
diff --git a/modules/eks/actions-runner-controller/charts/actions-runner/values.yaml b/modules/eks/actions-runner-controller/charts/actions-runner/values.yaml
index c5a34270b..44a5e14b4 100644
--- a/modules/eks/actions-runner-controller/charts/actions-runner/values.yaml
+++ b/modules/eks/actions-runner-controller/charts/actions-runner/values.yaml
@@ -2,32 +2,29 @@ type: "repository" # can be either 'organization' or 'repository'
 dind_enabled: true # If `true`, a Docker sidecar container will be deployed
 # To run Docker in Docker (dind), change image from summerwind/actions-runner to summerwind/actions-runner-dind
 image: summerwind/actions-runner-dind
-node_selector:
-  kubernetes.io/os: "linux"
-  kubernetes.io/arch: "amd64"
+
 #scope: "example/app"
-scale_down_delay_seconds: 300
-min_replicas: 1
-max_replicas: 2
+#scale_down_delay_seconds: 300
+#min_replicas: 1
+#max_replicas: 2
 #busy_metrics:
 #  scale_up_threshold: 0.75
 #  scale_down_threshold: 0.25
 #  scale_up_factor: 2
 #  scale_down_factor: 0.5
-resources:
-  limits:
-    cpu: 1.5
-    memory: 4Gi
-    # ephemeral_storage: "10Gi"
-  requests:
-    cpu: 0.5
-    memory: 1Gi
-    # ephemeral_storage: "10Gi"
+#resources:
+#  limits:
+#    cpu: 1
+#    memory: 1Gi
+#    ephemeral_storage: "10Gi"
+#  requests:
+#    cpu: 500m
+#    memory: 512Mi
+#    ephemeral_storage: "1Gi"
 
-storage: "10Gi"
 pvc_enabled: false
 webhook_driven_scaling_enabled: true
-webhook_startup_timeout: "30m"
+webhook_startup_timeout: "90m"
 pull_driven_scaling_enabled: false
 #labels:
 #  - "Ubuntu"
diff --git a/modules/eks/actions-runner-controller/main.tf b/modules/eks/actions-runner-controller/main.tf
index dcf795dad..ba4d25e70 100644
--- a/modules/eks/actions-runner-controller/main.tf
+++ b/modules/eks/actions-runner-controller/main.tf
@@ -111,7 +111,7 @@ data "aws_ssm_parameter" "docker_config_json" {
 
 module "actions_runner_controller" {
   source  = "cloudposse/helm-release/aws"
-  version = "0.10.0"
+  version = "0.10.1"
 
   name            = "" # avoids hitting length restrictions on IAM Role names
   chart           = var.chart
@@ -140,14 +140,15 @@ module "actions_runner_controller" {
     file("${path.module}/resources/values.yaml"),
     # standard k8s object settings
     yamlencode({
-      fullnameOverride = module.this.name,
+      fullnameOverride = module.this.name
       serviceAccount = {
         name = module.this.name
-      },
+      }
       resources = var.resources
       rbac = {
         create = var.rbac_enabled
       }
+      replicaCount = var.controller_replica_count
       githubWebhookServer = {
         enabled                   = var.webhook.enabled
         queueLimit                = var.webhook.queue_limit
@@ -166,7 +167,7 @@ module "actions_runner_controller" {
             }
           ]
         }
-      },
+      }
       authSecret = {
         enabled = true
         create  = local.create_secret
@@ -201,7 +202,7 @@ module "actions_runner" {
   for_each = local.enabled ? var.runners : {}
 
   source  = "cloudposse/helm-release/aws"
-  version = "0.10.0"
+  version = "0.10.1"
 
   name  = each.key
   chart = "${path.module}/charts/actions-runner"
@@ -215,7 +216,8 @@ module "actions_runner" {
   values = compact([
     yamlencode({
       release_name                   = each.key
-      pod_annotations                = lookup(each.value, "pod_annotations", "")
+      pod_annotations                = each.value.pod_annotations
+      running_pod_annotations        = each.value.running_pod_annotations
       service_account_name           = module.actions_runner_controller.service_account_name
       type                           = each.value.type
       scope                          = each.value.scope
@@ -223,7 +225,7 @@ module "actions_runner" {
       dind_enabled                   = each.value.dind_enabled
       service_account_role_arn       = module.actions_runner_controller.service_account_role_arn
       resources                      = each.value.resources
-      storage                        = each.value.storage
+      docker_storage                 = each.value.docker_storage != null ? each.value.docker_storage : each.value.storage
       labels                         = concat(each.value.labels, local.context_labels)
       scale_down_delay_seconds       = each.value.scale_down_delay_seconds
       min_replicas                   = each.value.min_replicas
@@ -233,6 +235,7 @@ module "actions_runner" {
       pull_driven_scaling_enabled    = each.value.pull_driven_scaling_enabled
       pvc_enabled                    = each.value.pvc_enabled
       node_selector                  = each.value.node_selector
+      affinity                       = each.value.affinity
       tolerations                    = each.value.tolerations
       docker_config_json_enabled     = local.docker_config_json_enabled
       docker_config_json             = local.docker_config_json
diff --git a/modules/eks/actions-runner-controller/resources/values.yaml b/modules/eks/actions-runner-controller/resources/values.yaml
index fe4f6cc45..6132ae7a5 100644
--- a/modules/eks/actions-runner-controller/resources/values.yaml
+++ b/modules/eks/actions-runner-controller/resources/values.yaml
@@ -1,7 +1,6 @@
 authSecret:
   create: false
   name: controller-manager
-replicaCount: 1
 scope:
   # If true, the controller will only watch custom resources in a single namespace,
   # which by default is the namespace the controller is in.
@@ -12,6 +11,7 @@ syncPeriod: 120s
 
 githubWebhookServer:
   enabled: false
+  syncPeriod: 120s
   secret:
     # Webhook secret, used to authenticate incoming webhook events from GitHub
     # When using Sops, stored in same SopsSecret as authSecret under key `github_webhook_secret_token`
@@ -23,16 +23,11 @@ githubWebhookServer:
     enabled: false
     annotations:
       alb.ingress.kubernetes.io/backend-protocol: HTTP
-      alb.ingress.kubernetes.io/group.name: common
+      # Use the default ingress, or uncomment and set the group name to use a different one
+      # alb.ingress.kubernetes.io/group.name: common
       alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80},{"HTTPS":443}]'
-      alb.ingress.kubernetes.io/load-balancer-name: k8s-common
       alb.ingress.kubernetes.io/scheme: internet-facing
       alb.ingress.kubernetes.io/ssl-redirect: '443'
       alb.ingress.kubernetes.io/target-type: ip
-      kubernetes.io/ingress.class: alb
     podDisruptionBudget:
       maxUnavailable: "60%"
-
-nodeSelector:
-  kubernetes.io/os: "linux"
-  kubernetes.io/arch: "amd64"
diff --git a/modules/eks/actions-runner-controller/variables.tf b/modules/eks/actions-runner-controller/variables.tf
index 9aa0f7354..57117c334 100644
--- a/modules/eks/actions-runner-controller/variables.tf
+++ b/modules/eks/actions-runner-controller/variables.tf
@@ -25,6 +25,12 @@ variable "chart_version" {
   default     = null
 }
 
+variable "controller_replica_count" {
+  type        = number
+  description = "The number of replicas of the runner-controller to run."
+  default     = 2
+}
+
 variable "resources" {
   type = object({
     limits = object({
@@ -86,33 +92,6 @@ variable "rbac_enabled" {
   description = "Service Account for pods."
 }
 
-# Runner-specific settings
-
-/*
-variable "account_map_environment_name" {
-  type        = string
-  description = "The name of the environment where `account_map` is provisioned"
-  default     = "gbl"
-}
-
-variable "account_map_stage_name" {
-  type        = string
-  description = "The name of the stage where `account_map` is provisioned"
-  default     = "root"
-}
-
-variable "account_map_tenant_name" {
-  type        = string
-  description = <<-EOT
-  The name of the tenant where `account_map` is provisioned.
-
-  If the `tenant` label is not used, leave this as `null`.
-  EOT
-  default     = "core"
-}
-
-*/
-
 variable "existing_kubernetes_secret_name" {
   type        = string
   description = <<-EOT
@@ -153,19 +132,13 @@ variable "runners" {
   ```hcl
   organization_runner = {
     type = "organization" # can be either 'organization' or 'repository'
-    dind_enabled: false # A Docker sidecar container will be deployed
-    image: summerwind/actions-runner # If dind_enabled=true, set this to 'summerwind/actions-runner-dind'
+    dind_enabled: true # A Docker daemon will be started in the runner Pod
+    image: summerwind/actions-runner-dind # If dind_enabled=false, set this to 'summerwind/actions-runner'
     scope = "ACME"  # org name for Organization runners, repo name for Repository runners
     group = "core-automation" # Optional. Assigns the runners to a runner group, for access control.
     scale_down_delay_seconds = 300
     min_replicas = 1
     max_replicas = 5
-    busy_metrics = {
-      scale_up_threshold = 0.75
-      scale_down_threshold = 0.25
-      scale_up_factor = 2
-      scale_down_factor = 0.5
-    }
     labels = [
       "Ubuntu",
       "core-automation",
@@ -178,17 +151,25 @@ variable "runners" {
     type            = string
     scope           = string
     group           = optional(string, null)
-    image           = optional(string, "")
-    dind_enabled    = bool
+    image           = optional(string, "summerwind/actions-runner-dind")
+    dind_enabled    = optional(bool, true)
     node_selector   = optional(map(string), {})
     pod_annotations = optional(map(string), {})
+
+    # running_pod_annotations are only applied to the pods once they start running a job
+    running_pod_annotations = optional(map(string), {})
+
+    # affinity is too complex to model. Whatever you assigned affinity will be copied
+    # to the runner Pod spec.
+    affinity = optional(any)
+
     tolerations = optional(list(object({
       key      = string
       operator = string
       value    = optional(string, null)
       effect   = string
     })), [])
-    scale_down_delay_seconds = number
+    scale_down_delay_seconds = optional(number, 300)
     min_replicas             = number
     max_replicas             = number
     busy_metrics = optional(object({
@@ -199,23 +180,32 @@ variable "runners" {
       scale_up_factor       = optional(string)
       scale_down_factor     = optional(string)
     }))
-    webhook_driven_scaling_enabled = bool
-    webhook_startup_timeout        = optional(string, null)
-    pull_driven_scaling_enabled    = bool
-    labels                         = list(string)
-    storage                        = optional(string, null)
-    pvc_enabled                    = optional(bool, false)
-    resources = object({
-      limits = object({
-        cpu               = string
-        memory            = string
-        ephemeral_storage = optional(string, null)
-      })
-      requests = object({
-        cpu    = string
-        memory = string
-      })
-    })
+    webhook_driven_scaling_enabled = optional(bool, true)
+    # The name `webhook_startup_timeout` is misleading.
+    # It is actually the duration after which a job will be considered completed,
+    # (and the runner killed) even if the webhook has not received a "job completed" event.
+    # This is to ensure that if an event is missed, it does not leave the runner running forever.
+    # Set it long enough to cover the longest job you expect to run and then some.
+    # See https://github.com/actions/actions-runner-controller/blob/9afd93065fa8b1f87296f0dcdf0c2753a0548cb7/docs/automatically-scaling-runners.md?plain=1#L264-L268
+    webhook_startup_timeout     = optional(string, "1h")
+    pull_driven_scaling_enabled = optional(bool, false)
+    labels                      = optional(list(string), [])
+    docker_storage              = optional(string, null)
+    # storage is deprecated in favor of docker_storage, since it is only storage for the Docker daemon
+    storage     = optional(string, null)
+    pvc_enabled = optional(bool, false)
+    resources = optional(object({
+      limits = optional(object({
+        cpu               = optional(string, "1")
+        memory            = optional(string, "1Gi")
+        ephemeral_storage = optional(string, "10Gi")
+      }), {})
+      requests = optional(object({
+        cpu               = optional(string, "500m")
+        memory            = optional(string, "256Mi")
+        ephemeral_storage = optional(string, "1Gi")
+      }), {})
+    }), {})
   }))
 }
 
@@ -223,19 +213,19 @@ variable "webhook" {
   type = object({
     enabled           = bool
     hostname_template = string
-    queue_limit       = optional(number, 100)
+    queue_limit       = optional(number, 1000)
   })
   description = <<-EOT
     Configuration for the GitHub Webhook Server.
     `hostname_template` is the `format()` string to use to generate the hostname via `format(var.hostname_template, var.tenant, var.stage, var.environment)`"
     Typically something like `"echo.%[3]v.%[2]v.example.com"`.
-    `queue_limit` is the maximum number of webhook events that can be queued up processing by the autoscaler.
+    `queue_limit` is the maximum number of webhook events that can be queued up for processing by the autoscaler.
     When the queue gets full, webhook events will be dropped (status 500).
   EOT
   default = {
     enabled           = false
     hostname_template = null
-    queue_limit       = 100
+    queue_limit       = 1000
   }
 }