From f9bdf44d3fd5188d4c37323903aeb4544af86c32 Mon Sep 17 00:00:00 2001 From: akash1810 Date: Wed, 18 Sep 2024 18:01:59 +0100 Subject: [PATCH 1/2] fix(experimental-ec2-pattern): Add buffer to rolling update timeout If we consider the health check grace period to be the time it takes the "normal" user data to run, the rolling update should be configured to be a little longer to cover the additional time spent polling the target group. A buffer of 1 minute is somewhat arbitrarily chosen. Too high a value, then we increase the time it takes to automatically rollback from a failing health check. Too low a value, then we risk flaky deploys. --- .../__snapshots__/ec2-app.test.ts.snap | 4 +-- src/experimental/patterns/ec2-app.test.ts | 9 ++--- src/experimental/patterns/ec2-app.ts | 34 ++++++++++++++----- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/src/experimental/patterns/__snapshots__/ec2-app.test.ts.snap b/src/experimental/patterns/__snapshots__/ec2-app.test.ts.snap index 11b0ea2800..c171d8842f 100644 --- a/src/experimental/patterns/__snapshots__/ec2-app.test.ts.snap +++ b/src/experimental/patterns/__snapshots__/ec2-app.test.ts.snap @@ -106,7 +106,7 @@ exports[`The GuEc2AppExperimental pattern matches the snapshot 1`] = ` }, "ResourceSignal": { "Count": 1, - "Timeout": "PT2M", + "Timeout": "PT3M", }, }, "Properties": { @@ -180,7 +180,7 @@ exports[`The GuEc2AppExperimental pattern matches the snapshot 1`] = ` "MaxBatchSize": 2, "MinInstancesInService": 1, "MinSuccessfulInstancesPercent": 100, - "PauseTime": "PT2M", + "PauseTime": "PT3M", "SuspendProcesses": [ "AlarmNotification", ], diff --git a/src/experimental/patterns/ec2-app.test.ts b/src/experimental/patterns/ec2-app.test.ts index a65857ef32..676f3695ee 100644 --- a/src/experimental/patterns/ec2-app.test.ts +++ b/src/experimental/patterns/ec2-app.test.ts @@ -9,7 +9,7 @@ import { GuUserData } from "../../constructs/autoscaling"; import { GuStack } from "../../constructs/core"; import { simpleGuStackForTesting } from "../../utils/test"; import type { GuEc2AppExperimentalProps } from "./ec2-app"; -import { GuEc2AppExperimental } from "./ec2-app"; +import { GuEc2AppExperimental, RollingUpdateDurations } from "./ec2-app"; /** * `Aspects` appear to run only at synth time. @@ -102,11 +102,12 @@ describe("The GuEc2AppExperimental pattern", () => { }); }); - it("should have a PauseTime equal to the ASG healthcheck grace period", () => { + it("should have a PauseTime higher than the ASG healthcheck grace period", () => { const stack = simpleGuStackForTesting(); const { autoScalingGroup } = new GuEc2AppExperimental(stack, initialProps(stack)); const tenMinutes = Duration.minutes(10); + const tenMinutesPlusBuffer = tenMinutes.plus(RollingUpdateDurations.buffer); const cfnAsg = autoScalingGroup.node.defaultChild as CfnAutoScalingGroup; cfnAsg.healthCheckGracePeriod = tenMinutes.toSeconds(); @@ -119,12 +120,12 @@ describe("The GuEc2AppExperimental pattern", () => { }, CreationPolicy: { ResourceSignal: { - Timeout: tenMinutes.toIsoString(), + Timeout: tenMinutesPlusBuffer.toIsoString(), }, }, UpdatePolicy: { AutoScalingRollingUpdate: { - PauseTime: tenMinutes.toIsoString(), + PauseTime: tenMinutesPlusBuffer.toIsoString(), }, }, }); diff --git a/src/experimental/patterns/ec2-app.ts b/src/experimental/patterns/ec2-app.ts index cf1b1aad1c..256016c356 100644 --- a/src/experimental/patterns/ec2-app.ts +++ b/src/experimental/patterns/ec2-app.ts @@ -9,10 +9,29 @@ import type { GuEc2AppProps } from "../../patterns"; import { GuEc2App } from "../../patterns"; import { isSingletonPresentInStack } from "../../utils/singleton"; +interface AutoScalingRollingUpdateDurations { + /** + * The time between each check of an instance's health within the target group. + */ + sleep: Duration; + + /** + * Additional time to cover the time spent polling an instance's health within the target group before sending the CloudFormation signal. + */ + buffer: Duration; +} + +export const RollingUpdateDurations: AutoScalingRollingUpdateDurations = { + sleep: Duration.seconds(5), + buffer: Duration.minutes(1), +}; + /** * Ensures the `AutoScalingRollingUpdate` of an AutoScaling Group has a `PauseTime` matching the healthcheck grace period. * It also ensures the `CreationPolicy` resource signal `Timeout` matches the healthcheck grace period. * + * @internal + * * @privateRemarks * The ASG healthcheck grace period is hard-coded by {@link GuEc2App}. * Customisation of this value is performed via an escape hatch. @@ -40,14 +59,11 @@ class AutoScalingRollingUpdateTimeout implements IAspect { const currentRollingUpdate = construct.cfnOptions.updatePolicy?.autoScalingRollingUpdate; const currentCreationPolicy = construct.cfnOptions.creationPolicy; - /** - * The type of `healthCheckGracePeriod` is `number | undefined`. - * In reality, it will always be set as we set it in {@link GuEc2App}. - * The right-hand side to appease the compiler; 5 minutes is the default value used by AWS. - * - * @see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-attribute-updatepolicy.html#cfn-attributes-updatepolicy-rollingupdate-pausetime - */ - const signalTimeoutSeconds = construct.healthCheckGracePeriod ?? Duration.minutes(5).toSeconds(); + if (!construct.healthCheckGracePeriod) { + throw new Error(`The healthcheck grace period not set for autoscaling group ${construct.node.id}.`); + } + + const signalTimeoutSeconds = construct.healthCheckGracePeriod + RollingUpdateDurations.buffer.toSeconds(); if (currentRollingUpdate) { construct.cfnOptions.updatePolicy = { @@ -312,7 +328,7 @@ export class GuEc2AppExperimental extends GuEc2App { until [ "$STATE" == "\\"healthy\\"" ]; do echo "Instance not yet healthy within target group. Current state $STATE. Sleeping..." - sleep 5 + sleep ${RollingUpdateDurations.sleep.toSeconds()} STATE=$(aws elbv2 describe-target-health \ --target-group-arn ${targetGroup.targetGroupArn} \ --region ${region} \ From fed2598a8643728dc208a3e65af7d49e9368347b Mon Sep 17 00:00:00 2001 From: akash1810 Date: Wed, 18 Sep 2024 18:25:52 +0100 Subject: [PATCH 2/2] chore: Add changeset --- .changeset/breezy-waves-develop.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .changeset/breezy-waves-develop.md diff --git a/.changeset/breezy-waves-develop.md b/.changeset/breezy-waves-develop.md new file mode 100644 index 0000000000..3a6a576844 --- /dev/null +++ b/.changeset/breezy-waves-develop.md @@ -0,0 +1,12 @@ +--- +"@guardian/cdk": patch +--- + +fix(experimental-ec2-pattern): Add buffer to rolling update timeout + +If we consider the health check grace period to be the time it takes the "normal" user data to run, +the rolling update should be configured to be a little longer to cover the additional time spent polling the target group. + +A buffer of 1 minute is somewhat arbitrarily chosen. +Too high a value, then we increase the time it takes to automatically rollback from a failing healthcheck. +Too low a value, then we risk flaky deploys.