Skip to content

Commit

Permalink
Merge pull request #2462 from guardian/aa/rolling-update-duration
Browse files Browse the repository at this point in the history
fix(experimental-ec2-pattern): Add buffer to rolling update timeout
  • Loading branch information
akash1810 authored Sep 19, 2024
2 parents b9544a1 + fed2598 commit 2daaea1
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 15 deletions.
12 changes: 12 additions & 0 deletions .changeset/breezy-waves-develop.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
"@guardian/cdk": patch
---

fix(experimental-ec2-pattern): Add buffer to rolling update timeout

If we consider the health check grace period to be the time it takes the "normal" user data to run,
the rolling update should be configured to be a little longer to cover the additional time spent polling the target group.

A buffer of 1 minute is somewhat arbitrarily chosen.
Too high a value, then we increase the time it takes to automatically rollback from a failing healthcheck.
Too low a value, then we risk flaky deploys.
4 changes: 2 additions & 2 deletions src/experimental/patterns/__snapshots__/ec2-app.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ exports[`The GuEc2AppExperimental pattern matches the snapshot 1`] = `
},
"ResourceSignal": {
"Count": 1,
"Timeout": "PT2M",
"Timeout": "PT3M",
},
},
"Properties": {
Expand Down Expand Up @@ -180,7 +180,7 @@ exports[`The GuEc2AppExperimental pattern matches the snapshot 1`] = `
"MaxBatchSize": 2,
"MinInstancesInService": 1,
"MinSuccessfulInstancesPercent": 100,
"PauseTime": "PT2M",
"PauseTime": "PT3M",
"SuspendProcesses": [
"AlarmNotification",
],
Expand Down
9 changes: 5 additions & 4 deletions src/experimental/patterns/ec2-app.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import { GuUserData } from "../../constructs/autoscaling";
import { GuStack } from "../../constructs/core";
import { simpleGuStackForTesting } from "../../utils/test";
import type { GuEc2AppExperimentalProps } from "./ec2-app";
import { GuEc2AppExperimental } from "./ec2-app";
import { GuEc2AppExperimental, RollingUpdateDurations } from "./ec2-app";

/**
* `Aspects` appear to run only at synth time.
Expand Down Expand Up @@ -102,11 +102,12 @@ describe("The GuEc2AppExperimental pattern", () => {
});
});

it("should have a PauseTime equal to the ASG healthcheck grace period", () => {
it("should have a PauseTime higher than the ASG healthcheck grace period", () => {
const stack = simpleGuStackForTesting();
const { autoScalingGroup } = new GuEc2AppExperimental(stack, initialProps(stack));

const tenMinutes = Duration.minutes(10);
const tenMinutesPlusBuffer = tenMinutes.plus(RollingUpdateDurations.buffer);

const cfnAsg = autoScalingGroup.node.defaultChild as CfnAutoScalingGroup;
cfnAsg.healthCheckGracePeriod = tenMinutes.toSeconds();
Expand All @@ -119,12 +120,12 @@ describe("The GuEc2AppExperimental pattern", () => {
},
CreationPolicy: {
ResourceSignal: {
Timeout: tenMinutes.toIsoString(),
Timeout: tenMinutesPlusBuffer.toIsoString(),
},
},
UpdatePolicy: {
AutoScalingRollingUpdate: {
PauseTime: tenMinutes.toIsoString(),
PauseTime: tenMinutesPlusBuffer.toIsoString(),
},
},
});
Expand Down
34 changes: 25 additions & 9 deletions src/experimental/patterns/ec2-app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,29 @@ import type { GuEc2AppProps } from "../../patterns";
import { GuEc2App } from "../../patterns";
import { isSingletonPresentInStack } from "../../utils/singleton";

interface AutoScalingRollingUpdateDurations {
/**
* The time between each check of an instance's health within the target group.
*/
sleep: Duration;

/**
* Additional time to cover the time spent polling an instance's health within the target group before sending the CloudFormation signal.
*/
buffer: Duration;
}

export const RollingUpdateDurations: AutoScalingRollingUpdateDurations = {
sleep: Duration.seconds(5),
buffer: Duration.minutes(1),
};

/**
* Ensures the `AutoScalingRollingUpdate` of an AutoScaling Group has a `PauseTime` matching the healthcheck grace period.
* It also ensures the `CreationPolicy` resource signal `Timeout` matches the healthcheck grace period.
*
* @internal
*
* @privateRemarks
* The ASG healthcheck grace period is hard-coded by {@link GuEc2App}.
* Customisation of this value is performed via an escape hatch.
Expand Down Expand Up @@ -40,14 +59,11 @@ class AutoScalingRollingUpdateTimeout implements IAspect {
const currentRollingUpdate = construct.cfnOptions.updatePolicy?.autoScalingRollingUpdate;
const currentCreationPolicy = construct.cfnOptions.creationPolicy;

/**
* The type of `healthCheckGracePeriod` is `number | undefined`.
* In reality, it will always be set as we set it in {@link GuEc2App}.
* The right-hand side to appease the compiler; 5 minutes is the default value used by AWS.
*
* @see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-attribute-updatepolicy.html#cfn-attributes-updatepolicy-rollingupdate-pausetime
*/
const signalTimeoutSeconds = construct.healthCheckGracePeriod ?? Duration.minutes(5).toSeconds();
if (!construct.healthCheckGracePeriod) {
throw new Error(`The healthcheck grace period not set for autoscaling group ${construct.node.id}.`);
}

const signalTimeoutSeconds = construct.healthCheckGracePeriod + RollingUpdateDurations.buffer.toSeconds();

if (currentRollingUpdate) {
construct.cfnOptions.updatePolicy = {
Expand Down Expand Up @@ -312,7 +328,7 @@ export class GuEc2AppExperimental extends GuEc2App {
until [ "$STATE" == "\\"healthy\\"" ]; do
echo "Instance not yet healthy within target group. Current state $STATE. Sleeping..."
sleep 5
sleep ${RollingUpdateDurations.sleep.toSeconds()}
STATE=$(aws elbv2 describe-target-health \
--target-group-arn ${targetGroup.targetGroupArn} \
--region ${region} \
Expand Down

0 comments on commit 2daaea1

Please sign in to comment.