From 9d5460f63482aa4b016459ce63016a716bd6cc1f Mon Sep 17 00:00:00 2001 From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com> Date: Wed, 1 Feb 2023 15:34:29 +0100 Subject: [PATCH] =?UTF-8?q?improve=20retry=20mechanism,=20avoiding=20rely?= =?UTF-8?q?=20on=20AWS=20SQS=20retry=20and=20instead=20o=E2=80=A6=20(#1999?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …nly using self retry mechanism --- terraform-aws-github-runner/main.tf | 4 +- .../lambdas/runners/src/lambda.test.ts | 16 +- .../runners/lambdas/runners/src/lambda.ts | 136 ++++++++++------ .../runners/src/scale-runners/metrics.ts | 43 +++-- .../src/scale-runners/scale-up.test.ts | 22 +-- .../runners/src/scale-runners/scale-up.ts | 149 ++++++++---------- terraform-aws-github-runner/variables.tf | 8 +- 7 files changed, 224 insertions(+), 154 deletions(-) diff --git a/terraform-aws-github-runner/main.tf b/terraform-aws-github-runner/main.tf index 797aa307d1..e5d726e5fc 100644 --- a/terraform-aws-github-runner/main.tf +++ b/terraform-aws-github-runner/main.tf @@ -37,7 +37,7 @@ resource "aws_sqs_queue" "queued_builds" { fifo_queue = true content_based_deduplication = true max_message_size = 2048 - message_retention_seconds = var.runners_scale_up_sqs_max_retry * var.runners_scale_up_sqs_visibility_timeout + 100 + message_retention_seconds = var.runners_scale_up_sqs_message_ret_s redrive_policy = jsonencode({ deadLetterTargetArn = aws_sqs_queue.queued_builds_dead_letter.arn maxReceiveCount = var.runners_scale_up_sqs_max_retry @@ -57,7 +57,7 @@ resource "aws_sqs_queue" "queued_builds_retry" { name = "${var.environment}-queued-builds-retry" visibility_timeout_seconds = var.runners_scale_up_sqs_visibility_timeout max_message_size = 2048 - message_retention_seconds = var.runners_scale_up_sqs_max_retry * var.runners_scale_up_sqs_visibility_timeout + 100 + message_retention_seconds = var.runners_scale_up_sqs_message_ret_s redrive_policy = jsonencode({ deadLetterTargetArn = aws_sqs_queue.queued_builds_retry_dead_letter.arn maxReceiveCount = var.runners_scale_up_sqs_max_retry diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts index 7f84fcceff..41543b161d 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts @@ -6,17 +6,26 @@ import { Context, SQSEvent, ScheduledEvent } from 'aws-lambda'; import { mocked } from 'ts-jest/utils'; import { scaleDown } from './scale-runners/scale-down'; import { scaleUp, RetryableScalingError } from './scale-runners/scale-up'; +import * as MetricsModule from './scale-runners/metrics'; +const mockCloudWatch = { + putMetricData: jest.fn().mockImplementation(() => { + return { promise: jest.fn().mockResolvedValue(true) }; + }), +}; const mockSQS = { sendMessage: jest.fn().mockReturnValue({ promise: jest.fn() }), }; jest.mock('aws-sdk', () => ({ SQS: jest.fn().mockImplementation(() => mockSQS), + CloudWatch: jest.fn().mockImplementation(() => mockCloudWatch), })); jest.mock('./scale-runners/scale-down'); jest.mock('./scale-runners/scale-up'); +const metrics = new MetricsModule.ScaleUpMetrics(); + beforeEach(() => { jest.resetModules(); jest.clearAllMocks(); @@ -27,6 +36,7 @@ beforeEach(() => { describe('scaleUp', () => { beforeEach(() => { jest.spyOn(global.Math, 'random').mockReturnValue(1.0); + jest.spyOn(MetricsModule, 'ScaleUpMetrics').mockReturnValue(metrics); }); afterEach(() => { @@ -47,8 +57,8 @@ describe('scaleUp', () => { callback, ); expect(mockedScaleUp).toBeCalledTimes(2); - expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }); - expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 2 }); + expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, metrics); + expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 2 }, metrics); expect(callback).toBeCalledTimes(1); expect(callback).toBeCalledWith(null); }); @@ -67,7 +77,7 @@ describe('scaleUp', () => { callback, ); expect(mockedScaleUp).toBeCalledTimes(1); - expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }); + expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, metrics); expect(callback).toBeCalledTimes(1); expect(callback).toBeCalledWith('Failed handling SQS event'); }); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts index daba2dfff1..02e0ad4cf3 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts @@ -1,71 +1,117 @@ -import { SQS } from 'aws-sdk'; +import { ActionRequestMessage, RetryableScalingError, scaleUp as scaleUpR } from './scale-runners/scale-up'; import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda'; import { Config } from './scale-runners/config'; -import { scaleDown as scaleDownR } from './scale-runners/scale-down'; -import { scaleUp as scaleUpR, RetryableScalingError, ActionRequestMessage } from './scale-runners/scale-up'; +import { SQS } from 'aws-sdk'; +import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; import { getDelayWithJitterRetryCount } from './scale-runners/utils'; +import { scaleDown as scaleDownR } from './scale-runners/scale-down'; + +async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean]>, metrics: ScaleUpMetrics) { + console.error(`Detected ${evtFailed.length} errors when processing messages, will retry relevant messages.`); + metrics.exception(); + + const sqs: SQS = new SQS(); + + for (const [evt, retryable] of evtFailed) { + const body: ActionRequestMessage = JSON.parse(evt.body); + const retryCount = body?.retryCount ?? 0; + + if ( + retryCount < Config.Instance.maxRetryScaleUpRecord && + (Config.Instance.retryScaleUpRecordQueueUrl?.length ?? 0) > 0 + ) { + if (retryable) { + metrics.scaleUpFailureRetryable(retryCount); + } else { + metrics.scaleUpFailureNonRetryable(retryCount); + } + + body.retryCount = retryCount + 1; + body.delaySeconds = Math.min( + 900, + getDelayWithJitterRetryCount( + retryCount, + Math.max(Config.Instance.retryScaleUpRecordDelayS, 20), + Config.Instance.retryScaleUpRecordJitterPct, + ), + ); + + const sqsPayload: SQS.SendMessageRequest = { + DelaySeconds: body.delaySeconds, + MessageBody: JSON.stringify(body), + QueueUrl: Config.Instance.retryScaleUpRecordQueueUrl as string, + }; + + await sqs.sendMessage(sqsPayload).promise(); + console.warn(`Sent message: ${evt.body}`); + } else { + console.error(`Permanently abandoning message: ${evt.body}`); + } + } +} // eslint-disable-next-line @typescript-eslint/no-explicit-any export async function scaleUp(event: SQSEvent, context: Context, callback: any) { - console.dir(event, { depth: 5 }); + let success = false; + const metrics = new ScaleUpMetrics(); + + const sndMetricsTimout: sendMetricsTimeoutVars = { + metrics: metrics, + }; + sndMetricsTimout.setTimeout = setTimeout( + sendMetricsAtTimeout(sndMetricsTimout), + (Config.Instance.lambdaTimeout - 10) * 1000, + ); + try { - const evtFailed: Array = []; + const evtFailed: Array<[SQSRecord, boolean]> = []; + + recordsIterProcess: for (let i = 0; i < event.Records.length; i += 1) { + const evt = event.Records[i]; - for (const evt of event.Records) { try { - await scaleUpR(evt.eventSource, JSON.parse(evt.body)); + await scaleUpR(evt.eventSource, JSON.parse(evt.body), metrics); + metrics.scaleUpSuccess(); } catch (e) { if (e instanceof RetryableScalingError) { console.error(`Retryable error thrown: "${e.message}"`); - evtFailed.push(evt); + evtFailed.push([evt, true]); } else { - throw e; + console.error(`Non-retryable error during request: "${e.message}"`); + console.error(`All remaning '${event.Records.length - i}' messages will be scheduled to retry`); + for (let ii = i; ii < event.Records.length; ii += 1) { + evtFailed.push([event.Records[ii], false]); + } + break recordsIterProcess; } } } if (evtFailed.length > 0) { - console.error(`Detected ${evtFailed.length} errors when processing messages, will retry relevant messages.`); - - const sqs: SQS = new SQS(); - - for (const evt of evtFailed) { - const body: ActionRequestMessage = JSON.parse(evt.body); - const retryCount = body?.retryCount ?? 0; - - if ( - retryCount < Config.Instance.maxRetryScaleUpRecord && - (Config.Instance.retryScaleUpRecordQueueUrl?.length ?? 0) > 0 - ) { - body.retryCount = retryCount + 1; - body.delaySeconds = Math.min( - 900, - getDelayWithJitterRetryCount( - retryCount, - Math.max(Config.Instance.retryScaleUpRecordDelayS, 20), - Config.Instance.retryScaleUpRecordJitterPct, - ), - ); - - const sqsPayload: SQS.SendMessageRequest = { - DelaySeconds: body.delaySeconds, - MessageBody: JSON.stringify(body), - QueueUrl: Config.Instance.retryScaleUpRecordQueueUrl as string, - }; - - await sqs.sendMessage(sqsPayload).promise(); - console.warn(`Sent message: ${evt.body}`); - } else { - console.error(`Permanently abandoning message: ${evt.body}`); - } - } + await sendRetryEvents(evtFailed, metrics); } - return callback(null); + success = evtFailed.every((i) => { + return i[1]; + }); } catch (e) { console.error(e); - return callback('Failed handling SQS event'); + } finally { + try { + clearTimeout(sndMetricsTimout.setTimeout); + sndMetricsTimout.metrics = undefined; + sndMetricsTimout.setTimeout = undefined; + metrics.sendMetrics(); + } catch (e) { + console.error(`Error sending metrics: ${e}`); + } + + if (success) { + callback(null); + } else { + callback('Failed handling SQS event'); + } } } diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index b39aab4425..e17a62b5b5 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -220,6 +220,16 @@ export class Metrics { } } + /* istanbul ignore next */ + run() { + this.countEntry('run.count'); + } + + /* istanbul ignore next */ + exception() { + this.countEntry('run.exceptions_count'); + } + // GitHub API CALLS /* istanbul ignore next */ createAppAuthGHCallSuccess(ms: number) { @@ -690,6 +700,29 @@ export class ScaleUpMetrics extends Metrics { this.countEntry('run.skip', 1, this.getRepoDim(repo)); } + /* istanbul ignore next */ + scaleUpSuccess() { + this.countEntry('run.scaleup.success'); + } + + /* istanbul ignore next */ + scaleUpFailureRetryable(retries: number) { + this.countEntry('run.scaleup.failure.total.count'); + this.addEntry('run.scaleup.failure.total.retries', retries); + + this.countEntry('run.scaleup.failure.retryable.count'); + this.addEntry('run.scaleup.failure.retryable.retries', retries); + } + + /* istanbul ignore next */ + scaleUpFailureNonRetryable(retries: number) { + this.countEntry('run.scaleup.failure.total.count'); + this.addEntry('run.scaleup.failure.total.retries', retries); + + this.countEntry('run.scaleup.failure.nonretryable.count'); + this.addEntry('run.scaleup.failure.nonretryable.retries', retries); + } + /* istanbul ignore next */ ghRunnersRepoStats(repo: Repo, runnerType: string, total: number, labeled: number, busy: number) { const dimensions = this.getRepoDim(repo); @@ -794,16 +827,6 @@ export class ScaleDownMetrics extends Metrics { super('scaleDown'); } - /* istanbul ignore next */ - run() { - this.countEntry('run.count'); - } - - /* istanbul ignore next */ - exception() { - this.countEntry('run.exceptions_count'); - } - /* istanbul ignore next */ runnerLessMinimumTime(ec2Runner: RunnerInfo) { this.countEntry(`run.ec2runners.notMinTime`); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts index 4a372e4f58..9a3513a828 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.test.ts @@ -50,7 +50,7 @@ describe('scaleUp', () => { installationId: 2, runnerLabels: [], }; - await expect(scaleUp('other', payload)).rejects.toThrow('Cannot handle non-SQS events!'); + await expect(scaleUp('other', payload, metrics)).rejects.toThrow('Cannot handle non-SQS events!'); }); it('provides runnerLabels that aren`t present on runnerTypes', async () => { @@ -80,7 +80,7 @@ describe('scaleUp', () => { ); const mockedListGithubRunners = mocked(listGithubRunnersRepo); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedGetRunnerTypes).toBeCalledTimes(1); expect(mockedGetRunnerTypes).toBeCalledWith({ repo: 'repo', owner: 'owner' }, metrics); @@ -189,7 +189,7 @@ describe('scaleUp', () => { ]); const mockedCreateRegistrationTokenForRepo = mocked(createRegistrationTokenRepo); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedGetRunnerTypes).toBeCalledTimes(1); expect(mockedGetRunnerTypes).toBeCalledWith(repo, metrics); @@ -261,7 +261,7 @@ describe('scaleUp', () => { const mockedCreateRegistrationTokenForOrg = mocked(createRegistrationTokenOrg).mockResolvedValue(token); const mockedCreateRunner = mocked(createRunner); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedListGithubRunnersOrg).toBeCalledWith(repo.owner, metrics); expect(mockedCreateRunner).toBeCalledTimes(1); @@ -344,7 +344,7 @@ describe('scaleUp', () => { const mockedCreateRegistrationTokenForRepo = mocked(createRegistrationTokenRepo).mockResolvedValue(token); const mockedCreateRunner = mocked(createRunner); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedCreateRunner).toBeCalledTimes(1); expect(mockedCreateRunner).toBeCalledWith( @@ -426,7 +426,7 @@ describe('scaleUp', () => { const mockedCreateRegistrationTokenForRepo = mocked(createRegistrationTokenRepo).mockResolvedValue(token); const mockedCreateRunner = mocked(createRunner); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedCreateRunner).toBeCalledTimes(1); expect(mockedCreateRunner).toBeCalledWith( @@ -508,7 +508,7 @@ describe('scaleUp', () => { const mockedCreateRegistrationTokenForRepo = mocked(createRegistrationTokenRepo).mockResolvedValue(token); const mockedCreateRunner = mocked(createRunner); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedCreateRunner).toBeCalledTimes(1); expect(mockedCreateRunner).toBeCalledWith( @@ -573,7 +573,7 @@ describe('scaleUp', () => { ]); const mockedCreateRegistrationTokenForRepo = mocked(createRegistrationTokenRepo); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedCreateRegistrationTokenForRepo).not.toBeCalled(); }); @@ -623,7 +623,7 @@ describe('scaleUp', () => { mocked(createRegistrationTokenRepo).mockResolvedValue(token); const mockedCreateRunner = mocked(createRunner); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedCreateRunner).toBeCalledWith( { @@ -668,7 +668,7 @@ describe('scaleUp', () => { mockedGetRepoIssuesWithLabel.mockResolvedValueOnce([{ something: 1 }] as unknown as GhIssues); mockedGetRepoIssuesWithLabel.mockResolvedValueOnce([]); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedGetRunnerTypes).not.toBeCalled(); expect(mockedGetRepoIssuesWithLabel).toBeCalledTimes(2); expect(mockedGetRepoIssuesWithLabel).toBeCalledWith(repo, config.mustHaveIssuesLabels[0], metrics); @@ -701,7 +701,7 @@ describe('scaleUp', () => { mockedGetRepoIssuesWithLabel.mockResolvedValueOnce([]); mockedGetRepoIssuesWithLabel.mockResolvedValueOnce([{ something: 1 }] as unknown as GhIssues); - await scaleUp('aws:sqs', payload); + await scaleUp('aws:sqs', payload, metrics); expect(mockedGetRunnerTypes).not.toBeCalled(); expect(mockedGetRepoIssuesWithLabel).toBeCalledTimes(2); expect(mockedGetRepoIssuesWithLabel).toBeCalledWith(repo, config.cantHaveIssuesLabels[0], metrics); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts index 4bee77c8a4..227ac64557 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up.ts @@ -1,4 +1,4 @@ -import { Metrics, ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './metrics'; +import { Metrics, ScaleUpMetrics } from './metrics'; import { Repo, getRepoKey } from './utils'; import { RunnerType, RunnerInputParameters, createRunner } from './runners'; import { @@ -30,17 +30,14 @@ export class RetryableScalingError extends Error { } } -export async function scaleUp(eventSource: string, payload: ActionRequestMessage): Promise { - if (eventSource !== 'aws:sqs') throw Error('Cannot handle non-SQS events!'); - - const metrics = new ScaleUpMetrics(); - const sndMetricsTimout: sendMetricsTimeoutVars = { - metrics: metrics, - }; - sndMetricsTimout.setTimeout = setTimeout( - sendMetricsAtTimeout(sndMetricsTimout), - (Config.Instance.lambdaTimeout - 10) * 1000, - ); +export async function scaleUp( + eventSource: string, + payload: ActionRequestMessage, + metrics: ScaleUpMetrics, +): Promise { + if (eventSource !== 'aws:sqs') { + throw Error('Cannot handle non-SQS events!'); + } const repo: Repo = { owner: payload.repositoryOwner, @@ -49,81 +46,69 @@ export async function scaleUp(eventSource: string, payload: ActionRequestMessage const errors = []; - try { - if (await shouldSkipForRepo(repo, metrics)) { - metrics.skipRepo(repo); - return; - } + if (await shouldSkipForRepo(repo, metrics)) { + metrics.skipRepo(repo); + return; + } - metrics.runRepo(repo); + metrics.runRepo(repo); + metrics.run(); - const runnerTypes = await getRunnerTypes( - { - owner: repo.owner, - repo: Config.Instance.enableOrganizationRunners ? Config.Instance.scaleConfigRepo : repo.repo, - }, - metrics, - ); - /* istanbul ignore next */ - const runnerLabels = payload?.runnerLabels ?? Array.from(runnerTypes.keys()); - - // ideally we should only have one label specfied but loop so we can go through them all if there are multiple - // if no labels are found this should just be a no-op - for (const runnerLabel of runnerLabels) { - const runnerType = runnerTypes.get(runnerLabel); - if (runnerType === undefined) { - console.info(`Runner label '${runnerLabel}' was not found in config for ` + `${repo.owner}/${repo.repo}`); - continue; - } - if ( - await allRunnersBusy( - runnerType.runnerTypeName, - repo, - runnerType.is_ephemeral, - runnerType.max_available, - metrics, - ) - ) { - try { - const createRunnerParams: RunnerInputParameters = { - environment: Config.Instance.environment, - runnerConfig: (awsRegion: string) => { - return createRunnerConfigArgument(runnerType, repo, payload.installationId, metrics, awsRegion); - }, - runnerType: runnerType, - }; - if (Config.Instance.enableOrganizationRunners) { - createRunnerParams.orgName = repo.owner; - } else { - createRunnerParams.repoName = getRepoKey(repo); - } - const awsRegion = await createRunner(createRunnerParams, metrics); - if (Config.Instance.enableOrganizationRunners) { - metrics.runnersOrgCreate(repo.owner, runnerType.runnerTypeName, awsRegion); - } else { - metrics.runnersRepoCreate(repo, runnerType.runnerTypeName, awsRegion); - } - } catch (e) { - errors.push(e); - - /* istanbul ignore next */ - if (Config.Instance.enableOrganizationRunners) { - metrics.runnersOrgCreateFail(repo.owner, runnerType.runnerTypeName); - } else { - metrics.runnersRepoCreateFail(repo, runnerType.runnerTypeName); - } - /* istanbul ignore next */ - console.error(`Error spinning up instance of type ${runnerType.runnerTypeName}: ${e}`); + const runnerTypes = await getRunnerTypes( + { + owner: repo.owner, + repo: Config.Instance.enableOrganizationRunners ? Config.Instance.scaleConfigRepo : repo.repo, + }, + metrics, + ); + /* istanbul ignore next */ + const runnerLabels = payload?.runnerLabels ?? Array.from(runnerTypes.keys()); + + // ideally we should only have one label specfied but loop so we can go through them all if there are multiple + // if no labels are found this should just be a no-op + for (const runnerLabel of runnerLabels) { + const runnerType = runnerTypes.get(runnerLabel); + if (runnerType === undefined) { + console.info(`Runner label '${runnerLabel}' was not found in config for ` + `${repo.owner}/${repo.repo}`); + continue; + } + if ( + await allRunnersBusy(runnerType.runnerTypeName, repo, runnerType.is_ephemeral, runnerType.max_available, metrics) + ) { + try { + const createRunnerParams: RunnerInputParameters = { + environment: Config.Instance.environment, + runnerConfig: (awsRegion: string) => { + return createRunnerConfigArgument(runnerType, repo, payload.installationId, metrics, awsRegion); + }, + runnerType: runnerType, + }; + if (Config.Instance.enableOrganizationRunners) { + createRunnerParams.orgName = repo.owner; + } else { + createRunnerParams.repoName = getRepoKey(repo); } - } else { - console.info('There are available runners, no new runners will be created'); + const awsRegion = await createRunner(createRunnerParams, metrics); + if (Config.Instance.enableOrganizationRunners) { + metrics.runnersOrgCreate(repo.owner, runnerType.runnerTypeName, awsRegion); + } else { + metrics.runnersRepoCreate(repo, runnerType.runnerTypeName, awsRegion); + } + } catch (e) { + errors.push(e); + + /* istanbul ignore next */ + if (Config.Instance.enableOrganizationRunners) { + metrics.runnersOrgCreateFail(repo.owner, runnerType.runnerTypeName); + } else { + metrics.runnersRepoCreateFail(repo, runnerType.runnerTypeName); + } + /* istanbul ignore next */ + console.error(`Error spinning up instance of type ${runnerType.runnerTypeName}: ${e}`); } + } else { + console.info('There are available runners, no new runners will be created'); } - } finally { - clearTimeout(sndMetricsTimout.setTimeout); - sndMetricsTimout.metrics = undefined; - sndMetricsTimout.setTimeout = undefined; - metrics.sendMetrics(); } if (errors.length > 0) { diff --git a/terraform-aws-github-runner/variables.tf b/terraform-aws-github-runner/variables.tf index 0ca91b9dca..da07ab9cc2 100644 --- a/terraform-aws-github-runner/variables.tf +++ b/terraform-aws-github-runner/variables.tf @@ -90,7 +90,13 @@ variable "runners_lambda_zip" { variable "runners_scale_up_sqs_max_retry" { description = "max retry count for messages in the scale up sqs." type = number - default = 3 + default = 1 +} + +variable "runners_scale_up_sqs_message_ret_s" { + description = "scale up SQS message retention timeout (seconds)" + type = number + default = 7200 } variable "runners_scale_up_sqs_visibility_timeout" {