diff --git a/build-logic/src/main/kotlin/clusterless.java-common-conventions.gradle.kts b/build-logic/src/main/kotlin/clusterless.java-common-conventions.gradle.kts index 573e5485..a6d8b415 100644 --- a/build-logic/src/main/kotlin/clusterless.java-common-conventions.gradle.kts +++ b/build-logic/src/main/kotlin/clusterless.java-common-conventions.gradle.kts @@ -152,7 +152,7 @@ dependencies { implementationAndTestFixture("javax.annotation:javax.annotation-api:1.3.2") - implementationAndTestFixture("io.github.resilience4j:resilience4j-retry:2.0.2") + implementationAndTestFixture("io.github.resilience4j:resilience4j-retry:2.1.0") testImplementationAndTestFixture("org.mockito:mockito-core:5.7.0") diff --git a/clusterless-main-common/src/main/java/clusterless/cls/process/ProcessExec.java b/clusterless-main-common/src/main/java/clusterless/cls/process/ProcessExec.java index cf578b8f..a68f011d 100644 --- a/clusterless-main-common/src/main/java/clusterless/cls/process/ProcessExec.java +++ b/clusterless-main-common/src/main/java/clusterless/cls/process/ProcessExec.java @@ -65,9 +65,7 @@ protected int executeProcess(Map environment, List args) return process(environment, args); } - if (retry()) { - LOG.info("enabled retrying command: {} {} times", args, retries); - } + LOG.info("enabled retrying command: {} {} times", args, retries); RetryConfig config = RetryConfig.custom() .maxAttempts(retries) diff --git a/clusterless-substrate-aws-common/build.gradle.kts b/clusterless-substrate-aws-common/build.gradle.kts index cfc55176..47354b02 100644 --- a/clusterless-substrate-aws-common/build.gradle.kts +++ b/clusterless-substrate-aws-common/build.gradle.kts @@ -22,6 +22,8 @@ dependencies { implementation("com.fasterxml.jackson.datatype:jackson-datatype-joda") implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310") + implementation("io.github.resilience4j:resilience4j-retry") + api("com.jayway.jsonpath:json-path") implementation("software.amazon.awssdk:s3") diff --git a/clusterless-substrate-aws-common/src/main/java/clusterless/cls/substrate/aws/sdk/ClientRetry.java b/clusterless-substrate-aws-common/src/main/java/clusterless/cls/substrate/aws/sdk/ClientRetry.java new file mode 100644 index 00000000..f5576f88 --- /dev/null +++ b/clusterless-substrate-aws-common/src/main/java/clusterless/cls/substrate/aws/sdk/ClientRetry.java @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023 Chris K Wensel . All Rights Reserved. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +package clusterless.cls.substrate.aws.sdk; + +import io.github.resilience4j.core.IntervalFunction; +import io.github.resilience4j.retry.Retry; +import io.github.resilience4j.retry.RetryConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import software.amazon.awssdk.awscore.AwsClient; + +import java.time.Duration; +import java.util.function.Predicate; +import java.util.function.Supplier; + +public class ClientRetry { + private static final Logger LOG = LoggerFactory.getLogger(ClientRetry.class); + private final RetryConfig config; + private final String client; + + public ClientRetry(String client, int maxAttempts, Predicate.Response> predicate) { + this.client = client; + this.config = RetryConfig..Response>custom() + .maxAttempts(maxAttempts) + .intervalFunction(IntervalFunction.ofExponentialBackoff(Duration.ofSeconds(30), 2.0, Duration.ofMinutes(5))) + .consumeResultBeforeRetryAttempt((attempt, response) -> LOG.warn("got: {}, for retry attempt: {} of {}", response.errorMessage(), attempt, maxAttempts)) + .retryOnResult(predicate) + .failAfterMaxAttempts(true) + .build(); + } + + public ClientBase.Response invoke(Supplier.Response> checkedSupplier) { + return Retry.of(client, config) + .executeSupplier(checkedSupplier); + } +} diff --git a/clusterless-substrate-aws-lambda-common/build.gradle.kts b/clusterless-substrate-aws-lambda-common/build.gradle.kts index 365a555d..62c79f24 100644 --- a/clusterless-substrate-aws-lambda-common/build.gradle.kts +++ b/clusterless-substrate-aws-lambda-common/build.gradle.kts @@ -56,6 +56,4 @@ dependencies { testFixturesImplementation("uk.org.webcompere:system-stubs-core") testFixturesImplementation("uk.org.webcompere:system-stubs-jupiter") testFixturesImplementation("org.mockito:mockito-inline") - -// implementation("io.github.resilience4j:resilience4j-retry") } diff --git a/clusterless-substrate-aws-lambda-transform/src/main/java/clusterless/aws/lambda/activity/cloudwatch/CloudWatchExportActivityHandler.java b/clusterless-substrate-aws-lambda-transform/src/main/java/clusterless/aws/lambda/activity/cloudwatch/CloudWatchExportActivityHandler.java index bd07df72..d53632a0 100644 --- a/clusterless-substrate-aws-lambda-transform/src/main/java/clusterless/aws/lambda/activity/cloudwatch/CloudWatchExportActivityHandler.java +++ b/clusterless-substrate-aws-lambda-transform/src/main/java/clusterless/aws/lambda/activity/cloudwatch/CloudWatchExportActivityHandler.java @@ -10,6 +10,7 @@ import clusterless.aws.lambda.EventHandler; import clusterless.aws.lambda.transform.json.event.AWSEvent; +import clusterless.cls.substrate.aws.sdk.ClientRetry; import clusterless.cls.substrate.aws.sdk.CloudWatchLogs; import clusterless.cls.util.Env; import clusterless.commons.temporal.IntervalBuilder; @@ -17,6 +18,8 @@ import com.google.common.base.Stopwatch; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import software.amazon.awssdk.services.cloudwatchlogs.CloudWatchLogsClient; +import software.amazon.awssdk.services.cloudwatchlogs.model.LimitExceededException; import java.net.URI; import java.time.Duration; @@ -31,7 +34,9 @@ public class CloudWatchExportActivityHandler extends EventHandler retryClient = new ClientRetry<>("cloudwatchlogs", 5, r -> r.exception() instanceof LimitExceededException); protected final IntervalBuilder intervalBuilder = new IntervalBuilder(activityProps.interval); @@ -90,8 +95,11 @@ public void handleEvent(AWSEvent event, Context context, CloudWatchExportActivit return; } + // there is a service quota here: https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/cloudwatch_limits_cwl.html + // Export task: One active (running or pending) export task at a time, per account. This quota can't be changed. + // Caused by: software.amazon.awssdk.services.cloudwatchlogs.model.LimitExceededException: Resource limit exceeded. (Service: CloudWatchLogs, Status Code: 400, Request ID: 79069256-8403-4435-a85c-17013c7d7c4c) getStopwatch.start(); - CloudWatchLogs.Response response = cloudWatchLogs.createExportLogGroupTask(taskName, logGroupName, logStreamPrefix, destination, startTimeInclusive, endTimeInclusive); + CloudWatchLogs.Response response = retryClient.invoke(() -> cloudWatchLogs.createExportLogGroupTask(taskName, logGroupName, logStreamPrefix, destination, startTimeInclusive, endTimeInclusive)); getStopwatch.stop(); response.isSuccessOrThrowRuntime(