Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Feature/composite alarm scaling #246

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 254 additions & 31 deletions infra/modules/sagemaker_deployment/cloudwatch.tf
Original file line number Diff line number Diff line change
@@ -1,44 +1,267 @@
resource "aws_cloudwatch_metric_alarm" "cloudwatch_alarm" {
count = length(var.alarms)
resource "aws_cloudwatch_composite_alarm" "scale_up_from_n_to_np1" {
alarm_name = "scale_up_from_n_to_np1"
alarm_description = "Where there exists a high backlog and a high state of any of CPU, GPU, RAM, HardDisk (i.e. live instances are insufficient for the tasks being performed)"

alarm_name = "${var.alarms[count.index].alarm_name_prefix}-${aws_sagemaker_endpoint.main.name}"
alarm_description = var.alarms[count.index].alarm_description
metric_name = var.alarms[count.index].metric_name
namespace = var.alarms[count.index].namespace
comparison_operator = var.alarms[count.index].comparison_operator
threshold = var.alarms[count.index].threshold
evaluation_periods = var.alarms[count.index].evaluation_periods
datapoints_to_alarm = var.alarms[count.index].datapoints_to_alarm
period = var.alarms[count.index].period
statistic = var.alarms[count.index].statistic
alarm_actions = concat(var.alarms[count.index].alarm_actions, [aws_sns_topic.alarmstate[count.index].arn])
ok_actions = concat(var.alarms[count.index].ok_actions, [aws_sns_topic.okstate[count.index].arn])
dimensions = (count.index == 0 || count.index == 1 || count.index == 2) ? { # TODO: this logic is brittle as it assumes "backlog" has index [0,1,2]; it would be better to have a logic that rests on the specific name of that metric
EndpointName = aws_sagemaker_endpoint.main.name # Only EndpointName is used in this case
} : {
EndpointName = aws_sagemaker_endpoint.main.name, # Both EndpointName and VariantName are used in all other cases
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name # Note this logic would not work if there were ever more than one production variant deployed for an LLM
}
alarm_actions = [aws_appautoscaling_policy.scale_up_from_n_to_np1.arn]
ok_actions = []

alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_high.alarm_name}))"
}

resource "aws_cloudwatch_composite_alarm" "scale_up_from_0_to_1" {
alarm_name = "scale_up_from_0_to_1"
alarm_description = "Where there exists a high backlog and there exists a state of insufficient data for any of CPU, GPU, RAM, HardDisk (i.e. there are tasks to do but no instance is live to perform it)"

alarm_actions = [aws_appautoscaling_policy.scale_up_from_0_to_1.arn]
ok_actions = []

alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.cpu_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.gpu_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.ram_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.harddisk_high.alarm_name}))"
}


resource "aws_cloudwatch_composite_alarm" "scale_down_from_n_to_nm1" {
alarm_name = "scale_down_from_n_to_nm1"
alarm_description = "Where there exists a high backlog and a low state of any of CPU, GPU, RAM, HardDisk (i.e. live instances are excessive for the current tasks)"

alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_nm1.arn]
ok_actions = []

alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_low.alarm_name}))"
}


resource "aws_cloudwatch_composite_alarm" "scale_down_from_n_to_0" {
alarm_name = "example-composite-alarm"
alarm_description = "Where there exists a low backlog and a low state of any of CPU, GPU, RAM, HardDisk (i.e. there is no task to come and live instances are excessive for any tasks currently in process)"

alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_0.arn]
ok_actions = []

alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_low.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_low.alarm_name}))"
}


resource "aws_cloudwatch_metric_alarm" "backlog_high" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-backlog-high"
alarm_description = "Alarm when in high Backlog Usage"
metric_name = "ApproximateBacklogSize"
namespace = "AWS/SageMaker"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.backlog_threshold_high
evaluation_periods = var.evaluation_periods_high
datapoints_to_alarm = var.datapoints_to_alarm_high
period = 60
statistic = "Maximum"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "backlog_low" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-backlog-low"
alarm_description = "Alarm when in low Backlog Usage"
metric_name = "ApproximateBacklogSize"
namespace = "AWS/SageMaker"
comparison_operator = "LessThanThreshold"
threshold = var.backlog_threshold_low
evaluation_periods = var.evaluation_periods_low
datapoints_to_alarm = var.datapoints_to_alarm_low
period = 60
statistic = "Maximum"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "cpu_high" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-cpu-high"
alarm_description = "Alarm when in high vCPU Usage"
metric_name = "CPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.cpu_threshold_high
evaluation_periods = var.evaluation_periods_high
datapoints_to_alarm = var.datapoints_to_alarm_high
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "cpu_low" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-cpu-low"
alarm_description = "Alarm when in low vCPU Usage"
metric_name = "CPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "LessThanOrEqualToThreshold"
threshold = var.cpu_threshold_low
evaluation_periods = var.evaluation_periods_low
datapoints_to_alarm = var.datapoints_to_alarm_low
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "gpu_high" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-gpu-high"
alarm_description = "Alarm when in high GPU Usage"
metric_name = "GPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.gpu_threshold_high
evaluation_periods = var.evaluation_periods_high
datapoints_to_alarm = var.datapoints_to_alarm_high
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "gpu_low" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-gpu-low"
alarm_description = "Alarm when in low GPU Usage"
metric_name = "GPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "LessThanOrEqualToThreshold"
threshold = var.gpu_threshold_low
evaluation_periods = var.evaluation_periods_low
datapoints_to_alarm = var.datapoints_to_alarm_low
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "ram_high" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-ram-high"
alarm_description = "Alarm when in high RAM Usage"
metric_name = "MemoryUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.ram_threshold_high
evaluation_periods = var.evaluation_periods_high
datapoints_to_alarm = var.datapoints_to_alarm_high
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "ram_low" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-ram-low"
alarm_description = "Alarm when in low RAM Usage"
metric_name = "MemoryUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "LessThanOrEqualToThreshold"
threshold = var.ram_threshold_low
evaluation_periods = var.evaluation_periods_low
datapoints_to_alarm = var.datapoints_to_alarm_low
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "null_resource" "wait_for_metric_alarms" {
# Aggregating metric alarms dependencies so we wait for them to be deleted/created before composite alarms are created or deleted. This prevents cyclic dependency issues.
depends_on = [aws_cloudwatch_metric_alarm.cloudwatch_alarm]
resource "aws_cloudwatch_metric_alarm" "harddisk_high" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-harddisk-high"
alarm_description = "Alarm when in high HardDisk Usage"
metric_name = "DiskUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = var.harddisk_threshold_high
evaluation_periods = var.evaluation_periods_high
datapoints_to_alarm = var.datapoints_to_alarm_high
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_composite_alarm" "composite_alarm" {
count = length(var.alarm_composites)
resource "aws_cloudwatch_metric_alarm" "harddisk_low" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-harddisk-low"
alarm_description = "Alarm when in low RAM Usage"
metric_name = "MemoryUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "LessThanOrEqualToThreshold"
threshold = var.ram_threshold_low
evaluation_periods = var.evaluation_periods_low
datapoints_to_alarm = var.datapoints_to_alarm_low
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

alarm_name = "${var.alarm_composites[count.index].alarm_name}-${aws_sagemaker_endpoint.main.name}"
alarm_description = var.alarm_composites[count.index].alarm_description
alarm_rule = var.alarm_composites[count.index].alarm_rule
alarm_actions = concat(var.alarm_composites[count.index].alarm_actions, [aws_sns_topic.alarm_composite_notifications[count.index].arn], [aws_sns_topic.composite_alarmstate[count.index].arn])
ok_actions = var.alarm_composites[count.index].ok_actions
depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarm_composite_notifications, aws_sns_topic.composite_alarmstate, null_resource.wait_for_metric_alarms]

resource "aws_cloudwatch_metric_alarm" "unauthorized_operations" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-unauthorized-operations"
alarm_description = "Alarm when unauthorized operations are detected in the CloudTrail Logs"
metric_name = "UnauthorizedOperationsCount"
namespace = "CloudTrailMetrics"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = 1
evaluation_periods = 1
datapoints_to_alarm = 1
period = 60
statistic = "Maximum"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}


resource "aws_cloudwatch_metric_alarm" "errors_4xx" {

alarm_name = "${aws_sagemaker_endpoint.main.name}-errors-4XX"
alarm_description = "4XX errors are detected in the CloudTrail Logs"
metric_name = "Invocation4XXErrors"
namespace = "AWS/SageMaker"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = 1
evaluation_periods = 1
datapoints_to_alarm = 1
period = 60
statistic = "Average"
dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }

depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
}
29 changes: 5 additions & 24 deletions infra/modules/sagemaker_deployment/lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,26 @@ resource "aws_lambda_function" "slack_alert_function" {
runtime = "python3.12"
timeout = 30

environment {
variables = {
SNS_TO_WEBHOOK_JSON = jsonencode(local.sns_to_webhook_mapping),
ADDRESS = "arn:aws:sns:eu-west-2:${var.aws_account_id}:"
}
}
}


resource "aws_lambda_permission" "allow_sns_composite" {
count = length(var.alarm_composites)
resource "aws_lambda_permission" "allow_sns_okstate" {

statement_id = "AllowSNS-composite-${count.index}"
statement_id = "AllowSNS-ok"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.slack_alert_function.function_name
principal = "sns.amazonaws.com"
source_arn = aws_sns_topic.composite_alarmstate[count.index].arn
source_arn = aws_sns_topic.okstate.arn
}


resource "aws_lambda_permission" "allow_sns_alarmstate" {
count = length(var.alarms)

statement_id = "AllowSNS-alarm-${count.index}"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.slack_alert_function.function_name
principal = "sns.amazonaws.com"
source_arn = aws_sns_topic.alarmstate[count.index].arn
}


resource "aws_lambda_permission" "allow_sns_okstate" {
count = length(var.alarms)

statement_id = "AllowSNS-ok-${count.index}"
statement_id = "AllowSNS-alarm"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.slack_alert_function.function_name
principal = "sns.amazonaws.com"
source_arn = aws_sns_topic.okstate[count.index].arn
source_arn = aws_sns_topic.alarmstate.arn
}


Expand Down
Loading