diff --git a/infra/modules/sagemaker_deployment/cloudwatch.tf b/infra/modules/sagemaker_deployment/cloudwatch.tf index 5235fd1..fdafd55 100644 --- a/infra/modules/sagemaker_deployment/cloudwatch.tf +++ b/infra/modules/sagemaker_deployment/cloudwatch.tf @@ -1,44 +1,267 @@ -resource "aws_cloudwatch_metric_alarm" "cloudwatch_alarm" { - count = length(var.alarms) +resource "aws_cloudwatch_composite_alarm" "scale_up_from_n_to_np1" { + alarm_name = "scale_up_from_n_to_np1" + alarm_description = "Where there exists a high backlog and a high state of any of CPU, GPU, RAM, HardDisk (i.e. live instances are insufficient for the tasks being performed)" - alarm_name = "${var.alarms[count.index].alarm_name_prefix}-${aws_sagemaker_endpoint.main.name}" - alarm_description = var.alarms[count.index].alarm_description - metric_name = var.alarms[count.index].metric_name - namespace = var.alarms[count.index].namespace - comparison_operator = var.alarms[count.index].comparison_operator - threshold = var.alarms[count.index].threshold - evaluation_periods = var.alarms[count.index].evaluation_periods - datapoints_to_alarm = var.alarms[count.index].datapoints_to_alarm - period = var.alarms[count.index].period - statistic = var.alarms[count.index].statistic - alarm_actions = concat(var.alarms[count.index].alarm_actions, [aws_sns_topic.alarmstate[count.index].arn]) - ok_actions = concat(var.alarms[count.index].ok_actions, [aws_sns_topic.okstate[count.index].arn]) - dimensions = (count.index == 0 || count.index == 1 || count.index == 2) ? { # TODO: this logic is brittle as it assumes "backlog" has index [0,1,2]; it would be better to have a logic that rests on the specific name of that metric - EndpointName = aws_sagemaker_endpoint.main.name # Only EndpointName is used in this case - } : { - EndpointName = aws_sagemaker_endpoint.main.name, # Both EndpointName and VariantName are used in all other cases - VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name # Note this logic would not work if there were ever more than one production variant deployed for an LLM - } + alarm_actions = [aws_appautoscaling_policy.scale_up_from_n_to_np1.arn] + ok_actions = [] + + alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_high.alarm_name}))" +} + +resource "aws_cloudwatch_composite_alarm" "scale_up_from_0_to_1" { + alarm_name = "scale_up_from_0_to_1" + alarm_description = "Where there exists a high backlog and there exists a state of insufficient data for any of CPU, GPU, RAM, HardDisk (i.e. there are tasks to do but no instance is live to perform it)" + + alarm_actions = [aws_appautoscaling_policy.scale_up_from_0_to_1.arn] + ok_actions = [] + + alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.cpu_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.gpu_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.ram_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.harddisk_high.alarm_name}))" +} + + +resource "aws_cloudwatch_composite_alarm" "scale_down_from_n_to_nm1" { + alarm_name = "scale_down_from_n_to_nm1" + alarm_description = "Where there exists a high backlog and a low state of any of CPU, GPU, RAM, HardDisk (i.e. live instances are excessive for the current tasks)" + + alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_nm1.arn] + ok_actions = [] + + alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_low.alarm_name}))" +} + + +resource "aws_cloudwatch_composite_alarm" "scale_down_from_n_to_0" { + alarm_name = "example-composite-alarm" + alarm_description = "Where there exists a low backlog and a low state of any of CPU, GPU, RAM, HardDisk (i.e. there is no task to come and live instances are excessive for any tasks currently in process)" + + alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_0.arn] + ok_actions = [] + + alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_low.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_low.alarm_name}))" +} + + +resource "aws_cloudwatch_metric_alarm" "backlog_high" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-backlog-high" + alarm_description = "Alarm when in high Backlog Usage" + metric_name = "ApproximateBacklogSize" + namespace = "AWS/SageMaker" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = var.backlog_threshold_high + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + period = 60 + statistic = "Maximum" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "backlog_low" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-backlog-low" + alarm_description = "Alarm when in low Backlog Usage" + metric_name = "ApproximateBacklogSize" + namespace = "AWS/SageMaker" + comparison_operator = "LessThanThreshold" + threshold = var.backlog_threshold_low + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + period = 60 + statistic = "Maximum" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "cpu_high" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-cpu-high" + alarm_description = "Alarm when in high vCPU Usage" + metric_name = "CPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = var.cpu_threshold_high + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "cpu_low" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-cpu-low" + alarm_description = "Alarm when in low vCPU Usage" + metric_name = "CPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "LessThanOrEqualToThreshold" + threshold = var.cpu_threshold_low + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "gpu_high" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-gpu-high" + alarm_description = "Alarm when in high GPU Usage" + metric_name = "GPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = var.gpu_threshold_high + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "gpu_low" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-gpu-low" + alarm_description = "Alarm when in low GPU Usage" + metric_name = "GPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "LessThanOrEqualToThreshold" + threshold = var.gpu_threshold_low + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "ram_high" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-ram-high" + alarm_description = "Alarm when in high RAM Usage" + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = var.ram_threshold_high + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "ram_low" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-ram-low" + alarm_description = "Alarm when in low RAM Usage" + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "LessThanOrEqualToThreshold" + threshold = var.ram_threshold_low + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] } -resource "null_resource" "wait_for_metric_alarms" { - # Aggregating metric alarms dependencies so we wait for them to be deleted/created before composite alarms are created or deleted. This prevents cyclic dependency issues. - depends_on = [aws_cloudwatch_metric_alarm.cloudwatch_alarm] +resource "aws_cloudwatch_metric_alarm" "harddisk_high" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-harddisk-high" + alarm_description = "Alarm when in high HardDisk Usage" + metric_name = "DiskUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = var.harddisk_threshold_high + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] } -resource "aws_cloudwatch_composite_alarm" "composite_alarm" { - count = length(var.alarm_composites) +resource "aws_cloudwatch_metric_alarm" "harddisk_low" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-harddisk-low" + alarm_description = "Alarm when in low RAM Usage" + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + comparison_operator = "LessThanOrEqualToThreshold" + threshold = var.ram_threshold_low + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } - alarm_name = "${var.alarm_composites[count.index].alarm_name}-${aws_sagemaker_endpoint.main.name}" - alarm_description = var.alarm_composites[count.index].alarm_description - alarm_rule = var.alarm_composites[count.index].alarm_rule - alarm_actions = concat(var.alarm_composites[count.index].alarm_actions, [aws_sns_topic.alarm_composite_notifications[count.index].arn], [aws_sns_topic.composite_alarmstate[count.index].arn]) - ok_actions = var.alarm_composites[count.index].ok_actions + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} - depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarm_composite_notifications, aws_sns_topic.composite_alarmstate, null_resource.wait_for_metric_alarms] +resource "aws_cloudwatch_metric_alarm" "unauthorized_operations" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-unauthorized-operations" + alarm_description = "Alarm when unauthorized operations are detected in the CloudTrail Logs" + metric_name = "UnauthorizedOperationsCount" + namespace = "CloudTrailMetrics" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 1 + evaluation_periods = 1 + datapoints_to_alarm = 1 + period = 60 + statistic = "Maximum" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] +} + + +resource "aws_cloudwatch_metric_alarm" "errors_4xx" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-errors-4XX" + alarm_description = "4XX errors are detected in the CloudTrail Logs" + metric_name = "Invocation4XXErrors" + namespace = "AWS/SageMaker" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 1 + evaluation_periods = 1 + datapoints_to_alarm = 1 + period = 60 + statistic = "Average" + dimensions = { EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate] } diff --git a/infra/modules/sagemaker_deployment/lambda.tf b/infra/modules/sagemaker_deployment/lambda.tf index a63ee0c..9618128 100644 --- a/infra/modules/sagemaker_deployment/lambda.tf +++ b/infra/modules/sagemaker_deployment/lambda.tf @@ -14,45 +14,26 @@ resource "aws_lambda_function" "slack_alert_function" { runtime = "python3.12" timeout = 30 - environment { - variables = { - SNS_TO_WEBHOOK_JSON = jsonencode(local.sns_to_webhook_mapping), - ADDRESS = "arn:aws:sns:eu-west-2:${var.aws_account_id}:" - } - } } -resource "aws_lambda_permission" "allow_sns_composite" { - count = length(var.alarm_composites) +resource "aws_lambda_permission" "allow_sns_okstate" { - statement_id = "AllowSNS-composite-${count.index}" + statement_id = "AllowSNS-ok" action = "lambda:InvokeFunction" function_name = aws_lambda_function.slack_alert_function.function_name principal = "sns.amazonaws.com" - source_arn = aws_sns_topic.composite_alarmstate[count.index].arn + source_arn = aws_sns_topic.okstate.arn } resource "aws_lambda_permission" "allow_sns_alarmstate" { - count = length(var.alarms) - - statement_id = "AllowSNS-alarm-${count.index}" - action = "lambda:InvokeFunction" - function_name = aws_lambda_function.slack_alert_function.function_name - principal = "sns.amazonaws.com" - source_arn = aws_sns_topic.alarmstate[count.index].arn -} - - -resource "aws_lambda_permission" "allow_sns_okstate" { - count = length(var.alarms) - statement_id = "AllowSNS-ok-${count.index}" + statement_id = "AllowSNS-alarm" action = "lambda:InvokeFunction" function_name = aws_lambda_function.slack_alert_function.function_name principal = "sns.amazonaws.com" - source_arn = aws_sns_topic.okstate[count.index].arn + source_arn = aws_sns_topic.alarmstate.arn } diff --git a/infra/modules/sagemaker_deployment/main.tf b/infra/modules/sagemaker_deployment/main.tf index fcfceef..d88aafa 100644 --- a/infra/modules/sagemaker_deployment/main.tf +++ b/infra/modules/sagemaker_deployment/main.tf @@ -65,7 +65,7 @@ resource "aws_appautoscaling_target" "main" { } -resource "aws_appautoscaling_policy" "scale_up_to_n_policy" { +resource "aws_appautoscaling_policy" "scale_up_from_n_to_np1" { name = "scale-up-to-n-policy-${var.model_name}" policy_type = "StepScaling" @@ -87,7 +87,7 @@ resource "aws_appautoscaling_policy" "scale_up_to_n_policy" { } -resource "aws_appautoscaling_policy" "scale_down_to_n_policy" { +resource "aws_appautoscaling_policy" "scale_down_from_n_to_nm1" { name = "scale-down-to-n-policy-${var.model_name}" policy_type = "StepScaling" @@ -109,7 +109,7 @@ resource "aws_appautoscaling_policy" "scale_down_to_n_policy" { } -resource "aws_appautoscaling_policy" "scale_up_to_one_policy" { +resource "aws_appautoscaling_policy" "scale_up_from_0_to_1" { name = "scale-up-to-one-policy-${var.model_name}" policy_type = "StepScaling" @@ -131,7 +131,7 @@ resource "aws_appautoscaling_policy" "scale_up_to_one_policy" { } -resource "aws_appautoscaling_policy" "scale_down_to_zero_policy" { +resource "aws_appautoscaling_policy" "scale_down_from_n_to_0" { name = "scale-down-to-zero-policy-${var.model_name}" policy_type = "StepScaling" @@ -153,22 +153,6 @@ resource "aws_appautoscaling_policy" "scale_down_to_zero_policy" { } -# Mapping SNS topic ARNs to Slack webhook URLs -locals { - sns_to_webhook_mapping = merge({ - for idx, alarm in var.alarms : - replace(aws_sns_topic.alarmstate[idx].arn, "arn:aws:sns:eu-west-2:${var.aws_account_id}:", "") => alarm.slack_webhook_url - }, { - for idx, alarm in var.alarms : - replace(aws_sns_topic.okstate[idx].arn, "arn:aws:sns:eu-west-2:${var.aws_account_id}:", "") => alarm.slack_webhook_url - }, { - for idx, alarm_composite in var.alarm_composites : - replace(aws_sns_topic.composite_alarmstate[idx].arn, "arn:aws:sns:eu-west-2:${var.aws_account_id}:", "") => alarm_composite.slack_webhook_url - } - ) -} - - resource "aws_cloudwatch_log_metric_filter" "unauthorized_operations" { name = "unauthorized-operations-filter" log_group_name = "/aws/sagemaker/Endpoints/${aws_sagemaker_endpoint.main.name}" diff --git a/infra/modules/sagemaker_deployment/outputs.tf b/infra/modules/sagemaker_deployment/outputs.tf index 0fda7d4..a96f9a7 100644 --- a/infra/modules/sagemaker_deployment/outputs.tf +++ b/infra/modules/sagemaker_deployment/outputs.tf @@ -6,23 +6,3 @@ output "model_name" { output "endpoint_name" { value = aws_sagemaker_endpoint.main.name } - - -output "scale_up_to_one_policy_arn" { - value = aws_appautoscaling_policy.scale_up_to_one_policy.arn -} - - -output "scale_down_to_zero_policy_arn" { - value = aws_appautoscaling_policy.scale_down_to_zero_policy.arn -} - - -output "scale_up_to_n_policy_arn" { - value = aws_appautoscaling_policy.scale_up_to_n_policy.arn -} - - -output "scale_down_to_n_policy_arn" { - value = aws_appautoscaling_policy.scale_down_to_n_policy.arn -} diff --git a/infra/modules/sagemaker_deployment/sns.tf b/infra/modules/sagemaker_deployment/sns.tf index 61248ff..12fcb40 100644 --- a/infra/modules/sagemaker_deployment/sns.tf +++ b/infra/modules/sagemaker_deployment/sns.tf @@ -1,7 +1,6 @@ resource "aws_sns_topic" "alarmstate" { - count = length(var.alarms) - name = "alarm-alarmstate-${var.alarms[count.index].alarm_name_prefix}-${aws_sagemaker_endpoint.main.name}" + name = "alarm-alarmstate-${aws_sagemaker_endpoint.main.name}" policy = jsonencode({ Version = "2012-10-17", Statement = [ @@ -19,9 +18,8 @@ resource "aws_sns_topic" "alarmstate" { resource "aws_sns_topic" "okstate" { - count = length(var.alarms) - name = "alarm-okstate-${var.alarms[count.index].alarm_name_prefix}-${aws_sagemaker_endpoint.main.name}" + name = "alarm-okstate-${aws_sagemaker_endpoint.main.name}" policy = jsonencode({ Version = "2012-10-17", @@ -39,99 +37,17 @@ resource "aws_sns_topic" "okstate" { } -resource "aws_sns_topic" "composite_alarmstate" { - count = length(var.alarm_composites) - - name = "alarm-alarm-composite-lambda-${var.alarm_composites[count.index].alarm_name}-${aws_sagemaker_endpoint.main.name}-topic" - - policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Effect = "Allow", - Principal = { - Service = "cloudwatch.amazonaws.com" - }, - Action = "sns:Publish", - Resource = "*" - } - ] - }) -} - - -resource "aws_sns_topic" "alarm_composite_notifications" { - count = length(var.alarm_composites) - name = "alarm-composite-${var.alarm_composites[count.index].alarm_name}-${aws_sagemaker_endpoint.main.name}-sns-topic" -} - - -resource "aws_sns_topic_policy" "composite_sns_topic_policy" { - count = length(var.alarm_composites) - - arn = aws_sns_topic.alarm_composite_notifications[count.index].arn - policy = jsonencode({ - Version = "2012-10-17", - Statement = [ - { - Sid = "AllowPublishFromCloudWatch" - Effect = "Allow", - Principal = { - Service = "cloudwatch.amazonaws.com" - }, - Action = "SNS:Publish", - Resource = aws_sns_topic.alarm_composite_notifications[count.index].arn - }, - { - Sid = "AllowSubscriptionActions" - Effect = "Allow", - Principal = "*", - Action = [ - "sns:Subscribe", - "sns:Receive" - ], - Resource = aws_sns_topic.alarm_composite_notifications[count.index].arn - } - ] - }) -} - - resource "aws_sns_topic_subscription" "sns_lambda_subscription_okstate" { - count = length(var.alarms) - topic_arn = aws_sns_topic.okstate[count.index].arn + topic_arn = aws_sns_topic.okstate.arn protocol = "lambda" endpoint = aws_lambda_function.slack_alert_function.arn } resource "aws_sns_topic_subscription" "sns_lambda_subscription_alarmstate" { - count = length(var.alarms) - topic_arn = aws_sns_topic.alarmstate[count.index].arn + topic_arn = aws_sns_topic.alarmstate.arn protocol = "lambda" endpoint = aws_lambda_function.slack_alert_function.arn } - -resource "aws_sns_topic_subscription" "sns_lambda_subscription_composite" { - count = length(var.alarm_composites) - - topic_arn = aws_sns_topic.composite_alarmstate[count.index].arn - protocol = "lambda" - endpoint = aws_lambda_function.slack_alert_function.arn -} - - -resource "aws_sns_topic_subscription" "email_subscription" { - count = length(var.alarm_composites) - topic_arn = aws_sns_topic.alarm_composite_notifications[count.index].arn - protocol = "email" - endpoint = flatten([ - for variables in var.alarm_composites : - [ - for email in variables.emails : - email - ] - ])[count.index] -} diff --git a/infra/modules/sagemaker_deployment/variables.tf b/infra/modules/sagemaker_deployment/variables.tf index 14be800..14794e4 100644 --- a/infra/modules/sagemaker_deployment/variables.tf +++ b/infra/modules/sagemaker_deployment/variables.tf @@ -87,38 +87,87 @@ variable "scale_down_cooldown" { description = "Cooldown period for scale down" } +variable "backlog_threshold_high" { + type = number + description = "Threshold for high backlog alarm" +} + + +variable "backlog_threshold_low" { + type = number + description = "Threshold for low backlog alarm" +} + + +variable "cpu_threshold_high" { + type = number + description = "Threshold for high CPU alarm (NOTE this varies based on number of vCPU)" +} + + +variable "cpu_threshold_low" { + type = number + description = "Threshold for low CPU alarm (NOTE this varies based on number of vCPU)" +} + + +variable "gpu_threshold_high" { + type = number + description = "Threshold for high GPU alarm (NOTE this varies based on number of GPU)" +} + + +variable "gpu_threshold_low" { + type = number + description = "Threshold for low GPU alarm (NOTE this varies based on number of GPU)" +} + + +variable "ram_threshold_high" { + type = number + description = "Threshold for high RAM alarm" +} -variable "alarms" { - type = list(object({ - alarm_name_prefix = string - alarm_description = string - metric_name = string - namespace = string - comparison_operator = string - threshold = number - evaluation_periods = number - datapoints_to_alarm = number - period = number - statistic = string - slack_webhook_url = string - alarm_actions = list(string) - ok_actions = list(string) - })) - description = "List of CloudWatch alarms to be created" -} - - -variable "alarm_composites" { - type = list(object({ - alarm_name = string - alarm_description = string - alarm_rule = string - alarm_actions = list(string) - ok_actions = list(string) - slack_webhook_url = string - emails = list(string) - })) - description = "List of CloudWatch composite alarms to be created utilizing pre-existing alarms" + +variable "ram_threshold_low" { + type = number + description = "Threshold for low RAM alarm" +} + + +variable "harddisk_threshold_high" { + type = number + description = "Threshold for high HardDisk alarm" +} + + +variable "harddisk_threshold_low" { + type = number + description = "Threshold for low HardDisk alarm" +} + + +variable "evaluation_periods_high" { + type = number + description = "Number of evaluation periods to consider for high alarm states" +} + + +variable "datapoints_to_alarm_high" { + type = number + description = "Number of datapoints within an evaluation period to require for low alarm states" +} + + +variable "evaluation_periods_low" { + type = number + description = "Number of evaluation periods to consider for low alarm states" +} + + +variable "datapoints_to_alarm_low" { + type = number + description = "Number of datapoints within an evaluation period to require for low alarm states" } diff --git a/infra/sagemaker_llm_resources.tf b/infra/sagemaker_llm_resources.tf index a4023df..8aa2424 100644 --- a/infra/sagemaker_llm_resources.tf +++ b/infra/sagemaker_llm_resources.tf @@ -33,256 +33,20 @@ module "gpt_neo_125m_deployment" { "SAGEMAKER_PROGRAM" : "inference.py", "SM_NUM_GPUS" : "1" } - - alarms = [ - { - alarm_name_prefix = "nonzero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale up based on existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_up_to_one_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "zero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale down based on non-existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanThreshold" - threshold = 1 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_down_to_zero_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "backlog-composite-alarm" # TODO: backlog is currently required to have index 0, which is brittle - alarm_description = "Detect if queries in backlog for extended time period" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 0 - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "high-cpu" - alarm_description = "Scale up when CPU usage is heavy" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 8 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-cpu" - alarm_description = "Scale down when CPU usage is light" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 8 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-gpu" - alarm_description = "Scale up when GPU usage is heavy" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-ram" - alarm_description = "Scale up when RAM usage is heavy" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-ram" - alarm_description = "Scale down when RAM usage is light" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-hard-disk" - alarm_description = "Scale up when Hard Disk usage is heavy" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-hard-disk" - alarm_description = "Scale down when Hard Disk usage is light" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.gpt_neo_125m_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "unauthorized-operations" - alarm_description = "Unauthorized operations are detected in the CloudTrail Logs" - metric_name = "UnauthorizedOperationsCount" - namespace = "CloudTrailMetrics" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "errors-4XX" - alarm_description = "4XX errors are detected in the CloudTrail Logs" - metric_name = "Invocation4XXErrors" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "elevated-cpu-composite" - alarm_description = "Detect CPU activity above idle for extended time period" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 8 # TODO: we must manually multiply by CPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu-composite" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [] - ok_actions = [] - } - ] - - # Note that we have cyclic dependency issues now and you have to destroy to get this to work - alarm_composites = [ - { - alarm_name = "ElevatedCPUUtilizationNoBackLog" - alarm_description = "Triggered when CPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(elevated-cpu-composite-${module.gpt_neo_125m_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.gpt_neo_125m_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - }, - { - alarm_name = "ElevatedGPUUtilizationNoBackLog" - alarm_description = "Triggered when GPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(low-gpu-composite-${module.gpt_neo_125m_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.gpt_neo_125m_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - } - ] + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 8 # 8 vCPUs + cpu_threshold_low = 20 * 8 # 8 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + harddisk_threshold_high = 80 + harddisk_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 20 + datapoints_to_alarm_low = 15 # These variables do not change between LLMs source = "./modules/sagemaker_deployment" @@ -317,256 +81,20 @@ module "phi_2_3b_deployment" { "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", "SAGEMAKER_PROGRAM" : "inference.py" } - - alarms = [ - { - alarm_name_prefix = "nonzero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale up based on existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_up_to_one_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "zero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale down based on non-existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanThreshold" - threshold = 1 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_down_to_zero_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "backlog-composite-alarm" - alarm_description = "Detect if queries in backlog for extended time period" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 0 - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "high-cpu" - alarm_description = "Scale up when CPU usage is heavy" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 4 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-cpu" - alarm_description = "Scale down when CPU usage is light" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-gpu" - alarm_description = "Scale up when GPU usage is heavy" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-ram" - alarm_description = "Scale up when RAM usage is heavy" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-ram" - alarm_description = "Scale down when RAM usage is light" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-hard-disk" - alarm_description = "Scale up when Hard Disk usage is heavy" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-hard-disk" - alarm_description = "Scale down when Hard Disk usage is light" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.phi_2_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "unauthorized-operations" - alarm_description = "Unauthorized operations are detected in the CloudTrail Logs" - metric_name = "UnauthorizedOperationsCount" - namespace = "CloudTrailMetrics" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "errors-4XX" - alarm_description = "4XX errors are detected in the CloudTrail Logs" - metric_name = "Invocation4XXErrors" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "elevated-cpu-composite" - alarm_description = "Detect CPU activity above idle for extended time period" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by CPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu-composite" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [] - ok_actions = [] - } - ] - - alarm_composites = [ - { - alarm_name = "ElevatedCPUUtilizationNoBackLog" - alarm_description = "Triggered when CPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(elevated-cpu-composite-${module.phi_2_3b_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.phi_2_3b_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - }, - { - alarm_name = "ElevatedGPUUtilizationNoBackLog" - alarm_description = "Triggered when GPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(low-gpu-composite-${module.phi_2_3b_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.phi_2_3b_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - } - - ] + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 4 # 4 vCPUs + cpu_threshold_low = 20 * 4 # 4 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + harddisk_threshold_high = 80 + harddisk_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 20 + datapoints_to_alarm_low = 15 # These variables do not change between LLMs source = "./modules/sagemaker_deployment" @@ -606,256 +134,20 @@ module "llama_3_3b_deployment" { "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", "SAGEMAKER_PROGRAM" : "inference.py" } - - alarms = [ - { - alarm_name_prefix = "nonzero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale up based on existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_up_to_one_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "zero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale down based on non-existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanThreshold" - threshold = 1 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_down_to_zero_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "backlog-composite-alarm" - alarm_description = "Detect if queries in backlog for extended time period" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 0 - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "high-cpu" - alarm_description = "Scale up when CPU usage is heavy" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 4 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-cpu" - alarm_description = "Scale down when CPU usage is light" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-gpu" - alarm_description = "Scale up when GPU usage is heavy" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-ram" - alarm_description = "Scale up when RAM usage is heavy" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-ram" - alarm_description = "Scale down when RAM usage is light" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-hard-disk" - alarm_description = "Scale up when Hard Disk usage is heavy" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-hard-disk" - alarm_description = "Scale down when Hard Disk usage is light" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "unauthorized-operations" - alarm_description = "Unauthorized operations are detected in the CloudTrail Logs" - metric_name = "UnauthorizedOperationsCount" - namespace = "CloudTrailMetrics" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "errors-4XX" - alarm_description = "4XX errors are detected in the CloudTrail Logs" - metric_name = "Invocation4XXErrors" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "elevated-cpu-composite" - alarm_description = "Detect CPU activity above idle for extended time period" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by CPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu-composite" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [] - ok_actions = [] - } - ] - - alarm_composites = [ - { - alarm_name = "ElevatedCPUUtilizationNoBackLog" - alarm_description = "Triggered when CPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(elevated-cpu-composite-${module.phi_2_3b_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.phi_2_3b_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - }, - { - alarm_name = "ElevatedGPUUtilizationNoBackLog" - alarm_description = "Triggered when GPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(low-gpu-composite-${module.phi_2_3b_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.phi_2_3b_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - } - - ] + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 4 # 4 vCPUs + cpu_threshold_low = 20 * 4 # 4 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + harddisk_threshold_high = 80 + harddisk_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 20 + datapoints_to_alarm_low = 15 # These variables do not change between LLMs source = "./modules/sagemaker_deployment" @@ -894,256 +186,20 @@ module "llama_3_3b_instruct_deployment" { "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", "SAGEMAKER_PROGRAM" : "inference.py" } - - alarms = [ - { - alarm_name_prefix = "nonzero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale up based on existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_up_to_one_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "zero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale down based on non-existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanThreshold" - threshold = 1 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_down_to_zero_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "backlog-composite-alarm" - alarm_description = "Detect if queries in backlog for extended time period" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 0 - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "high-cpu" - alarm_description = "Scale up when CPU usage is heavy" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 4 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-cpu" - alarm_description = "Scale down when CPU usage is light" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-gpu" - alarm_description = "Scale up when GPU usage is heavy" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-ram" - alarm_description = "Scale up when RAM usage is heavy" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-ram" - alarm_description = "Scale down when RAM usage is light" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-hard-disk" - alarm_description = "Scale up when Hard Disk usage is heavy" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-hard-disk" - alarm_description = "Scale down when Hard Disk usage is light" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.llama_3_3b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "unauthorized-operations" - alarm_description = "Unauthorized operations are detected in the CloudTrail Logs" - metric_name = "UnauthorizedOperationsCount" - namespace = "CloudTrailMetrics" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "errors-4XX" - alarm_description = "4XX errors are detected in the CloudTrail Logs" - metric_name = "Invocation4XXErrors" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "elevated-cpu-composite" - alarm_description = "Detect CPU activity above idle for extended time period" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by CPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu-composite" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 1 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [] - ok_actions = [] - } - ] - - alarm_composites = [ - { - alarm_name = "ElevatedCPUUtilizationNoBackLog" - alarm_description = "Triggered when CPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(elevated-cpu-composite-${module.phi_2_3b_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.phi_2_3b_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - }, - { - alarm_name = "ElevatedGPUUtilizationNoBackLog" - alarm_description = "Triggered when GPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(low-gpu-composite-${module.phi_2_3b_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.phi_2_3b_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - } - - ] + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 4 # 4 vCPUs + cpu_threshold_low = 20 * 4 # 4 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + harddisk_threshold_high = 80 + harddisk_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 20 + datapoints_to_alarm_low = 15 # These variables do not change between LLMs source = "./modules/sagemaker_deployment" @@ -1180,242 +236,20 @@ module "mistral_7b_instruct_deployment" { "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", "SAGEMAKER_PROGRAM" : "inference.py", } - - alarms = [ - { - alarm_name_prefix = "nonzero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale up based on existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_up_to_one_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "zero-backlog" # TODO: backlog is currently required to have index [0,1] which is brittle - alarm_description = "Scale down based on non-existence of backlog" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanThreshold" - threshold = 1 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_down_to_zero_policy_arn] - ok_actions = [] - }, - - { - alarm_name_prefix = "backlog-composite-alarm" # TODO: backlog is currently required to have index 0, which is brittle - alarm_description = "Detect if queries in backlog for extended time period" - metric_name = "ApproximateBacklogSize" - namespace = "AWS/SageMaker" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 0 - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_backlog_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "low-cpu" - alarm_description = "Scale down when CPU usage is light" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 48 # TODO: we must manually multiply by vCPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-gpu" - alarm_description = "Scale up when GPU usage is heavy" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 * 4 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-ram" - alarm_description = "Scale up when RAM usage is heavy" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-ram" - alarm_description = "Scale down when RAM usage is light" - metric_name = "MemoryUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "high-hard-disk" - alarm_description = "Scale up when Hard Disk usage is heavy" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 80 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_up_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "low-hard-disk" - alarm_description = "Scale down when Hard Disk usage is light" - metric_name = "DiskUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "LessThanOrEqualToThreshold" - threshold = 20 - evaluation_periods = 15 - datapoints_to_alarm = 15 - period = 60 - statistic = "Maximum" - slack_webhook_url = var.slack_webhook_resource_alerts - alarm_actions = [module.mistral_7b_instruct_deployment.scale_down_to_n_policy_arn] - ok_actions = [] - }, - { - alarm_name_prefix = "unauthorized-operations" - alarm_description = "Unauthorized operations are detected in the CloudTrail Logs" - metric_name = "UnauthorizedOperationsCount" - namespace = "CloudTrailMetrics" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "errors-4XX" - alarm_description = "4XX errors are detected in the CloudTrail Logs" - metric_name = "Invocation4XXErrors" - namespace = "AWS/SageMaker" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 1 - evaluation_periods = 1 - datapoints_to_alarm = 1 - period = 60 - statistic = "Sum" - slack_webhook_url = var.slack_webhook_security_alerts - alarm_actions = [] # SNS to give alert to developers - ok_actions = [] - }, - { - alarm_name_prefix = "elevated-cpu-composite" - alarm_description = "Detect CPU activity above idle for extended time period" - metric_name = "CPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 48 # TODO: we must manually multiply by CPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_cpu_alerts - alarm_actions = [] - ok_actions = [] - }, - { - alarm_name_prefix = "low-gpu-composite" - alarm_description = "Scale down when GPU usage is light" - metric_name = "GPUUtilization" - namespace = "/aws/sagemaker/Endpoints" - comparison_operator = "GreaterThanOrEqualToThreshold" - threshold = 20 * 4 # TODO: we must manually multiply by GPU count as Normalized metric not available - evaluation_periods = 3 - datapoints_to_alarm = 3 - period = 3600 - statistic = "Average" - slack_webhook_url = var.slack_webhook_gpu_alerts - alarm_actions = [] - ok_actions = [] - } - ] - - alarm_composites = [ - { - alarm_name = "ElevatedCPUUtilizationNoBackLog" - alarm_description = "Triggered when CPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(elevated-cpu-composite-${module.mistral_7b_instruct_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.mistral_7b_instruct_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - }, - { - alarm_name = "ElevatedGPUUtilizationNoBackLog" - alarm_description = "Triggered when GPU util is above idle and no backlog query exists for an extended time" - alarm_rule = "ALARM(low-gpu-composite-${module.mistral_7b_instruct_deployment.model_name}-endpoint) AND ALARM(backlog-composite-alarm-${module.mistral_7b_instruct_deployment.model_name}-endpoint)" - alarm_actions = [] - ok_actions = [] - slack_webhook_url = var.slack_webhook_backlog_alerts - emails = var.sagemaker_budget_emails - } - - ] + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 48 # 48 vCPUs + cpu_threshold_low = 20 * 48 # 48 vCPUs + gpu_threshold_high = 80 * 4 # 4 GPUs + gpu_threshold_low = 20 * 4 # 4 GPUs + ram_threshold_high = 80 + ram_threshold_low = 20 + harddisk_threshold_high = 80 + harddisk_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 + datapoints_to_alarm_low = 15 # These variables do not change between LLMs source = "./modules/sagemaker_deployment"