uktrade · aidanrussell · Feb 12, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/infra/modules/sagemaker_deployment/cloudwatch.tf b/infra/modules/sagemaker_deployment/cloudwatch.tf
@@ -1,44 +1,267 @@
-resource "aws_cloudwatch_metric_alarm" "cloudwatch_alarm" {
-  count = length(var.alarms)
+resource "aws_cloudwatch_composite_alarm" "scale_up_from_n_to_np1" {
+  alarm_name        = "scale_up_from_n_to_np1"
+  alarm_description = "Where there exists a high backlog and a high state of any of CPU, GPU, RAM, HardDisk (i.e. live instances are insufficient for the tasks being performed)"
 
-  alarm_name          = "${var.alarms[count.index].alarm_name_prefix}-${aws_sagemaker_endpoint.main.name}"
-  alarm_description   = var.alarms[count.index].alarm_description
-  metric_name         = var.alarms[count.index].metric_name
-  namespace           = var.alarms[count.index].namespace
-  comparison_operator = var.alarms[count.index].comparison_operator
-  threshold           = var.alarms[count.index].threshold
-  evaluation_periods  = var.alarms[count.index].evaluation_periods
-  datapoints_to_alarm = var.alarms[count.index].datapoints_to_alarm
-  period              = var.alarms[count.index].period
-  statistic           = var.alarms[count.index].statistic
-  alarm_actions       = concat(var.alarms[count.index].alarm_actions, [aws_sns_topic.alarmstate[count.index].arn])
-  ok_actions          = concat(var.alarms[count.index].ok_actions, [aws_sns_topic.okstate[count.index].arn])
-  dimensions = (count.index == 0 || count.index == 1 || count.index == 2) ? { # TODO: this logic is brittle as it assumes "backlog" has index [0,1,2]; it would be better to have a logic that rests on the specific name of that metric
-    EndpointName = aws_sagemaker_endpoint.main.name                           # Only EndpointName is used in this case
-    } : {
-    EndpointName = aws_sagemaker_endpoint.main.name,                                             # Both EndpointName and VariantName are used in all other cases
-    VariantName  = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name # Note this logic would not work if there were ever more than one production variant deployed for an LLM
-  }
+  alarm_actions = [aws_appautoscaling_policy.scale_up_from_n_to_np1.arn]
+  ok_actions    = []
+
+  alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_high.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_high.alarm_name}))"
+}
+
+resource "aws_cloudwatch_composite_alarm" "scale_up_from_0_to_1" {
+  alarm_name        = "scale_up_from_0_to_1"
+  alarm_description = "Where there exists a high backlog and there exists a state of insufficient data for any of CPU, GPU, RAM, HardDisk (i.e. there are tasks to do but no instance is live to perform it)"
+
+  alarm_actions = [aws_appautoscaling_policy.scale_up_from_0_to_1.arn]
+  ok_actions    = []
+
+  alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.cpu_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.gpu_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.ram_high.alarm_name}) OR INSUFFICIENT_DATA(${aws_cloudwatch_metric_alarm.harddisk_high.alarm_name}))"
+}
+
+
+resource "aws_cloudwatch_composite_alarm" "scale_down_from_n_to_nm1" {
+  alarm_name        = "scale_down_from_n_to_nm1"
+  alarm_description = "Where there exists a high backlog and a low state of any of CPU, GPU, RAM, HardDisk (i.e. live instances are excessive for the current tasks)"
+
+  alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_nm1.arn]
+  ok_actions    = []
+
+  alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_high.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_low.alarm_name}))"
+}
+
+
+resource "aws_cloudwatch_composite_alarm" "scale_down_from_n_to_0" {
+  alarm_name        = "example-composite-alarm"
+  alarm_description = "Where there exists a low backlog and a low state of any of CPU, GPU, RAM, HardDisk (i.e. there is no task to come and live instances are excessive for any tasks currently in process)"
+
+  alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_0.arn]
+  ok_actions    = []
+
+  alarm_rule = "ALARM(${aws_cloudwatch_metric_alarm.backlog_low.alarm_name}) AND (ALARM(${aws_cloudwatch_metric_alarm.cpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.gpu_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.ram_low.alarm_name}) OR ALARM(${aws_cloudwatch_metric_alarm.harddisk_low.alarm_name}))"
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "backlog_high" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-backlog-high"
+  alarm_description   = "Alarm when in high Backlog Usage"
+  metric_name         = "ApproximateBacklogSize"
+  namespace           = "AWS/SageMaker"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.backlog_threshold_high
+  evaluation_periods  = var.evaluation_periods_high
+  datapoints_to_alarm = var.datapoints_to_alarm_high
+  period              = 60
+  statistic           = "Maximum"
+  dimensions          = { EndpointName = aws_sagemaker_endpoint.main.name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "backlog_low" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-backlog-low"
+  alarm_description   = "Alarm when in low Backlog Usage"
+  metric_name         = "ApproximateBacklogSize"
+  namespace           = "AWS/SageMaker"
+  comparison_operator = "LessThanThreshold"
+  threshold           = var.backlog_threshold_low
+  evaluation_periods  = var.evaluation_periods_low
+  datapoints_to_alarm = var.datapoints_to_alarm_low
+  period              = 60
+  statistic           = "Maximum"
+  dimensions          = { EndpointName = aws_sagemaker_endpoint.main.name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "cpu_high" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-cpu-high"
+  alarm_description   = "Alarm when in high vCPU Usage"
+  metric_name         = "CPUUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.cpu_threshold_high
+  evaluation_periods  = var.evaluation_periods_high
+  datapoints_to_alarm = var.datapoints_to_alarm_high
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "cpu_low" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-cpu-low"
+  alarm_description   = "Alarm when in low vCPU Usage"
+  metric_name         = "CPUUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  threshold           = var.cpu_threshold_low
+  evaluation_periods  = var.evaluation_periods_low
+  datapoints_to_alarm = var.datapoints_to_alarm_low
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "gpu_high" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-gpu-high"
+  alarm_description   = "Alarm when in high GPU Usage"
+  metric_name         = "GPUUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.gpu_threshold_high
+  evaluation_periods  = var.evaluation_periods_high
+  datapoints_to_alarm = var.datapoints_to_alarm_high
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "gpu_low" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-gpu-low"
+  alarm_description   = "Alarm when in low GPU Usage"
+  metric_name         = "GPUUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  threshold           = var.gpu_threshold_low
+  evaluation_periods  = var.evaluation_periods_low
+  datapoints_to_alarm = var.datapoints_to_alarm_low
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "ram_high" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-ram-high"
+  alarm_description   = "Alarm when in high RAM Usage"
+  metric_name         = "MemoryUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.ram_threshold_high
+  evaluation_periods  = var.evaluation_periods_high
+  datapoints_to_alarm = var.datapoints_to_alarm_high
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "ram_low" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-ram-low"
+  alarm_description   = "Alarm when in low RAM Usage"
+  metric_name         = "MemoryUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  threshold           = var.ram_threshold_low
+  evaluation_periods  = var.evaluation_periods_low
+  datapoints_to_alarm = var.datapoints_to_alarm_low
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
 
   depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
 }
 
 
-resource "null_resource" "wait_for_metric_alarms" {
-  #  Aggregating metric alarms dependencies so we wait for them to be deleted/created before composite alarms are created or deleted. This prevents cyclic dependency issues.
-  depends_on = [aws_cloudwatch_metric_alarm.cloudwatch_alarm]
+resource "aws_cloudwatch_metric_alarm" "harddisk_high" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-harddisk-high"
+  alarm_description   = "Alarm when in high HardDisk Usage"
+  metric_name         = "DiskUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = var.harddisk_threshold_high
+  evaluation_periods  = var.evaluation_periods_high
+  datapoints_to_alarm = var.datapoints_to_alarm_high
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
 }
 
 
-resource "aws_cloudwatch_composite_alarm" "composite_alarm" {
-  count = length(var.alarm_composites)
+resource "aws_cloudwatch_metric_alarm" "harddisk_low" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-harddisk-low"
+  alarm_description   = "Alarm when in low RAM Usage"
+  metric_name         = "MemoryUtilization"
+  namespace           = "/aws/sagemaker/Endpoints"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  threshold           = var.ram_threshold_low
+  evaluation_periods  = var.evaluation_periods_low
+  datapoints_to_alarm = var.datapoints_to_alarm_low
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
 
-  alarm_name        = "${var.alarm_composites[count.index].alarm_name}-${aws_sagemaker_endpoint.main.name}"
-  alarm_description = var.alarm_composites[count.index].alarm_description
-  alarm_rule        = var.alarm_composites[count.index].alarm_rule
-  alarm_actions     = concat(var.alarm_composites[count.index].alarm_actions, [aws_sns_topic.alarm_composite_notifications[count.index].arn], [aws_sns_topic.composite_alarmstate[count.index].arn])
-  ok_actions        = var.alarm_composites[count.index].ok_actions
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
 
-  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarm_composite_notifications, aws_sns_topic.composite_alarmstate, null_resource.wait_for_metric_alarms]
 
+resource "aws_cloudwatch_metric_alarm" "unauthorized_operations" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-unauthorized-operations"
+  alarm_description   = "Alarm when unauthorized operations are detected in the CloudTrail Logs"
+  metric_name         = "UnauthorizedOperationsCount"
+  namespace           = "CloudTrailMetrics"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = 1
+  evaluation_periods  = 1
+  datapoints_to_alarm = 1
+  period              = 60
+  statistic           = "Maximum"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
+}
+
+
+resource "aws_cloudwatch_metric_alarm" "errors_4xx" {
+
+  alarm_name          = "${aws_sagemaker_endpoint.main.name}-errors-4XX"
+  alarm_description   = "4XX errors are detected in the CloudTrail Logs"
+  metric_name         = "Invocation4XXErrors"
+  namespace           = "AWS/SageMaker"
+  comparison_operator = "GreaterThanOrEqualToThreshold"
+  threshold           = 1
+  evaluation_periods  = 1
+  datapoints_to_alarm = 1
+  period              = 60
+  statistic           = "Average"
+  dimensions = { EndpointName = aws_sagemaker_endpoint.main.name,
+  VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name }
+
+  depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.alarmstate, aws_sns_topic.okstate]
 }
diff --git a/infra/modules/sagemaker_deployment/lambda.tf b/infra/modules/sagemaker_deployment/lambda.tf
@@ -14,45 +14,26 @@ resource "aws_lambda_function" "slack_alert_function" {
   runtime          = "python3.12"
   timeout          = 30
 
-  environment {
-    variables = {
-      SNS_TO_WEBHOOK_JSON = jsonencode(local.sns_to_webhook_mapping),
-      ADDRESS             = "arn:aws:sns:eu-west-2:${var.aws_account_id}:"
-    }
-  }
 }
 
 
-resource "aws_lambda_permission" "allow_sns_composite" {
-  count = length(var.alarm_composites)
+resource "aws_lambda_permission" "allow_sns_okstate" {
 
-  statement_id  = "AllowSNS-composite-${count.index}"
+  statement_id  = "AllowSNS-ok"
   action        = "lambda:InvokeFunction"
   function_name = aws_lambda_function.slack_alert_function.function_name
   principal     = "sns.amazonaws.com"
-  source_arn    = aws_sns_topic.composite_alarmstate[count.index].arn
+  source_arn    = aws_sns_topic.okstate.arn
 }
 
 
 resource "aws_lambda_permission" "allow_sns_alarmstate" {
-  count = length(var.alarms)
-
-  statement_id  = "AllowSNS-alarm-${count.index}"
-  action        = "lambda:InvokeFunction"
-  function_name = aws_lambda_function.slack_alert_function.function_name
-  principal     = "sns.amazonaws.com"
-  source_arn    = aws_sns_topic.alarmstate[count.index].arn
-}
-
-
-resource "aws_lambda_permission" "allow_sns_okstate" {
-  count = length(var.alarms)
 
-  statement_id  = "AllowSNS-ok-${count.index}"
+  statement_id  = "AllowSNS-alarm"
   action        = "lambda:InvokeFunction"
   function_name = aws_lambda_function.slack_alert_function.function_name
   principal     = "sns.amazonaws.com"
-  source_arn    = aws_sns_topic.okstate[count.index].arn
+  source_arn    = aws_sns_topic.alarmstate.arn
 }