diff --git a/.github/workflows/lint-terraform.yml b/.github/workflows/lint-terraform-python.yml similarity index 77% rename from .github/workflows/lint-terraform.yml rename to .github/workflows/lint-terraform-python.yml index 9a23c54..1ac9bfc 100644 --- a/.github/workflows/lint-terraform.yml +++ b/.github/workflows/lint-terraform-python.yml @@ -2,17 +2,18 @@ name: Lint Terraform on: push: - branches: [ "main" ] + branches: [ "main", "feat/sagemaker-llms" ] pull_request: - branches: [ "main" ] + branches: [ "main", "feat/sagemaker-llms" ] jobs: - test: + lint: name: Lint Terraform runs-on: ubuntu-20.04 steps: - name: "Checkout" uses: "actions/checkout@v4" + - name: "Install Terraform" # From https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli, # but with the addition of programatically verifying the package signing key, which was @@ -39,6 +40,20 @@ jobs: sudo tee /etc/apt/sources.list.d/hashicorp.list sudo apt update sudo apt-get install terraform - - name: "Run linting" + + - name: Set up Python 3.12 + uses: actions/setup-python@v3 + with: + python-version: "3.12" + + - name: Install uv + uses: astral-sh/setup-uv@v3 + + - name: Run linting on terraform run: | terraform fmt -check -recursive -diff + + - name: Run linting on Python code for lambda functions + run: | + cd infra/modules/ + uv run ruff check . diff --git a/infra/ecr.tf b/infra/ecr.tf index 6c9c0de..5550a3c 100644 --- a/infra/ecr.tf +++ b/infra/ecr.tf @@ -249,6 +249,11 @@ data "aws_ecr_lifecycle_policy_document" "expire_untagged_after_one_day" { } } +resource "aws_ecr_repository" "sagemaker" { + count = var.sagemaker_on ? 1 : 0 + name = "${var.prefix}-sagemaker" +} + data "aws_ecr_lifecycle_policy_document" "expire_preview_and_untagged_after_one_day" { # Match *--prod images, but expire them in 1000 years... rule { diff --git a/infra/ecs_notebooks_notebook.tf b/infra/ecs_notebooks_notebook.tf index 1553502..c9dbbce 100644 --- a/infra/ecs_notebooks_notebook.tf +++ b/infra/ecs_notebooks_notebook.tf @@ -111,6 +111,8 @@ resource "aws_iam_policy" "notebook_task_execution" { } data "aws_iam_policy_document" "notebook_task_execution" { + + statement { actions = [ "logs:CreateLogStream", @@ -122,6 +124,41 @@ data "aws_iam_policy_document" "notebook_task_execution" { ] } + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + actions = [ + "sagemaker:DescribeEndpoint", + "sagemaker:DescribeEndpointConfig", + "sagemaker:DescribeModel", + "sagemaker:InvokeEndpointAsync", + "sagemaker:ListEndpoints", + "sagemaker:ListEndpointConfigs", + "sagemaker:ListModels", + ] + + resources = [ + "*", + ] + } + } + + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + actions = [ + "ec2:*VpcEndpoint*" + ] + resources = [ + "*", + ] + } + } + statement { actions = [ "ecr:GetAuthorizationToken", @@ -234,6 +271,41 @@ data "aws_iam_policy_document" "notebook_s3_access_template" { "${aws_efs_file_system.notebooks.arn}", ] } + + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + actions = [ + "sagemaker:DescribeEndpoint", + "sagemaker:DescribeEndpointConfig", + "sagemaker:DescribeModel", + "sagemaker:InvokeEndpointAsync", + "sagemaker:ListEndpoints", + "sagemaker:ListEndpointConfigs", + "sagemaker:ListModels", + ] + + resources = [ + "*", + ] + } + } + + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + actions = [ + "ec2:*VpcEndpoint*" + ] + resources = [ + "*", + ] + } + } } resource "aws_vpc_endpoint" "s3" { @@ -345,6 +417,34 @@ data "aws_iam_policy_document" "aws_vpc_endpoint_s3_notebooks" { "arn:aws:s3:::amazonlinux.*.amazonaws.com/*", ] } + + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + + principals { + type = "AWS" + identifiers = ["*"] + } + + actions = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:GetBucketLocation", + ] + + resources = [ + "arn:aws:s3:::jumpstart-cache-prod-eu-west-2/*", + "arn:aws:s3:::jumpstart-private-cache-prod-eu-west-2/*", + "arn:aws:s3:::jumpstart-cache-prod-eu-west-2", + "arn:aws:s3:::jumpstart-private-cache-prod-eu-west-2", + ] + } + } } resource "aws_iam_policy" "notebook_task_boundary" { @@ -375,6 +475,28 @@ data "aws_iam_policy_document" "jupyterhub_notebook_task_boundary" { ] } + # Allow all tools users to access SageMaker endpoints + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + actions = [ + "sagemaker:DescribeEndpoint", + "sagemaker:DescribeEndpointConfig", + "sagemaker:DescribeModel", + "sagemaker:InvokeEndpointAsync", + "sagemaker:ListEndpoints", + "sagemaker:ListEndpointConfigs", + "sagemaker:ListModels", + ] + + resources = [ + "*", + ] + } + } + statement { actions = [ "s3:ListBucket", diff --git a/infra/main.tf b/infra/main.tf index 0565c84..7bc8bec 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -41,6 +41,8 @@ variable "subnets_num_bits" {} variable "vpc_notebooks_cidr" {} variable "vpc_notebooks_subnets_num_bits" {} variable "vpc_datasets_cidr" {} +variable "vpc_sagemaker_cidr" {} +variable "vpc_sagemaker_subnets_num_bits" {} variable "aws_route53_zone" {} variable "admin_domain" {} @@ -48,9 +50,7 @@ variable "appstream_domain" {} variable "support_domain" {} variable "admin_db_instance_class" {} -variable "admin_db_instance_version" { - default = "10.15" -} +variable "admin_db_instance_version" {} variable "admin_db_instance_allocated_storage" { type = number default = 200 @@ -274,38 +274,94 @@ variable "s3_prefixes_for_external_role_copy" { default = ["import-data", "export-data"] } +variable "sagemaker_example_inference_image" { default = "" } + +variable "sagemaker_models_folder" { default = "" } +variable "hugging_face_model_image" { default = "" } +variable "sagemaker_default_bucket" { default = "" } +variable "teams_webhook_url" { default = "" } +variable "sagemaker_budget_emails" { default = [""] } +variable "slack_webhook_resource_alerts" { default = [""] } +variable "slack_webhook_cpu_alerts" { default = [""] } +variable "slack_webhook_gpu_alerts" { default = [""] } +variable "slack_webhook_security_alerts" { default = [""] } +variable "slack_webhook_backlog_alerts" { default = [""] } + +variable "sagemaker_on" { + type = bool + default = false +} + +variable "sagemaker_gpt_neo_125m" { + type = bool + default = false +} + +variable "sagemaker_flan_t5_780m" { + type = bool + default = false +} + +variable "sagemaker_phi_2_3b" { + type = bool + default = false +} + +variable "sagemaker_llama_3_3b" { + type = bool + default = false +} + +variable "sagemaker_llama_3_3b_instruct" { + type = bool + default = false +} + +variable "sagemaker_mistral_7b_instruct" { + type = bool + default = false +} + variable "matchbox_on" { type = bool default = false } + variable "matchbox_dev_mode_on" { type = bool default = false } + variable "vpc_matchbox_cidr" { type = string default = "" } + variable "matchbox_instances" { type = list(string) default = [] } + variable "matchbox_instances_long" { type = list(string) default = [] } + variable "matchbox_db_instance_class" { type = string default = "" } + variable "vpc_matchbox_subnets_num_bits" { type = string default = "" } + variable "matchbox_s3_cache" { type = string default = "" } + variable "matchbox_s3_dev_artefacts" { type = string default = "" diff --git a/infra/modules/.bumpversion.toml b/infra/modules/.bumpversion.toml new file mode 100644 index 0000000..95bc062 --- /dev/null +++ b/infra/modules/.bumpversion.toml @@ -0,0 +1,21 @@ +[tool.bumpversion] +current_version = "0.1.19" +parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)" +serialize = ["{major}.{minor}.{patch}"] +search = "{current_version}" +replace = "{new_version}" +regex = false +ignore_missing_version = false +ignore_missing_files = false +tag = false +sign_tags = false +tag_name = "v{new_version}" +tag_message = "Bump version: {current_version} → {new_version}" +allow_dirty = false +commit = false +message = "Bump version: {current_version} → {new_version}" +moveable_tags = [] +commit_args = "" +setup_hooks = [] +pre_commit_hooks = [] +post_commit_hooks = [] diff --git a/infra/modules/Makefile b/infra/modules/Makefile new file mode 100644 index 0000000..39f0741 --- /dev/null +++ b/infra/modules/Makefile @@ -0,0 +1,26 @@ +.DEFAULT_GOAL := help +SHELL := /bin/bash + +.PHONY: help +help: ## Show all available commands + @awk 'BEGIN {FS = ":.*##"; printf "Usage: make \033[36m\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-13s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST); + +.PHONY: bump +bump: format # You must install uv https://docs.astral.sh/uv/getting-started/installation/ + @echo "You must have committed your changes before running this" + @uv run bump-my-version bump patch; + @echo "Upgraded version to $(shell bump-my-version show current_version)" + @git add .bumpversion.toml + @git commit -m "Upgraded version to $(shell bump-my-version show current_version)" + +.PHONY: format +format: # You must install terraform https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli + @echo "Enforcing formatting" + @terraform fmt -recursive; + @uv run ruff format .; + @uv run ruff check --fix-only --unsafe-fixes .; + +.PHONY: quality +quality: + @uv run ruff check .; + @terraform fmt -check -recursive -diff; diff --git a/infra/modules/README.md b/infra/modules/README.md new file mode 100644 index 0000000..9c55048 --- /dev/null +++ b/infra/modules/README.md @@ -0,0 +1,25 @@ +### Modules + +Modules folders allows us to separate code rather than the flat structure utilised in the rest of this repository. Some standards are set here which it is suggested could be applied to the rest of the repository also. + + +### Makefile + +The Makefile allows for the storing of complex commands with a simple and easy-to-remember interface - + +``` +make format +``` +will run required formatting with terraform and python code and + +``` +make bump +``` +will bump the version number (patch level). + + + + +### Required software + +You must install [terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli) and [uv](https://docs.astral.sh/uv/getting-started/installation/). diff --git a/infra/modules/cost_monitoring/budgets/main.tf b/infra/modules/cost_monitoring/budgets/main.tf new file mode 100644 index 0000000..28ac935 --- /dev/null +++ b/infra/modules/cost_monitoring/budgets/main.tf @@ -0,0 +1,36 @@ + +resource "aws_budgets_budget" "monthly_cost_budget" { + name = "${var.budget_name}-monthly-cost-budget" + budget_type = "COST" + limit_amount = var.budget_limit + limit_unit = "USD" + time_unit = "MONTHLY" + + cost_filter { + values = [var.cost_filter_service] + name = "Service" + } + + notification { + notification_type = "ACTUAL" + threshold_type = "PERCENTAGE" + comparison_operator = "GREATER_THAN" + threshold = 80 + + subscriber_email_addresses = var.notification_email # Secrets to be passed + subscriber_sns_topic_arns = [var.sns_topic_arn] + } + + notification { + notification_type = "ACTUAL" + threshold_type = "PERCENTAGE" + comparison_operator = "GREATER_THAN" + threshold = 100 + + subscriber_email_addresses = var.notification_email # Secrets to be passed + subscriber_sns_topic_arns = [var.sns_topic_arn] + } +} + + + diff --git a/infra/modules/cost_monitoring/budgets/output.tf b/infra/modules/cost_monitoring/budgets/output.tf new file mode 100644 index 0000000..37306e2 --- /dev/null +++ b/infra/modules/cost_monitoring/budgets/output.tf @@ -0,0 +1,3 @@ +output "budget_name" { + value = aws_budgets_budget.monthly_cost_budget.name +} \ No newline at end of file diff --git a/infra/modules/cost_monitoring/budgets/variables.tf b/infra/modules/cost_monitoring/budgets/variables.tf new file mode 100644 index 0000000..aabdfb0 --- /dev/null +++ b/infra/modules/cost_monitoring/budgets/variables.tf @@ -0,0 +1,39 @@ +variable "budget_name" { + type = string + description = "AWS Budget name" +} + +variable "budget_limit" { + type = string + default = null + description = "Optional monthly budget limit for AWS for the budget" +} + +variable "time_unit" { + description = "Budget time unit, i.e. Monthly, etc" + type = string + default = "MONTHLY" +} + +variable "notification_thresholds" { + type = list(number) + default = [80, 100] + description = "list of notification thresholds in %" +} + +variable "notification_email" { + type = list(string) + description = "email for who recieves budget alerts" +} + +variable "sns_topic_arn" { + type = string + description = "ARN of SNS topic for budget alerts" + +} + +variable "cost_filter_service" { + type = string + description = "service to apply cost filter on" + default = "Amazon SageMaker" +} diff --git a/infra/modules/cost_monitoring/cloudwatch_dashboard/main.tf b/infra/modules/cost_monitoring/cloudwatch_dashboard/main.tf new file mode 100644 index 0000000..23268a8 --- /dev/null +++ b/infra/modules/cost_monitoring/cloudwatch_dashboard/main.tf @@ -0,0 +1,60 @@ +resource "aws_cloudwatch_dashboard" "cost_dashboard" { + dashboard_name = var.dashboard_name + + dashboard_body = jsonencode({ + widgets = [ + { + "type" : "metric", + "x" : 0, + "y" : 0, + "width" : 12, + "height" : 6, + "properties" : { + "metrics" : [ + ["AWS/Billing", "EstimatedCharges", "Currency", "USD"] + ], + "period" : 86400, + "stat" : "Maximum", + "region" : "us-east-1", + "title" : "Monthly AWS Costs" + } + }, + { + "type" : "metric", + "x" : 0, + "y" : 7, + "width" : 12, + "height" : 6, + "properties" : { + "metrics" : [ + ["AWS/Billing", "EstimatedCharges", "ServiceName", "AmazonSageMaker", "Currency", "USD"], + ["AWS/Billing", "EstimatedCharges", "ServiceName", "AmazonEC2", "Currency", "USD"], + ["AWS/Billing", "EstimatedCharges", "ServiceName", "AmazonS3", "Currency", "USD"] + ], + "period" : 86400, + "stat" : "Maximum", + "region" : "us-east-1", + "title" : "Service-Level Costs (SageMaker, EC2, S3)" + } + }, + { + "type" : "metric", + "x" : 13, + "y" : 0, + "width" : 12, + "height" : 6, + "properties" : { + "metrics" : [ + ["AWS/Billing", "EstimatedCharges", "Currency", "USD", { "stat" : "Average" }] + ], + "period" : 3600, + "stat" : "Average", + "region" : "us-east-1", + "title" : "Hourly AWS Costs" + } + } + ] + }) +} + + diff --git a/infra/modules/cost_monitoring/cloudwatch_dashboard/output.tf b/infra/modules/cost_monitoring/cloudwatch_dashboard/output.tf new file mode 100644 index 0000000..e69de29 diff --git a/infra/modules/cost_monitoring/cloudwatch_dashboard/variables.tf b/infra/modules/cost_monitoring/cloudwatch_dashboard/variables.tf new file mode 100644 index 0000000..fdb5750 --- /dev/null +++ b/infra/modules/cost_monitoring/cloudwatch_dashboard/variables.tf @@ -0,0 +1,10 @@ +variable "dashboard_name" { + description = "Name of the CloudWatch dashboard" + type = string +} + +variable "services_to_monitor" { + description = "List of AWS services to monitor costs from" + type = list(string) + default = ["AmazonSageMaker", "AmazonEC2", "AmazonS3"] +} \ No newline at end of file diff --git a/infra/modules/cost_monitoring/sns/main.tf b/infra/modules/cost_monitoring/sns/main.tf new file mode 100644 index 0000000..66c7260 --- /dev/null +++ b/infra/modules/cost_monitoring/sns/main.tf @@ -0,0 +1,18 @@ +resource "aws_sns_topic" "budget_alert_topic" { + name = "${var.prefix}-budget-alert-topic" + policy = data.aws_iam_policy_document.budget_publish_policy.json +} + +data "aws_iam_policy_document" "budget_publish_policy" { + statement { + actions = ["SNS:Publish"] + effect = "Allow" + principals { + type = "Service" + identifiers = ["budgets.amazonaws.com"] + } + resources = [ + "arn:aws:sns:eu-west-2:${var.account_id}:${var.prefix}-budget-alert-topic" + ] + } +} diff --git a/infra/modules/cost_monitoring/sns/outputs.tf b/infra/modules/cost_monitoring/sns/outputs.tf new file mode 100644 index 0000000..6419b2e --- /dev/null +++ b/infra/modules/cost_monitoring/sns/outputs.tf @@ -0,0 +1,3 @@ +output "sns_topic_arn" { + value = aws_sns_topic.budget_alert_topic.arn +} diff --git a/infra/modules/cost_monitoring/sns/variables.tf b/infra/modules/cost_monitoring/sns/variables.tf new file mode 100644 index 0000000..b316fe0 --- /dev/null +++ b/infra/modules/cost_monitoring/sns/variables.tf @@ -0,0 +1,9 @@ +variable "prefix" { + type = string + description = "Prefix for SNS topic name" +} + +variable "account_id" { + type = string + description = "account ID for the SNS topic" +} diff --git a/infra/modules/pyproject.toml b/infra/modules/pyproject.toml new file mode 100644 index 0000000..621fafc --- /dev/null +++ b/infra/modules/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "lambda_functions" +version = "0.1.0" +requires-python = ">=3.12" +dependencies = ["boto3"] # strictly speaking boto3-stubs but this may not be installed in the lambda + +[tool.uv] +dev-dependencies = ["bump-my-version", "ruff"] + +[tool.mypy] +pretty = true +strict = true +show_error_codes = true +warn_unreachable = true + +[tool.ruff.lint] +select = ["A", "B", "BLE", "E", "ERA", "F", "I", "RUF", "T", "T10", "T20", "W"] diff --git a/infra/modules/sagemaker_deployment/cloudwatch_alarms.tf b/infra/modules/sagemaker_deployment/cloudwatch_alarms.tf new file mode 100644 index 0000000..6a0ca71 --- /dev/null +++ b/infra/modules/sagemaker_deployment/cloudwatch_alarms.tf @@ -0,0 +1,394 @@ + +resource "aws_cloudwatch_metric_alarm" "scale_up_from_0_to_1" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-scale-up-from-0-to-1" + alarm_description = "Where there exists a high backlog and there exists a state of insufficient data for any of CPU, GPU, RAM (i.e. there are tasks to do but no instance is live to perform it)" + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 0.5 # boolean comparison operator does not exist so this uses TRUE=1 and FALSE=0 instead + alarm_actions = [aws_appautoscaling_policy.scale_up_from_0_to_1.arn, aws_sns_topic.scale_up_from_0_to_1.arn] + ok_actions = [aws_sns_topic.scale_up_from_0_to_1.arn] + + metric_query { + id = "result" + expression = "ABS(backlog>=${var.backlog_threshold_high}) AND (FILL(cpu, 0)==0 OR FILL(gpu,0)==0 OR FILL(ram,0)==0)" + return_data = "true" + period = 60 + + } + + metric_query { + id = "backlog" + + metric { + metric_name = "ApproximateBacklogSize" + namespace = "AWS/SageMaker" + period = 60 + stat = "Maximum" + unit = "Count" + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name + } + } + } + + metric_query { + id = "cpu" + + metric { + metric_name = "CPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each vCPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "gpu" + + metric { + metric_name = "GPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each GPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "ram" + + metric { + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% is total in this case + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.scale_up_from_0_to_1] +} + + +resource "aws_cloudwatch_metric_alarm" "scale_down_from_n_to_nm1" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-scale-down-from-n-to-nm1" + alarm_description = "Where there exists a high backlog and a low state of any of CPU, GPU, RAM (i.e. live instances are excessive for the current tasks)" + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 0.5 # boolean comparison operator does not exist so this uses TRUE=1 and FALSE=0 instead + alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_nm1.arn, aws_sns_topic.scale_down_from_n_to_nm1.arn] + ok_actions = [aws_sns_topic.scale_down_from_n_to_nm1.arn] + + metric_query { + id = "result" + expression = "ABS(backlog>=${var.backlog_threshold_high} AND (cpu<=${var.cpu_threshold_low} OR gpu<=${var.gpu_threshold_low} OR ram<=${var.ram_threshold_low}))" + return_data = "true" + period = 60 + + } + + metric_query { + id = "backlog" + + metric { + metric_name = "ApproximateBacklogSize" + namespace = "AWS/SageMaker" + period = 60 + stat = "Maximum" + unit = "Count" + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name + } + } + } + + metric_query { + id = "cpu" + + metric { + metric_name = "CPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each vCPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "gpu" + + metric { + metric_name = "GPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each GPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "ram" + + metric { + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% is total in this case + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.scale_down_from_n_to_nm1] +} + + + +resource "aws_cloudwatch_metric_alarm" "scale_down_from_n_to_0" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-scale-down-from-n-to-0" + alarm_description = "Where there exists a low backlog and a low state of any of CPU, GPU, RAM (i.e. there is no task to come and live instances are excessive for any tasks currently in process)" + evaluation_periods = var.evaluation_periods_low + datapoints_to_alarm = var.datapoints_to_alarm_low + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 0.5 # boolean comparison operator does not exist so this uses TRUE=1 and FALSE=0 instead + alarm_actions = [aws_appautoscaling_policy.scale_down_from_n_to_0.arn, aws_sns_topic.scale_down_from_n_to_0.arn] + ok_actions = [aws_sns_topic.scale_down_from_n_to_0.arn] + + metric_query { + id = "result" + expression = "ABS(backlog<${var.backlog_threshold_low} AND (cpu<=${var.cpu_threshold_low} OR gpu<=${var.gpu_threshold_low} OR ram<=${var.ram_threshold_low}))" + return_data = "true" + period = 60 + + } + + metric_query { + id = "backlog" + + metric { + metric_name = "ApproximateBacklogSize" + namespace = "AWS/SageMaker" + period = 60 + stat = "Maximum" + unit = "Count" + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name + } + } + } + + metric_query { + id = "cpu" + + metric { + metric_name = "CPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each vCPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "gpu" + + metric { + metric_name = "GPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each GPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "ram" + + metric { + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% is total in this case + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.scale_down_from_n_to_0] +} + + + +resource "aws_cloudwatch_metric_alarm" "scale_up_from_n_to_np1" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-scale-up-from-n-to-np1" + alarm_description = "Where there exists a high backlog and a high state of any of CPU, GPU, RAM (i.e. live instances are insufficient for the tasks being performed)" + evaluation_periods = var.evaluation_periods_high + datapoints_to_alarm = var.datapoints_to_alarm_high + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 0.5 # boolean comparison operator does not exist so this uses TRUE=1 and FALSE=0 instead + alarm_actions = [aws_appautoscaling_policy.scale_up_from_n_to_np1.arn, aws_sns_topic.scale_up_from_n_to_np1.arn] + ok_actions = [aws_sns_topic.scale_up_from_n_to_np1.arn] + + metric_query { + id = "result" + expression = "ABS(backlog>=${var.backlog_threshold_high} AND (cpu>=${var.cpu_threshold_high} OR gpu>=${var.gpu_threshold_high} OR ram>=${var.ram_threshold_high}))" + return_data = "true" + period = 60 + + } + + metric_query { + id = "backlog" + + metric { + metric_name = "ApproximateBacklogSize" + namespace = "AWS/SageMaker" + period = 60 + stat = "Maximum" + unit = "Count" + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name + } + } + } + + metric_query { + id = "cpu" + + metric { + metric_name = "CPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each vCPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "gpu" + + metric { + metric_name = "GPUUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% for each GPU available + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + metric_query { + id = "ram" + + metric { + metric_name = "MemoryUtilization" + namespace = "/aws/sagemaker/Endpoints" + period = 60 + stat = "Average" + unit = "Percent" # NOTE: 100% is total in this case + + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + } + } + + depends_on = [aws_sagemaker_endpoint.main, aws_sns_topic.scale_up_from_n_to_np1] +} + + +resource "aws_cloudwatch_metric_alarm" "unauthorized_operations" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-unauthorized-operations" + alarm_description = "Alarm when unauthorized operations are detected in the CloudTrail Logs" + metric_name = "UnauthorizedOperationsCount" + namespace = "CloudTrailMetrics" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 1 + evaluation_periods = 1 + datapoints_to_alarm = 1 + period = 60 + statistic = "Maximum" + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + + depends_on = [aws_sagemaker_endpoint.main] +} + + +resource "aws_cloudwatch_metric_alarm" "errors_4xx" { + + alarm_name = "${aws_sagemaker_endpoint.main.name}-errors-4XX" + alarm_description = "4XX errors are detected in the CloudTrail Logs" + metric_name = "Invocation4XXErrors" + namespace = "AWS/SageMaker" + comparison_operator = "GreaterThanOrEqualToThreshold" + threshold = 1 + evaluation_periods = 1 + datapoints_to_alarm = 1 + period = 60 + statistic = "Average" + dimensions = { + EndpointName = aws_sagemaker_endpoint.main.name, + VariantName = aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name + } + + depends_on = [aws_sagemaker_endpoint.main] +} diff --git a/infra/modules/sagemaker_deployment/lambda.tf b/infra/modules/sagemaker_deployment/lambda.tf new file mode 100644 index 0000000..c91fbbb --- /dev/null +++ b/infra/modules/sagemaker_deployment/lambda.tf @@ -0,0 +1,124 @@ +data "archive_file" "lambda_payload" { + type = "zip" + source_file = "${path.module}/lambda_function/sns_to_microsoft_teams.py" + output_path = "${path.module}/lambda_function/payload.zip" +} + + +resource "aws_sns_topic_subscription" "sns_lambda_subscription_scale_up_from_0_to_1" { + + topic_arn = aws_sns_topic.scale_up_from_0_to_1.arn + protocol = "lambda" + endpoint = aws_lambda_function.teams_alert.arn +} + + +resource "aws_sns_topic_subscription" "sns_lambda_subscription_scale_up_from_n_to_np1" { + + topic_arn = aws_sns_topic.scale_up_from_n_to_np1.arn + protocol = "lambda" + endpoint = aws_lambda_function.teams_alert.arn +} + + +resource "aws_sns_topic_subscription" "sns_lambda_subscription_scale_down_from_n_to_nm1" { + + topic_arn = aws_sns_topic.scale_down_from_n_to_nm1.arn + protocol = "lambda" + endpoint = aws_lambda_function.teams_alert.arn +} + + +resource "aws_sns_topic_subscription" "sns_lambda_subscription_scale_down_from_n_to_0" { + + topic_arn = aws_sns_topic.scale_down_from_n_to_0.arn + protocol = "lambda" + endpoint = aws_lambda_function.teams_alert.arn +} + + +resource "aws_lambda_function" "teams_alert" { + filename = data.archive_file.lambda_payload.output_path + source_code_hash = data.archive_file.lambda_payload.output_base64sha256 + function_name = "${var.model_name}-teams-alert" + role = aws_iam_role.teams_lambda.arn + handler = "sns_to_microsoft_teams.lambda_handler" + runtime = "python3.12" + timeout = 30 + environment { + variables = { + TEAMS_WEBHOOK_URL = var.teams_webhook_url + } + } +} + + +resource "aws_lambda_permission" "allow_sns_scale_up_from_0_to_1" { + + statement_id = "AllowSNS-scale-up-from-0-to-1" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.teams_alert.function_name + principal = "sns.amazonaws.com" + source_arn = aws_sns_topic.scale_up_from_0_to_1.arn +} + + + +resource "aws_lambda_permission" "allow_sns_scale_down_from_n_to_nm1" { + + statement_id = "AllowSNS-scale-down-from-n-to-nm1" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.teams_alert.function_name + principal = "sns.amazonaws.com" + source_arn = aws_sns_topic.scale_down_from_n_to_nm1.arn + +} + + +resource "aws_iam_role" "teams_lambda" { + name = "${var.model_name}-teams-lambda-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "lambda.amazonaws.com" + }, + Action = "sts:AssumeRole" + } + ] + }) +} + + +resource "aws_iam_policy" "teams_lambda" { + name = "${var.model_name}-teams-lambda-policy" + + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + Resource = "arn:aws:logs:*:*:*" + }, + { + Effect = "Allow", + Action = "sns:Publish", + Resource = "*" + } + ] + }) +} + + +resource "aws_iam_role_policy_attachment" "teams_lambda" { + role = aws_iam_role.teams_lambda.name + policy_arn = aws_iam_policy.teams_lambda.arn +} diff --git a/infra/modules/sagemaker_deployment/lambda_function/payload.zip b/infra/modules/sagemaker_deployment/lambda_function/payload.zip new file mode 100644 index 0000000..933d17c Binary files /dev/null and b/infra/modules/sagemaker_deployment/lambda_function/payload.zip differ diff --git a/infra/modules/sagemaker_deployment/lambda_function/sns_to_microsoft_teams.py b/infra/modules/sagemaker_deployment/lambda_function/sns_to_microsoft_teams.py new file mode 100644 index 0000000..c1aa4b9 --- /dev/null +++ b/infra/modules/sagemaker_deployment/lambda_function/sns_to_microsoft_teams.py @@ -0,0 +1,63 @@ +import json +import logging +import os +from datetime import datetime + +import urllib3 + +logger = logging.getLogger() +logger.setLevel("INFO") +http = urllib3.PoolManager() + + +def lambda_handler(event, context): + webhook_url = os.getenv("TEAMS_WEBHOOK_URL") + message_str = event["Records"][0]["Sns"]["Message"] + alarm_name = json.loads(message_str)["AlarmName"] + dimensions_list = json.loads(message_str)["Trigger"]["Metrics"][0]["MetricStat"][ + "Metric" + ]["Dimensions"] + endpoint_name = next( + x["value"] for x in dimensions_list if x["name"] == "EndpointName" + ) + new_state = json.loads(message_str)["NewStateValue"] + timestamp_str = json.loads(message_str)["StateChangeTime"] + timestamp_dt = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f%z") + readable_date_str = str(timestamp_dt.date()) + readable_time_str = str(timestamp_dt.strftime("%H:%M:%S")) + region = "eu-west-2" # it was easier to hard-code this (apologies for inelegance) + + alarm_url = f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}#alarmsV2:alarm/{alarm_name}" + + if new_state == "ALARM": + colour = "FF0000" + elif new_state == "OK": + colour = "00FF00" + else: + colour = "0000FF" + + message_card = { + "@type": "MessageCard", + "@context": "http://schema.org/extensions", + "themeColor": colour, + "title": f"Transition to {new_state} on {endpoint_name} for alarm {alarm_name}", + "text": f"Triggered at {readable_time_str} on {readable_date_str}", + "potentialAction": [ + { + "@type": "OpenUri", + "name": "View Alarm", + "targets": [{"os": "default", "uri": alarm_url}], + } + ], + } + headers = {"Content-Type": "application/json"} + encoded_message_card = json.dumps(message_card) + response = http.request( + method="POST", + url=webhook_url, + body=encoded_message_card, + headers=headers, + ) + logger.info( + f"Completed with code {response.status} and full response {response.data}" + ) diff --git a/infra/modules/sagemaker_deployment/main.tf b/infra/modules/sagemaker_deployment/main.tf new file mode 100644 index 0000000..ce4d7d8 --- /dev/null +++ b/infra/modules/sagemaker_deployment/main.tf @@ -0,0 +1,166 @@ +resource "aws_sagemaker_model" "main" { + name = var.model_name + execution_role_arn = var.execution_role_arn + + primary_container { + image = var.container_image + environment = var.environment_variables + + model_data_source { + s3_data_source { + s3_uri = var.model_uri + s3_data_type = "S3Prefix" + compression_type = var.model_uri_compression + model_access_config { + accept_eula = true + } + } + } + } + + vpc_config { + security_group_ids = var.security_group_ids + subnets = var.subnets + } +} + + +resource "aws_sagemaker_endpoint_configuration" "main" { + name = "${aws_sagemaker_model.main.name}-endpoint-config" + + production_variants { + variant_name = "AllTraffic" + model_name = aws_sagemaker_model.main.name + instance_type = var.instance_type + initial_instance_count = 1 + } + + async_inference_config { + output_config { + s3_output_path = var.s3_output_path + notification_config { + include_inference_response_in = ["SUCCESS_NOTIFICATION_TOPIC"] + success_topic = var.sns_success_topic_arn + } + } + } +} + + +resource "aws_sagemaker_endpoint" "main" { + name = "${aws_sagemaker_model.main.name}-endpoint" + + endpoint_config_name = aws_sagemaker_endpoint_configuration.main.name + depends_on = [aws_sagemaker_endpoint_configuration.main, var.sns_success_topic_arn] +} + + +resource "aws_appautoscaling_target" "main" { + max_capacity = var.max_capacity + min_capacity = var.min_capacity + resource_id = "endpoint/${aws_sagemaker_endpoint.main.name}/variant/${aws_sagemaker_endpoint_configuration.main.production_variants[0].variant_name}" # Note this logic would not work if there were ever more than one production variant deployed for an LLM + scalable_dimension = "sagemaker:variant:DesiredInstanceCount" + service_namespace = "sagemaker" + depends_on = [aws_sagemaker_endpoint.main, aws_sagemaker_endpoint_configuration.main] +} + + +resource "aws_appautoscaling_policy" "scale_up_from_n_to_np1" { + name = "scale-up-to-n-policy-${var.model_name}" + + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.main.resource_id + scalable_dimension = aws_appautoscaling_target.main.scalable_dimension + service_namespace = aws_appautoscaling_target.main.service_namespace + depends_on = [aws_appautoscaling_target.main] + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + cooldown = var.scale_up_cooldown + + step_adjustment { + scaling_adjustment = 1 # means add 1 + metric_interval_lower_bound = 0 + metric_interval_upper_bound = null + } + } +} + + +resource "aws_appautoscaling_policy" "scale_down_from_n_to_nm1" { + name = "scale-down-to-n-policy-${var.model_name}" + + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.main.resource_id + scalable_dimension = aws_appautoscaling_target.main.scalable_dimension + service_namespace = aws_appautoscaling_target.main.service_namespace + depends_on = [aws_appautoscaling_target.main] + + step_scaling_policy_configuration { + adjustment_type = "ChangeInCapacity" + cooldown = var.scale_down_cooldown + + step_adjustment { + scaling_adjustment = -1 # mean subtract 1 + metric_interval_lower_bound = 0 + metric_interval_upper_bound = null + } + } +} + + +resource "aws_appautoscaling_policy" "scale_up_from_0_to_1" { + name = "scale-up-to-one-policy-${var.model_name}" + + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.main.resource_id + scalable_dimension = aws_appautoscaling_target.main.scalable_dimension + service_namespace = aws_appautoscaling_target.main.service_namespace + depends_on = [aws_appautoscaling_target.main] + + step_scaling_policy_configuration { + adjustment_type = "ExactCapacity" + cooldown = var.scale_up_cooldown + + step_adjustment { + scaling_adjustment = 1 # means set =1 (NOT add or subtract) + metric_interval_lower_bound = 0 + metric_interval_upper_bound = null + } + } +} + + +resource "aws_appautoscaling_policy" "scale_down_from_n_to_0" { + name = "scale-down-to-zero-policy-${var.model_name}" + + policy_type = "StepScaling" + resource_id = aws_appautoscaling_target.main.resource_id + scalable_dimension = aws_appautoscaling_target.main.scalable_dimension + service_namespace = aws_appautoscaling_target.main.service_namespace + depends_on = [aws_appautoscaling_target.main] + + step_scaling_policy_configuration { + adjustment_type = "ExactCapacity" + cooldown = var.scale_down_cooldown + + step_adjustment { + scaling_adjustment = 0 # means set =0 (NOT add or subtract) + metric_interval_lower_bound = 0 + metric_interval_upper_bound = null + } + } +} + + +resource "aws_cloudwatch_log_metric_filter" "unauthorized_operations" { + name = "unauthorized-operations-filter" + log_group_name = "/aws/sagemaker/Endpoints/${aws_sagemaker_endpoint.main.name}" + pattern = "{ $.errorCode = \"UnauthorizedOperation\" || $.errorCode = \"AccessDenied\" }" + + metric_transformation { + name = "UnauthorizedOperationsCount" + namespace = "CloudTrailMetrics" + value = "1" + } +} diff --git a/infra/modules/sagemaker_deployment/outputs.tf b/infra/modules/sagemaker_deployment/outputs.tf new file mode 100644 index 0000000..a96f9a7 --- /dev/null +++ b/infra/modules/sagemaker_deployment/outputs.tf @@ -0,0 +1,8 @@ +output "model_name" { + value = aws_sagemaker_model.main.name +} + + +output "endpoint_name" { + value = aws_sagemaker_endpoint.main.name +} diff --git a/infra/modules/sagemaker_deployment/sns.tf b/infra/modules/sagemaker_deployment/sns.tf new file mode 100644 index 0000000..39d8ca9 --- /dev/null +++ b/infra/modules/sagemaker_deployment/sns.tf @@ -0,0 +1,74 @@ +resource "aws_sns_topic" "scale_up_from_0_to_1" { + + name = "${aws_sagemaker_endpoint.main.name}-scale-up-from-0-to-1" + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "cloudwatch.amazonaws.com" + }, + Action = "sns:Publish", + Resource = "*" + } + ] + }) +} + + +resource "aws_sns_topic" "scale_down_from_n_to_0" { + + name = "${aws_sagemaker_endpoint.main.name}-scale-down-from-n-to-0" + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "cloudwatch.amazonaws.com" + }, + Action = "sns:Publish", + Resource = "*" + } + ] + }) +} + + +resource "aws_sns_topic" "scale_down_from_n_to_nm1" { + + name = "${aws_sagemaker_endpoint.main.name}-scale-down-from-n-to-nm1" + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "cloudwatch.amazonaws.com" + }, + Action = "sns:Publish", + Resource = "*" + } + ] + }) +} + + +resource "aws_sns_topic" "scale_up_from_n_to_np1" { + + name = "${aws_sagemaker_endpoint.main.name}-scale-up-from-n-to-np1" + policy = jsonencode({ + Version = "2012-10-17", + Statement = [ + { + Effect = "Allow", + Principal = { + Service = "cloudwatch.amazonaws.com" + }, + Action = "sns:Publish", + Resource = "*" + } + ] + }) +} diff --git a/infra/modules/sagemaker_deployment/variables.tf b/infra/modules/sagemaker_deployment/variables.tf new file mode 100644 index 0000000..d40bd68 --- /dev/null +++ b/infra/modules/sagemaker_deployment/variables.tf @@ -0,0 +1,169 @@ +variable "sns_success_topic_arn" { + type = string + description = "ARN of the SNS topic for Sagemaker successful async outputs" +} + + +variable "model_name" { + type = string + description = "Name of the SageMaker model" +} + + +variable "s3_output_path" { + type = string + description = "Where the async output of the model is sent" +} + + +variable "execution_role_arn" { + type = string + description = "Execution role ARN for SageMaker" +} + + +variable "container_image" { + type = string + description = "Container image for the model" +} + + +variable "model_uri" { + type = string + description = "S3 URL where the model data is located" +} + + +variable "model_uri_compression" { + type = string + description = "Whether the model weights are stored compressed and if so what compression type" +} + + +variable "environment_variables" { + type = map(string) + description = "Environment variables for the container" +} + + +variable "security_group_ids" { + type = list(string) + description = "List of security group IDs for the SageMaker model" +} + + +variable "subnets" { + type = list(string) + description = "List of subnets for the SageMaker model" +} + + +variable "instance_type" { + type = string + description = "Instance type for the endpoint" +} + + +variable "max_capacity" { + type = number + description = "Maximum capacity for autoscaling" +} + + +variable "min_capacity" { + type = number + description = "Minimum capacity for autoscaling" +} + + +variable "scale_up_cooldown" { + type = number + description = "Cooldown period for scale up" +} + + +variable "scale_down_cooldown" { + type = number + description = "Cooldown period for scale down" +} + +variable "backlog_threshold_high" { + type = number + description = "Threshold for high backlog alarm" +} + + +variable "backlog_threshold_low" { + type = number + description = "Threshold for low backlog alarm" +} + + +variable "cpu_threshold_high" { + type = number + description = "Threshold for high CPU alarm (NOTE this varies based on number of vCPU)" +} + + +variable "cpu_threshold_low" { + type = number + description = "Threshold for low CPU alarm (NOTE this varies based on number of vCPU)" +} + + +variable "gpu_threshold_high" { + type = number + description = "Threshold for high GPU alarm (NOTE this varies based on number of GPU)" +} + + +variable "gpu_threshold_low" { + type = number + description = "Threshold for low GPU alarm (NOTE this varies based on number of GPU)" +} + + +variable "ram_threshold_high" { + type = number + description = "Threshold for high RAM alarm" +} + + +variable "ram_threshold_low" { + type = number + description = "Threshold for low RAM alarm" +} + + +variable "evaluation_periods_high" { + type = number + description = "Number of evaluation periods to consider for high alarm states" +} + + +variable "datapoints_to_alarm_high" { + type = number + description = "Number of datapoints within an evaluation period to require for low alarm states" +} + + +variable "evaluation_periods_low" { + type = number + description = "Number of evaluation periods to consider for low alarm states" +} + + +variable "datapoints_to_alarm_low" { + type = number + description = "Number of datapoints within an evaluation period to require for low alarm states" +} + + +variable "aws_account_id" { + type = string +} + + +variable "teams_webhook_url" { + type = string +} diff --git a/infra/modules/sagemaker_init/domain/main.tf b/infra/modules/sagemaker_init/domain/main.tf new file mode 100644 index 0000000..5e7fb0d --- /dev/null +++ b/infra/modules/sagemaker_init/domain/main.tf @@ -0,0 +1,11 @@ +resource "aws_sagemaker_domain" "sagemaker" { + domain_name = var.domain_name + auth_mode = "IAM" + vpc_id = var.vpc_id + subnet_ids = var.subnet_ids + app_network_access_type = "VpcOnly" + + default_user_settings { + execution_role = var.execution_role_arn + } +} \ No newline at end of file diff --git a/infra/modules/sagemaker_init/domain/outputs.tf b/infra/modules/sagemaker_init/domain/outputs.tf new file mode 100644 index 0000000..bcdb4ca --- /dev/null +++ b/infra/modules/sagemaker_init/domain/outputs.tf @@ -0,0 +1,4 @@ +output "sagemaker_domain_id" { + description = "The ID of the SageMaker Domain" + value = aws_sagemaker_domain.sagemaker.id +} \ No newline at end of file diff --git a/infra/modules/sagemaker_init/domain/variables.tf b/infra/modules/sagemaker_init/domain/variables.tf new file mode 100644 index 0000000..c3adbea --- /dev/null +++ b/infra/modules/sagemaker_init/domain/variables.tf @@ -0,0 +1,19 @@ +variable "domain_name" { + type = string + description = "The Domain name of the service, i.e. SageMaker" +} + +variable "vpc_id" { + type = string + description = "VPC ID" +} + +variable "subnet_ids" { + type = any + description = "subnet ids" +} + +variable "execution_role_arn" { + type = string + description = "The execution role" +} diff --git a/infra/modules/sagemaker_init/iam/main.tf b/infra/modules/sagemaker_init/iam/main.tf new file mode 100644 index 0000000..53eb8a8 --- /dev/null +++ b/infra/modules/sagemaker_init/iam/main.tf @@ -0,0 +1,172 @@ +# Use the data source to get the bucket ARN from the bucket name +data "aws_s3_bucket" "sagemaker_default_bucket" { + bucket = var.sagemaker_default_bucket_name +} + + +# Assume Role Policy for SageMaker Execution Role +data "aws_iam_policy_document" "sagemaker_assume_role" { + statement { + actions = ["sts:AssumeRole"] + + + principals { + type = "Service" + identifiers = ["sagemaker.amazonaws.com"] + } + } +} + + +# SageMaker Execution Role +resource "aws_iam_role" "sagemaker" { + name = "${var.prefix}-sagemaker" + path = "/" + assume_role_policy = data.aws_iam_policy_document.sagemaker_assume_role.json +} + + +# Assume Role Policy for SageMaker Inference Role +data "aws_iam_policy_document" "assume_inference_role" { + statement { + actions = ["sts:AssumeRole"] + + + principals { + type = "Service" + identifiers = ["sagemaker.amazonaws.com"] + } + } +} + + +# SageMaker Inference Role +resource "aws_iam_role" "inference_role" { + name = "${var.prefix}-sagemaker-inference-role" + assume_role_policy = data.aws_iam_policy_document.assume_inference_role.json +} + + +# Policy Document for SageMaker Permissions +data "aws_iam_policy_document" "sagemaker_inference_policy_document" { + statement { + actions = [ + "s3:ListBucket", + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:GetBucketLocation", + ] + resources = [ + "arn:aws:s3:::*sagemaker*", + "${var.aws_s3_bucket_notebook.arn}/*", + "arn:aws:s3:::jumpstart-cache-prod-eu-west-2/*", + "arn:aws:s3:::jumpstart-private-cache-prod-eu-west-2/*", + "arn:aws:s3:::jumpstart-cache-prod-eu-west-2", + "arn:aws:s3:::jumpstart-private-cache-prod-eu-west-2", + ] + } + + statement { + actions = [ + "sns:Publish", + ] + resources = ["arn:aws:sns:eu-west-2:${var.account_id}:async-sagemaker-success-topic"] + } + + statement { + actions = [ + "ecr:BatchGetImage", + "ecr:DescribeImages", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchCheckLayerAvailability", + "ecr:GetAuthorizationToken" + ] + resources = ["*"] + } + + + statement { + actions = [ + "cloudwatch:DeleteAlarms", + "cloudwatch:DescribeAlarms", + "cloudwatch:GetMetricData", + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics", + "cloudwatch:PutMetricAlarm", + "cloudwatch:PutMetricData"] + resources = ["*"] + } + statement { + actions = [ + "application-autoscaling:DeleteScalingPolicy", + "application-autoscaling:DeleteScheduledAction", + "application-autoscaling:DeregisterScalableTarget", + "application-autoscaling:DescribeScalableTargets", + "application-autoscaling:DescribeScalingActivities", + "application-autoscaling:DescribeScalingPolicies", + "application-autoscaling:DescribeScheduledActions", + "application-autoscaling:PutScalingPolicy", + "application-autoscaling:PutScheduledAction", + "application-autoscaling:RegisterScalableTarget", + ] + resources = ["*", ] + } + + statement { + actions = [ + "ec2:CreateNetworkInterface", + "ec2:CreateNetworkInterfacePermission", + "ec2:CreateVpcEndpoint", + "ec2:DeleteNetworkInterface", + "ec2:DeleteNetworkInterfacePermission", + "ec2:DescribeDhcpOptions", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribeRouteTables", + "ec2:DescribeSecurityGroups", + "ec2:DescribeSubnets", + "ec2:DescribeVpcEndpoints", + "ec2:DescribeVpcs", + ] + resources = ["*", ] + } + + statement { + actions = [ + "logs:CreateLogDelivery", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:DeleteLogDelivery", + "logs:Describe*", + "logs:GetLogDelivery", + "logs:GetLogEvents", + "logs:ListLogDeliveries", + "logs:PutLogEvents", + "logs:PutResourcePolicy", + "logs:UpdateLogDelivery", + ] + resources = ["*", ] + } +} + + +# Create IAM Policy for SageMaker Permissions +resource "aws_iam_policy" "sagemaker_access_policy" { + name = "${var.prefix}-sagemaker-domain" + policy = data.aws_iam_policy_document.sagemaker_inference_policy_document.json +} + + +# Attach Policy to SageMaker Role +resource "aws_iam_role_policy_attachment" "sagemaker_managed_policy" { + role = aws_iam_role.sagemaker.name + policy_arn = aws_iam_policy.sagemaker_access_policy.arn +} + + +# Attach Policy to Inference Role +resource "aws_iam_role_policy_attachment" "sagemaker_inference_role_policy" { + role = aws_iam_role.inference_role.name + policy_arn = aws_iam_policy.sagemaker_access_policy.arn +} + diff --git a/infra/modules/sagemaker_init/iam/outputs.tf b/infra/modules/sagemaker_init/iam/outputs.tf new file mode 100644 index 0000000..2ef0cc2 --- /dev/null +++ b/infra/modules/sagemaker_init/iam/outputs.tf @@ -0,0 +1,14 @@ +output "execution_role" { + description = "ARN of the sagemaker execution role" + value = aws_iam_role.sagemaker.arn +} + +output "inference_role" { + description = "ARN of the sagemaker inference role" + value = aws_iam_role.inference_role.arn +} + +output "default_sagemaker_bucket" { + description = "Default sagemaker bucket data object" + value = data.aws_s3_bucket.sagemaker_default_bucket +} diff --git a/infra/modules/sagemaker_init/iam/variables.tf b/infra/modules/sagemaker_init/iam/variables.tf new file mode 100644 index 0000000..e9fe801 --- /dev/null +++ b/infra/modules/sagemaker_init/iam/variables.tf @@ -0,0 +1,19 @@ +variable "prefix" { + type = string + description = "Prefix for naming IAM resources" +} + +variable "sagemaker_default_bucket_name" { + type = string + description = "name of the default S3 bucket used by sagemaker" +} + +variable "aws_s3_bucket_notebook" { + type = any + description = "S3 bucket for notebooks" +} + +variable "account_id" { + type = string + description = "account ID for the AWS account, dyanmic" +} diff --git a/infra/modules/sagemaker_output_mover/lambda_function/payload.zip b/infra/modules/sagemaker_output_mover/lambda_function/payload.zip new file mode 100644 index 0000000..a0cc6e0 Binary files /dev/null and b/infra/modules/sagemaker_output_mover/lambda_function/payload.zip differ diff --git a/infra/modules/sagemaker_output_mover/lambda_function/s3_move_output.py b/infra/modules/sagemaker_output_mover/lambda_function/s3_move_output.py new file mode 100644 index 0000000..fe830b7 --- /dev/null +++ b/infra/modules/sagemaker_output_mover/lambda_function/s3_move_output.py @@ -0,0 +1,46 @@ +import ast +import logging + +import boto3 + +logger = logging.getLogger() +logger.setLevel("INFO") + + +def lambda_handler(event, context): + for record in event["Records"]: + process_message(record) + + +def process_message(record): + try: + message_str = record["Sns"]["Message"] + s3 = boto3.resource("s3") + message_dict = ast.literal_eval(message_str) + + endpoint_name = message_dict["requestParameters"]["endpointName"] + input_file_uri = message_dict["requestParameters"]["inputLocation"] + input_file_bucket = input_file_uri.split("/user/federated/")[0].split("s3://")[ + 1 + ] + federated_user_id = input_file_uri.split("/user/federated/")[1].split("/")[0] + + output_file_uri = message_dict["responseParameters"]["outputLocation"] + output_file_bucket = ( + output_file_uri.split("https://")[1] + .split("/")[0] + .split(".s3.eu-west-2.amazonaws.com")[0] + ) + output_file_key = output_file_uri.split("https://")[1].split("/")[1] + + copy_source = {"Bucket": output_file_bucket, "Key": output_file_key} + s3_filepath_output = ( + f"user/federated/{federated_user_id}/sagemaker/outputs/{output_file_key}" + ) + s3.meta.client.copy(copy_source, input_file_bucket, s3_filepath_output) + logger.info( + f"Output frm {endpoint_name} with id:{federated_user_id} mvd to usr's files" + ) + except Exception as e: + logger.error(e) + raise e diff --git a/infra/modules/sagemaker_output_mover/main.tf b/infra/modules/sagemaker_output_mover/main.tf new file mode 100644 index 0000000..d7b3191 --- /dev/null +++ b/infra/modules/sagemaker_output_mover/main.tf @@ -0,0 +1,104 @@ + +resource "aws_iam_role" "iam_for_lambda_s3_move" { + name = "iam_for_lambda_s3_move" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] }) +} + +resource "aws_iam_role_policy" "policy_for_lambda_s3_move" { + name = "policy_for_lambda_s3_move" + role = aws_iam_role.iam_for_lambda_s3_move.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Action = ["SNS:Receive", "SNS:Subscribe"] + Effect = "Allow" + Resource = aws_sns_topic.async-sagemaker-success-topic.arn + }, + { + Action = ["s3:GetObject"] + Effect = "Allow" + Resource = "arn:aws:s3:::*sagemaker*" + }, + { + Action = ["s3:PutObject"] + Effect = "Allow" + Resource = "${var.s3_bucket_notebooks_arn}*" + }, + { + Action = ["logs:CreateLogGroup", "logs:CreateLogStream", "logs:PutLogEvents", "logs:DescribeLogStreams"] + Effect = "Allow" + Resource = "arn:aws:logs:*:*:*" + } + ] + }) +} + +data "archive_file" "lambda_payload" { + type = "zip" + source_file = "${path.module}/lambda_function/s3_move_output.py" + output_path = "${path.module}/lambda_function/payload.zip" +} + +resource "aws_lambda_function" "lambda_s3_move_output" { + filename = data.archive_file.lambda_payload.output_path + source_code_hash = data.archive_file.lambda_payload.output_base64sha256 + function_name = "lambda_s3_move_output" + role = aws_iam_role.iam_for_lambda_s3_move.arn + handler = "s3_move_output.lambda_handler" + runtime = "python3.12" + timeout = 30 +} + + +resource "aws_sns_topic" "async-sagemaker-success-topic" { + name = "async-sagemaker-success-topic" + policy = data.aws_iam_policy_document.sns_publish_and_read_policy.json +} + +resource "aws_sns_topic_subscription" "topic_lambda" { + topic_arn = aws_sns_topic.async-sagemaker-success-topic.arn + protocol = "lambda" + endpoint = aws_lambda_function.lambda_s3_move_output.arn +} + +resource "aws_lambda_permission" "with_sns" { + statement_id = "AllowExecutionFromSNS" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.lambda_s3_move_output.function_name + principal = "sns.amazonaws.com" + source_arn = aws_sns_topic.async-sagemaker-success-topic.arn +} + +data "aws_iam_policy_document" "sns_publish_and_read_policy" { + statement { + sid = "sns_publish_and_read_policy_1" + actions = ["SNS:Publish"] + effect = "Allow" + principals { + type = "Service" + identifiers = ["sagemaker.amazonaws.com"] + } + resources = ["arn:aws:sns:${var.aws_region}:${var.account_id}:async-sagemaker-success-topic"] + } + statement { + sid = "sns_publish_and_read_policy_2" + actions = ["SNS:Receive", "SNS:Subscribe"] + effect = "Allow" + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + resources = ["arn:aws:sns:${var.aws_region}:${var.account_id}:async-sagemaker-success-topic"] + } +} diff --git a/infra/modules/sagemaker_output_mover/outputs.tf b/infra/modules/sagemaker_output_mover/outputs.tf new file mode 100644 index 0000000..5bd1164 --- /dev/null +++ b/infra/modules/sagemaker_output_mover/outputs.tf @@ -0,0 +1,3 @@ +output "sns_success_topic_arn" { + value = aws_sns_topic.async-sagemaker-success-topic.arn +} diff --git a/infra/modules/sagemaker_output_mover/variables.tf b/infra/modules/sagemaker_output_mover/variables.tf new file mode 100644 index 0000000..248f5fa --- /dev/null +++ b/infra/modules/sagemaker_output_mover/variables.tf @@ -0,0 +1,14 @@ +variable "account_id" { + type = string + description = "AWS Account ID" +} + +variable "aws_region" { + type = string + description = "AWS Region in format e.g. us-west-1" +} + +variable "s3_bucket_notebooks_arn" { + type = string + description = "S3 Bucket for notebook user data storage" +} diff --git a/infra/modules/uv.lock b/infra/modules/uv.lock new file mode 100644 index 0000000..39a37a4 --- /dev/null +++ b/infra/modules/uv.lock @@ -0,0 +1,382 @@ +version = 1 +requires-python = ">=3.12" + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, +] + +[[package]] +name = "boto3" +version = "1.36.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/de/3c35089f97f6068beb852b51b9eede70e8f7e39a6c8ddff68f3bcabafe3e/boto3-1.36.13.tar.gz", hash = "sha256:c8031aa1c4a7c331081b2d86c49a362654b86e0b89d0a41fa166a68b226f4aba", size = 111027 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/d9/d0e741995fedf458e99f71856ae725c201f4cbd69ba6c92fd7498fe71a16/boto3-1.36.13-py3-none-any.whl", hash = "sha256:20d97739cea1b0f549e9096c453ac727a350da28bd0451098714260b655a85ea", size = 139176 }, +] + +[[package]] +name = "botocore" +version = "1.36.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7e/0b/87dcaaa03a7b5bf3e06abfeccb2af328a436a97fd7b6015f174f1350a284/botocore-1.36.13.tar.gz", hash = "sha256:50a3ff292f8dfdde21074b5c916afe847b01e074ab16d9c9fe71b34960c77134", size = 13500731 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/24/4d/dc5d65588601cec5f243a73c8c16bc22e485d2845eccb2f924ba883df4e4/botocore-1.36.13-py3-none-any.whl", hash = "sha256:d644a814440bf8d55f4e29b1c0e6f021e2573b7784e0c91f55f4d9d689e08005", size = 13331110 }, +] + +[[package]] +name = "bracex" +version = "2.5.post1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/6c/57418c4404cd22fe6275b8301ca2b46a8cdaa8157938017a9ae0b3edf363/bracex-2.5.post1.tar.gz", hash = "sha256:12c50952415bfa773d2d9ccb8e79651b8cdb1f31a42f6091b804f6ba2b4a66b6", size = 26641 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/02/8db98cdc1a58e0abd6716d5e63244658e6e63513c65f469f34b6f1053fd0/bracex-2.5.post1-py3-none-any.whl", hash = "sha256:13e5732fec27828d6af308628285ad358047cec36801598368cb28bc631dbaf6", size = 11558 }, +] + +[[package]] +name = "bump-my-version" +version = "0.31.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, + { name = "questionary" }, + { name = "rich" }, + { name = "rich-click" }, + { name = "tomlkit" }, + { name = "wcmatch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ab/1c/11efd01de6eaa730519af987362b5cf7783a74657798e591fc3a98c91c4a/bump_my_version-0.31.1.tar.gz", hash = "sha256:83962dbd593b3edb426661a4c2276a0842a7eaa5dee896543b771c358ac78915", size = 1021738 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f0/7d/af9ad0729d0e64ca79bf76c7b564c890d1471d3d140777b762b66484ac24/bump_my_version-0.31.1-py3-none-any.whl", hash = "sha256:3b9f496eb5554208d91f84fcb781628bdd4549e055fd3282804959453ebd1857", size = 55594 }, +] + +[[package]] +name = "click" +version = "8.1.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "jmespath" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/00/2a/e867e8531cf3e36b41201936b7fa7ba7b5702dbef42922193f05c8976cd6/jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe", size = 25843 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256 }, +] + +[[package]] +name = "lambda-functions" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "boto3" }, +] + +[package.dev-dependencies] +dev = [ + { name = "bump-my-version" }, + { name = "ruff" }, +] + +[package.metadata] +requires-dist = [{ name = "boto3" }] + +[package.metadata.requires-dev] +dev = [ + { name = "bump-my-version" }, + { name = "ruff" }, +] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.50" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/e1/bd15cb8ffdcfeeb2bdc215de3c3cffca11408d829e4b8416dcfe71ba8854/prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab", size = 429087 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816 }, +] + +[[package]] +name = "pydantic" +version = "2.10.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/ae/d5220c5c52b158b1de7ca89fc5edb72f304a70a4c540c84c8844bf4008de/pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236", size = 761681 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/3c/8cc1cc84deffa6e25d2d0c688ebb80635dfdbf1dbea3e30c541c8cf4d860/pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584", size = 431696 }, +] + +[[package]] +name = "pydantic-core" +version = "2.27.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127 }, + { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340 }, + { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900 }, + { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177 }, + { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046 }, + { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386 }, + { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060 }, + { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870 }, + { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822 }, + { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364 }, + { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303 }, + { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064 }, + { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046 }, + { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092 }, + { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709 }, + { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273 }, + { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027 }, + { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888 }, + { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738 }, + { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138 }, + { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025 }, + { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633 }, + { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404 }, + { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130 }, + { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946 }, + { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387 }, + { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453 }, + { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186 }, +] + +[[package]] +name = "pydantic-settings" +version = "2.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/7b/c58a586cd7d9ac66d2ee4ba60ca2d241fa837c02bca9bea80a9a8c3d22a9/pydantic_settings-2.7.1.tar.gz", hash = "sha256:10c9caad35e64bfb3c2fbf70a078c0e25cc92499782e5200747f942a065dec93", size = 79920 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b4/46/93416fdae86d40879714f72956ac14df9c7b76f7d41a4d68aa9f71a0028b/pydantic_settings-2.7.1-py3-none-any.whl", hash = "sha256:590be9e6e24d06db33a4262829edef682500ef008565a969c73d39d5f8bfb3fd", size = 29718 }, +] + +[[package]] +name = "pygments" +version = "2.19.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 }, +] + +[[package]] +name = "python-dotenv" +version = "1.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bc/57/e84d88dfe0aec03b7a2d4327012c1627ab5f03652216c63d49846d7a6c58/python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", size = 39115 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 }, +] + +[[package]] +name = "questionary" +version = "2.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "prompt-toolkit" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/b8/d16eb579277f3de9e56e5ad25280fab52fc5774117fb70362e8c2e016559/questionary-2.1.0.tar.gz", hash = "sha256:6302cdd645b19667d8f6e6634774e9538bfcd1aad9be287e743d96cacaf95587", size = 26775 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ad/3f/11dd4cd4f39e05128bfd20138faea57bec56f9ffba6185d276e3107ba5b2/questionary-2.1.0-py3-none-any.whl", hash = "sha256:44174d237b68bc828e4878c763a9ad6790ee61990e0ae72927694ead57bab8ec", size = 36747 }, +] + +[[package]] +name = "rich" +version = "13.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ab/3a/0316b28d0761c6734d6bc14e770d85506c986c85ffb239e688eeaab2c2bc/rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098", size = 223149 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/19/71/39c7c0d87f8d4e6c020a393182060eaefeeae6c01dab6a84ec346f2567df/rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90", size = 242424 }, +] + +[[package]] +name = "rich-click" +version = "1.8.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "rich" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9a/31/103501e85e885e3e202c087fa612cfe450693210372766552ce1ab5b57b9/rich_click-1.8.5.tar.gz", hash = "sha256:a3eebe81da1c9da3c32f3810017c79bd687ff1b3fa35bfc9d8a3338797f1d1a1", size = 38229 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/aa/0b/e2de98c538c0ee9336211d260f88b7e69affab44969750aaca0b48a697c8/rich_click-1.8.5-py3-none-any.whl", hash = "sha256:0fab7bb5b66c15da17c210b4104277cd45f3653a7322e0098820a169880baee0", size = 35081 }, +] + +[[package]] +name = "ruff" +version = "0.9.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c0/17/529e78f49fc6f8076f50d985edd9a2cf011d1dbadb1cdeacc1d12afc1d26/ruff-0.9.4.tar.gz", hash = "sha256:6907ee3529244bb0ed066683e075f09285b38dd5b4039370df6ff06041ca19e7", size = 3599458 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/f8/3fafb7804d82e0699a122101b5bee5f0d6e17c3a806dcbc527bb7d3f5b7a/ruff-0.9.4-py3-none-linux_armv6l.whl", hash = "sha256:64e73d25b954f71ff100bb70f39f1ee09e880728efb4250c632ceed4e4cdf706", size = 11668400 }, + { url = "https://files.pythonhosted.org/packages/2e/a6/2efa772d335da48a70ab2c6bb41a096c8517ca43c086ea672d51079e3d1f/ruff-0.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6ce6743ed64d9afab4fafeaea70d3631b4d4b28b592db21a5c2d1f0ef52934bf", size = 11628395 }, + { url = "https://files.pythonhosted.org/packages/dc/d7/cd822437561082f1c9d7225cc0d0fbb4bad117ad7ac3c41cd5d7f0fa948c/ruff-0.9.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54499fb08408e32b57360f6f9de7157a5fec24ad79cb3f42ef2c3f3f728dfe2b", size = 11090052 }, + { url = "https://files.pythonhosted.org/packages/9e/67/3660d58e893d470abb9a13f679223368ff1684a4ef40f254a0157f51b448/ruff-0.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c892540108314a6f01f105040b5106aeb829fa5fb0561d2dcaf71485021137", size = 11882221 }, + { url = "https://files.pythonhosted.org/packages/79/d1/757559995c8ba5f14dfec4459ef2dd3fcea82ac43bc4e7c7bf47484180c0/ruff-0.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de9edf2ce4b9ddf43fd93e20ef635a900e25f622f87ed6e3047a664d0e8f810e", size = 11424862 }, + { url = "https://files.pythonhosted.org/packages/c0/96/7915a7c6877bb734caa6a2af424045baf6419f685632469643dbd8eb2958/ruff-0.9.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c90c32357c74f11deb7fbb065126d91771b207bf9bfaaee01277ca59b574ec", size = 12626735 }, + { url = "https://files.pythonhosted.org/packages/0e/cc/dadb9b35473d7cb17c7ffe4737b4377aeec519a446ee8514123ff4a26091/ruff-0.9.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:56acd6c694da3695a7461cc55775f3a409c3815ac467279dfa126061d84b314b", size = 13255976 }, + { url = "https://files.pythonhosted.org/packages/5f/c3/ad2dd59d3cabbc12df308cced780f9c14367f0321e7800ca0fe52849da4c/ruff-0.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0c93e7d47ed951b9394cf352d6695b31498e68fd5782d6cbc282425655f687a", size = 12752262 }, + { url = "https://files.pythonhosted.org/packages/c7/17/5f1971e54bd71604da6788efd84d66d789362b1105e17e5ccc53bba0289b/ruff-0.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d4c8772670aecf037d1bf7a07c39106574d143b26cfe5ed1787d2f31e800214", size = 14401648 }, + { url = "https://files.pythonhosted.org/packages/30/24/6200b13ea611b83260501b6955b764bb320e23b2b75884c60ee7d3f0b68e/ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfc5f1d7afeda8d5d37660eeca6d389b142d7f2b5a1ab659d9214ebd0e025231", size = 12414702 }, + { url = "https://files.pythonhosted.org/packages/34/cb/f5d50d0c4ecdcc7670e348bd0b11878154bc4617f3fdd1e8ad5297c0d0ba/ruff-0.9.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faa935fc00ae854d8b638c16a5f1ce881bc3f67446957dd6f2af440a5fc8526b", size = 11859608 }, + { url = "https://files.pythonhosted.org/packages/d6/f4/9c8499ae8426da48363bbb78d081b817b0f64a9305f9b7f87eab2a8fb2c1/ruff-0.9.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a6c634fc6f5a0ceae1ab3e13c58183978185d131a29c425e4eaa9f40afe1e6d6", size = 11485702 }, + { url = "https://files.pythonhosted.org/packages/18/59/30490e483e804ccaa8147dd78c52e44ff96e1c30b5a95d69a63163cdb15b/ruff-0.9.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:433dedf6ddfdec7f1ac7575ec1eb9844fa60c4c8c2f8887a070672b8d353d34c", size = 12067782 }, + { url = "https://files.pythonhosted.org/packages/3d/8c/893fa9551760b2f8eb2a351b603e96f15af167ceaf27e27ad873570bc04c/ruff-0.9.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d612dbd0f3a919a8cc1d12037168bfa536862066808960e0cc901404b77968f0", size = 12483087 }, + { url = "https://files.pythonhosted.org/packages/23/15/f6751c07c21ca10e3f4a51ea495ca975ad936d780c347d9808bcedbd7182/ruff-0.9.4-py3-none-win32.whl", hash = "sha256:db1192ddda2200671f9ef61d9597fcef89d934f5d1705e571a93a67fb13a4402", size = 9852302 }, + { url = "https://files.pythonhosted.org/packages/12/41/2d2d2c6a72e62566f730e49254f602dfed23019c33b5b21ea8f8917315a1/ruff-0.9.4-py3-none-win_amd64.whl", hash = "sha256:05bebf4cdbe3ef75430d26c375773978950bbf4ee3c95ccb5448940dc092408e", size = 10850051 }, + { url = "https://files.pythonhosted.org/packages/c6/e6/3d6ec3bc3d254e7f005c543a661a41c3e788976d0e52a1ada195bd664344/ruff-0.9.4-py3-none-win_arm64.whl", hash = "sha256:585792f1e81509e38ac5123492f8875fbc36f3ede8185af0a26df348e5154f41", size = 10078251 }, +] + +[[package]] +name = "s3transfer" +version = "0.11.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/62/45/2323b5928f86fd29f9afdcef4659f68fa73eaa5356912b774227f5cf46b5/s3transfer-0.11.2.tar.gz", hash = "sha256:3b39185cb72f5acc77db1a58b6e25b977f28d20496b6e58d6813d75f464d632f", size = 147885 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/ac/e7dc469e49048dc57f62e0c555d2ee3117fa30813d2a1a2962cce3a2a82a/s3transfer-0.11.2-py3-none-any.whl", hash = "sha256:be6ecb39fadd986ef1701097771f87e4d2f821f27f6071c872143884d2950fbc", size = 84151 }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, +] + +[[package]] +name = "tomlkit" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b1/09/a439bec5888f00a54b8b9f05fa94d7f901d6735ef4e55dcec9bc37b5d8fa/tomlkit-0.13.2.tar.gz", hash = "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79", size = 192885 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f9/b6/a447b5e4ec71e13871be01ba81f5dfc9d0af7e473da256ff46bc0e24026f/tomlkit-0.13.2-py3-none-any.whl", hash = "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde", size = 37955 }, +] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, +] + +[[package]] +name = "urllib3" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/63/e53da845320b757bf29ef6a9062f5c669fe997973f966045cb019c3f4b66/urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d", size = 307268 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, +] + +[[package]] +name = "wcmatch" +version = "10.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "bracex" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/41/ab/b3a52228538ccb983653c446c1656eddf1d5303b9cb8b9aef6a91299f862/wcmatch-10.0.tar.gz", hash = "sha256:e72f0de09bba6a04e0de70937b0cf06e55f36f37b3deb422dfaf854b867b840a", size = 115578 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/df/4ee467ab39cc1de4b852c212c1ed3becfec2e486a51ac1ce0091f85f38d7/wcmatch-10.0-py3-none-any.whl", hash = "sha256:0dd927072d03c0a6527a20d2e6ad5ba8d0380e60870c383bc533b71744df7b7a", size = 39347 }, +] + +[[package]] +name = "wcwidth" +version = "0.2.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, +] diff --git a/infra/mwaa.tf b/infra/mwaa.tf index aef92b5..6d5c893 100644 --- a/infra/mwaa.tf +++ b/infra/mwaa.tf @@ -190,9 +190,8 @@ resource "aws_iam_role_policy_attachment" "mwaa_execution_role_policy_attachment # S3 resource "aws_s3_bucket" "mwaa_source_bucket" { - count = var.mwaa_environment_name != "" ? 1 : 0 - bucket = var.mwaa_source_bucket_name - force_destroy = "false" + count = var.mwaa_environment_name != "" ? 1 : 0 + bucket = var.mwaa_source_bucket_name } resource "aws_s3_bucket_versioning" "mwaa_source_bucket" { diff --git a/infra/route_53.tf b/infra/route_53.tf index 9743edf..6995b2d 100644 --- a/infra/route_53.tf +++ b/infra/route_53.tf @@ -288,4 +288,4 @@ resource "aws_acm_certificate_validation" "airflow_webserver" { # resource "aws_acm_certificate_validation" "jupyterhub" { # certificate_arn = "${aws_acm_certificate.jupyterhub.arn}" -# } +# } \ No newline at end of file diff --git a/infra/s3_notebooks.tf b/infra/s3_notebooks.tf index 8bf300e..3b82e08 100644 --- a/infra/s3_notebooks.tf +++ b/infra/s3_notebooks.tf @@ -76,8 +76,34 @@ data "aws_iam_policy_document" "notebooks" { test = "StringEquals" variable = "aws:SourceVpce" values = [ - aws_vpc_endpoint.s3.id + aws_vpc_endpoint.s3.id, ] } } + + dynamic "statement" { + + for_each = var.sagemaker_on ? [1] : [] + + content { + effect = "Allow" + principals { + type = "*" + identifiers = ["*"] + } + actions = [ + "s3:GetObject", + ] + resources = [ + "arn:aws:s3:::${aws_s3_bucket.notebooks.id}/shared/*", + ] + condition { + test = "StringEquals" + variable = "aws:SourceVpce" + values = [ + aws_vpc_endpoint.sagemaker_s3[0].id, + ] + } + } + } } diff --git a/infra/sagemaker.tf b/infra/sagemaker.tf new file mode 100644 index 0000000..bf94eab --- /dev/null +++ b/infra/sagemaker.tf @@ -0,0 +1,315 @@ +module "sagemaker_domain" { + + count = var.sagemaker_on ? 1 : 0 + + source = "./modules/sagemaker_init/domain" + domain_name = "SageMaker" + vpc_id = aws_vpc.notebooks.id + subnet_ids = aws_subnet.private_without_egress.*.id + execution_role_arn = module.iam[0].execution_role +} + +# IAM Roles and Policies for SageMaker +module "iam" { + + count = var.sagemaker_on ? 1 : 0 + + source = "./modules/sagemaker_init/iam" + prefix = var.prefix + sagemaker_default_bucket_name = var.sagemaker_default_bucket + aws_s3_bucket_notebook = aws_s3_bucket.notebooks + account_id = data.aws_caller_identity.aws_caller_identity.account_id +} + +resource "aws_security_group" "sagemaker_vpc_endpoints_main" { + + count = var.sagemaker_on ? 1 : 0 + + name = "${var.prefix}-sagemaker-vpc-endpoints-main" + description = "${var.prefix}-sagemaker-vpc-endpoints-main" + vpc_id = aws_vpc.main.id + + tags = { + Name = "${var.prefix}-sagemaker-vpc-endpoints-main" + } + + lifecycle { + create_before_destroy = true + } +} + +resource "aws_security_group_rule" "ingress_sagemaker_vpc_endpoint_notebooks_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-from-notebooks-vpc" + + security_group_id = aws_security_group.sagemaker_vpc_endpoints_main[0].id + cidr_blocks = [aws_vpc.notebooks.cidr_block] + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "egress_sagemaker_vpc_endpoint_notebooks_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-from-notebooks-vpc" + + security_group_id = aws_security_group.sagemaker_vpc_endpoints_main[0].id + cidr_blocks = [aws_vpc.notebooks.cidr_block] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "ingress_sagemaker_vpc_endpoint_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-from-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker_vpc_endpoints_main[0].id + cidr_blocks = [aws_vpc.sagemaker[0].cidr_block] + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "egress_sagemaker_vpc_endpoints_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-from-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker_vpc_endpoints_main[0].id + cidr_blocks = [aws_vpc.sagemaker[0].cidr_block] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +############################### +## To test new SageMaker VPC ## +############################### + +resource "aws_security_group" "sagemaker_endpoints" { + + count = var.sagemaker_on ? 1 : 0 + + name = "${var.prefix}-sagemaker-endpoints" + description = "${var.prefix}-sagemaker-endpoints" + vpc_id = aws_vpc.sagemaker[0].id + + tags = { + Name = "${var.prefix}-sagemaker-endpoints" + } + + lifecycle { + create_before_destroy = true + } +} + +resource "aws_security_group_rule" "notebooks_endpoint_ingress_sagemaker_test" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-sagemaker-to-notebooks-vpc" + + security_group_id = aws_security_group.sagemaker_endpoints[0].id + cidr_blocks = [aws_vpc.notebooks.cidr_block] + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "notebooks_endpoint_egress_sagemaker_test" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-egress-notebooks-to-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker_endpoints[0].id + cidr_blocks = [aws_vpc.notebooks.cidr_block] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "sagemaker_vpc_endpoint_egress" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-egress-notebooks-to-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker_endpoints[0].id + cidr_blocks = ["0.0.0.0/0"] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + + + +resource "aws_security_group" "main_to_sagemaker" { + + count = var.sagemaker_on ? 1 : 0 + + name = "${var.prefix}-main-to-sagemaker-endpoints" + description = "${var.prefix}sagemaker-access-VPC-endpoints-in-main" + vpc_id = aws_vpc.main.id + + tags = { + Name = "${var.prefix}-sagemaker-endpoints-main" + } + + lifecycle { + create_before_destroy = true + } +} + +resource "aws_security_group_rule" "sagemaker_to_main_ingress" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-sagemaker-to-main-vpc" + + security_group_id = aws_security_group.main_to_sagemaker[0].id + cidr_blocks = [aws_vpc.sagemaker[0].cidr_block] + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "sagemaker_to_main_egress" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-egress-sagemaker-to-main-vpc" + + security_group_id = aws_security_group.main_to_sagemaker[0].id + cidr_blocks = [aws_vpc.sagemaker[0].cidr_block] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + + +#### Used to allow access to VPC endpoints in Main + +resource "aws_security_group_rule" "main_ingress_sagemaker_endpoints" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-ingress-sagemaker-to-main-vpc" + + security_group_id = aws_security_group.sagemaker_endpoints[0].id + cidr_blocks = [aws_vpc.main.cidr_block] + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "sagemaker_endpoints_egress_main" { + + count = var.sagemaker_on ? 1 : 0 + + description = "endpoint-egress-notebooks-to-main-vpc" + + security_group_id = aws_security_group.sagemaker_endpoints[0].id + cidr_blocks = [aws_vpc.main.cidr_block] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +# SageMaker Execution Role Output +output "execution_role" { + value = module.iam[*].execution_role + description = "The ARN of the SageMaker execution role" +} + + +# SageMaker Inference Role Output +output "inference_role" { + value = module.iam[*].inference_role + description = "The ARN of the SageMaker inference role" +} + +# SageMaker Domain Output +output "sagemaker_domain_id" { + value = module.sagemaker_domain[*].sagemaker_domain_id + description = "The ID of the SageMaker domain" +} + +output "default_sagemaker_bucket" { + value = module.iam[*].default_sagemaker_bucket +} + +module "cost_monitoring_dashboard" { + + count = var.sagemaker_on ? 1 : 0 + + source = "./modules/cost_monitoring/cloudwatch_dashboard" + dashboard_name = "aws-cost-monitoring-dashboard" + services_to_monitor = [ + "AmazonSageMaker", + "AmazonEC2", + "AmazonS3" + ] +} + +module "sns" { + + count = var.sagemaker_on ? 1 : 0 + + source = "./modules/cost_monitoring/sns" + prefix = "data-workspace-sagemaker" + account_id = data.aws_caller_identity.aws_caller_identity.account_id + #notification_email = var.sagemaker_budget_emails +} + +module "sagemaker_output_mover" { + + count = var.sagemaker_on ? 1 : 0 + + source = "./modules/sagemaker_output_mover" + account_id = data.aws_caller_identity.aws_caller_identity.account_id + aws_region = data.aws_region.aws_region.name + s3_bucket_notebooks_arn = aws_s3_bucket.notebooks.arn +} + +module "budgets" { + + count = var.sagemaker_on ? 1 : 0 + + source = "./modules/cost_monitoring/budgets" + budget_limit = "1000" + cost_filter_service = "Amazon SageMaker" + budget_name = "sagemaker-budget" + sns_topic_arn = module.sns[0].sns_topic_arn + notification_email = var.sagemaker_budget_emails +} diff --git a/infra/sagemaker_llm_resources.tf b/infra/sagemaker_llm_resources.tf new file mode 100644 index 0000000..6df788d --- /dev/null +++ b/infra/sagemaker_llm_resources.tf @@ -0,0 +1,310 @@ +################ +# GPT Neo 125m +############### +module "gpt_neo_125m_deployment" { + + count = (var.sagemaker_on && var.sagemaker_gpt_neo_125m) ? 1 : 0 + + model_name = "gpt-neo-125m" + container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.0-gpu-py310-cu121-ubuntu20.04" + model_uri = "s3://jumpstart-cache-prod-eu-west-2/huggingface-textgeneration1/huggingface-textgeneration1-gpt-neo-125m/artifacts/inference-prepack/v2.0.0/" + model_uri_compression = "None" + instance_type = "ml.g5.2xlarge" # 8 vCPU and 1 GPU and 32 GB-RAM + max_capacity = 2 + min_capacity = 0 + scale_up_cooldown = 900 + scale_down_cooldown = 0 + environment_variables = { + "ENDPOINT_SERVER_TIMEOUT" : "3600", + "HF_MODEL_ID" : "/opt/ml/model", + "MAX_INPUT_LENGTH" : "1024", + "MAX_TOTAL_TOKENS" : "2048", + "MODEL_CACHE_ROOT" : "/opt/ml/model", + "SAGEMAKER_ENV" : "1", + "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", + "SAGEMAKER_PROGRAM" : "inference.py", + "SM_NUM_GPUS" : "1" + } + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 8 # 8 vCPUs + cpu_threshold_low = 20 * 8 # 8 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 + datapoints_to_alarm_low = 15 + + # These variables do not change between LLMs + source = "./modules/sagemaker_deployment" + security_group_ids = [aws_security_group.sagemaker[0].id, aws_security_group.sagemaker_endpoints[0].id] + subnets = aws_subnet.sagemaker_private_without_egress.*.id + s3_output_path = "https://${module.iam[0].default_sagemaker_bucket.bucket_regional_domain_name}" + aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id + sns_success_topic_arn = module.sagemaker_output_mover[0].sns_success_topic_arn + execution_role_arn = module.iam[0].inference_role + teams_webhook_url = var.teams_webhook_url +} + +################ +# Flan T5 780m (Large) +############### +module "flan_t5_780m_deployment" { + + count = (var.sagemaker_on && var.sagemaker_flan_t5_780m) ? 1 : 0 + + model_name = "flan-t5-780m" + container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.0-gpu-py310-cu121-ubuntu20.04" + model_uri = "s3://jumpstart-cache-prod-eu-west-2/huggingface-text2text/huggingface-text2text-flan-t5-large/artifacts/inference-prepack/v2.0.0/" + model_uri_compression = "None" + instance_type = "ml.g5.2xlarge" # 8 vCPU and 1 GPU and 32 GB-RAM + max_capacity = 2 + min_capacity = 0 + scale_up_cooldown = 900 + scale_down_cooldown = 0 + environment_variables = { + "ENDPOINT_SERVER_TIMEOUT" : "3600", + "HF_MODEL_ID" : "/opt/ml/model", + "MAX_INPUT_LENGTH" : "1024", + "MAX_TOTAL_TOKENS" : "2048", + "MODEL_CACHE_ROOT" : "/opt/ml/model", + "SAGEMAKER_ENV" : "1", + "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", + "SAGEMAKER_PROGRAM" : "inference.py", + "SM_NUM_GPUS" : "1" + } + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 8 # 8 vCPUs + cpu_threshold_low = 20 * 8 # 8 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 + datapoints_to_alarm_low = 15 + + # These variables do not change between LLMs + source = "./modules/sagemaker_deployment" + security_group_ids = [aws_security_group.sagemaker[0].id, aws_security_group.sagemaker_endpoints[0].id] + subnets = aws_subnet.sagemaker_private_without_egress.*.id + s3_output_path = "https://${module.iam[0].default_sagemaker_bucket.bucket_regional_domain_name}" + aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id + sns_success_topic_arn = module.sagemaker_output_mover[0].sns_success_topic_arn + execution_role_arn = module.iam[0].inference_role + teams_webhook_url = var.teams_webhook_url +} + + +############### +# Phi 2 3b +############### +module "phi_2_3b_deployment" { + + count = (var.sagemaker_on && var.sagemaker_phi_2_3b) ? 1 : 0 + + model_name = "phi-2-3b" + container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.2-gpu-py310-cu121-ubuntu22.04" + model_uri = "s3://jumpstart-cache-prod-eu-west-2/huggingface-llm/huggingface-llm-phi-2/artifacts/inference-prepack/v1.0.0/" + model_uri_compression = "None" + instance_type = "ml.g5.xlarge" # 4 vCPU and 1 GPU and 16 GB-RAM + max_capacity = 2 + min_capacity = 0 + scale_up_cooldown = 900 + scale_down_cooldown = 0 + environment_variables = { + "ENDPOINT_SERVER_TIMEOUT" : "3600", + "HF_MODEL_ID" : "/opt/ml/model", + "MAX_INPUT_LENGTH" : "2047", + "MAX_TOTAL_TOKENS" : "2048", + "MODEL_CACHE_ROOT" : "/opt/ml/model", + "SAGEMAKER_ENV" : "1", + "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", + "SAGEMAKER_PROGRAM" : "inference.py" + } + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 4 # 4 vCPUs + cpu_threshold_low = 20 * 4 # 4 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 + datapoints_to_alarm_low = 15 + + # These variables do not change between LLMs + source = "./modules/sagemaker_deployment" + security_group_ids = [aws_security_group.sagemaker[0].id, aws_security_group.sagemaker_endpoints[0].id] + subnets = aws_subnet.sagemaker_private_without_egress.*.id + s3_output_path = "https://${module.iam[0].default_sagemaker_bucket.bucket_regional_domain_name}" + aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id + sns_success_topic_arn = module.sagemaker_output_mover[0].sns_success_topic_arn + execution_role_arn = module.iam[0].inference_role + teams_webhook_url = var.teams_webhook_url +} + + +############### +# Llama 3.2 3b +############### +module "llama_3_3b_deployment" { + + count = (var.sagemaker_on && var.sagemaker_llama_3_3b) ? 1 : 0 + + model_name = "llama-3-3b" + container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124" + model_uri = "s3://jumpstart-private-cache-prod-eu-west-2/meta-textgeneration/meta-textgeneration-llama-3-2-3b/artifacts/inference-prepack/v1.0.0/" + model_uri_compression = "None" + instance_type = "ml.g6.xlarge" # 4 vCPU and 1 GPU and 16 GB-RAM + max_capacity = 2 + min_capacity = 0 + scale_up_cooldown = 900 * 4 + scale_down_cooldown = 0 + environment_variables = { + "ENDPOINT_SERVER_TIMEOUT" : "3600", + "HF_MODEL_ID" : "/opt/ml/model", + "MODEL_CACHE_ROOT" : "/opt/ml/model", + "OPTION_ENFORCE_EAGER" : "false", + "OPTION_GPU_MEMORY_UTILIZATION" : "0.95", + "OPTION_MAX_ROLLING_BATCH_SIZE" : "8", + "OPTION_TENSOR_PARALLEL_DEGREE" : "1", + "SAGEMAKER_ENV" : "1", + "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", + "SAGEMAKER_PROGRAM" : "inference.py" + } + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 4 # 4 vCPUs + cpu_threshold_low = 20 * 4 # 4 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 * 4 + datapoints_to_alarm_low = 15 * 4 + + # These variables do not change between LLMs + source = "./modules/sagemaker_deployment" + security_group_ids = [aws_security_group.sagemaker[0].id, aws_security_group.sagemaker_endpoints[0].id] + subnets = aws_subnet.sagemaker_private_without_egress.*.id + s3_output_path = "https://${module.iam[0].default_sagemaker_bucket.bucket_regional_domain_name}" + aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id + sns_success_topic_arn = module.sagemaker_output_mover[0].sns_success_topic_arn + execution_role_arn = module.iam[0].inference_role + teams_webhook_url = var.teams_webhook_url +} + + +############### +# Llama 3.2 3b-instruct +############### +module "llama_3_3b_instruct_deployment" { + + count = (var.sagemaker_on && var.sagemaker_llama_3_3b_instruct) ? 1 : 0 + + model_name = "llama-3-3b-instruct" + container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124" + model_uri = "s3://jumpstart-private-cache-prod-eu-west-2/meta-textgeneration/meta-textgeneration-llama-3-2-3b-instruct/artifacts/inference-prepack/v1.0.0/" + model_uri_compression = "None" + instance_type = "ml.g6.xlarge" # 4 vCPU and 1 GPU and 16 GB-RAM + max_capacity = 2 + min_capacity = 0 + scale_up_cooldown = 900 * 4 + scale_down_cooldown = 0 + environment_variables = { + "ENDPOINT_SERVER_TIMEOUT" : "3600", + "HF_MODEL_ID" : "/opt/ml/model", + "MODEL_CACHE_ROOT" : "/opt/ml/model", + "OPTION_ENFORCE_EAGER" : "false", + "OPTION_GPU_MEMORY_UTILIZATION" : "0.95", + "OPTION_MAX_ROLLING_BATCH_SIZE" : "8", + "OPTION_TENSOR_PARALLEL_DEGREE" : "1", + "SAGEMAKER_ENV" : "1", + "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", + "SAGEMAKER_PROGRAM" : "inference.py" + } + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 4 # 4 vCPUs + cpu_threshold_low = 20 * 4 # 4 vCPUs + gpu_threshold_high = 80 * 1 # 1 GPU + gpu_threshold_low = 20 * 1 # 1 GPU + ram_threshold_high = 80 + ram_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 * 4 + datapoints_to_alarm_low = 15 * 4 + + # These variables do not change between LLMs + source = "./modules/sagemaker_deployment" + security_group_ids = [aws_security_group.sagemaker[0].id, aws_security_group.sagemaker_endpoints[0].id] + subnets = aws_subnet.sagemaker_private_without_egress.*.id + s3_output_path = "https://${module.iam[0].default_sagemaker_bucket.bucket_regional_domain_name}" + aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id + sns_success_topic_arn = module.sagemaker_output_mover[0].sns_success_topic_arn + execution_role_arn = module.iam[0].inference_role + teams_webhook_url = var.teams_webhook_url +} + + +############### +# Mistral 7b-instruct +############### +module "mistral_7b_instruct_deployment" { + + count = (var.sagemaker_on && var.sagemaker_mistral_7b_instruct) ? 1 : 0 + + model_name = "mistral-7b-instruct" + container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.0.3-gpu-py310-cu121-ubuntu22.04" + model_uri = "s3://jumpstart-cache-prod-eu-west-2/huggingface-llm/huggingface-llm-mistral-7b-instruct-v3/artifacts/inference-prepack/v1.0.0/" + model_uri_compression = "None" + instance_type = "ml.g5.12xlarge" # 48 vCPU and 4 GPU and 192 GB-RAM + max_capacity = 2 + min_capacity = 0 + scale_up_cooldown = 900 * 4 + scale_down_cooldown = 0 + environment_variables = { + "ENDPOINT_SERVER_TIMEOUT" : "3600", + "HF_MODEL_ID" : "/opt/ml/model", + "MAX_BATCH_PREFILL_TOKENS" : "8191", + "MAX_INPUT_LENGTH" : "8191", + "MAX_TOTAL_TOKENS" : "8192", + "MODEL_CACHE_ROOT" : "/opt/ml/model", + "SAGEMAKER_ENV" : "1", + "SAGEMAKER_MODEL_SERVER_WORKERS" : "1", + "SAGEMAKER_PROGRAM" : "inference.py", + } + backlog_threshold_high = 1 + backlog_threshold_low = 1 + cpu_threshold_high = 80 * 48 # 48 vCPUs + cpu_threshold_low = 20 * 48 # 48 vCPUs + gpu_threshold_high = 80 * 4 # 4 GPUs + gpu_threshold_low = 20 * 4 # 4 GPUs + ram_threshold_high = 80 + ram_threshold_low = 20 + evaluation_periods_high = 1 + datapoints_to_alarm_high = 1 + evaluation_periods_low = 15 * 4 + datapoints_to_alarm_low = 15 * 4 + + # These variables do not change between LLMs + source = "./modules/sagemaker_deployment" + security_group_ids = [aws_security_group.sagemaker[0].id, aws_security_group.sagemaker_endpoints[0].id] + subnets = aws_subnet.sagemaker_private_without_egress.*.id + s3_output_path = "https://${module.iam[0].default_sagemaker_bucket.bucket_regional_domain_name}" + aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id + sns_success_topic_arn = module.sagemaker_output_mover[0].sns_success_topic_arn + execution_role_arn = module.iam[0].inference_role + teams_webhook_url = var.teams_webhook_url +} diff --git a/infra/security_groups.tf b/infra/security_groups.tf index bc43b92..5762f54 100644 --- a/infra/security_groups.tf +++ b/infra/security_groups.tf @@ -560,6 +560,164 @@ resource "aws_security_group_rule" "notebooks_egress_arango_lb" { protocol = "tcp" } +resource "aws_security_group_rule" "sagemaker_endpoint_ingress_to_notebooks" { + + count = var.sagemaker_on ? 1 : 0 + + description = "ingress-from-sagemaker-endpoints" + + security_group_id = aws_security_group.notebooks.id + source_security_group_id = aws_security_group.sagemaker_endpoints[0].id + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "sagemaker_endpoint_egress_to_notebooks" { + + count = var.sagemaker_on ? 1 : 0 + + description = "egress-to-sagemaker-endpoints" + + security_group_id = aws_security_group.notebooks.id + source_security_group_id = aws_security_group.sagemaker_endpoints[0].id + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "notebooks_egress_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "egress-sagemaker-vpc" + + security_group_id = aws_security_group.notebooks.id + cidr_blocks = ["0.0.0.0/0"] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "TCP" +} + +resource "aws_security_group_rule" "notebooks_ingress_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "egress-sagemaker-vpc" + + security_group_id = aws_security_group.notebooks.id + cidr_blocks = [aws_vpc.sagemaker[0].cidr_block] + + type = "ingress" + from_port = "443" + to_port = "443" + protocol = "TCP" +} + + +########################### +## To test SageMaker VPC ## +########################### + +resource "aws_security_group" "sagemaker" { + + count = var.sagemaker_on ? 1 : 0 + + name = "${var.prefix}-sagemaker" + description = "${var.prefix}-sagemaker" + vpc_id = aws_vpc.sagemaker[0].id + + tags = { + Name = "${var.prefix}-sagemaker" + } + + lifecycle { + create_before_destroy = true + } +} + +resource "aws_security_group_rule" "sagemaker_endpoint_ingress_to_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "ingress-from-sagemaker-endpoints" + + security_group_id = aws_security_group.sagemaker[0].id + source_security_group_id = aws_security_group.sagemaker_endpoints[0].id + + type = "ingress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "sagemaker_endpoint_egress_to_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "egress-to-sagemaker-endpoints" + + security_group_id = aws_security_group.sagemaker[0].id + source_security_group_id = aws_security_group.sagemaker_endpoints[0].id + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "egress_sagemaker_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "egress-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker[0].id + cidr_blocks = [aws_vpc.main.cidr_block] + + type = "egress" + from_port = "0" + to_port = "65535" + protocol = "tcp" +} + +resource "aws_security_group_rule" "ingress_notebooks_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "egress-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker[0].id + cidr_blocks = [aws_vpc.notebooks.cidr_block] + + type = "ingress" + from_port = "443" + to_port = "443" + protocol = "tcp" +} + +resource "aws_security_group_rule" "ingress_main_vpc" { + + count = var.sagemaker_on ? 1 : 0 + + description = "ingress-main-sagemaker-vpc" + + security_group_id = aws_security_group.sagemaker[0].id + cidr_blocks = [aws_vpc.main.cidr_block] + + type = "ingress" + from_port = "443" + to_port = "443" + protocol = "tcp" +} + +####################### resource "aws_security_group" "cloudwatch" { name = "${var.prefix}-cloudwatch" diff --git a/infra/vpc.tf b/infra/vpc.tf index ad2b27c..bcd0db3 100644 --- a/infra/vpc.tf +++ b/infra/vpc.tf @@ -286,6 +286,15 @@ resource "aws_route" "private_without_egress_to_jupyterhub" { vpc_peering_connection_id = aws_vpc_peering_connection.jupyterhub.id } +resource "aws_route" "pcx_notebooks_to_sagemaker_endpoints" { + + count = var.sagemaker_on ? length(var.aws_availability_zones) : 0 + + route_table_id = aws_route_table.private_without_egress.id + destination_cidr_block = aws_subnet.sagemaker_private_without_egress.*.cidr_block[count.index] + vpc_peering_connection_id = aws_vpc_peering_connection.sagemaker_to_notebooks[0].id +} + resource "aws_route" "private_without_egress_to_matchbox" { count = var.matchbox_on ? length(var.aws_availability_zones) : 0 @@ -545,7 +554,7 @@ data "aws_iam_policy_document" "datasets_s3_endpoint" { } actions = [ - "s3:GetObject", + "s3:GetObject" ] resources = [ @@ -841,6 +850,331 @@ data "aws_iam_policy_document" "aws_datasets_endpoint_ecr" { } } +###################################### +### New VPC & Subnet for SageMaker ### +###################################### + +resource "aws_vpc" "sagemaker" { + count = var.sagemaker_on ? 1 : 0 + + cidr_block = var.vpc_sagemaker_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = { + Name = "${var.prefix}-sagemaker" + } + + lifecycle { + create_before_destroy = true + } +} + + +resource "aws_subnet" "sagemaker_private_without_egress" { + count = var.sagemaker_on ? length(var.aws_availability_zones) : 0 + + vpc_id = aws_vpc.sagemaker[0].id + cidr_block = cidrsubnet(aws_vpc.sagemaker[0].cidr_block, var.vpc_sagemaker_subnets_num_bits, count.index) + availability_zone = var.aws_availability_zones[count.index] + + tags = { + Name = "${var.prefix}-sagemaker-private-without-egress-${var.aws_availability_zones_short[count.index]}" + } + + lifecycle { + create_before_destroy = true + } +} + +################################################################## +### VPC Peering Connections from SageMaker to Main & Notebooks ### +################################################################## + +resource "aws_vpc_peering_connection" "main_to_sagemaker" { + + count = var.sagemaker_on ? 1 : 0 + + peer_vpc_id = aws_vpc.sagemaker[0].id + vpc_id = aws_vpc.main.id + auto_accept = true + + accepter { + allow_remote_vpc_dns_resolution = true + } + + requester { + allow_remote_vpc_dns_resolution = true + } + + tags = { + Name = "${var.prefix}-main-to-sagemaker" + } +} + +# To enable connection between tools in the notebooks VPC and Sagemaker +resource "aws_vpc_peering_connection" "sagemaker_to_notebooks" { + + count = var.sagemaker_on ? 1 : 0 + + peer_vpc_id = aws_vpc.notebooks.id + vpc_id = aws_vpc.sagemaker[0].id + auto_accept = true + + accepter { + allow_remote_vpc_dns_resolution = false + } + + requester { + allow_remote_vpc_dns_resolution = false + } + + tags = { + Name = "${var.prefix}-sagemaker-to-notebooks" + } +} + +resource "aws_route_table" "sagemaker" { + + count = var.sagemaker_on ? 1 : 0 + + vpc_id = aws_vpc.sagemaker[0].id + tags = { + Name = "${var.prefix}-sagemaker" + } +} + +resource "aws_main_route_table_association" "sagemaker" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.sagemaker[0].id + route_table_id = aws_route_table.sagemaker[0].id +} + +resource "aws_route_table_association" "private_without_egress_sagemaker" { + count = var.sagemaker_on ? length(var.aws_availability_zones) : 0 + subnet_id = aws_subnet.sagemaker_private_without_egress.*.id[count.index] + route_table_id = aws_route_table.sagemaker[0].id +} + +resource "aws_route" "main_private_with_egress_to_sagemaker" { + count = var.sagemaker_on ? length(var.aws_availability_zones) : 0 + + route_table_id = aws_route_table.private_with_egress.id + destination_cidr_block = aws_subnet.sagemaker_private_without_egress.*.cidr_block[count.index] + vpc_peering_connection_id = aws_vpc_peering_connection.main_to_sagemaker[0].id +} + +resource "aws_route" "sagemaker_to_main_private_with_egress" { + count = var.sagemaker_on ? length(var.aws_availability_zones) : 0 + + route_table_id = aws_route_table.sagemaker[0].id + destination_cidr_block = aws_subnet.private_with_egress.*.cidr_block[count.index] + vpc_peering_connection_id = aws_vpc_peering_connection.main_to_sagemaker[0].id +} + +resource "aws_route" "pcx_sagemaker_to_notebooks" { + count = var.sagemaker_on ? 1 : 0 + route_table_id = aws_route_table.sagemaker[0].id + destination_cidr_block = aws_vpc.notebooks.cidr_block + vpc_peering_connection_id = aws_vpc_peering_connection.sagemaker_to_notebooks[0].id +} + +resource "aws_vpc_endpoint_route_table_association" "s3_sagemaker" { + count = var.sagemaker_on ? 1 : 0 + vpc_endpoint_id = aws_vpc_endpoint.sagemaker_s3[0].id + route_table_id = aws_route_table.sagemaker[0].id +} + +############################################# +### Cloudwatch Logging for SageMaker VPC ### +############################################# + +resource "aws_flow_log" "sagemaker" { + count = var.sagemaker_on ? 1 : 0 + log_destination = aws_cloudwatch_log_group.vpc_sagemaker_flow_log[0].arn + iam_role_arn = aws_iam_role.vpc_sagemaker_flow_log[0].arn + vpc_id = aws_vpc.sagemaker[0].id + traffic_type = "ALL" +} + +resource "aws_cloudwatch_log_group" "vpc_sagemaker_flow_log" { + count = var.sagemaker_on ? 1 : 0 + name = "${var.prefix}-vpc-sagemaker-flow-log" + retention_in_days = "3653" +} + +resource "aws_iam_role" "vpc_sagemaker_flow_log" { + count = var.sagemaker_on ? 1 : 0 + name = "${var.prefix}-vpc-sagemaker-flow-log" + assume_role_policy = data.aws_iam_policy_document.vpc_sagemaker_flow_log_vpc_flow_logs_assume_role[0].json +} + +data "aws_iam_policy_document" "vpc_sagemaker_flow_log_vpc_flow_logs_assume_role" { + count = var.sagemaker_on ? 1 : 0 + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["vpc-flow-logs.amazonaws.com"] + } + } +} + +######################################################### +## VPC Endpoints in Main VPC (SageMaker Runtime & API) ## +######################################################### + +resource "aws_vpc_endpoint" "sagemaker_runtime_endpoint_main" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.eu-west-2.sagemaker.runtime" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private_with_egress.*.id + security_group_ids = [aws_security_group.sagemaker_vpc_endpoints_main[0].id] + tags = { + Environment = var.prefix + Name = "main-sagemaker-runtime-endpoint" + } + private_dns_enabled = true + policy = data.aws_iam_policy_document.sagemaker_vpc_endpoint_policy[0].json +} + +resource "aws_vpc_endpoint" "sagemaker_api_endpoint_main" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.main.id + service_name = "com.amazonaws.eu-west-2.sagemaker.api" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.private_with_egress.*.id + security_group_ids = [aws_security_group.sagemaker_vpc_endpoints_main[0].id] + tags = { + Environment = var.prefix + Name = "main-sagemaker-api-endpoint" + } + private_dns_enabled = true + policy = data.aws_iam_policy_document.sagemaker_vpc_endpoint_policy[0].json +} + +data "aws_iam_policy_document" "sagemaker_vpc_endpoint_policy" { + count = var.sagemaker_on ? 1 : 0 + statement { + principals { + type = "AWS" + identifiers = ["*"] + } + actions = [ + "sagemaker:DescribeEndpoint", + "sagemaker:DescribeEndpointConfig", + "sagemaker:DescribeModel", + "sagemaker:InvokeEndpointAsync", + "sagemaker:ListEndpoints", + "sagemaker:ListEndpointConfigs", + "sagemaker:ListModels", + ] + resources = [ + "*" + ] + } +} + +################################################### +## VPC Endpoints in SageMaker VPC (SNS, S3, ECR) ## +################################################### + +resource "aws_vpc_endpoint" "sagemaker_s3" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.sagemaker[0].id + service_name = "com.amazonaws.${data.aws_region.aws_region.name}.s3" + vpc_endpoint_type = "Gateway" + route_table_ids = [aws_route_table.sagemaker[0].id] +} + +resource "aws_vpc_endpoint" "sagemaker_ecr_api_endpoint" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.sagemaker[0].id + service_name = "com.amazonaws.eu-west-2.ecr.api" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.sagemaker_private_without_egress.*.id + security_group_ids = [aws_security_group.sagemaker_endpoints[0].id] + tags = { + Environment = var.prefix + Name = " sagemaker-ecr-api-endpoint" + } + private_dns_enabled = true + policy = data.aws_iam_policy_document.aws_sagemaker_endpoint_ecr[0].json +} + +resource "aws_vpc_endpoint" "sagemaker_ecr_dkr_endpoint" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.sagemaker[0].id + service_name = "com.amazonaws.eu-west-2.ecr.dkr" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.sagemaker_private_without_egress.*.id + security_group_ids = [aws_security_group.sagemaker_endpoints[0].id] + tags = { + Environment = var.prefix + Name = "sagemaker-ecr-dkr-endpoint" + } + private_dns_enabled = true + policy = data.aws_iam_policy_document.aws_sagemaker_endpoint_ecr[0].json +} + + +data "aws_iam_policy_document" "aws_sagemaker_endpoint_ecr" { + count = var.sagemaker_on ? 1 : 0 + # Contains policies for both ECR and DKR endpoints, as recommended + + statement { + principals { + type = "AWS" + identifiers = ["*"] + } + + actions = [ + "ecr:GetAuthorizationToken", + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ] + + resources = [ + "*", + ] + } +} + +resource "aws_vpc_endpoint" "sns_endpoint_sagemaker" { + count = var.sagemaker_on ? 1 : 0 + vpc_id = aws_vpc.sagemaker[0].id + service_name = "com.amazonaws.eu-west-2.sns" + vpc_endpoint_type = "Interface" + subnet_ids = aws_subnet.sagemaker_private_without_egress.*.id + security_group_ids = [aws_security_group.sagemaker_endpoints[0].id, aws_security_group.sagemaker[0].id] + tags = { + Environment = var.prefix + Name = "sns-endpoint" + } + private_dns_enabled = true + policy = data.aws_iam_policy_document.sns_endpoint_policy[0].json +} + + +data "aws_iam_policy_document" "sns_endpoint_policy" { + count = var.sagemaker_on ? 1 : 0 + statement { + principals { + type = "AWS" + identifiers = ["*"] + } + actions = [ + "SNS:Subscribe", + "SNS:Receive", + "SNS:Publish", + ] + resources = [ + "*" + ] + } +} + resource "aws_vpc" "matchbox" { count = var.matchbox_on ? 1 : 0