From cab4c662c7e7e9f659749848d1692e874ac7b62f Mon Sep 17 00:00:00 2001 From: Rocket Date: Thu, 16 May 2024 14:34:11 -0700 Subject: [PATCH] Add support for ECS Exec for debugging (#594) --- docs/infra/service-command-execution.md | 69 ++++++++++++++++++++ infra/app/app-config/dev.tf | 5 ++ infra/app/app-config/env-config/outputs.tf | 15 +++-- infra/app/app-config/env-config/variables.tf | 6 ++ infra/app/app-config/prod.tf | 5 ++ infra/app/app-config/staging.tf | 5 ++ infra/app/service/main.tf | 7 +- infra/modules/network/variables.tf | 6 ++ infra/modules/network/vpc-endpoints.tf | 3 + infra/modules/service/command-execution.tf | 31 +++++++++ infra/modules/service/main.tf | 13 ++-- infra/modules/service/variables.tf | 5 ++ infra/networks/main.tf | 9 +++ 13 files changed, 163 insertions(+), 16 deletions(-) create mode 100644 docs/infra/service-command-execution.md create mode 100644 infra/modules/service/command-execution.tf diff --git a/docs/infra/service-command-execution.md b/docs/infra/service-command-execution.md new file mode 100644 index 000000000..df2ebdfb9 --- /dev/null +++ b/docs/infra/service-command-execution.md @@ -0,0 +1,69 @@ +# Running commands on the service + +The infrastructure supports developer access to a running application's service container using [ECS Exec](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html). You can run commands in or get a shell to an actively running container, allowing you to quickly debug issues or to use the container to access an attached database. Once you create an interactive shell, you will be operating with the same permissions as the container (e.g. you may access any database the container has access to, but you cannot access databases within the same account that the container does not have access to). + +⚠️ **Warning: It is not recommended to enable service access in a production environment!** + +## Prerequisites + +* You'll need to have [set up infrastructure tools](./set-up-infrastructure-tools.md), like Terraform, AWS CLI, and AWS authentication +* You'll need to have set up the [app environments](./set-up-app-env.md) +* You'll need to have [installed the Session Manager plugin for the AWS CLI](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-working-with-install-plugin.html) + +## Instructions + +### 1. Make sure you're authenticated into the AWS account that the ECS container is running in + +This takes effect in whatever account you're authenticated into. To see which account that is, run + +```bash +aws sts get-caller-identity +``` + +To see a more human readable account alias instead of the account, run + +```bash +aws iam list-account-aliases +``` + +### 2. Enable service execution access + +Within the `app-config` directory (e.g. `infra//app-config`), each environment has its own config file named after the environment. For example, if the application has three environments `dev`, `staging`, and `prod`, it should have corresponding `dev.tf`, `staging.tf`, and `prod.tf` files. + +In the environment config file for the environment that you want to enable service access, set `enable_command_execution` to `true`. + +### 3. Update the network + +To enable service execution access, the VPC requires an additional VPC endpoint. Update the network by running + +```bash +make infra-update-network NETWORK_NAME= +``` + +`ENVIRONMENT` needs to be the name of the network that the application environment is running in. + +### 4. Update the application service + +To enable service execution access, some configuration changes need to be applied to the ECS Task Definition. Update the service by running + +```bash +make infra-update-app-service APP_NAME= ENVIRONMENT= +``` + +`APP_NAME` needs to be the name of the application folder within the `infra` folder. + +`ENVIRONMENT` needs to be the name of the environment to update. + +### 5. Execute commands + +To create an interactive shell, run + +```bash +aws ecs execute-command --cluster \ + --task \ + --container \ + --interactive \ + --command "/bin/sh" +``` + +To run other commands, modify the `--command` flag to execute the command, rather than starting a shell. diff --git a/infra/app/app-config/dev.tf b/infra/app/app-config/dev.tf index d18c7e7f5..8cf29e4b5 100644 --- a/infra/app/app-config/dev.tf +++ b/infra/app/app-config/dev.tf @@ -9,4 +9,9 @@ module "dev_config" { enable_https = false has_database = local.has_database has_incident_management_service = local.has_incident_management_service + + # Enables ECS Exec access for debugging or jump access. + # See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html + # Defaults to `false`. Uncomment the next line to enable. + # enable_command_execution = true } diff --git a/infra/app/app-config/env-config/outputs.tf b/infra/app/app-config/env-config/outputs.tf index db4c07d3b..360bc7e85 100644 --- a/infra/app/app-config/env-config/outputs.tf +++ b/infra/app/app-config/env-config/outputs.tf @@ -16,13 +16,14 @@ output "network_name" { output "service_config" { value = { - service_name = "${local.prefix}${var.app_name}-${var.environment}" - domain_name = var.domain_name - enable_https = var.enable_https - region = var.default_region - cpu = var.service_cpu - memory = var.service_memory - desired_instance_count = var.service_desired_instance_count + service_name = "${local.prefix}${var.app_name}-${var.environment}" + domain_name = var.domain_name + enable_https = var.enable_https + region = var.default_region + cpu = var.service_cpu + memory = var.service_memory + desired_instance_count = var.service_desired_instance_count + enable_command_execution = var.enable_command_execution extra_environment_variables = merge( local.default_extra_environment_variables, diff --git a/infra/app/app-config/env-config/variables.tf b/infra/app/app-config/env-config/variables.tf index 0dec37e80..1241f087d 100644 --- a/infra/app/app-config/env-config/variables.tf +++ b/infra/app/app-config/env-config/variables.tf @@ -70,3 +70,9 @@ variable "service_override_extra_environment_variables" { EOT default = {} } + +variable "enable_command_execution" { + type = bool + description = "Enables the ability to manually execute commands on running service containers using AWS ECS Exec" + default = false +} diff --git a/infra/app/app-config/prod.tf b/infra/app/app-config/prod.tf index c452531f2..3cc9bc786 100644 --- a/infra/app/app-config/prod.tf +++ b/infra/app/app-config/prod.tf @@ -16,4 +16,9 @@ module "prod_config" { service_cpu = 1024 service_memory = 4096 service_desired_instance_count = 3 + + # Enables ECS Exec access for debugging or jump access. + # Defaults to `false`. Uncomment the next line to enable. + # ⚠️ Warning! It is not recommended to enable this in a production environment. + # enable_command_execution = true } diff --git a/infra/app/app-config/staging.tf b/infra/app/app-config/staging.tf index 8205c5205..dad61db0e 100644 --- a/infra/app/app-config/staging.tf +++ b/infra/app/app-config/staging.tf @@ -9,4 +9,9 @@ module "staging_config" { enable_https = false has_database = local.has_database has_incident_management_service = local.has_incident_management_service + + # Enables ECS Exec access for debugging or jump access. + # See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html + # Defaults to `false`. Uncomment the next line to enable. + # enable_command_execution = true } diff --git a/infra/app/service/main.tf b/infra/app/service/main.tf index 99c331448..198372a91 100644 --- a/infra/app/service/main.tf +++ b/infra/app/service/main.tf @@ -128,9 +128,10 @@ module "service" { hosted_zone_id = local.service_config.domain_name != null ? data.aws_route53_zone.zone[0].zone_id : null certificate_arn = local.service_config.enable_https ? data.aws_acm_certificate.certificate[0].arn : null - cpu = local.service_config.cpu - memory = local.service_config.memory - desired_instance_count = local.service_config.desired_instance_count + cpu = local.service_config.cpu + memory = local.service_config.memory + desired_instance_count = local.service_config.desired_instance_count + enable_command_execution = local.service_config.enable_command_execution aws_services_security_group_id = data.aws_security_groups.aws_services.ids[0] diff --git a/infra/modules/network/variables.tf b/infra/modules/network/variables.tf index d57661b8c..ce4db19a5 100644 --- a/infra/modules/network/variables.tf +++ b/infra/modules/network/variables.tf @@ -24,3 +24,9 @@ variable "has_external_non_aws_service" { description = "Whether the application(s) in this network need to call external non-AWS services. Determines whether or not to create NAT gateways." default = false } + +variable "enable_command_execution" { + type = bool + description = "Whether the application(s) in this network need ECS Exec access. Determines whether to create VPC endpoints needed by ECS Exec." + default = false +} diff --git a/infra/modules/network/vpc-endpoints.tf b/infra/modules/network/vpc-endpoints.tf index 701d57d37..8ac8a720f 100644 --- a/infra/modules/network/vpc-endpoints.tf +++ b/infra/modules/network/vpc-endpoints.tf @@ -15,6 +15,9 @@ locals { # AWS services used by the database's role manager var.has_database ? ["ssm", "kms", "secretsmanager"] : [], + + # AWS services used by ECS Exec + var.enable_command_execution ? ["ssmmessages"] : [], ) # S3 and DynamoDB use Gateway VPC endpoints. All other services use Interface VPC endpoints diff --git a/infra/modules/service/command-execution.tf b/infra/modules/service/command-execution.tf new file mode 100644 index 000000000..8210a9c73 --- /dev/null +++ b/infra/modules/service/command-execution.tf @@ -0,0 +1,31 @@ +#----------------- +# ECS Exec Access +# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html +#----------------- +resource "aws_iam_policy" "ecs_exec" { + name = "${var.service_name}-ecs-exec" + description = "Allow access to SSM Messages to support ECS Exec" + policy = data.aws_iam_policy_document.ecs_exec.json +} + +data "aws_iam_policy_document" "ecs_exec" { + # Allow ECS to access SSM Messages so that ECS Exec works + # See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-exec.html + statement { + sid = "SSMAccess" + effect = "Allow" + actions = [ + "ssmmessages:CreateControlChannel", + "ssmmessages:CreateDataChannel", + "ssmmessages:OpenControlChannel", + "ssmmessages:OpenDataChannel", + ] + resources = ["*"] + } +} + +resource "aws_iam_role_policy_attachment" "ecs_exec" { + count = var.enable_command_execution ? 1 : 0 + role = aws_iam_role.app_service.name + policy_arn = aws_iam_policy.ecs_exec.arn +} diff --git a/infra/modules/service/main.tf b/infra/modules/service/main.tf index 05c89c0b2..9513087f8 100644 --- a/infra/modules/service/main.tf +++ b/infra/modules/service/main.tf @@ -41,11 +41,12 @@ locals { #------------------- resource "aws_ecs_service" "app" { - name = var.service_name - cluster = aws_ecs_cluster.cluster.arn - launch_type = "FARGATE" - task_definition = aws_ecs_task_definition.app.arn - desired_count = var.desired_instance_count + name = var.service_name + cluster = aws_ecs_cluster.cluster.arn + launch_type = "FARGATE" + task_definition = aws_ecs_task_definition.app.arn + desired_count = var.desired_instance_count + enable_execute_command = var.enable_command_execution ? true : null # Allow changes to the desired_count without differences in terraform plan. # This allows autoscaling to manage the desired count for us. @@ -79,7 +80,7 @@ resource "aws_ecs_task_definition" "app" { cpu = var.cpu, networkMode = "awsvpc", essential = true, - readonlyRootFilesystem = true, + readonlyRootFilesystem = !var.enable_command_execution, # Need to define all parameters in the healthCheck block even if we want # to use AWS's defaults, otherwise the terraform plan will show a diff diff --git a/infra/modules/service/variables.tf b/infra/modules/service/variables.tf index 737a8ed34..bea653cec 100644 --- a/infra/modules/service/variables.tf +++ b/infra/modules/service/variables.tf @@ -52,6 +52,11 @@ variable "memory" { description = "Amount (in MiB) of memory used by the task. e.g. 2048" } +variable "enable_command_execution" { + type = bool + default = false + description = "Whether the service should enable ECS Exec, such as for debugging" +} variable "container_port" { type = number diff --git a/infra/networks/main.tf b/infra/networks/main.tf index 554f0decb..76761b96a 100644 --- a/infra/networks/main.tf +++ b/infra/networks/main.tf @@ -28,6 +28,14 @@ locals { # Whether any of the applications in the network have dependencies on an external non-AWS service has_external_non_aws_service = anytrue([for app in local.apps_in_network : app.has_external_non_aws_service]) + + # Whether any of the applications in the network has an environment that needs container execution access + enable_command_execution = anytrue([ + for app in local.apps_in_network : + anytrue([ + for environment_config in app.environment_configs : true if environment_config.service_config.enable_command_execution == true && environment_config.network_name == var.network_name + ]) + ]) } terraform { @@ -67,6 +75,7 @@ module "network" { database_subnet_group_name = local.network_config.database_subnet_group_name has_database = local.has_database has_external_non_aws_service = local.has_external_non_aws_service + enable_command_execution = local.enable_command_execution } module "domain" {