Skip to content

Commit

Permalink
BFD-3701: Update BFD Server load balancing to support Blue/Green Depl…
Browse files Browse the repository at this point in the history
…oyments (#2546)

Co-authored-by: Michael J Burling <[email protected]>
  • Loading branch information
malessi and mjburling authored Feb 13, 2025
1 parent a5482a7 commit e41bea6
Show file tree
Hide file tree
Showing 17 changed files with 336 additions and 152 deletions.
4 changes: 2 additions & 2 deletions .github/scripts/pre-commit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ runShellCheckForCommitFiles() {
filename=$(basename -- "$file")
extension="${filename##*.}"

# Skip binary formats
# Skip binary formats and groovy files
case "$extension" in
"zip" | "p12" | "pfx" | "cer" | "pem" | "png" | "jpg")
"zip" | "p12" | "pfx" | "cer" | "pem" | "png" | "jpg" | "groovy")
continue ;;
*) ;;
esac
Expand Down
5 changes: 4 additions & 1 deletion apps/utils/locust_tests/lambda/server-regression/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def handler(event, context):
cert = get_ssm_parameter(
f"/bfd/{environment}/server/sensitive/server_regression_cert", with_decrypt=True
)
green_port = get_ssm_parameter(
f"/bfd/{environment}/server/nonsensitive/lb_green_ingress_port"
)
except ValueError as exc:
send_pipeline_signal(
signal_queue_url=signal_queue_url,
Expand Down Expand Up @@ -191,7 +194,7 @@ def handler(event, context):
[
"locust",
f"--locustfile=/var/task/{invoke_event.suite_version}/{locust_file}",
f"--host={invoke_event.host}",
f"--host={invoke_event.host}:{green_port}",
f"--users={invoke_event.users}",
f"--spawn-rate={invoke_event.spawn_rate}",
f"--spawned-runtime={invoke_event.spawned_runtime}",
Expand Down
3 changes: 2 additions & 1 deletion ops/jenkins/global-pipeline-libraries/vars/awsElb.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
// awsElb.groovy contains methods that wrap awscli elb subcommands

// Returns the Elastic Load Balancer's DNSName for the given environment
// See ops/terraform/services/server/modules/bfd_server_asg/main.tf for NLB definition and naming scheme
String getElbDnsName(String environment) {
elbDnsName = sh(returnStdout: true, script: "aws elb describe-load-balancers --load-balancer-names bfd-${environment}-fhir --query 'LoadBalancerDescriptions[0].DNSName' --output text").trim()
elbDnsName = sh(returnStdout: true, script: "aws elbv2 describe-load-balancers --names bfd-${environment}-fhir-nlb --query 'LoadBalancer[0].DNSName' --output text").trim()
return elbDnsName
}
4 changes: 2 additions & 2 deletions ops/terraform/services/base/values/ephemeral.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
/bfd/${env}/server/nonsensitive/pac/claim_source_types: fiss,mcs
/bfd/${env}/server/nonsensitive/c4dic/enabled: "false"
/bfd/${env}/server/nonsensitive/lb_is_public: false
/bfd/${env}/server/nonsensitive/lb_ingress_port: 443
/bfd/${env}/server/nonsensitive/lb_egress_port: 7443
/bfd/${env}/server/nonsensitive/lb_blue_ingress_port: 443
/bfd/${env}/server/nonsensitive/lb_green_ingress_port: 7443
/bfd/${env}/server/nonsensitive/launch_template_volume_iops: 3000
/bfd/${env}/server/nonsensitive/launch_template_volume_size_gb: 60
/bfd/${env}/server/nonsensitive/launch_template_volume_throughput: 250
Expand Down
4 changes: 2 additions & 2 deletions ops/terraform/services/base/values/prod-sbx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@
/bfd/${env}/server/nonsensitive/heathcheck/testing_bene_id: "-88888888888888"
/bfd/${env}/server/nonsensitive/paths/files/war: UNDEFINED
/bfd/${env}/server/nonsensitive/lb_is_public: "true"
/bfd/${env}/server/nonsensitive/lb_ingress_port: "443"
/bfd/${env}/server/nonsensitive/lb_egress_port: "7443"
/bfd/${env}/server/nonsensitive/lb_blue_ingress_port: "443"
/bfd/${env}/server/nonsensitive/lb_green_ingress_port: "7443"
/bfd/${env}/server/nonsensitive/lb_vpc_peerings_json: '[ "bfd-prod-sbx-to-ab2d-dev", "bfd-prod-sbx-to-ab2d-impl", "bfd-prod-sbx-to-ab2d-sbx", "bfd-prod-sbx-to-bcda-dev", "bfd-prod-sbx-to-bcda-test", "bfd-prod-sbx-to-bcda-sbx", "bfd-prod-sbx-to-bcda-opensbx", "bfd-prod-sbx-vpc-to-bluebutton-impl", "bfd-prod-sbx-vpc-to-bluebutton-test", "bfd-prod-sbx-vpc-to-dpc-prod-sbx-vpc", "bfd-prod-sbx-vpc-to-dpc-test-vpc", "bfd-prod-sbx-vpc-to-dpc-dev-vpc" ]'
/bfd/${env}/server/nonsensitive/asg_min_instance_count: "3"
/bfd/${env}/server/nonsensitive/asg_max_instance_count: "12"
Expand Down
4 changes: 2 additions & 2 deletions ops/terraform/services/base/values/prod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,8 @@
/bfd/${env}/server/nonsensitive/heathcheck/testing_bene_id: "-88888888888888"
/bfd/${env}/server/nonsensitive/paths/files/war: UNDEFINED
/bfd/${env}/server/nonsensitive/lb_is_public: "false"
/bfd/${env}/server/nonsensitive/lb_ingress_port: "443"
/bfd/${env}/server/nonsensitive/lb_egress_port: "7443"
/bfd/${env}/server/nonsensitive/lb_blue_ingress_port: "443"
/bfd/${env}/server/nonsensitive/lb_green_ingress_port: "7443"
/bfd/${env}/server/nonsensitive/lb_vpc_peerings_json: '[ "bfd-prod-vpc-to-dpc-prod-vpc", "bfd-prod-vpc-to-bluebutton-prod", "bfd-prod-vpc-to-bcda-prod-vpc", "bfd-prod-to-ab2d-prod" ]'
/bfd/${env}/server/nonsensitive/asg_min_instance_count: "3"
/bfd/${env}/server/nonsensitive/asg_max_instance_count: "12"
Expand Down
4 changes: 2 additions & 2 deletions ops/terraform/services/base/values/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@
/bfd/${env}/server/nonsensitive/heathcheck/testing_bene_id: "-88888888888888"
/bfd/${env}/server/nonsensitive/paths/files/war: UNDEFINED
/bfd/${env}/server/nonsensitive/lb_is_public: "false"
/bfd/${env}/server/nonsensitive/lb_ingress_port: "443"
/bfd/${env}/server/nonsensitive/lb_egress_port: "7443"
/bfd/${env}/server/nonsensitive/lb_blue_ingress_port: "443"
/bfd/${env}/server/nonsensitive/lb_green_ingress_port: "7443"
/bfd/${env}/server/nonsensitive/lb_vpc_peerings_json: '[ "bfd-test-vpc-to-bluebutton-test" ]'
/bfd/${env}/server/nonsensitive/asg_min_instance_count: "3"
/bfd/${env}/server/nonsensitive/asg_max_instance_count: "12"
Expand Down
12 changes: 11 additions & 1 deletion ops/terraform/services/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ terraform apply

**NOTE** the above double-invocation of terraform is correct. Two executions of `terraform apply` are necessary to achieve the desired state as of BFD-2558.

## Blue/Green Workflow

This Terraservice implements the logic and resources necessary to support a Blue/Green Deployment strategy for the BFD Server.

Blue (`blue`) refers to the "active" or _production_ infrastructure that serves traffic to our consumers. Resources in `blue` are considered to "known-good" resources. Green (`green`) refers to _incoming_, new infrastructure for a _new_ version of the BFD Server that needs to be verified as good before it being promoted to `blue` and made available to serve traffic to our consumers.

This Terraservice achieves a Blue/Green Deployment strategy by utilizing two AutoScaling Groups, two Target Groups and two Load Balancer Listeners on ports `443` and `7443` that route to the aforementioned Target Groups on different ports. The Listener on port `443` (the reserved HTTPS port) is associated with the `blue` Target Group and the Listener on `7443` is associated with `green`. This way, clients using the default HTTPS port will reach the `blue` BFD Server Instances only, while our automation can reach the `green` Instances by using port `7443`.

The Terraservice logic decides which AutoScaling Group is associated with the `blue`/`green` Target Group by looking at the oddness/evenness of the _latest_ Launch Template version number _iff_ the Launch Template is changing upon the `terraform apply`. Correspondingly, the ASGs are suffixed with `-odd` and `-even`. Given latest Launch Template version number, if it is _odd_ the ASG suffixed as `-odd` will be chosen as `green` whereas if it is _even_ `-even` will be chosen as `green`. In this scenario, we expect no changes to the existing `blue` ASG nor its Target Group so that it continues to serve traffic uninterrupted.

<!-- BEGIN_TF_DOCS -->
<!-- GENERATED WITH `terraform-docs .`
Manually updating the README.md will be overwritten.
Expand Down Expand Up @@ -61,13 +71,13 @@ terraform apply
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
| [aws_ec2_managed_prefix_list.jenkins](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ec2_managed_prefix_list) | data source |
| [aws_ec2_managed_prefix_list.vpn](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ec2_managed_prefix_list) | data source |
| [aws_s3_bucket.logs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/s3_bucket) | data source |
| [aws_security_group.remote](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_group) | data source |
| [aws_security_group.tools](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_group) | data source |
| [aws_security_group.vpn](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_group) | data source |
| [aws_security_groups.aurora_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/security_groups) | data source |
| [aws_ssm_parameters_by_path.nonsensitive_common](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ssm_parameters_by_path) | data source |
| [aws_ssm_parameters_by_path.nonsensitive_service](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ssm_parameters_by_path) | data source |
| [aws_ssm_parameters_by_path.sensitive_service](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ssm_parameters_by_path) | data source |
| [aws_vpc.main](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/vpc) | data source |
| [aws_vpc.mgmt](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/vpc) | data source |
| [aws_vpc_peering_connection.peers](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/vpc_peering_connection) | data source |
Expand Down
10 changes: 5 additions & 5 deletions ops/terraform/services/server/data-sources.tf
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,6 @@ data "aws_ami" "main" {
}
}

# s3 buckets
data "aws_s3_bucket" "logs" {
bucket = "bfd-${local.env}-logs-${data.aws_caller_identity.current.account_id}"
}

# aurora security group
data "aws_security_groups" "aurora_cluster" {
filter {
Expand Down Expand Up @@ -114,3 +109,8 @@ data "aws_ssm_parameters_by_path" "nonsensitive_common" {
data "aws_ssm_parameters_by_path" "nonsensitive_service" {
path = "/bfd/${local.env}/${local.service}/nonsensitive"
}

data "aws_ssm_parameters_by_path" "sensitive_service" {
path = "/bfd/${local.env}/${local.service}/sensitive"
with_decryption = true
}
60 changes: 49 additions & 11 deletions ops/terraform/services/server/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ locals {
for key, value in local.nonsensitive_service_map
: split("/", key)[5] => value
}
sensitive_service_map = zipmap(
data.aws_ssm_parameters_by_path.sensitive_service.names,
nonsensitive(data.aws_ssm_parameters_by_path.sensitive_service.values)
)
sensitive_service_config = {
for key, value in local.sensitive_service_map
: split("/", key)[5] => value
}


enterprise_tools_security_group = local.nonsensitive_common_config["enterprise_tools_security_group"]
management_security_group = local.nonsensitive_common_config["management_security_group"]
Expand All @@ -43,10 +52,10 @@ locals {
ssh_key_pair = local.nonsensitive_common_config["key_pair"]
vpc_name = local.nonsensitive_common_config["vpc_name"]

lb_is_public = local.nonsensitive_service_config["lb_is_public"]
lb_ingress_port = local.nonsensitive_service_config["lb_ingress_port"]
lb_egress_port = local.nonsensitive_service_config["lb_egress_port"]
lb_vpc_peerings = jsondecode(local.nonsensitive_service_config["lb_vpc_peerings_json"])
lb_is_public = tobool(local.nonsensitive_service_config["lb_is_public"])
lb_blue_ingress_port = local.nonsensitive_service_config["lb_blue_ingress_port"]
lb_green_ingress_port = local.nonsensitive_service_config["lb_green_ingress_port"]
lb_vpc_peerings = jsondecode(local.nonsensitive_service_config["lb_vpc_peerings_json"])

asg_min_instance_count = local.nonsensitive_service_config["asg_min_instance_count"]
asg_max_instance_count = local.nonsensitive_service_config["asg_max_instance_count"]
Expand All @@ -60,6 +69,8 @@ locals {
launch_template_volume_throughput = local.nonsensitive_service_config["launch_template_volume_throughput"]
launch_template_volume_type = local.nonsensitive_service_config["launch_template_volume_type"]

service_port = local.sensitive_service_config["service_port"]

env_config = {
default_tags = local.default_tags,
vpc_id = data.aws_vpc.main.id,
Expand Down Expand Up @@ -92,40 +103,49 @@ module "fhir_iam" {

## NLB for the FHIR server (SSL terminated by the FHIR server)
#
# TODO: Remove bfd_server_lb module in BFD-3878
# TODO: Remove below code in BFD-3878
module "fhir_lb" {
count = !local.is_ephemeral_env ? 1 : 0
source = "./modules/bfd_server_lb"

env_config = local.env_config
role = local.legacy_service
layer = "dmz"
log_bucket = data.aws_s3_bucket.logs.id
is_public = local.lb_is_public

ingress = local.lb_is_public ? {
description = "Public Internet access"
port = local.lb_ingress_port
port = local.lb_blue_ingress_port
cidr_blocks = ["0.0.0.0/0"]
prefix_list_ids = []
} : {
description = "From VPN, VPC peerings, the MGMT VPC, and self"
port = local.lb_ingress_port
port = local.lb_blue_ingress_port
cidr_blocks = concat(data.aws_vpc_peering_connection.peers[*].peer_cidr_block, [data.aws_vpc.mgmt.cidr_block, data.aws_vpc.main.cidr_block])
prefix_list_ids = [data.aws_ec2_managed_prefix_list.vpn.id, data.aws_ec2_managed_prefix_list.jenkins.id]
}

egress = {
description = "To VPC instances"
port = local.lb_egress_port
port = local.service_port
cidr_blocks = [data.aws_vpc.main.cidr_block]
}
}

moved {
from = module.fhir_lb
to = module.fhir_lb[0]
}
# TODO: Remove above code in BFD-3878

# TODO: Update this module with new NLB metrics in BFD-3885
module "lb_alarms" {
count = local.create_server_lb_alarms ? 1 : 0

source = "./modules/bfd_server_lb_alarms"

load_balancer_name = module.fhir_lb.name
load_balancer_name = one(module.fhir_lb[*].legacy_clb_name)
app = "bfd"

# NLBs only have this metric to alarm on
Expand All @@ -136,7 +156,6 @@ module "lb_alarms" {
}
}


## Autoscale group for the FHIR server
#
module "fhir_asg" {
Expand All @@ -146,9 +165,13 @@ module "fhir_asg" {
env_config = local.env_config
role = local.legacy_service
layer = "app"
lb_config = module.fhir_lb.lb_config
seed_env = local.seed_env

# TODO: Remove below code in BFD-3878
legacy_clb_name = one(module.fhir_lb[*].legacy_clb_name)
legacy_sg_id = one(module.fhir_lb[*].legacy_sg_id)
# TODO: Remove above code in BFD-3878

# Initial size is one server per AZ
asg_config = {
min = local.asg_min_instance_count
Expand Down Expand Up @@ -186,6 +209,21 @@ module "fhir_asg" {
remote_sg = data.aws_security_group.remote.id
ci_cidrs = [data.aws_vpc.mgmt.cidr_block]
}

lb_config = {
is_public = local.lb_is_public
enable_deletion_protection = !local.is_ephemeral_env
ingress = {
blue_port = local.lb_blue_ingress_port
green_port = local.lb_green_ingress_port
cidr_blocks = !local.lb_is_public ? concat(data.aws_vpc_peering_connection.peers[*].peer_cidr_block, [data.aws_vpc.mgmt.cidr_block, data.aws_vpc.main.cidr_block]) : ["0.0.0.0/0"]
prefix_list_ids = !local.lb_is_public ? [data.aws_ec2_managed_prefix_list.vpn.id, data.aws_ec2_managed_prefix_list.jenkins.id] : []
}
egress = {
cidr_blocks = [data.aws_vpc.main.cidr_block]
}
server_listen_port = local.service_port
}
}

## FHIR server logs
Expand Down
Loading

0 comments on commit e41bea6

Please sign in to comment.