From 87f33968979ae2c51c708ce448c9e8cd59302e77 Mon Sep 17 00:00:00 2001 From: Amir Alavi Date: Mon, 6 Sep 2021 18:05:54 -0400 Subject: [PATCH 1/4] Add CF Stack + Change Scheduler to use Cloudformation Stack output --- Hummingbird/AWS/cloudformation.yaml | 313 ++++++++++++++++++++ Hummingbird/AWS/compute_environment.json | 9 +- Hummingbird/AWS/job-definition.json | 2 +- Hummingbird/AWS/launch-template-data.json | 2 +- Hummingbird/conf/examples/bwa.aws.conf.json | 3 +- Hummingbird/scheduler.py | 46 ++- Hummingbird/test/test_scheduler.py | 31 +- docs/EditConf.md | 7 +- docs/GettingStarted.md | 30 +- 9 files changed, 403 insertions(+), 40 deletions(-) create mode 100644 Hummingbird/AWS/cloudformation.yaml diff --git a/Hummingbird/AWS/cloudformation.yaml b/Hummingbird/AWS/cloudformation.yaml new file mode 100644 index 0000000..8e402b3 --- /dev/null +++ b/Hummingbird/AWS/cloudformation.yaml @@ -0,0 +1,313 @@ +Description: This template deploys an environment for running Hummingbird. + Resources include VPC, with a pair of public and private subnets spread + across two Availability Zones, IAM roles for AWS Batch/ECS. + It deploys an internet gateway, with a default route on the public subnets. + It deploys a pair of NAT gateways (one in each AZ), and default routes for + them in the private subnets. + +Parameters: + EnvironmentName: + Description: An environment name that is prefixed to resource names (e.g. hummingbird) + Type: String + AllowedPattern: '[-_a-zA-Z0-9]*' + ConstraintDescription: Can contain only alphanumeric characters, dashes and underscores. + + VpcCIDR: + Description: Please enter the IP range (CIDR notation) for this VPC + Type: String + Default: 10.192.0.0/16 + + PublicSubnet1CIDR: + Description: Please enter the IP range (CIDR notation) for the public subnet in the first Availability Zone + Type: String + Default: 10.192.10.0/24 + + PublicSubnet2CIDR: + Description: Please enter the IP range (CIDR notation) for the public subnet in the second Availability Zone + Type: String + Default: 10.192.11.0/24 + + PrivateSubnet1CIDR: + Description: Please enter the IP range (CIDR notation) for the private subnet in the first Availability Zone + Type: String + Default: 10.192.20.0/24 + + PrivateSubnet2CIDR: + Description: Please enter the IP range (CIDR notation) for the private subnet in the second Availability Zone + Type: String + Default: 10.192.21.0/24 + +Resources: + VPC: + Type: AWS::EC2::VPC + Properties: + CidrBlock: !Ref VpcCIDR + EnableDnsSupport: true + EnableDnsHostnames: true + Tags: + - Key: Name + Value: !Ref EnvironmentName + + InternetGateway: + Type: AWS::EC2::InternetGateway + Properties: + Tags: + - Key: Name + Value: !Ref EnvironmentName + + InternetGatewayAttachment: + Type: AWS::EC2::VPCGatewayAttachment + Properties: + InternetGatewayId: !Ref InternetGateway + VpcId: !Ref VPC + + PublicSubnet1: + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 0, !GetAZs '' ] + CidrBlock: !Ref PublicSubnet1CIDR + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Public Subnet (AZ1) + + PublicSubnet2: + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 1, !GetAZs '' ] + CidrBlock: !Ref PublicSubnet2CIDR + MapPublicIpOnLaunch: true + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Public Subnet (AZ2) + + PrivateSubnet1: + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 0, !GetAZs '' ] + CidrBlock: !Ref PrivateSubnet1CIDR + MapPublicIpOnLaunch: false + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Private Subnet (AZ1) + + PrivateSubnet2: + Type: AWS::EC2::Subnet + Properties: + VpcId: !Ref VPC + AvailabilityZone: !Select [ 1, !GetAZs '' ] + CidrBlock: !Ref PrivateSubnet2CIDR + MapPublicIpOnLaunch: false + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Private Subnet (AZ2) + + NatGateway1EIP: + Type: AWS::EC2::EIP + DependsOn: InternetGatewayAttachment + Properties: + Domain: vpc + + NatGateway2EIP: + Type: AWS::EC2::EIP + DependsOn: InternetGatewayAttachment + Properties: + Domain: vpc + + NatGateway1: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt NatGateway1EIP.AllocationId + SubnetId: !Ref PublicSubnet1 + + NatGateway2: + Type: AWS::EC2::NatGateway + Properties: + AllocationId: !GetAtt NatGateway2EIP.AllocationId + SubnetId: !Ref PublicSubnet2 + + PublicRouteTable: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref VPC + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Public Routes + + DefaultPublicRoute: + Type: AWS::EC2::Route + DependsOn: InternetGatewayAttachment + Properties: + RouteTableId: !Ref PublicRouteTable + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: !Ref InternetGateway + + PublicSubnet1RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref PublicSubnet1 + + PublicSubnet2RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: !Ref PublicRouteTable + SubnetId: !Ref PublicSubnet2 + + + PrivateRouteTable1: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref VPC + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Private Routes (AZ1) + + DefaultPrivateRoute1: + Type: AWS::EC2::Route + Properties: + RouteTableId: !Ref PrivateRouteTable1 + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: !Ref NatGateway1 + + PrivateSubnet1RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: !Ref PrivateRouteTable1 + SubnetId: !Ref PrivateSubnet1 + + PrivateRouteTable2: + Type: AWS::EC2::RouteTable + Properties: + VpcId: !Ref VPC + Tags: + - Key: Name + Value: !Sub ${EnvironmentName} Private Routes (AZ2) + + DefaultPrivateRoute2: + Type: AWS::EC2::Route + Properties: + RouteTableId: !Ref PrivateRouteTable2 + DestinationCidrBlock: 0.0.0.0/0 + NatGatewayId: !Ref NatGateway2 + + PrivateSubnet2RouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Properties: + RouteTableId: !Ref PrivateRouteTable2 + SubnetId: !Ref PrivateSubnet2 + + BatchEC2SecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: "AWS Batch EC2 Security Group" + GroupName: !Sub ${EnvironmentName}-sg + SecurityGroupEgress: + - IpProtocol: tcp + FromPort: 0 + ToPort: 0 + CidrIp: 0.0.0.0/0 + Tags: + - Key: Name + Value: !Sub ${EnvironmentName}-SG + VpcId: !Ref VPC + + # https://docs.aws.amazon.com/batch/latest/userguide/instance_IAM_role.html + ECSInstanceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [ ec2.amazonaws.com ] + Action: [ 'sts:AssumeRole' ] + Path: / + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role' + - 'arn:aws:iam::aws:policy/AmazonS3FullAccess' # TODO narrow down to S3 bucket Read/Write + Tags: + - Key: Name + Value: !Sub ${EnvironmentName}-ECSInstanceRole + + # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html + ECSTaskExecutionRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [ ecs-tasks.amazonaws.com ] + Action: [ 'sts:AssumeRole' ] + Path: / + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy' + Tags: + - Key: Name + Value: !Sub ${EnvironmentName}-ECSTaskExecutionRole + + # https://docs.aws.amazon.com/batch/latest/userguide/service_IAM_role.html + BatchServiceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: + Service: [ batch.amazonaws.com ] + Action: [ 'sts:AssumeRole' ] + Path: / + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole' + Tags: + - Key: Name + Value: !Sub ${EnvironmentName}-BatchServiceRole + +Outputs: + VPC: + Description: A reference to the created VPC + Value: !Ref VPC + + PublicSubnets: + Description: A list of the public subnets + Value: !Join [ ",", [ !Ref PublicSubnet1, !Ref PublicSubnet2 ] ] + + PrivateSubnets: + Description: A list of the private subnets + Value: !Join [ ",", [ !Ref PrivateSubnet1, !Ref PrivateSubnet2 ] ] + + PublicSubnet1: + Description: A reference to the public subnet in the 1st Availability Zone + Value: !Ref PublicSubnet1 + + PublicSubnet2: + Description: A reference to the public subnet in the 2nd Availability Zone + Value: !Ref PublicSubnet2 + + PrivateSubnet1: + Description: A reference to the private subnet in the 1st Availability Zone + Value: !Ref PrivateSubnet1 + + PrivateSubnet2: + Description: A reference to the private subnet in the 2nd Availability Zone + Value: !Ref PrivateSubnet2 + + BatchEC2SecurityGroup: + Description: Security Group for AWS Batch EC2 + Value: !Ref BatchEC2SecurityGroup + + ECSInstanceRole: + Description: ECS Instance Role + Value: !Ref ECSInstanceRole + + ECSTaskExecutionRole: + Description: ECS Task Execution Role + Value: !Ref ECSTaskExecutionRole + + BatchServiceRole: + Description: AWS Batch Service Role + Value: !Ref BatchServiceRole diff --git a/Hummingbird/AWS/compute_environment.json b/Hummingbird/AWS/compute_environment.json index 76e3ff3..ed3bb92 100644 --- a/Hummingbird/AWS/compute_environment.json +++ b/Hummingbird/AWS/compute_environment.json @@ -1,5 +1,5 @@ { - "computeEnvironmentName": "hummingbird-env", + "computeEnvironmentName": "", "type": "MANAGED", "state": "ENABLED", "computeResources": { @@ -14,15 +14,14 @@ ], "securityGroupIds": [ ], - "ec2KeyPair": "", - "instanceRole": "ecsInstanceRole", + "instanceRole": "", "tags": { "KeyName": "hummingbird" }, "launchTemplate": { - "launchTemplateName": "hummingbird_disk_launch_template", + "launchTemplateName": "hummingbird_launch_template", "version": "$Latest" } }, - "serviceRole": "" + "serviceRole": "" } diff --git a/Hummingbird/AWS/job-definition.json b/Hummingbird/AWS/job-definition.json index 66743e2..c85aab3 100644 --- a/Hummingbird/AWS/job-definition.json +++ b/Hummingbird/AWS/job-definition.json @@ -10,7 +10,7 @@ "-c", "/usr/local/bin/aws_fetch_and_run.sh" ], - "jobRoleArn": "", + "jobRoleArn": "", "volumes": [ ], "environment": [ diff --git a/Hummingbird/AWS/launch-template-data.json b/Hummingbird/AWS/launch-template-data.json index 283a855..7e0084c 100644 --- a/Hummingbird/AWS/launch-template-data.json +++ b/Hummingbird/AWS/launch-template-data.json @@ -1,5 +1,5 @@ { - "LaunchTemplateName": "hummingbird_disk_launch_template", + "LaunchTemplateName": "hummingbird_launch_template", "LaunchTemplateData": { "EbsOptimized": true, "BlockDeviceMappings": [ diff --git a/Hummingbird/conf/examples/bwa.aws.conf.json b/Hummingbird/conf/examples/bwa.aws.conf.json index b0c6f4b..d36e14d 100644 --- a/Hummingbird/conf/examples/bwa.aws.conf.json +++ b/Hummingbird/conf/examples/bwa.aws.conf.json @@ -3,7 +3,8 @@ "service": "aws", "project": "", "regions": "us-west-2", - "bucket": "" + "bucket": "", + "cloudformation_stack_name": "hummingbird" }, "Downsample": { "input": { diff --git a/Hummingbird/scheduler.py b/Hummingbird/scheduler.py index 3082c79..3e1501c 100644 --- a/Hummingbird/scheduler.py +++ b/Hummingbird/scheduler.py @@ -59,6 +59,8 @@ def __init__(self, conf, machine, disk_size, script, **kwargs): self.batch_client = boto3.client('batch', region_name=self.region) self.ec2_client = boto3.client('ec2', region_name=self.region) self.s3_bucket = boto3.resource('s3').Bucket(self.conf[PLATFORM]['bucket']) + self.cf_client = boto3.resource('cloudformation') + self.cf_stack_name = conf[PLATFORM]['cloudformation_stack_name'] super(AWSBatchScheduler, self).__init__() def create_or_update_launch_template(self): @@ -79,22 +81,26 @@ def create_or_update_launch_template(self): logging.info('Creating a new version for launch template %s', data['LaunchTemplateName']) self.ec2_client.create_launch_template_version(**data) - def create_or_update_compute_environment(self): + def create_or_update_compute_environment(self, cf_output): with open('AWS/compute_environment.json') as f: data = json.load(f) - compute_env_prefix = data.get('computeEnvironmentName', self.compute_env_prefix) - compute_env_name = compute_env_prefix + self.machine.name.replace('.', '_') + '-' + str(self.disk_size) - + compute_env_name = self.cf_stack_name + '-' + self.machine.name.replace('.', '_') + '-' + str(self.disk_size) desc_json = self.batch_client.describe_compute_environments(computeEnvironments=[compute_env_name]) if desc_json['computeEnvironments']: logging.info('Skipping creation of AWS Batch Compute environment %s as it already exists', compute_env_name) return compute_env_name + compute_resources = data['computeResources'] data['computeEnvironmentName'] = compute_env_name - data['computeResources']['instanceTypes'].append(self.machine.name) - if 'ec2KeyPair' in data['computeResources'] and not data['computeResources']['ec2KeyPair']: - del data['computeResources']['ec2KeyPair'] # if there is an empty keypair name, don't provide it + compute_resources['instanceTypes'].append(self.machine.name) + if 'EC2KeyPair' in cf_output and cf_output['EC2KeyPair']: + compute_resources['ec2KeyPair'] = cf_output + + data['serviceRole'] = cf_output['BatchServiceRole'] + compute_resources['subnets'] = [cf_output['PrivateSubnet1'], cf_output['PrivateSubnet2']] + compute_resources['securityGroupIds'] = cf_output['BatchEC2SecurityGroup'] + compute_resources['instanceRole'] = cf_output['ECSInstanceRole'] data['tags'] = {'Name': compute_env_name} logging.info('Attempting to create AWS Batch Compute environment: %s', compute_env_name) @@ -172,13 +178,14 @@ def create_or_update_job_queue(self, env_name): return job_queue_name - def register_job_definition(self, compute_env_name, job_queue_name): + def register_job_definition(self, cf_output, compute_env_name, job_queue_name): with open('AWS/job-definition.json') as f: data = json.load(f) data['containerProperties']['vcpus'] = self.machine.cpu data['containerProperties']['memory'] = int(self.machine.mem) * 1024 if self.image: data['containerProperties']['image'] = self.image + data['jobRoleArn'] = cf_output['ECSTaskExecutionRole'] job_definition_name = data.get('jobDefinitionName', self.job_def_name) data.setdefault('tags', {}) data['tags'].update({'Name': job_definition_name, 'ComputeEnvironment': compute_env_name, 'JobQueue': job_queue_name}) @@ -186,11 +193,30 @@ def register_job_definition(self, compute_env_name, job_queue_name): logging.info('Successfully registered AWS Batch Job Definition: %s', job_definition_name) return job_definition_name + def get_cf_stack_output(self): + logging.info('Attempting to query Cloudformation Stack: %s', self.cf_stack_name) + response = self.cf_client.describe_stacks(StackName=self.cf_stack_name) + stacks = response['Stacks'] + if not stacks or 'Outputs' not in stacks[0] or not stacks[0]['Outputs']: + msg = f"Unable to query Cloudformation Stack {self.cf_stack_name}" + logging.exception(msg) + raise SchedulerException(msg) + + output = stacks[0]['Outputs'][-1] + for key in ['PrivateSubnet1', 'PrivateSubnet2', 'BatchEC2SecurityGroup', 'ECSInstanceRole', 'ECSTaskExecutionRole', 'BatchServiceRole']: + if key not in output or not output[key]: + msg = f"Cloudformation stack {self.cf_stack_name} is missing required output: {key}" + logging.exception(msg) + raise SchedulerException(msg) + + return output + def submit_job(self, tries=1): + cf_output = self.get_cf_stack_output() self.create_or_update_launch_template() - compute_env_name = self.create_or_update_compute_environment() + compute_env_name = self.create_or_update_compute_environment(cf_output) job_queue_name = self.create_or_update_job_queue(compute_env_name) - job_definition_name = self.register_job_definition(compute_env_name, job_queue_name) + job_definition_name = self.register_job_definition(cf_output, compute_env_name, job_queue_name) jobname = os.path.basename(self.script) s3_path = 'script/' + jobname + '.sh' diff --git a/Hummingbird/test/test_scheduler.py b/Hummingbird/test/test_scheduler.py index 29af832..d5a92b8 100644 --- a/Hummingbird/test/test_scheduler.py +++ b/Hummingbird/test/test_scheduler.py @@ -9,8 +9,16 @@ class TestAWSScheduler(unittest.TestCase): - conf = {PLATFORM: {'regions': 'us-west-2', 'bucket': 'local-bucket'}} + conf = {PLATFORM: {'regions': 'us-west-2', 'bucket': 'local-bucket', 'cloudformation_stack_name': 'test'}} jobs = ['some-job-id'] + cf_stack_output = { + 'PrivateSubnet1': 'subnet1', + 'PrivateSubnet2': 'subnet2', + 'BatchEC2SecurityGroup': 'sg-test', + 'ECSInstanceRole': 'ecsInstanceRole', + 'ECSTaskExecutionRole': 'taskExecutionRole', + 'BatchServiceRole': 'awsBatchServiceRole' + } launch_template = """ { "LaunchTemplateName": "hummingbird_launch_template", @@ -100,3 +108,24 @@ def test_create_or_update_launch_template_create_version(self, _, client_mock): self.instance.create_or_update_launch_template() client_mock.create_launch_template_version.assert_called_once() + + @patch('boto3.client', return_value=MagicMock()) + def test_get_cf_stack_output(self, client_mock): + self.instance.cf_client = client_mock + client_mock.describe_stacks.return_value = {'Stacks': [{'StackName': 'test', 'Outputs': [self.cf_stack_output]}]} + + self.instance.get_cf_stack_output() + + client_mock.describe_stacks.assert_called_once_with(StackName='test') + + @patch('boto3.client', return_value=MagicMock()) + @patch('logging.exception') + def test_get_cf_stack_output_missing_key(self, _, client_mock): + self.instance.cf_client = client_mock + + for key in self.cf_stack_output: + output = self.cf_stack_output.copy() + del output[key] + client_mock.describe_stacks.return_value = {'Stacks': [{'StackName': 'test', 'Outputs': [output]}]} + + self.assertRaises(SchedulerException, self.instance.get_cf_stack_output) diff --git a/docs/EditConf.md b/docs/EditConf.md index 983a132..e10dba4 100644 --- a/docs/EditConf.md +++ b/docs/EditConf.md @@ -5,7 +5,12 @@ Hummingbird has a `conf` folder which contains configuration files for all teste 1. `Platform` Specifies information about the cloud computing platform. - `service` The cloud computing service. Specify `gcp` for Google Cloud, `aws` for AWS, 'azure' for Azure. - - **gcp/aws**: + - **aws**: + - `project` The cloud project ID. Make sure the project has access to all needed functionalities and APIs. + - `regions` The region where the computing resource is hosted. + - `bucket` The name of the cloud storage bucket where all the log and output files generated by Hummingbird will be stored. + - `cloudformation_stack_name` The name of the cloudformation stack created in the target AWS account and region. + - **gcp**: - `project` The cloud project ID. Make sure the project has access to all needed functionalities and APIs. - `regions` The region where the computing resource is hosted. - `bucket` The name of the cloud storage bucket where all the log and output files generated by Hummingbird will be stored. diff --git a/docs/GettingStarted.md b/docs/GettingStarted.md index cb08247..0b14a27 100644 --- a/docs/GettingStarted.md +++ b/docs/GettingStarted.md @@ -12,32 +12,22 @@ gcloud auth application-default login ``` ### Getting started on AWS Batch -1. Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) and configure: - ``` - aws configure - ``` - - It will ask for `Access key ID` and `Secret access key`. This credential will be used for all resources on AWS. - See more instructions [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html). -2. Follow instructions [here](https://docs.aws.amazon.com/batch/latest/userguide/service_IAM_role.html) to create the -AWS Batch Service Role. +1. Create the Hummingbird Cloudformation Stack in the target AWS Account and Region. -3. Follow instructions [here](https://docs.aws.amazon.com/batch/latest/userguide/instance_IAM_role.html) to create the -AWS ECS Instance Role. Additionally, make sure that the instance has read/write access to the Input/Output buckets. + [![Launch Stack](https://cdn.rawgit.com/buildkite/cloudformation-launch-stack-button-svg/master/launch-stack.svg)]( + https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=hummingbird&templateURL=https://cf-templates-gvgta4w56y1c-us-west-2.s3.us-west-2.amazonaws.com/hummingbird-cloudformation.template) -4. Follow instructions [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html) -to create the AWS ECS Task Execution Role. +2. Install [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) and configure: + ``` + aws configure + ``` -5. Edit [compute_environment.json](./Hummingbird/AWS/compute_environment.json) to update the following sections: - - `subnets` to a list of subnets in the target VPC that the AWS Batch instance can be provisioned in. - - `securityGroupIds` to one or more Security Groups that define the ingress/egress rules. Hummingbird, will not require any ingress rules but may require TCP `0.0.0.0/0` on egress rules. - - `instanceRole` to be replaced by the value from step #3 (IAM Role Name). - - `` to be replaced by value from step #2 (full IAM Role ARN). -6. Edit [job-definition.json](./Hummingbird/AWS/job-definition.json) to update the following sections: - - `jobRoleArn` to be replaced by the value from step #4 (full IAM Role ARN). + It will ask for `Access key ID` and `Secret access key`. This credential will be used for all resources on AWS. See + more instructions [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html). ### Getting started on Azure Batch + Install [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) and login: ```bash az login From 35a3a268901dbaf58a3f4d8c87e639ed1b49fd5f Mon Sep 17 00:00:00 2001 From: Amir Alavi Date: Mon, 6 Sep 2021 19:18:18 -0400 Subject: [PATCH 2/4] Add region name to cloudformation stack --- Hummingbird/scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Hummingbird/scheduler.py b/Hummingbird/scheduler.py index 3e1501c..1062f33 100644 --- a/Hummingbird/scheduler.py +++ b/Hummingbird/scheduler.py @@ -59,7 +59,7 @@ def __init__(self, conf, machine, disk_size, script, **kwargs): self.batch_client = boto3.client('batch', region_name=self.region) self.ec2_client = boto3.client('ec2', region_name=self.region) self.s3_bucket = boto3.resource('s3').Bucket(self.conf[PLATFORM]['bucket']) - self.cf_client = boto3.resource('cloudformation') + self.cf_client = boto3.resource('cloudformation', region_name=self.region) self.cf_stack_name = conf[PLATFORM]['cloudformation_stack_name'] super(AWSBatchScheduler, self).__init__() @@ -209,6 +209,7 @@ def get_cf_stack_output(self): logging.exception(msg) raise SchedulerException(msg) + logging.info('Successfully queried Cloudformation Stack: %s', self.cf_stack_name) return output def submit_job(self, tries=1): From 278d747d34c8ea99ebb1b290e8c36322a64694bb Mon Sep 17 00:00:00 2001 From: Amir Alavi Date: Mon, 6 Sep 2021 23:18:18 -0400 Subject: [PATCH 3/4] Change output fields to use instance profile role arn --- ...ml => hummingbird-cloudformation.template} | 30 +++++++++++++++++++ Hummingbird/AWS/job-definition.json | 2 -- Hummingbird/scheduler.py | 24 +++++++++------ Hummingbird/test/test_scheduler.py | 25 ++++++++-------- 4 files changed, 57 insertions(+), 24 deletions(-) rename Hummingbird/AWS/{cloudformation.yaml => hummingbird-cloudformation.template} (90%) diff --git a/Hummingbird/AWS/cloudformation.yaml b/Hummingbird/AWS/hummingbird-cloudformation.template similarity index 90% rename from Hummingbird/AWS/cloudformation.yaml rename to Hummingbird/AWS/hummingbird-cloudformation.template index 8e402b3..0cad0e8 100644 --- a/Hummingbird/AWS/cloudformation.yaml +++ b/Hummingbird/AWS/hummingbird-cloudformation.template @@ -219,6 +219,7 @@ Resources: ECSInstanceRole: Type: AWS::IAM::Role Properties: + Description: "Allows EC2 instances in an ECS cluster to access ECS." AssumeRolePolicyDocument: Statement: - Effect: Allow @@ -233,10 +234,18 @@ Resources: - Key: Name Value: !Sub ${EnvironmentName}-ECSInstanceRole + ECSInstanceProfileRole: + Type: AWS::IAM::InstanceProfile + Properties: + Path: / + Roles: + - Ref: 'ECSInstanceRole' + # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html ECSTaskExecutionRole: Type: AWS::IAM::Role Properties: + Description: "Allows ECS tasks to call AWS services on your behalf." AssumeRolePolicyDocument: Statement: - Effect: Allow @@ -254,6 +263,7 @@ Resources: BatchServiceRole: Type: AWS::IAM::Role Properties: + Description: "Allows Batch to create and manage AWS resources on your behalf." AssumeRolePolicyDocument: Statement: - Effect: Allow @@ -304,10 +314,30 @@ Outputs: Description: ECS Instance Role Value: !Ref ECSInstanceRole + ECSInstanceRoleARN: + Description: ECS Instance Role ARN + Value: !GetAtt ECSInstanceRole.Arn + + ECSInstanceProfileRole: + Description: ECS Instance Profile Role + Value: !Ref ECSInstanceProfileRole + + ECSInstanceProfileRoleARN: + Description: ECS Instance Profile Role ARN + Value: !GetAtt ECSInstanceProfileRole.Arn + ECSTaskExecutionRole: Description: ECS Task Execution Role Value: !Ref ECSTaskExecutionRole + ECSTaskExecutionRoleARN: + Description: ECS Task Execution Role ARN + Value: !GetAtt ECSTaskExecutionRole.Arn + BatchServiceRole: Description: AWS Batch Service Role Value: !Ref BatchServiceRole + + BatchServiceRoleARN: + Description: AWS Batch Service Role ARN + Value: !GetAtt BatchServiceRole.Arn diff --git a/Hummingbird/AWS/job-definition.json b/Hummingbird/AWS/job-definition.json index c85aab3..0932d09 100644 --- a/Hummingbird/AWS/job-definition.json +++ b/Hummingbird/AWS/job-definition.json @@ -18,8 +18,6 @@ "mountPoints": [ ], "ulimits": [ - ], - "resourceRequirements": [ ] }, "retryStrategy": { diff --git a/Hummingbird/scheduler.py b/Hummingbird/scheduler.py index 1062f33..ad1728d 100644 --- a/Hummingbird/scheduler.py +++ b/Hummingbird/scheduler.py @@ -59,7 +59,7 @@ def __init__(self, conf, machine, disk_size, script, **kwargs): self.batch_client = boto3.client('batch', region_name=self.region) self.ec2_client = boto3.client('ec2', region_name=self.region) self.s3_bucket = boto3.resource('s3').Bucket(self.conf[PLATFORM]['bucket']) - self.cf_client = boto3.resource('cloudformation', region_name=self.region) + self.cf_client = boto3.client('cloudformation', region_name=self.region) self.cf_stack_name = conf[PLATFORM]['cloudformation_stack_name'] super(AWSBatchScheduler, self).__init__() @@ -97,10 +97,12 @@ def create_or_update_compute_environment(self, cf_output): if 'EC2KeyPair' in cf_output and cf_output['EC2KeyPair']: compute_resources['ec2KeyPair'] = cf_output - data['serviceRole'] = cf_output['BatchServiceRole'] + data['serviceRole'] = cf_output['BatchServiceRoleARN'] compute_resources['subnets'] = [cf_output['PrivateSubnet1'], cf_output['PrivateSubnet2']] - compute_resources['securityGroupIds'] = cf_output['BatchEC2SecurityGroup'] - compute_resources['instanceRole'] = cf_output['ECSInstanceRole'] + compute_resources['securityGroupIds'] = [cf_output['BatchEC2SecurityGroup']] + compute_resources['instanceRole'] = cf_output['ECSInstanceProfileRoleARN'] + + print(json.dumps(data)) data['tags'] = {'Name': compute_env_name} logging.info('Attempting to create AWS Batch Compute environment: %s', compute_env_name) @@ -183,9 +185,9 @@ def register_job_definition(self, cf_output, compute_env_name, job_queue_name): data = json.load(f) data['containerProperties']['vcpus'] = self.machine.cpu data['containerProperties']['memory'] = int(self.machine.mem) * 1024 + data['containerProperties']['jobRoleArn'] = cf_output['ECSTaskExecutionRoleARN'] if self.image: data['containerProperties']['image'] = self.image - data['jobRoleArn'] = cf_output['ECSTaskExecutionRole'] job_definition_name = data.get('jobDefinitionName', self.job_def_name) data.setdefault('tags', {}) data['tags'].update({'Name': job_definition_name, 'ComputeEnvironment': compute_env_name, 'JobQueue': job_queue_name}) @@ -202,15 +204,19 @@ def get_cf_stack_output(self): logging.exception(msg) raise SchedulerException(msg) - output = stacks[0]['Outputs'][-1] - for key in ['PrivateSubnet1', 'PrivateSubnet2', 'BatchEC2SecurityGroup', 'ECSInstanceRole', 'ECSTaskExecutionRole', 'BatchServiceRole']: - if key not in output or not output[key]: + cf_output = {} + for key in ['PrivateSubnet1', 'PrivateSubnet2', 'BatchEC2SecurityGroup', 'ECSInstanceProfileRoleARN', 'ECSTaskExecutionRoleARN', 'BatchServiceRoleARN']: + for kv in stacks[0]['Outputs']: + if kv['OutputKey'] == key: + cf_output[key] = kv['OutputValue'] + + if key not in cf_output: msg = f"Cloudformation stack {self.cf_stack_name} is missing required output: {key}" logging.exception(msg) raise SchedulerException(msg) logging.info('Successfully queried Cloudformation Stack: %s', self.cf_stack_name) - return output + return cf_output def submit_job(self, tries=1): cf_output = self.get_cf_stack_output() diff --git a/Hummingbird/test/test_scheduler.py b/Hummingbird/test/test_scheduler.py index d5a92b8..3130906 100644 --- a/Hummingbird/test/test_scheduler.py +++ b/Hummingbird/test/test_scheduler.py @@ -11,14 +11,14 @@ class TestAWSScheduler(unittest.TestCase): conf = {PLATFORM: {'regions': 'us-west-2', 'bucket': 'local-bucket', 'cloudformation_stack_name': 'test'}} jobs = ['some-job-id'] - cf_stack_output = { - 'PrivateSubnet1': 'subnet1', - 'PrivateSubnet2': 'subnet2', - 'BatchEC2SecurityGroup': 'sg-test', - 'ECSInstanceRole': 'ecsInstanceRole', - 'ECSTaskExecutionRole': 'taskExecutionRole', - 'BatchServiceRole': 'awsBatchServiceRole' - } + cf_stack_output = [ + {'OutputKey': 'PrivateSubnet1', 'OutputValue': 'subnet1'}, + {'OutputKey': 'PrivateSubnet2', 'OutputValue': 'subnet2'}, + {'OutputKey': 'BatchEC2SecurityGroup', 'OutputValue': 'sg-test'}, + {'OutputKey': 'ECSInstanceProfileRoleARN', 'OutputValue': 'ecsInstanceRole'}, + {'OutputKey': 'ECSTaskExecutionRoleARN', 'OutputValue': 'taskExecutionRole'}, + {'OutputKey': 'BatchServiceRoleARN', 'OutputValue': 'awsBatchServiceRole'} + ] launch_template = """ { "LaunchTemplateName": "hummingbird_launch_template", @@ -112,7 +112,7 @@ def test_create_or_update_launch_template_create_version(self, _, client_mock): @patch('boto3.client', return_value=MagicMock()) def test_get_cf_stack_output(self, client_mock): self.instance.cf_client = client_mock - client_mock.describe_stacks.return_value = {'Stacks': [{'StackName': 'test', 'Outputs': [self.cf_stack_output]}]} + client_mock.describe_stacks.return_value = {'Stacks': [{'StackName': 'test', 'Outputs': self.cf_stack_output}]} self.instance.get_cf_stack_output() @@ -123,9 +123,8 @@ def test_get_cf_stack_output(self, client_mock): def test_get_cf_stack_output_missing_key(self, _, client_mock): self.instance.cf_client = client_mock - for key in self.cf_stack_output: - output = self.cf_stack_output.copy() - del output[key] - client_mock.describe_stacks.return_value = {'Stacks': [{'StackName': 'test', 'Outputs': [output]}]} + for kv in self.cf_stack_output: + output = [item for item in self.cf_stack_output if item != kv] + client_mock.describe_stacks.return_value = {'Stacks': [{'StackName': 'test', 'Outputs': output}]} self.assertRaises(SchedulerException, self.instance.get_cf_stack_output) From daeb49e4dfd26e92b647dae27eee880c0948407c Mon Sep 17 00:00:00 2001 From: Amir Alavi Date: Mon, 13 Sep 2021 21:56:36 -0400 Subject: [PATCH 4/4] Modify CF Stack to include all outgoing traffic --- Hummingbird/AWS/compute_environment.json | 2 +- .../AWS/hummingbird-cloudformation.template | 34 +++++++++++-------- Hummingbird/AWS/launch-template-data.json | 23 +++++++++++-- Hummingbird/scheduler.py | 4 +-- 4 files changed, 41 insertions(+), 22 deletions(-) diff --git a/Hummingbird/AWS/compute_environment.json b/Hummingbird/AWS/compute_environment.json index ed3bb92..48cb762 100644 --- a/Hummingbird/AWS/compute_environment.json +++ b/Hummingbird/AWS/compute_environment.json @@ -19,7 +19,7 @@ "KeyName": "hummingbird" }, "launchTemplate": { - "launchTemplateName": "hummingbird_launch_template", + "launchTemplateName": "hummingbird", "version": "$Latest" } }, diff --git a/Hummingbird/AWS/hummingbird-cloudformation.template b/Hummingbird/AWS/hummingbird-cloudformation.template index 0cad0e8..6da9ba4 100644 --- a/Hummingbird/AWS/hummingbird-cloudformation.template +++ b/Hummingbird/AWS/hummingbird-cloudformation.template @@ -206,10 +206,9 @@ Resources: GroupDescription: "AWS Batch EC2 Security Group" GroupName: !Sub ${EnvironmentName}-sg SecurityGroupEgress: - - IpProtocol: tcp - FromPort: 0 - ToPort: 0 + - IpProtocol: "-1" CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic Tags: - Key: Name Value: !Sub ${EnvironmentName}-SG @@ -224,12 +223,14 @@ Resources: Statement: - Effect: Allow Principal: - Service: [ ec2.amazonaws.com ] - Action: [ 'sts:AssumeRole' ] - Path: / + Service: + - 'ec2.amazonaws.com' + Action: + - 'sts:AssumeRole' ManagedPolicyArns: - 'arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role' - - 'arn:aws:iam::aws:policy/AmazonS3FullAccess' # TODO narrow down to S3 bucket Read/Write + - 'arn:aws:iam::aws:policy/AmazonS3FullAccess' + - 'arn:aws:iam::aws:policy/service-role/AmazonEC2RoleForSSM' Tags: - Key: Name Value: !Sub ${EnvironmentName}-ECSInstanceRole @@ -237,9 +238,9 @@ Resources: ECSInstanceProfileRole: Type: AWS::IAM::InstanceProfile Properties: - Path: / + InstanceProfileName: !Ref ECSInstanceRole Roles: - - Ref: 'ECSInstanceRole' + - !Ref ECSInstanceRole # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html ECSTaskExecutionRole: @@ -250,11 +251,13 @@ Resources: Statement: - Effect: Allow Principal: - Service: [ ecs-tasks.amazonaws.com ] - Action: [ 'sts:AssumeRole' ] - Path: / + Service: + - 'ecs-tasks.amazonaws.com' + Action: + - 'sts:AssumeRole' ManagedPolicyArns: - 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy' + - 'arn:aws:iam::aws:policy/AmazonS3FullAccess' Tags: - Key: Name Value: !Sub ${EnvironmentName}-ECSTaskExecutionRole @@ -268,9 +271,10 @@ Resources: Statement: - Effect: Allow Principal: - Service: [ batch.amazonaws.com ] - Action: [ 'sts:AssumeRole' ] - Path: / + Service: + - 'batch.amazonaws.com' + Action: + - 'sts:AssumeRole' ManagedPolicyArns: - 'arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole' Tags: diff --git a/Hummingbird/AWS/launch-template-data.json b/Hummingbird/AWS/launch-template-data.json index 7e0084c..8aa79cd 100644 --- a/Hummingbird/AWS/launch-template-data.json +++ b/Hummingbird/AWS/launch-template-data.json @@ -1,16 +1,33 @@ { - "LaunchTemplateName": "hummingbird_launch_template", + "LaunchTemplateName": "hummingbird", "LaunchTemplateData": { "EbsOptimized": true, "BlockDeviceMappings": [ { + "DeviceName": "/dev/xvda", + "Ebs": { + "DeleteOnTermination": true, + "VolumeType": "gp3", + "VolumeSize": 50 + } + }, + { + "DeviceName": "/dev/xvdcz", + "Ebs": { + "DeleteOnTermination": true, + "VolumeType": "gp3", + "VolumeSize": 22, + "Encrypted": true + } + }, + { + "DeviceName": "/dev/xvdba", "Ebs": { "DeleteOnTermination": true, "VolumeType": "gp3", "VolumeSize": 100, "Encrypted": true - }, - "DeviceName": "/dev/xvda" + } } ] } diff --git a/Hummingbird/scheduler.py b/Hummingbird/scheduler.py index ad1728d..3a6d737 100644 --- a/Hummingbird/scheduler.py +++ b/Hummingbird/scheduler.py @@ -66,7 +66,7 @@ def __init__(self, conf, machine, disk_size, script, **kwargs): def create_or_update_launch_template(self): with open('AWS/launch-template-data.json') as f: data = json.load(f) - data['LaunchTemplateData']['BlockDeviceMappings'][0]['Ebs']['VolumeSize'] = int(self.disk_size) + data['LaunchTemplateData']['BlockDeviceMappings'][-1]['Ebs']['VolumeSize'] = int(self.disk_size) from botocore.exceptions import ClientError try: @@ -102,8 +102,6 @@ def create_or_update_compute_environment(self, cf_output): compute_resources['securityGroupIds'] = [cf_output['BatchEC2SecurityGroup']] compute_resources['instanceRole'] = cf_output['ECSInstanceProfileRoleARN'] - print(json.dumps(data)) - data['tags'] = {'Name': compute_env_name} logging.info('Attempting to create AWS Batch Compute environment: %s', compute_env_name) self.batch_client.create_compute_environment(**data)