diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json new file mode 100644 index 0000000..5553409 --- /dev/null +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -0,0 +1,703 @@ +{ + "Description": "Launch an Alces HPC environment with a single SGE master node together with 16 initial compute nodes using EC2 spot.", + "Parameters": { + "KeyPair": { + "Description": "Choose an existing AWS key for administrator access", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SpotPrice": { + "Description": "Enter your maximum bid per hour for each compute instance. View the Spot Request calculator for information on spot pricing.", + "Type": "String", + "Default": "0.50" + }, + "SystemDiskSize": { + "Description": "Select the size in GB of root system disk to use for all deployed nodes in the environment", + "Type": "String", + "Default": "500" + }, + "LoginType": { + "Description": "Select the login node instance type to deploy - this defines the number of cores and memory available", + "Type": "String", + "Default": "t2.large", + "AllowedValues": [ + "t2.large", + "c3.large", + "c4.large" + ] + }, + "NetworkCIDR": { + "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", + "Type": "String", + "Default": "0.0.0.0/0", + "AllowedPattern": "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}", + "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } + } + }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } + }, + "Resources": { + "ClusterwareVPC": { + "Type": "AWS::EC2::VPC", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "CidrBlock": "10.75.0.0/16", + "EnableDnsSupport": "true", + "EnableDnsHostnames": "true" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRouteTable": { + "Type": "AWS::EC2::RouteTable", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "Tags": [ + { + "Key": "Application", + "Value": { + "Ref": "AWS::StackId" + } + } + ] + }, + "Condition": "CreateNetwork" + }, + "ClusterwarePublicNet": { + "Type": "AWS::EC2::Subnet", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "CidrBlock": "10.75.0.0/24" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareGateway": { + "Type": "AWS::EC2::InternetGateway", + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" + }, + "ClusterwareAttachGW": { + "Type": "AWS::EC2::VPCGatewayAttachment", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "InternetGatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRoute": { + "Type": "AWS::EC2::Route", + "DependsOn": "ClusterwareAttachGW", + "Properties": { + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + }, + "DestinationCidrBlock": "0.0.0.0/0", + "GatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "SubnetToRouteTable": { + "Type": "AWS::EC2::SubnetRouteTableAssociation", + "DependsOn": "ClusterwareRouteTable", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareNetACL": { + "Type": "AWS::EC2::NetworkAcl", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + } + }, + "Condition": "CreateNetwork" + }, + "InboundSSHACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "100", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "false", + "CidrBlock": { + "Ref": "NetworkCIDR" + }, + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "OutboundACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "101", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "true", + "CidrBlock": "0.0.0.0/0", + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareSubnetLink": { + "Type": "AWS::EC2::SubnetNetworkAclAssociation", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + } + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "GroupDescription": "Enable SSH access to the Alces Clusterware master node", + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "NetworkCIDR" + } + }, + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": { + "Ref": "NetworkCIDR" + } + } + ], + "SecurityGroupEgress": [ + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": "0.0.0.0/0" + } + ] + }, + "Condition": "CreateNetwork" + }, + "MasterIAM": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + }] + } + } + }, + "MasterIAMProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Path": "/", + "Roles": [ + { "Ref": "MasterIAM" + } + ] + } + }, + "MetricData": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "MetricData", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": [ + "*" + ] + }] + }, + "Roles": [ { "Ref": "MasterIAM" } ] + } + }, + "AlcesClusterwareMasterNode": { + "Type": "AWS::EC2::Instance", + "DependsOn": "ClusterwarePublicNet", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-login1" ] ]} + } + ], + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "IamInstanceProfile": { + "Ref": "MasterIAMProfile" + }, + "NetworkInterfaces": [ + { + "AssociatePublicIpAddress": "True", + "DeviceIndex": "0", + "GroupSet": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "SubnetId": { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": { "Ref": "SystemDiskSize" } + } + }, + { + "DeviceName": "/dev/sdb", + "VirtualName": "ephemeral0" + }, + { + "DeviceName": "/dev/sdc", + "VirtualName": "ephemeral1" + } + ], + "InstanceType": { "Ref": "LoginType" }, + "KeyName": { + "Ref": "KeyPair" + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "hostname: login1", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " #!/bin/bash -l", + "\n", + " computegroup=\"INSERTGROUP\"", + "\n", + " timestamp=$(date + '%Y-%m-%d'T'%T'.000Z)", + "\n", + " queued=$(qstat -u \\* | awk 'NR > 2{print $5}' | wc -l)", + "\n", + " /opt/clusterware/opt/aws/bin/aws --region \"eu-west-1\" cloudwatch put-metric-data --metric-name Queued --namespace \"ALCES/SGE\" --dimensions \"AutoScalingGroupName=${computegroup}\" --value $queued --timestamp $timestamp", + "\n", + " path: /opt/alces-sge-cloudwatch.sh", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'master'", + "\n", + " tags:", + "\n", + " scheduler_roles: ':master:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n", + "runcmd:", + "\n", + " - echo \"5 * * * * alces /opt/alces-sge-cloudwatch.sh\" >> /etc/crontab" + ] + ] + } + } + } + }, + "ComputeConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "True", + "KeyName": { + "Ref": "KeyPair" + }, + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "InstanceType": "c4.8xlarge", + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": { "Ref": "SystemDiskSize" } + } + }, + { + "DeviceName": "ephemeral0" + } + ], + "SpotPrice": { + "Ref": "SpotPrice" + }, + "SecurityGroups": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'slave'", + "\n", + " master: ", + { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PrivateIp" + ] + }, + "\n", + " tags:", + "\n", + " scheduler_roles: ':compute:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "PlacementGroup": { + "Type": "AWS::EC2::PlacementGroup", + "Properties": { + "Strategy": "cluster" + } + }, + "ComputeGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": "AlcesClusterwareMasterNode", + "Properties": { + "DesiredCapacity": "16", + "LaunchConfigurationName": { + "Ref": "ComputeConfig" + }, + "PlacementGroup": { + "Ref": "PlacementGroup" + }, + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-compute" ] ]}, + "PropagateAtLaunch": "true" + } + ], + "MinSize": "1", + "MaxSize": "100", + "VPCZoneIdentifier": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + ] + } + }, + "ScaleUp10": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "10" + } + }, + "ScaleDown10": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-10" + } + }, + "ScaleUp25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "25" + } + }, + "ScaleDown25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-25" + } + }, + "CPUAlarmHigh10": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 10% if currently queued jobs > 50 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "90", + "AlarmActions": [ + { + "Ref": "ScaleUp10" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + }, + "CPUAlarmLow10": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "55", + "Threshold": "10", + "AlarmActions": [ + { + "Ref": "ScaleDown10" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "LessThanThreshold" + } + }, + "CPUAlarmHigh25": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 25% if currently queued jobs > 100 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "100", + "AlarmActions": [ + { + "Ref": "ScaleUp25" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + } + }, + "Outputs": { + "AccessIP": { + "Value": { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PublicIp" + ] + } + } + } +} diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json new file mode 100644 index 0000000..315783d --- /dev/null +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -0,0 +1,795 @@ +{ + "Description": "Launch an Alces HPC environment with a single SGE master node together with 16 initial compute nodes using EC2 spot.", + "Parameters": { + "KeyPair": { + "Description": "Choose an existing AWS key for administrator access", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SpotPrice": { + "Description": "Enter your maximum bid per hour for each compute instance. View the Spot Request calculator for information on spot pricing.", + "Type": "String", + "Default": "0.50" + }, + "SystemDiskSize": { + "Description": "Select the size in GB of root system disk to use for all deployed nodes in the environment", + "Type": "String", + "Default": "500" + }, + "LoginType": { + "Description": "Select the login node instance type to deploy - this defines the number of cores and memory available", + "Type": "String", + "Default": "t2.large", + "AllowedValues": [ + "t2.large", + "c3.large", + "c4.large" + ] + }, + "SharedDataSize": { + "Description": "Enter the size in GB of the shared NFS area to deploy. If no shared area is required, enter None", + "Type": "String", + "Default": "5000" + }, + "NetworkCIDR": { + "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", + "Type": "String", + "Default": "0.0.0.0/0", + "AllowedPattern": "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}", + "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" + } + }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + }, + "CreateSharedData": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Ref": "SharedDataSize" + }, + "None" + ] + } + ] + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } + } + }, + "Resources": { + "ClusterwareVPC": { + "Type": "AWS::EC2::VPC", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "CidrBlock": "10.75.0.0/16", + "EnableDnsSupport": "true", + "EnableDnsHostnames": "true" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRouteTable": { + "Type": "AWS::EC2::RouteTable", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "Tags": [ + { + "Key": "Application", + "Value": { + "Ref": "AWS::StackId" + } + } + ] + }, + "Condition": "CreateNetwork" + }, + "ClusterwarePublicNet": { + "Type": "AWS::EC2::Subnet", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "CidrBlock": "10.75.0.0/24" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareGateway": { + "Type": "AWS::EC2::InternetGateway", + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" + }, + "ClusterwareAttachGW": { + "Type": "AWS::EC2::VPCGatewayAttachment", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "InternetGatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRoute": { + "Type": "AWS::EC2::Route", + "DependsOn": "ClusterwareAttachGW", + "Properties": { + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + }, + "DestinationCidrBlock": "0.0.0.0/0", + "GatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "SubnetToRouteTable": { + "Type": "AWS::EC2::SubnetRouteTableAssociation", + "DependsOn": "ClusterwareRouteTable", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareNetACL": { + "Type": "AWS::EC2::NetworkAcl", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + } + }, + "Condition": "CreateNetwork" + }, + "InboundSSHACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "100", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "false", + "CidrBlock": { + "Ref": "NetworkCIDR" + }, + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "OutboundACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "101", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "true", + "CidrBlock": "0.0.0.0/0", + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareSubnetLink": { + "Type": "AWS::EC2::SubnetNetworkAclAssociation", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + } + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "GroupDescription": "Enable SSH access to the Alces Clusterware master node", + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "NetworkCIDR" + } + }, + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": { + "Ref": "NetworkCIDR" + } + } + ], + "SecurityGroupEgress": [ + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": "0.0.0.0/0" + } + ] + }, + "Condition": "CreateNetwork" + }, + "MasterIAM": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + }] + } + } + }, + "MasterIAMProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Path": "/", + "Roles": [ + { "Ref": "MasterIAM" + } + ] + } + }, + "MetricData": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "MetricData", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": [ + "*" + ] + }] + }, + "Roles": [ { "Ref": "MasterIAM" } ] + } + }, + "SharedData": { + "Type": "AWS::EC2::Volume", + "Properties": { + "VolumeType": "io1", + "Size": { "Ref": "SharedDataSize" }, + "Iops": "1000", + "AvailabilityZone": { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "AvailabilityZone" ] } + }, + "Condition": "CreateSharedData" + }, + "AttachSharedData": { + "Type": "AWS::EC2::VolumeAttachment", + "Properties": { + "Device": "xvdb", + "InstanceId": { "Ref": "AlcesClusterwareMasterNode" }, + "VolumeId": { "Ref": "SharedData" } + }, + "Condition": "CreateSharedData" + }, + "AlcesClusterwareMasterNode": { + "Type": "AWS::EC2::Instance", + "DependsOn": "ClusterwarePublicNet", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-login1" ] ]} + } + ], + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "IamInstanceProfile": { + "Ref": "MasterIAMProfile" + }, + "NetworkInterfaces": [ + { + "AssociatePublicIpAddress": "True", + "DeviceIndex": "0", + "GroupSet": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "SubnetId": { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": { "Ref": "SystemDiskSize" } + } + } + ], + "InstanceType": { "Ref": "LoginType" }, + "KeyName": { + "Ref": "KeyPair" + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "hostname: login1", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " #!/bin/bash -l", + "\n", + " computegroup=\"INSERTGROUP\"", + "\n", + " timestamp=$(date + '%Y-%m-%d'T'%T'.000Z)", + "\n", + " queued=$(qstat -u \\* | awk 'NR > 2{print $5}' | wc -l)", + "\n", + " /opt/clusterware/opt/aws/bin/aws --region \"eu-west-1\" cloudwatch put-metric-data --metric-name Queued --namespace \"ALCES/SGE\" --dimensions \"AutoScalingGroupName=${computegroup}\" --value $queued --timestamp $timestamp", + "\n", + " path: /opt/alces-sge-cloudwatch.sh", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", + " #!/bin/bash", + "\n", + " checkdisk=$(fdisk -l | grep xvdb)", + "\n", + " if [ -z \"$checkdisk\" ]; then", + "\n", + " mkfs -t xfs /dev/xvdb", + "\n", + " mkdir /sharedscratch", + "\n", + " echo '/dev/xvdb /sharedscratch xfs defaults 0 0' >> etc/fstab", + "\n", + " mount -a", + "\n", + " echo '/sharedscratch 10.75.0.0/255.255.0.0(rw,no_root_squash,no_subtree_check,async)' >> /etc/exports", + "\n", + " systemctl enable nfs-server && systemctl start nfs-server", + "\n", + " chmod 0777 /sharedscratch/", + "\n", + " rm -rf /tmp/disk-format", + "\n", + " else", + "\n", + " echo \"No disk detected\"", + "\n", + " fi", + "\n", + " path: /tmp/disk-format", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'master'", + "\n", + " tags:", + "\n", + " scheduler_roles: ':master:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n", + "runcmd:", + "\n", + " - /tmp/disk-format", + "\n", + " - echo \"5 * * * * /opt/alces-sge-cloudwatch.sh\" >> /etc/crontab" + ] + ] + } + } + } + }, + "ComputeConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "True", + "KeyName": { + "Ref": "KeyPair" + }, + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "InstanceType": "c4.8xlarge", + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": { "Ref": "SystemDiskSize" } + } + } + ], + "SpotPrice": { + "Ref": "SpotPrice" + }, + "SecurityGroups": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " #!/bin/bash", + "\n", + " diskcheck=$(showmount -e ", + { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "PrivateIp" ] }, + " | grep sharedscratch)", + "\n", + " if [ -z \"$diskcheck\" ]; then", + "\n", + " mkdir /sharedscratch", + "\n", + " echo '", + { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "PrivateIp" ] }, + ":/sharedscratch /sharedscratch xfs defaults 0 0' >> etc/fstab ", + "\n", + " mount -a", + "\n", + " rm -rf /tmp/disk-format", + "\n", + " else", + "\n", + " echo \"No disk available\"", + "\n", + " path: /tmp/disk-format", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'slave'", + "\n", + " master: ", + { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PrivateIp" + ] + }, + "\n", + " tags:", + "\n", + " scheduler_roles: ':compute:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n", + "runcmd:", + "\n", + " - /tmp/disk-format" + ] + ] + } + } + } + }, + "PlacementGroup": { + "Type": "AWS::EC2::PlacementGroup", + "Properties": { + "Strategy": "cluster" + } + }, + "ComputeGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": "AlcesClusterwareMasterNode", + "Properties": { + "DesiredCapacity": "32", + "LaunchConfigurationName": { + "Ref": "ComputeConfig" + }, + "PlacementGroup": { + "Ref": "PlacementGroup" + }, + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-compute" ] ]}, + "PropagateAtLaunch": "true" + } + ], + "MinSize": "1", + "MaxSize": "100", + "VPCZoneIdentifier": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + ] + } + }, + "ScaleUp10": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "10" + } + }, + "ScaleDown10": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-10" + } + }, + "ScaleUp25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "25" + } + }, + "ScaleDown25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-25" + } + }, + "CPUAlarmHigh10": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 10% if currently queued jobs > 50 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "90", + "AlarmActions": [ + { + "Ref": "ScaleUp10" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + }, + "CPUAlarmLow10": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "55", + "Threshold": "10", + "AlarmActions": [ + { + "Ref": "ScaleDown10" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "LessThanThreshold" + } + }, + "CPUAlarmHigh25": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 25% if currently queued jobs > 100 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "100", + "AlarmActions": [ + { + "Ref": "ScaleUp25" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + } + }, + "Outputs": { + "AccessIP": { + "Value": { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PublicIp" + ] + } + } + } +} diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json new file mode 100644 index 0000000..cda5715 --- /dev/null +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -0,0 +1,694 @@ +{ + "Description": "Launch an Alces HPC environment with a single SGE master node together with 8 initial compute nodes using EC2 spot.", + "Parameters": { + "KeyPair": { + "Description": "Choose an existing AWS key for administrator access", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SpotPrice": { + "Description": "Enter your maximum bid per hour for each compute instance. View the Spot Request calculator for information on spot pricing.", + "Type": "String", + "Default": "0.50" + }, + "SystemDiskSize": { + "Description": "Select the size in GB of root system disk to use for all deployed nodes in the environment", + "Type": "String", + "Default": "500" + }, + "LoginType": { + "Description": "Select the login node instance type to deploy - this defines the number of cores and memory available", + "Type": "String", + "Default": "t2.large", + "AllowedValues": [ + "t2.large", + "c3.large", + "c4.large" + ] + }, + "NetworkCIDR": { + "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", + "Type": "String", + "Default": "0.0.0.0/0", + "AllowedPattern": "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}", + "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } + }, + "LoginType2SystemDisk": { + "t2.large": { + "rootdisk": "8" + }, + "c3.large": { + "rootdisk": "100" + }, + "c4.large": { + "rootdisk": "100" + } + } + }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } + }, + "Resources": { + "ClusterwareVPC": { + "Type": "AWS::EC2::VPC", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "CidrBlock": "10.75.0.0/16", + "EnableDnsSupport": "true", + "EnableDnsHostnames": "true" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRouteTable": { + "Type": "AWS::EC2::RouteTable", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "Tags": [ + { + "Key": "Application", + "Value": { + "Ref": "AWS::StackId" + } + } + ] + }, + "Condition": "CreateNetwork" + }, + "ClusterwarePublicNet": { + "Type": "AWS::EC2::Subnet", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "CidrBlock": "10.75.0.0/24" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareGateway": { + "Type": "AWS::EC2::InternetGateway", + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" + }, + "ClusterwareAttachGW": { + "Type": "AWS::EC2::VPCGatewayAttachment", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "InternetGatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRoute": { + "Type": "AWS::EC2::Route", + "DependsOn": "ClusterwareAttachGW", + "Properties": { + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + }, + "DestinationCidrBlock": "0.0.0.0/0", + "GatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "SubnetToRouteTable": { + "Type": "AWS::EC2::SubnetRouteTableAssociation", + "DependsOn": "ClusterwareRouteTable", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareNetACL": { + "Type": "AWS::EC2::NetworkAcl", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + } + }, + "Condition": "CreateNetwork" + }, + "InboundSSHACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "100", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "false", + "CidrBlock": { + "Ref": "NetworkCIDR" + }, + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "OutboundACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "101", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "true", + "CidrBlock": "0.0.0.0/0", + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareSubnetLink": { + "Type": "AWS::EC2::SubnetNetworkAclAssociation", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + } + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "GroupDescription": "Enable SSH access to the Alces Clusterware master node", + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "NetworkCIDR" + } + }, + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": { + "Ref": "NetworkCIDR" + } + } + ], + "SecurityGroupEgress": [ + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": "0.0.0.0/0" + } + ] + }, + "Condition": "CreateNetwork" + }, + "MasterIAM": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + }] + } + } + }, + "MasterIAMProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Path": "/", + "Roles": [ + { "Ref": "MasterIAM" + } + ] + } + }, + "MetricData": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "MetricData", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData", + "autoscaling:DescribeAutoScalingGroups" + ], + "Resource": [ + "*" + ] + }] + }, + "Roles": [ { "Ref": "MasterIAM" } ] + } + }, + "AlcesClusterwareMasterNode": { + "Type": "AWS::EC2::Instance", + "DependsOn": "ClusterwarePublicNet", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-login1" ] ]} + } + ], + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "IamInstanceProfile": { + "Ref": "MasterIAMProfile" + }, + "NetworkInterfaces": [ + { + "AssociatePublicIpAddress": "True", + "DeviceIndex": "0", + "GroupSet": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "SubnetId": { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": { "Fn::FindInMap": [ "LoginType2SystemDisk", { "Ref": "LoginType" }, "rootdisk"] } + } + } + ], + "InstanceType": { "Ref": "LoginType" }, + "KeyName": { + "Ref": "KeyPair" + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "hostname: login1", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " #!/bin/bash -l", + "\n", + " computegroup=\"INSERTGROUP\"", + "\n", + " timestamp=$(date + '%Y-%m-%d'T'%T'.000Z)", + "\n", + " queued=$(qstat -u \\* | awk 'NR > 2{print $5}' | wc -l)", + "\n", + " /opt/clusterware/opt/aws/bin/aws --region \"eu-west-1\" cloudwatch put-metric-data --metric-name Queued --namespace \"ALCES/SGE\" --dimensions \"AutoScalingGroupName=${computegroup}\" --value $queued --timestamp $timestamp", + "\n", + " path: /opt/alces-sge-cloudwatch.sh", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'master'", + "\n", + " tags:", + "\n", + " scheduler_roles: ':master:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n", + "runcmd:", + "\n", + " - echo \"5 * * * * alces /opt/alces-sge-cloudwatch.sh\" >> /etc/crontab" + ] + ] + } + } + } + }, + "ComputeConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "True", + "KeyName": { + "Ref": "KeyPair" + }, + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "InstanceType": "c4.large", + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": { "Ref": "SystemDiskSize" } + } + } + ], + "SpotPrice": { + "Ref": "SpotPrice" + }, + "SecurityGroups": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'slave'", + "\n", + " master: ", + { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PrivateIp" + ] + }, + "\n", + " tags:", + "\n", + " scheduler_roles: ':compute:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "Properties": { + "DesiredCapacity": "8", + "LaunchConfigurationName": { + "Ref": "ComputeConfig" + }, + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-compute" ] ]}, + "PropagateAtLaunch": "true" + } + ], + "MinSize": "1", + "MaxSize": "100", + "VPCZoneIdentifier": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + ] + } + }, + "ScaleUp10": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "10" + } + }, + "ScaleDown10": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-10" + } + }, + "ScaleUp25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "25" + } + }, + "ScaleDown25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-25" + } + }, + "CPUAlarmHigh10": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 10% if currently queued jobs > 50 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "90", + "AlarmActions": [ + { + "Ref": "ScaleUp10" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + }, + "CPUAlarmLow10": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "55", + "Threshold": "10", + "AlarmActions": [ + { + "Ref": "ScaleDown10" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "LessThanThreshold" + } + }, + "CPUAlarmHigh25": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 25% if currently queued jobs > 100 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "100", + "AlarmActions": [ + { + "Ref": "ScaleUp25" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + } + }, + "Outputs": { + "AccessIP": { + "Value": { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PublicIp" + ] + } + } + } +} diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json b/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json index 3a3c6cd..c682b20 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json @@ -11,6 +11,16 @@ "AllowedPattern": "[0-9]", "Default": "3" }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, "InstanceFlavour": { "Description": "Select the compute node instance flavour", "Type": "String", @@ -48,6 +58,16 @@ } } }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } + }, "Resources": { "ClusterwareVPC": { "Type": "AWS::EC2::VPC", @@ -63,7 +83,8 @@ "CidrBlock": "10.75.0.0/16", "EnableDnsSupport": "true", "EnableDnsHostnames": "true" - } + }, + "Condition": "CreateNetwork" }, "ClusterwareRouteTable": { "Type": "AWS::EC2::RouteTable", @@ -80,7 +101,8 @@ } } ] - } + }, + "Condition": "CreateNetwork" }, "ClusterwarePublicNet": { "Type": "AWS::EC2::Subnet", @@ -89,11 +111,13 @@ "Ref": "ClusterwareVPC" }, "CidrBlock": "10.75.0.0/24" - } + }, + "Condition": "CreateNetwork" }, "ClusterwareGateway": { "Type": "AWS::EC2::InternetGateway", - "DependsOn": "ClusterwareVPC" + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" }, "ClusterwareAttachGW": { "Type": "AWS::EC2::VPCGatewayAttachment", @@ -105,7 +129,8 @@ "InternetGatewayId": { "Ref": "ClusterwareGateway" } - } + }, + "Condition": "CreateNetwork" }, "ClusterwareRoute": { "Type": "AWS::EC2::Route", @@ -118,7 +143,8 @@ "GatewayId": { "Ref": "ClusterwareGateway" } - } + }, + "Condition": "CreateNetwork" }, "SubnetToRouteTable": { "Type": "AWS::EC2::SubnetRouteTableAssociation", @@ -130,7 +156,8 @@ "RouteTableId": { "Ref": "ClusterwareRouteTable" } - } + }, + "Condition": "CreateNetwork" }, "ClusterwareNetACL": { "Type": "AWS::EC2::NetworkAcl", @@ -138,7 +165,8 @@ "VpcId": { "Ref": "ClusterwareVPC" } - } + }, + "Condition": "CreateNetwork" }, "InboundSSHACLEntry": { "Type": "AWS::EC2::NetworkAclEntry", @@ -157,7 +185,8 @@ "From": "1", "To": "65535" } - } + }, + "Condition": "CreateNetwork" }, "OutboundACLEntry": { "Type": "AWS::EC2::NetworkAclEntry", @@ -174,7 +203,8 @@ "From": "1", "To": "65535" } - } + }, + "Condition": "CreateNetwork" }, "ClusterwareSubnetLink": { "Type": "AWS::EC2::SubnetNetworkAclAssociation", @@ -185,7 +215,8 @@ "NetworkAclId": { "Ref": "ClusterwareNetACL" } - } + }, + "Condition": "CreateNetwork" }, "AlcesClusterwareSecurityGroup": { "Type": "AWS::EC2::SecurityGroup", @@ -228,11 +259,11 @@ "CidrIp": "0.0.0.0/0" } ] - } + }, + "Condition": "CreateNetwork" }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", - "DependsOn": "ClusterwareAttachGW", "Properties": { "Tags": [ { @@ -255,11 +286,27 @@ "DeviceIndex": "0", "GroupSet": [ { - "Ref": "AlcesClusterwareSecurityGroup" + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] } ], "SubnetId": { - "Ref": "ClusterwarePublicNet" + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] } } ], @@ -363,7 +410,15 @@ }, "SecurityGroups": [ { - "Ref": "AlcesClusterwareSecurityGroup" + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] } ], "UserData": { @@ -430,7 +485,15 @@ "MaxSize": "100", "VPCZoneIdentifier": [ { - "Ref": "ClusterwarePublicNet" + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] } ] }