From ef09456a6ca5118c3c782003643eff294c33f36f Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 15:01:47 +0000 Subject: [PATCH 01/18] Initial template for https://trello.com/c/mpDtkIqc - User specifies their own SG/Subnet - If none specified, full network is created --- .../templates/all-in-one/hpc-cluster.json | 108 ++++++++++++++---- 1 file changed, 86 insertions(+), 22 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json b/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json index 3a3c6cd..fab6568 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json @@ -11,6 +11,16 @@ "AllowedPattern": "[0-9]", "Default": "3" }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, "InstanceFlavour": { "Description": "Select the compute node instance flavour", "Type": "String", @@ -48,6 +58,16 @@ } } }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals" : [ + { + "Ref" : "SubnetId" + }, + "None" + ] + } + }, "Resources": { "ClusterwareVPC": { "Type": "AWS::EC2::VPC", @@ -63,7 +83,8 @@ "CidrBlock": "10.75.0.0/16", "EnableDnsSupport": "true", "EnableDnsHostnames": "true" - } + }, + "Condition": "CreateNetwork" }, "ClusterwareRouteTable": { "Type": "AWS::EC2::RouteTable", @@ -80,7 +101,8 @@ } } ] - } + }, + "Condition": "CreateNetwork" }, "ClusterwarePublicNet": { "Type": "AWS::EC2::Subnet", @@ -89,11 +111,13 @@ "Ref": "ClusterwareVPC" }, "CidrBlock": "10.75.0.0/24" - } + }, + "Condition": "CreateNetwork" }, "ClusterwareGateway": { "Type": "AWS::EC2::InternetGateway", - "DependsOn": "ClusterwareVPC" + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" }, "ClusterwareAttachGW": { "Type": "AWS::EC2::VPCGatewayAttachment", @@ -105,7 +129,8 @@ "InternetGatewayId": { "Ref": "ClusterwareGateway" } - } + }, + "Condition": "CreateNetwork" }, "ClusterwareRoute": { "Type": "AWS::EC2::Route", @@ -118,7 +143,8 @@ "GatewayId": { "Ref": "ClusterwareGateway" } - } + }, + "Condition": "CreateNetwork" }, "SubnetToRouteTable": { "Type": "AWS::EC2::SubnetRouteTableAssociation", @@ -130,7 +156,8 @@ "RouteTableId": { "Ref": "ClusterwareRouteTable" } - } + }, + "Condition": "CreateNetwork" }, "ClusterwareNetACL": { "Type": "AWS::EC2::NetworkAcl", @@ -138,7 +165,8 @@ "VpcId": { "Ref": "ClusterwareVPC" } - } + }, + "Condition": "CreateNetwork" }, "InboundSSHACLEntry": { "Type": "AWS::EC2::NetworkAclEntry", @@ -157,7 +185,8 @@ "From": "1", "To": "65535" } - } + }, + "Condition": "CreateNetwork" }, "OutboundACLEntry": { "Type": "AWS::EC2::NetworkAclEntry", @@ -174,7 +203,8 @@ "From": "1", "To": "65535" } - } + }, + "Condition": "CreateNetwork" }, "ClusterwareSubnetLink": { "Type": "AWS::EC2::SubnetNetworkAclAssociation", @@ -185,7 +215,8 @@ "NetworkAclId": { "Ref": "ClusterwareNetACL" } - } + }, + "Condition": "CreateNetwork" }, "AlcesClusterwareSecurityGroup": { "Type": "AWS::EC2::SecurityGroup", @@ -228,11 +259,11 @@ "CidrIp": "0.0.0.0/0" } ] - } + }, + "Condition": "CreateNetwork" }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", - "DependsOn": "ClusterwareAttachGW", "Properties": { "Tags": [ { @@ -254,14 +285,31 @@ "AssociatePublicIpAddress": "True", "DeviceIndex": "0", "GroupSet": [ - { + { + "Fn::If": [ + "CreateNetwork", + { "Ref": "AlcesClusterwareSecurityGroup" - } + }, + { + "Ref": "SecurityGroup" + } + ] + } ], - "SubnetId": { - "Ref": "ClusterwarePublicNet" + "SubnetId": + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } } - } ], "BlockDeviceMappings": [ { @@ -362,9 +410,17 @@ "Ref": "SpotPrice" }, "SecurityGroups": [ - { + { + "Fn::If": [ + "CreateNetwork", + { "Ref": "AlcesClusterwareSecurityGroup" - } + }, + { + "Ref": "SecurityGroup" + } + ] + } ], "UserData": { "Fn::Base64": { @@ -429,9 +485,17 @@ "MinSize": "1", "MaxSize": "100", "VPCZoneIdentifier": [ - { + { + "Fn::If": [ + "CreateNetwork", + { "Ref": "ClusterwarePublicNet" - } + }, + { + "Ref": "SubnetId" + } + ] + } ] } }, From 6b9ae3e5ab5475e29a94006fdf3363c8b7965431 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 15:04:49 +0000 Subject: [PATCH 02/18] Tidy template formatting and spacing --- .../templates/all-in-one/hpc-cluster.json | 127 +++++++++--------- 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json b/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json index fab6568..c682b20 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/hpc-cluster.json @@ -12,12 +12,12 @@ "Default": "3" }, "SubnetId": { - "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", "Type": "String", "Default": "None" }, - "SecurityGroup": { - "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", "Type": "String", "Default": "None" }, @@ -60,13 +60,13 @@ }, "Conditions": { "CreateNetwork": { - "Fn::Equals" : [ - { - "Ref" : "SubnetId" - }, - "None" - ] - } + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } }, "Resources": { "ClusterwareVPC": { @@ -102,7 +102,7 @@ } ] }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "ClusterwarePublicNet": { "Type": "AWS::EC2::Subnet", @@ -112,12 +112,12 @@ }, "CidrBlock": "10.75.0.0/24" }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "ClusterwareGateway": { "Type": "AWS::EC2::InternetGateway", "DependsOn": "ClusterwareVPC", - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "ClusterwareAttachGW": { "Type": "AWS::EC2::VPCGatewayAttachment", @@ -130,7 +130,7 @@ "Ref": "ClusterwareGateway" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "ClusterwareRoute": { "Type": "AWS::EC2::Route", @@ -144,7 +144,7 @@ "Ref": "ClusterwareGateway" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "SubnetToRouteTable": { "Type": "AWS::EC2::SubnetRouteTableAssociation", @@ -157,7 +157,7 @@ "Ref": "ClusterwareRouteTable" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "ClusterwareNetACL": { "Type": "AWS::EC2::NetworkAcl", @@ -166,7 +166,7 @@ "Ref": "ClusterwareVPC" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "InboundSSHACLEntry": { "Type": "AWS::EC2::NetworkAclEntry", @@ -186,7 +186,7 @@ "To": "65535" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "OutboundACLEntry": { "Type": "AWS::EC2::NetworkAclEntry", @@ -204,7 +204,7 @@ "To": "65535" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "ClusterwareSubnetLink": { "Type": "AWS::EC2::SubnetNetworkAclAssociation", @@ -216,7 +216,7 @@ "Ref": "ClusterwareNetACL" } }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "AlcesClusterwareSecurityGroup": { "Type": "AWS::EC2::SecurityGroup", @@ -260,7 +260,7 @@ } ] }, - "Condition": "CreateNetwork" + "Condition": "CreateNetwork" }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", @@ -285,31 +285,30 @@ "AssociatePublicIpAddress": "True", "DeviceIndex": "0", "GroupSet": [ - { - "Fn::If": [ - "CreateNetwork", - { - "Ref": "AlcesClusterwareSecurityGroup" - }, - { - "Ref": "SecurityGroup" - } - ] - } + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } ], - "SubnetId": - { + "SubnetId": { "Fn::If": [ - "CreateNetwork", - { - "Ref": "ClusterwarePublicNet" - }, - { - "Ref": "SubnetId" - } + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } ] - } } + } ], "BlockDeviceMappings": [ { @@ -410,17 +409,17 @@ "Ref": "SpotPrice" }, "SecurityGroups": [ - { - "Fn::If": [ - "CreateNetwork", - { - "Ref": "AlcesClusterwareSecurityGroup" - }, - { - "Ref": "SecurityGroup" - } - ] - } + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } ], "UserData": { "Fn::Base64": { @@ -485,17 +484,17 @@ "MinSize": "1", "MaxSize": "100", "VPCZoneIdentifier": [ - { - "Fn::If": [ - "CreateNetwork", - { - "Ref": "ClusterwarePublicNet" - }, - { - "Ref": "SubnetId" - } - ] - } + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } ] } }, From 438fd2a008b3837ddaab15016a9545dd48d339dd Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 15:48:37 +0000 Subject: [PATCH 03/18] Base template for https://trello.com/c/lLy4QpQE --- .../templates/all-in-one/compute/8-node.json | 552 ++++++++++++++++++ 1 file changed, 552 insertions(+) create mode 100644 aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json new file mode 100644 index 0000000..2ea0d84 --- /dev/null +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -0,0 +1,552 @@ +{ + "Description": "Launch an Alces HPC environment with a single SGE master node together with 8 initial compute nodes using EC2 spot.", + "Parameters": { + "KeyPair": { + "Description": "Choose an existing AWS key for administrator access", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SpotPrice": { + "Description": "Enter your maximum bid per hour for each compute instance. View the Spot Request calculator for information on spot pricing.", + "Type": "String", + "Default": "0.50" + }, + "NetworkCIDR": { + "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", + "Type": "String", + "Default": "0.0.0.0/0", + "AllowedPattern": "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}", + "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } + } + }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } + }, + "Resources": { + "ClusterwareVPC": { + "Type": "AWS::EC2::VPC", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "CidrBlock": "10.75.0.0/16", + "EnableDnsSupport": "true", + "EnableDnsHostnames": "true" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRouteTable": { + "Type": "AWS::EC2::RouteTable", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "Tags": [ + { + "Key": "Application", + "Value": { + "Ref": "AWS::StackId" + } + } + ] + }, + "Condition": "CreateNetwork" + }, + "ClusterwarePublicNet": { + "Type": "AWS::EC2::Subnet", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "CidrBlock": "10.75.0.0/24" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareGateway": { + "Type": "AWS::EC2::InternetGateway", + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" + }, + "ClusterwareAttachGW": { + "Type": "AWS::EC2::VPCGatewayAttachment", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "InternetGatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRoute": { + "Type": "AWS::EC2::Route", + "DependsOn": "ClusterwareAttachGW", + "Properties": { + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + }, + "DestinationCidrBlock": "0.0.0.0/0", + "GatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "SubnetToRouteTable": { + "Type": "AWS::EC2::SubnetRouteTableAssociation", + "DependsOn": "ClusterwareRouteTable", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareNetACL": { + "Type": "AWS::EC2::NetworkAcl", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + } + }, + "Condition": "CreateNetwork" + }, + "InboundSSHACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "100", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "false", + "CidrBlock": { + "Ref": "NetworkCIDR" + }, + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "OutboundACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "101", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "true", + "CidrBlock": "0.0.0.0/0", + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareSubnetLink": { + "Type": "AWS::EC2::SubnetNetworkAclAssociation", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + } + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "GroupDescription": "Enable SSH access to the Alces Clusterware master node", + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "NetworkCIDR" + } + }, + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": { + "Ref": "NetworkCIDR" + } + } + ], + "SecurityGroupEgress": [ + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": "0.0.0.0/0" + } + ] + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareMasterNode": { + "Type": "AWS::EC2::Instance", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-login1" ] ]} + } + ], + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "NetworkInterfaces": [ + { + "AssociatePublicIpAddress": "True", + "DeviceIndex": "0", + "GroupSet": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "SubnetId": { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": "200" + } + } + ], + "InstanceType": "c4.large", + "KeyName": { + "Ref": "KeyPair" + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "hostname: login1", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'master'", + "\n", + " tags:", + "\n", + " scheduler_roles: ':master:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "True", + "KeyName": { + "Ref": "KeyPair" + }, + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "InstanceType": "c4.large", + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": "200" + } + } + ], + "SpotPrice": { + "Ref": "SpotPrice" + }, + "SecurityGroups": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'slave'", + "\n", + " master: ", + { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PrivateIp" + ] + }, + "\n", + " tags:", + "\n", + " scheduler_roles: ':compute:'", + "\n", + " quorum: 3", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": "AlcesClusterwareMasterNode", + "Properties": { + "DesiredCapacity": "8", + "LaunchConfigurationName": { + "Ref": "ComputeConfig" + }, + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-compute" ] ]}, + "PropagateAtLaunch": "true" + } + ], + "MinSize": "1", + "MaxSize": "100", + "VPCZoneIdentifier": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + ] + } + }, + "ScaleUp": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "ChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "1" + } + }, + "ScaleDown": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "ChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-1" + } + }, + "CPUAlarmHigh": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up if CPU > 90% for 20 minutes", + "MetricName": "CPUUtilization", + "Namespace": "AWS/EC2", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "90", + "AlarmActions": [ + { + "Ref": "ScaleUp" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + }, + "CPUAlarmLow": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", + "MetricName": "CPUUtilization", + "Namespace": "AWS/EC2", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "55", + "Threshold": "10", + "AlarmActions": [ + { + "Ref": "ScaleDown" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "LessThanThreshold" + } + } + }, + "Outputs": { + "AccessIP": { + "Value": { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PublicIp" + ] + } + } + } +} From e8eaff2795d3fb2fa042f3a707dcb925ee684f32 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 22:44:06 +0000 Subject: [PATCH 04/18] Initial 16-node template for https://trello.com/c/lLy4QpQE --- .../templates/all-in-one/compute/16-node.json | 555 ++++++++++++++++++ 1 file changed, 555 insertions(+) create mode 100644 aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json new file mode 100644 index 0000000..5232230 --- /dev/null +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -0,0 +1,555 @@ +{ + "Description": "Launch an Alces HPC environment with a single SGE master node together with 16 initial compute nodes using EC2 spot.", + "Parameters": { + "KeyPair": { + "Description": "Choose an existing AWS key for administrator access", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SpotPrice": { + "Description": "Enter your maximum bid per hour for each compute instance. View the Spot Request calculator for information on spot pricing.", + "Type": "String", + "Default": "0.50" + }, + "NetworkCIDR": { + "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", + "Type": "String", + "Default": "0.0.0.0/0", + "AllowedPattern": "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}", + "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } + } + }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } + }, + "Resources": { + "ClusterwareVPC": { + "Type": "AWS::EC2::VPC", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "CidrBlock": "10.75.0.0/16", + "EnableDnsSupport": "true", + "EnableDnsHostnames": "true" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRouteTable": { + "Type": "AWS::EC2::RouteTable", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "Tags": [ + { + "Key": "Application", + "Value": { + "Ref": "AWS::StackId" + } + } + ] + }, + "Condition": "CreateNetwork" + }, + "ClusterwarePublicNet": { + "Type": "AWS::EC2::Subnet", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "CidrBlock": "10.75.0.0/24" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareGateway": { + "Type": "AWS::EC2::InternetGateway", + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" + }, + "ClusterwareAttachGW": { + "Type": "AWS::EC2::VPCGatewayAttachment", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "InternetGatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRoute": { + "Type": "AWS::EC2::Route", + "DependsOn": "ClusterwareAttachGW", + "Properties": { + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + }, + "DestinationCidrBlock": "0.0.0.0/0", + "GatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "SubnetToRouteTable": { + "Type": "AWS::EC2::SubnetRouteTableAssociation", + "DependsOn": "ClusterwareRouteTable", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareNetACL": { + "Type": "AWS::EC2::NetworkAcl", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + } + }, + "Condition": "CreateNetwork" + }, + "InboundSSHACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "100", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "false", + "CidrBlock": { + "Ref": "NetworkCIDR" + }, + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "OutboundACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "101", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "true", + "CidrBlock": "0.0.0.0/0", + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareSubnetLink": { + "Type": "AWS::EC2::SubnetNetworkAclAssociation", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + } + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "GroupDescription": "Enable SSH access to the Alces Clusterware master node", + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "NetworkCIDR" + } + }, + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": { + "Ref": "NetworkCIDR" + } + } + ], + "SecurityGroupEgress": [ + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": "0.0.0.0/0" + } + ] + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareMasterNode": { + "Type": "AWS::EC2::Instance", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-login1" ] ]} + } + ], + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "NetworkInterfaces": [ + { + "AssociatePublicIpAddress": "True", + "DeviceIndex": "0", + "GroupSet": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "SubnetId": { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": "1000" + } + } + ], + "InstanceType": "c4.large", + "KeyName": { + "Ref": "KeyPair" + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "hostname: login1", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'master'", + "\n", + " tags:", + "\n", + " scheduler_roles: ':master:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "True", + "KeyName": { + "Ref": "KeyPair" + }, + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "InstanceType": "c4.8xlarge", + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": "1000" + } + }, + { + "DeviceName": "ephemeral0" + } + ], + "SpotPrice": { + "Ref": "SpotPrice" + }, + "SecurityGroups": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'slave'", + "\n", + " master: ", + { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PrivateIp" + ] + }, + "\n", + " tags:", + "\n", + " scheduler_roles: ':compute:'", + "\n", + " quorum: 3", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": "AlcesClusterwareMasterNode", + "Properties": { + "DesiredCapacity": "16", + "LaunchConfigurationName": { + "Ref": "ComputeConfig" + }, + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-compute" ] ]}, + "PropagateAtLaunch": "true" + } + ], + "MinSize": "1", + "MaxSize": "100", + "VPCZoneIdentifier": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + ] + } + }, + "ScaleUp": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "ChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "1" + } + }, + "ScaleDown": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "ChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-1" + } + }, + "CPUAlarmHigh": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up if CPU > 90% for 20 minutes", + "MetricName": "CPUUtilization", + "Namespace": "AWS/EC2", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "90", + "AlarmActions": [ + { + "Ref": "ScaleUp" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + }, + "CPUAlarmLow": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", + "MetricName": "CPUUtilization", + "Namespace": "AWS/EC2", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "55", + "Threshold": "10", + "AlarmActions": [ + { + "Ref": "ScaleDown" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "LessThanThreshold" + } + } + }, + "Outputs": { + "AccessIP": { + "Value": { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PublicIp" + ] + } + } + } +} From ffff0821bc5f3fecf9471e0686ad9776aaa7767c Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 22:44:58 +0000 Subject: [PATCH 05/18] `quorum` setting isn't required --- .../aws-tools/templates/all-in-one/compute/16-node.json | 2 -- .../aws-tools/templates/all-in-one/compute/8-node.json | 2 -- 2 files changed, 4 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index 5232230..5820c2f 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -421,8 +421,6 @@ "\n", " scheduler_roles: ':compute:'", "\n", - " quorum: 3", - "\n", " owner: root:root", "\n", " path: /opt/clusterware/etc/config.yml", diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json index 2ea0d84..57ed65e 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -418,8 +418,6 @@ "\n", " scheduler_roles: ':compute:'", "\n", - " quorum: 3", - "\n", " owner: root:root", "\n", " path: /opt/clusterware/etc/config.yml", From 80ffb5c5fc197e22e85739b4fb4a10a97f1cb519 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 22:51:28 +0000 Subject: [PATCH 06/18] Initial 32-node template for https://trello.com/c/lLy4QpQE --- .../templates/all-in-one/compute/32-node.json | 553 ++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json new file mode 100644 index 0000000..0df23dd --- /dev/null +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -0,0 +1,553 @@ +{ + "Description": "Launch an Alces HPC environment with a single SGE master node together with 16 initial compute nodes using EC2 spot.", + "Parameters": { + "KeyPair": { + "Description": "Choose an existing AWS key for administrator access", + "Type": "AWS::EC2::KeyPair::KeyName" + }, + "SubnetId": { + "Description": "Enter the ID of your existing subnet. If you wish to have one created for you, select None. The SecurityGroup field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SecurityGroup": { + "Description": "Enter the ID of your existing security group. If you wish to have one created for you, select None. The Subnet ID field must also be set to None.", + "Type": "String", + "Default": "None" + }, + "SpotPrice": { + "Description": "Enter your maximum bid per hour for each compute instance. View the Spot Request calculator for information on spot pricing.", + "Type": "String", + "Default": "0.50" + }, + "NetworkCIDR": { + "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", + "Type": "String", + "Default": "0.0.0.0/0", + "AllowedPattern": "[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}", + "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } + } + }, + "Conditions": { + "CreateNetwork": { + "Fn::Equals": [ + { + "Ref": "SubnetId" + }, + "None" + ] + } + }, + "Resources": { + "ClusterwareVPC": { + "Type": "AWS::EC2::VPC", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "CidrBlock": "10.75.0.0/16", + "EnableDnsSupport": "true", + "EnableDnsHostnames": "true" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRouteTable": { + "Type": "AWS::EC2::RouteTable", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "Tags": [ + { + "Key": "Application", + "Value": { + "Ref": "AWS::StackId" + } + } + ] + }, + "Condition": "CreateNetwork" + }, + "ClusterwarePublicNet": { + "Type": "AWS::EC2::Subnet", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "CidrBlock": "10.75.0.0/24" + }, + "Condition": "CreateNetwork" + }, + "ClusterwareGateway": { + "Type": "AWS::EC2::InternetGateway", + "DependsOn": "ClusterwareVPC", + "Condition": "CreateNetwork" + }, + "ClusterwareAttachGW": { + "Type": "AWS::EC2::VPCGatewayAttachment", + "DependsOn": "ClusterwareVPC", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "InternetGatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareRoute": { + "Type": "AWS::EC2::Route", + "DependsOn": "ClusterwareAttachGW", + "Properties": { + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + }, + "DestinationCidrBlock": "0.0.0.0/0", + "GatewayId": { + "Ref": "ClusterwareGateway" + } + }, + "Condition": "CreateNetwork" + }, + "SubnetToRouteTable": { + "Type": "AWS::EC2::SubnetRouteTableAssociation", + "DependsOn": "ClusterwareRouteTable", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "RouteTableId": { + "Ref": "ClusterwareRouteTable" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareNetACL": { + "Type": "AWS::EC2::NetworkAcl", + "Properties": { + "VpcId": { + "Ref": "ClusterwareVPC" + } + }, + "Condition": "CreateNetwork" + }, + "InboundSSHACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "100", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "false", + "CidrBlock": { + "Ref": "NetworkCIDR" + }, + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "OutboundACLEntry": { + "Type": "AWS::EC2::NetworkAclEntry", + "Properties": { + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + }, + "RuleNumber": "101", + "Protocol": "-1", + "RuleAction": "allow", + "Egress": "true", + "CidrBlock": "0.0.0.0/0", + "PortRange": { + "From": "1", + "To": "65535" + } + }, + "Condition": "CreateNetwork" + }, + "ClusterwareSubnetLink": { + "Type": "AWS::EC2::SubnetNetworkAclAssociation", + "Properties": { + "SubnetId": { + "Ref": "ClusterwarePublicNet" + }, + "NetworkAclId": { + "Ref": "ClusterwareNetACL" + } + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareSecurityGroup": { + "Type": "AWS::EC2::SecurityGroup", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { + "Ref": "AWS::StackName" + } + } + ], + "GroupDescription": "Enable SSH access to the Alces Clusterware master node", + "VpcId": { + "Ref": "ClusterwareVPC" + }, + "SecurityGroupIngress": [ + { + "IpProtocol": "tcp", + "FromPort": "22", + "ToPort": "22", + "CidrIp": { + "Ref": "NetworkCIDR" + } + }, + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": { + "Ref": "NetworkCIDR" + } + } + ], + "SecurityGroupEgress": [ + { + "IpProtocol": "-1", + "FromPort": "0", + "ToPort": "65535", + "CidrIp": "0.0.0.0/0" + } + ] + }, + "Condition": "CreateNetwork" + }, + "AlcesClusterwareMasterNode": { + "Type": "AWS::EC2::Instance", + "Properties": { + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-login1" ] ]} + } + ], + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "NetworkInterfaces": [ + { + "AssociatePublicIpAddress": "True", + "DeviceIndex": "0", + "GroupSet": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "SubnetId": { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + } + ], + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": "1000" + } + } + ], + "InstanceType": "c4.large", + "KeyName": { + "Ref": "KeyPair" + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "hostname: login1", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'master'", + "\n", + " tags:", + "\n", + " scheduler_roles: ':master:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Properties": { + "AssociatePublicIpAddress": "True", + "KeyName": { + "Ref": "KeyPair" + }, + "ImageId": { + "Fn::FindInMap": [ + "AWSRegionArch2AMI", + { + "Ref": "AWS::Region" + }, + "centos7" + ] + }, + "InstanceType": "c4.8xlarge", + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/sda1", + "Ebs": { + "VolumeSize": "1000" + } + }, + { + "DeviceName": "ephemeral0" + } + ], + "SpotPrice": { + "Ref": "SpotPrice" + }, + "SecurityGroups": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "AlcesClusterwareSecurityGroup" + }, + { + "Ref": "SecurityGroup" + } + ] + } + ], + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "#cloud-config", + "\n", + "write_files:", + "\n", + "- content: |", + "\n", + " cluster:", + "\n", + " uuid: '11111111-2222-3333-444444444444'", + "\n", + " token: '1A0a1aaAA1aAAA/aaa1aAA=='", + "\n", + " name: ", + { + "Ref": "AWS::StackName" + }, + "\n", + " role: 'slave'", + "\n", + " master: ", + { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PrivateIp" + ] + }, + "\n", + " tags:", + "\n", + " scheduler_roles: ':compute:'", + "\n", + " owner: root:root", + "\n", + " path: /opt/clusterware/etc/config.yml", + "\n", + " permissions: '0640'", + "\n" + ] + ] + } + } + } + }, + "ComputeGroup": { + "Type": "AWS::AutoScaling::AutoScalingGroup", + "DependsOn": "AlcesClusterwareMasterNode", + "Properties": { + "DesiredCapacity": "32", + "LaunchConfigurationName": { + "Ref": "ComputeConfig" + }, + "Tags": [ + { + "Key": "Name", + "Value": { "Fn::Join": [ "", [ {"Ref": "AWS::StackName"}, "-compute" ] ]}, + "PropagateAtLaunch": "true" + } + ], + "MinSize": "1", + "MaxSize": "100", + "VPCZoneIdentifier": [ + { + "Fn::If": [ + "CreateNetwork", + { + "Ref": "ClusterwarePublicNet" + }, + { + "Ref": "SubnetId" + } + ] + } + ] + } + }, + "ScaleUp": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "ChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "1" + } + }, + "ScaleDown": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "ChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-1" + } + }, + "CPUAlarmHigh": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up if CPU > 90% for 20 minutes", + "MetricName": "CPUUtilization", + "Namespace": "AWS/EC2", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "90", + "AlarmActions": [ + { + "Ref": "ScaleUp" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } + }, + "CPUAlarmLow": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", + "MetricName": "CPUUtilization", + "Namespace": "AWS/EC2", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "55", + "Threshold": "10", + "AlarmActions": [ + { + "Ref": "ScaleDown" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "LessThanThreshold" + } + } + }, + "Outputs": { + "AccessIP": { + "Value": { + "Fn::GetAtt": [ + "AlcesClusterwareMasterNode", + "PublicIp" + ] + } + } + } +} From 1d5534ba21d4ef5557a921e590b3f3750785955a Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 22:53:11 +0000 Subject: [PATCH 07/18] Add 16/32 node template compute nodes into PG --- .../aws-tools/templates/all-in-one/compute/16-node.json | 9 +++++++++ .../aws-tools/templates/all-in-one/compute/32-node.json | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index 5820c2f..ffb4c15 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -433,6 +433,12 @@ } } }, + "PlacementGroup": { + "Type": "AWS::EC2::PlacementGroup", + "Properties": { + "Strategy": "cluster" + } + }, "ComputeGroup": { "Type": "AWS::AutoScaling::AutoScalingGroup", "DependsOn": "AlcesClusterwareMasterNode", @@ -441,6 +447,9 @@ "LaunchConfigurationName": { "Ref": "ComputeConfig" }, + "PlacementGroup": { + "Ref": "PlacementGroup" + }, "Tags": [ { "Key": "Name", diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index 0df23dd..38b9beb 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -433,6 +433,12 @@ } } }, + "PlacementGroup": { + "Type": "AWS::EC2::PlacementGroup", + "Properties": { + "Strategy": "cluster" + } + }, "ComputeGroup": { "Type": "AWS::AutoScaling::AutoScalingGroup", "DependsOn": "AlcesClusterwareMasterNode", @@ -441,6 +447,9 @@ "LaunchConfigurationName": { "Ref": "ComputeConfig" }, + "PlacementGroup": { + "Ref": "PlacementGroup" + }, "Tags": [ { "Key": "Name", From f829637b8094e93d9fe7c926d8b25b7113e23b4f Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Thu, 24 Mar 2016 22:58:30 +0000 Subject: [PATCH 08/18] Add IAM role and provide to master node --- .../templates/all-in-one/compute/16-node.json | 50 +++++++++++++++++++ .../templates/all-in-one/compute/32-node.json | 50 +++++++++++++++++++ .../templates/all-in-one/compute/8-node.json | 50 +++++++++++++++++++ 3 files changed, 150 insertions(+) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index ffb4c15..ab73ab9 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -239,6 +239,53 @@ }, "Condition": "CreateNetwork" }, + "MasterIAM": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + }] + } + } + }, + "MasterIAMProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Path": "/", + "Roles": [ + { "Ref": "MasterIAM" + } + ] + } + }, + "MetricData": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "MetricData", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": [ + "*" + ] + }] + }, + "Roles": [ { "Ref": "MasterIAM" } ] + } + }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", "Properties": { @@ -257,6 +304,9 @@ "centos7" ] }, + "IamInstanceProfile": { + "Ref": "MasterIAMProfile" + }, "NetworkInterfaces": [ { "AssociatePublicIpAddress": "True", diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index 38b9beb..492eecb 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -239,6 +239,53 @@ }, "Condition": "CreateNetwork" }, + "MasterIAM": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + }] + } + } + }, + "MasterIAMProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Path": "/", + "Roles": [ + { "Ref": "MasterIAM" + } + ] + } + }, + "MetricData": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "MetricData", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": [ + "*" + ] + }] + }, + "Roles": [ { "Ref": "MasterIAM" } ] + } + }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", "Properties": { @@ -257,6 +304,9 @@ "centos7" ] }, + "IamInstanceProfile": { + "Ref": "MasterIAMProfile" + }, "NetworkInterfaces": [ { "AssociatePublicIpAddress": "True", diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json index 57ed65e..5fff4b3 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -239,6 +239,53 @@ }, "Condition": "CreateNetwork" }, + "MasterIAM": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ { + "Effect": "Allow", + "Principal": { + "Service": [ + "ec2.amazonaws.com" + ] + }, + "Action": [ + "sts:AssumeRole" + ] + }] + } + } + }, + "MasterIAMProfile": { + "Type": "AWS::IAM::InstanceProfile", + "Properties": { + "Path": "/", + "Roles": [ + { "Ref": "MasterIAM" + } + ] + } + }, + "MetricData": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyName": "MetricData", + "PolicyDocument": { + "Version": "2012-10-17", + "Statement": [ { + "Effect": "Allow", + "Action": [ + "cloudwatch:PutMetricData" + ], + "Resource": [ + "*" + ] + }] + }, + "Roles": [ { "Ref": "MasterIAM" } ] + } + }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", "Properties": { @@ -257,6 +304,9 @@ "centos7" ] }, + "IamInstanceProfile": { + "Ref": "MasterIAMProfile" + }, "NetworkInterfaces": [ { "AssociatePublicIpAddress": "True", From 7ae0e8d08d90402ccf3cee05bb666f80df4c115f Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Mon, 28 Mar 2016 16:00:41 +0200 Subject: [PATCH 09/18] Further work on all-in-one templates: - Login node is now user-defined size - System disk size is now user-defined --- .../templates/all-in-one/compute/16-node.json | 21 ++++++++++++++++--- .../templates/all-in-one/compute/32-node.json | 19 +++++++++++++++-- .../templates/all-in-one/compute/8-node.json | 21 ++++++++++++++++--- 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index ab73ab9..b3a83b6 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -20,6 +20,21 @@ "Type": "String", "Default": "0.50" }, + "SystemDiskSize": { + "Description": "Select the size in GB of root system disk to use for all deployed nodes in the environment", + "Type": "String", + "Default": "500" + }, + "LoginType": { + "Description": "Select the login node instance type to deploy - this defines the number of cores and memory available", + "Type": "String", + "Default": "t2.large", + "AllowedValues": [ + "t2.large", + "c3.large", + "c4.large" + ] + }, "NetworkCIDR": { "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", "Type": "String", @@ -341,11 +356,11 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": "1000" + "VolumeSize": { "Ref": "SystemDiskSize" } } } ], - "InstanceType": "c4.large", + "InstanceType": { "Ref": "LoginType" }, "KeyName": { "Ref": "KeyPair" }, @@ -412,7 +427,7 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": "1000" + "VolumeSize": { "Ref": "SystemDiskSize" } } }, { diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index 492eecb..b1f9226 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -20,6 +20,21 @@ "Type": "String", "Default": "0.50" }, + "SystemDiskSize": { + "Description": "Select the size in GB of root system disk to use for all deployed nodes in the environment", + "Type": "String", + "Default": "500" + }, + "LoginType": { + "Description": "Select the login node instance type to deploy - this defines the number of cores and memory available", + "Type": "String", + "Default": "t2.large", + "AllowedValues": [ + "t2.large", + "c3.large", + "c4.large" + ] + }, "NetworkCIDR": { "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", "Type": "String", @@ -341,11 +356,11 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": "1000" + "VolumeSize": { "Ref": "SystemDiskSize" } } } ], - "InstanceType": "c4.large", + "InstanceType": { "Ref": "LoginType" }, "KeyName": { "Ref": "KeyPair" }, diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json index 5fff4b3..402308f 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -20,6 +20,21 @@ "Type": "String", "Default": "0.50" }, + "SystemDiskSize": { + "Description": "Select the size in GB of root system disk to use for all deployed nodes in the environment", + "Type": "String", + "Default": "500" + }, + "LoginType": { + "Description": "Select the login node instance type to deploy - this defines the number of cores and memory available", + "Type": "String", + "Default": "t2.large", + "AllowedValues": [ + "t2.large", + "c3.large", + "c4.large" + ] + }, "NetworkCIDR": { "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", "Type": "String", @@ -341,11 +356,11 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": "200" + "VolumeSize": { "Ref": "SystemDiskSize" } } } ], - "InstanceType": "c4.large", + "InstanceType": { "Ref": "LoginType" }, "KeyName": { "Ref": "KeyPair" }, @@ -412,7 +427,7 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": "200" + "VolumeSize": { "Ref": "SystemDiskSize" } } } ], From 1498ceb136531b94e60a4efc511cc70593149b7b Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Mon, 28 Mar 2016 16:18:56 +0200 Subject: [PATCH 10/18] 32 node template now creates user defined shared NFS area --- .../templates/all-in-one/compute/32-node.json | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index b1f9226..9db77ae 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -35,6 +35,10 @@ "c4.large" ] }, + "SharedDataSize": { + "Description": "Enter the size in GB of the shared NFS area to deploy. If no shared area is required, enter None", + "Type": "String", + }, "NetworkCIDR": { "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", "Type": "String", @@ -58,6 +62,14 @@ }, "None" ] + }, + "CreateSharedData": { + "Fn::Not": [ + { + "Ref": "SharedDataSize" + }, + "None" + ] } }, "Resources": { @@ -301,6 +313,25 @@ "Roles": [ { "Ref": "MasterIAM" } ] } }, + "SharedData": { + "Type": "AWS::EC2::Volume", + "Properties": { + "VolumeType": "io1", + "Size": { "Ref": "SharedDataSize" }, + "Iops": "1000", + "AvailabilityZone": { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "AvailabilityZone" ] } + }, + "Conditions": "CreateSharedData" + }, + "AttachSharedData" { + "Type": "AWS::EC2::VolumeAttachment", + "Properties": { + "Device": "xvdb", + "InstanceId": { "Ref": "AlcesClusterwareMasterNode" }, + "VolumeId": { "Ref": "SharedData" } + }, + "Conditions": "CreateSharedData" + }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", "Properties": { From 7ec17216d03bd083130e9ca1a4374592deaa1b2f Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Mon, 28 Mar 2016 16:35:05 +0200 Subject: [PATCH 11/18] - Master now shares `/sharedscratch` - Nodes now mount `/sharedscratch` --- .../templates/all-in-one/compute/32-node.json | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index 9db77ae..c14dc3c 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -408,6 +408,30 @@ "\n", "- content: |", "\n", + " #!/bin/bash", + "\n", + " mkfs -t xfs /dev/xvdb", + "\n", + " mkdir /sharedscratch", + "\n", + " echo 'dev/xvdb /sharedscratch xfs defaults 0 0' >> etc/fstab", + "\n", + " mount -a", + "\n", + " echo '/sharedscratch 10.75.0.0/255.255.0.0(rw,no_root_squash,no_subtree_check,async)' >> /etc/exports", + "\n", + " systemctl enable nfs-server && systemctl start nfs-server", + "\n", + " chmod 0777 /sharedscratch/", + "\n", + " rm -rf /tmp/disk-format", + "\n", + " path: /tmp/disk-format", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", " cluster:", "\n", " uuid: '11111111-2222-3333-444444444444'", @@ -431,6 +455,9 @@ "\n", " permissions: '0640'", "\n" + "runcmd:", + "\n", + " - /tmp/disk-format" ] ] } @@ -492,6 +519,24 @@ "\n", "- content: |", "\n", + " #!/bin/bash", + "\n", + " mkdir /sharedscratch", + "\n", + " echo '", + { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "PrivateIp" ] }, + ":/sharedscratch /sharedscratch ext4 defaults 0 0' >> etc/fstab ", + "\n", + " mount -a", + "\n", + " rm -rf /tmp/disk-format", + "\n", + " path: /tmp/disk-format", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", " cluster:", "\n", " uuid: '11111111-2222-3333-444444444444'", @@ -523,6 +568,9 @@ "\n", " permissions: '0640'", "\n" + "runcmd:", + "\n", + " - /tmp/disk-format", ] ] } From 0caf3facb8230c552e7d7ba8b9340f580d5424a1 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Mon, 28 Mar 2016 17:07:46 +0200 Subject: [PATCH 12/18] - Check if shared NFS volume exists before trying to share and mount - This should be replaced by clusterware#90 when available --- .../templates/all-in-one/compute/32-node.json | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index c14dc3c..536e1b7 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -410,21 +410,31 @@ "\n", " #!/bin/bash", "\n", - " mkfs -t xfs /dev/xvdb", + " checkdisk=$(fdisk -l | grep xvdb)", "\n", - " mkdir /sharedscratch", + " if [ -z \"$checkdisk\" ]; then", "\n", - " echo 'dev/xvdb /sharedscratch xfs defaults 0 0' >> etc/fstab", + " mkfs -t xfs /dev/xvdb", "\n", - " mount -a", + " mkdir /sharedscratch", "\n", - " echo '/sharedscratch 10.75.0.0/255.255.0.0(rw,no_root_squash,no_subtree_check,async)' >> /etc/exports", + " echo '/dev/xvdb /sharedscratch xfs defaults 0 0' >> etc/fstab", "\n", - " systemctl enable nfs-server && systemctl start nfs-server", + " mount -a", "\n", - " chmod 0777 /sharedscratch/", + " echo '/sharedscratch 10.75.0.0/255.255.0.0(rw,no_root_squash,no_subtree_check,async)' >> /etc/exports", "\n", - " rm -rf /tmp/disk-format", + " systemctl enable nfs-server && systemctl start nfs-server", + "\n", + " chmod 0777 /sharedscratch/", + "\n", + " rm -rf /tmp/disk-format", + "\n", + " else", + "\n", + " echo \"No disk detected\"", + "\n", + " fi" "\n", " path: /tmp/disk-format", "\n", @@ -521,15 +531,25 @@ "\n", " #!/bin/bash", "\n", - " mkdir /sharedscratch", + " diskcheck=$(showmount -e ", + { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "PrivateIp" ] }, + " | grep sharedscratch)", + "\n", + " if [ -z \"$diskcheck\" ]; then", + "\n", + " mkdir /sharedscratch", "\n", - " echo '", + " echo '", { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "PrivateIp" ] }, ":/sharedscratch /sharedscratch ext4 defaults 0 0' >> etc/fstab ", "\n", - " mount -a", + " mount -a", + "\n", + " rm -rf /tmp/disk-format", + "\n", + " else", "\n", - " rm -rf /tmp/disk-format", + " echo \"No disk available\"", "\n", " path: /tmp/disk-format", "\n", From 1256c4f832d6f399320d551ff4de0b06e3d19764 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Mon, 28 Mar 2016 17:16:54 +0200 Subject: [PATCH 13/18] Mount as xfs --- .../aws-tools/templates/all-in-one/compute/32-node.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index 536e1b7..3dde285 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -541,7 +541,7 @@ "\n", " echo '", { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "PrivateIp" ] }, - ":/sharedscratch /sharedscratch ext4 defaults 0 0' >> etc/fstab ", + ":/sharedscratch /sharedscratch xfs defaults 0 0' >> etc/fstab ", "\n", " mount -a", "\n", From 6dcfb23c015a3b7735838a1e1c778ef443fdb0a9 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Mon, 28 Mar 2016 18:39:17 +0200 Subject: [PATCH 14/18] Add SGE monitoring to each template - Additional alarm types for better autoscaling - Increase/decrease ASG by percent, rather than exact capacity --- .../templates/all-in-one/compute/16-node.json | 78 +++++++++++++++---- .../templates/all-in-one/compute/32-node.json | 78 +++++++++++++++---- .../templates/all-in-one/compute/8-node.json | 78 +++++++++++++++---- 3 files changed, 189 insertions(+), 45 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index b3a83b6..639d85e 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -539,41 +539,63 @@ ] } }, - "ScaleUp": { + "ScaleUp10": { "Type": "AWS::AutoScaling::ScalingPolicy", "Properties": { - "AdjustmentType": "ChangeInCapacity", + "AdjustmentType": "PercentChangeInCapacity", "AutoScalingGroupName": { "Ref": "ComputeGroup" }, "Cooldown": "3300", - "ScalingAdjustment": "1" + "ScalingAdjustment": "10" } }, - "ScaleDown": { + "ScaleDown10": { "Type": "AWS::AutoScaling::ScalingPolicy", "Properties": { - "AdjustmentType": "ChangeInCapacity", + "AdjustmentType": "PercentChangeInCapacity", "AutoScalingGroupName": { "Ref": "ComputeGroup" }, "Cooldown": "3300", - "ScalingAdjustment": "-1" + "ScalingAdjustment": "-10" } }, - "CPUAlarmHigh": { + "ScaleUp25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "25" + } + }, + "ScaleDown25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-25" + } + }, + "CPUAlarmHigh10": { "Type": "AWS::CloudWatch::Alarm", "Properties": { - "AlarmDescription": "Scale-up if CPU > 90% for 20 minutes", - "MetricName": "CPUUtilization", - "Namespace": "AWS/EC2", + "AlarmDescription": "Scale-up by 10% if currently queued jobs > 50 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", "Statistic": "Average", "Period": "60", "EvaluationPeriods": "20", "Threshold": "90", "AlarmActions": [ { - "Ref": "ScaleUp" + "Ref": "ScaleUp10" } ], "Dimensions": [ @@ -587,19 +609,19 @@ "ComparisonOperator": "GreaterThanThreshold" } }, - "CPUAlarmLow": { + "CPUAlarmLow10": { "Type": "AWS::CloudWatch::Alarm", "Properties": { "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", - "MetricName": "CPUUtilization", - "Namespace": "AWS/EC2", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", "Statistic": "Average", "Period": "60", "EvaluationPeriods": "55", "Threshold": "10", "AlarmActions": [ { - "Ref": "ScaleDown" + "Ref": "ScaleDown10" } ], "Dimensions": [ @@ -612,6 +634,32 @@ ], "ComparisonOperator": "LessThanThreshold" } + }, + "CPUAlarmHigh25": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 25% if currently queued jobs > 100 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "100", + "AlarmActions": [ + { + "Ref": "ScaleUp25" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } } }, "Outputs": { diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index 3dde285..e3ba35f 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -638,41 +638,63 @@ ] } }, - "ScaleUp": { + "ScaleUp10": { "Type": "AWS::AutoScaling::ScalingPolicy", "Properties": { - "AdjustmentType": "ChangeInCapacity", + "AdjustmentType": "PercentChangeInCapacity", "AutoScalingGroupName": { "Ref": "ComputeGroup" }, "Cooldown": "3300", - "ScalingAdjustment": "1" + "ScalingAdjustment": "10" } }, - "ScaleDown": { + "ScaleDown10": { "Type": "AWS::AutoScaling::ScalingPolicy", "Properties": { - "AdjustmentType": "ChangeInCapacity", + "AdjustmentType": "PercentChangeInCapacity", "AutoScalingGroupName": { "Ref": "ComputeGroup" }, "Cooldown": "3300", - "ScalingAdjustment": "-1" + "ScalingAdjustment": "-10" } }, - "CPUAlarmHigh": { + "ScaleUp25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "25" + } + }, + "ScaleDown25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-25" + } + }, + "CPUAlarmHigh10": { "Type": "AWS::CloudWatch::Alarm", "Properties": { - "AlarmDescription": "Scale-up if CPU > 90% for 20 minutes", - "MetricName": "CPUUtilization", - "Namespace": "AWS/EC2", + "AlarmDescription": "Scale-up by 10% if currently queued jobs > 50 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", "Statistic": "Average", "Period": "60", "EvaluationPeriods": "20", "Threshold": "90", "AlarmActions": [ { - "Ref": "ScaleUp" + "Ref": "ScaleUp10" } ], "Dimensions": [ @@ -686,19 +708,19 @@ "ComparisonOperator": "GreaterThanThreshold" } }, - "CPUAlarmLow": { + "CPUAlarmLow10": { "Type": "AWS::CloudWatch::Alarm", "Properties": { "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", - "MetricName": "CPUUtilization", - "Namespace": "AWS/EC2", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", "Statistic": "Average", "Period": "60", "EvaluationPeriods": "55", "Threshold": "10", "AlarmActions": [ { - "Ref": "ScaleDown" + "Ref": "ScaleDown10" } ], "Dimensions": [ @@ -711,6 +733,32 @@ ], "ComparisonOperator": "LessThanThreshold" } + }, + "CPUAlarmHigh25": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 25% if currently queued jobs > 100 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "100", + "AlarmActions": [ + { + "Ref": "ScaleUp25" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } } }, "Outputs": { diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json index 402308f..76eddaf 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -527,41 +527,63 @@ ] } }, - "ScaleUp": { + "ScaleUp10": { "Type": "AWS::AutoScaling::ScalingPolicy", "Properties": { - "AdjustmentType": "ChangeInCapacity", + "AdjustmentType": "PercentChangeInCapacity", "AutoScalingGroupName": { "Ref": "ComputeGroup" }, "Cooldown": "3300", - "ScalingAdjustment": "1" + "ScalingAdjustment": "10" } }, - "ScaleDown": { + "ScaleDown10": { "Type": "AWS::AutoScaling::ScalingPolicy", "Properties": { - "AdjustmentType": "ChangeInCapacity", + "AdjustmentType": "PercentChangeInCapacity", "AutoScalingGroupName": { "Ref": "ComputeGroup" }, "Cooldown": "3300", - "ScalingAdjustment": "-1" + "ScalingAdjustment": "-10" } }, - "CPUAlarmHigh": { + "ScaleUp25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "25" + } + }, + "ScaleDown25": { + "Type": "AWS::AutoScaling::ScalingPolicy", + "Properties": { + "AdjustmentType": "PercentChangeInCapacity", + "AutoScalingGroupName": { + "Ref": "ComputeGroup" + }, + "Cooldown": "3300", + "ScalingAdjustment": "-25" + } + }, + "CPUAlarmHigh10": { "Type": "AWS::CloudWatch::Alarm", "Properties": { - "AlarmDescription": "Scale-up if CPU > 90% for 20 minutes", - "MetricName": "CPUUtilization", - "Namespace": "AWS/EC2", + "AlarmDescription": "Scale-up by 10% if currently queued jobs > 50 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", "Statistic": "Average", "Period": "60", "EvaluationPeriods": "20", "Threshold": "90", "AlarmActions": [ { - "Ref": "ScaleUp" + "Ref": "ScaleUp10" } ], "Dimensions": [ @@ -575,19 +597,19 @@ "ComparisonOperator": "GreaterThanThreshold" } }, - "CPUAlarmLow": { + "CPUAlarmLow10": { "Type": "AWS::CloudWatch::Alarm", "Properties": { "AlarmDescription": "Scale-down if CPU < 10% for 55 minutes", - "MetricName": "CPUUtilization", - "Namespace": "AWS/EC2", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", "Statistic": "Average", "Period": "60", "EvaluationPeriods": "55", "Threshold": "10", "AlarmActions": [ { - "Ref": "ScaleDown" + "Ref": "ScaleDown10" } ], "Dimensions": [ @@ -600,6 +622,32 @@ ], "ComparisonOperator": "LessThanThreshold" } + }, + "CPUAlarmHigh25": { + "Type": "AWS::CloudWatch::Alarm", + "Properties": { + "AlarmDescription": "Scale-up by 25% if currently queued jobs > 100 for 20 minutes", + "MetricName": "Queued", + "Namespace": "ALCES/SGE", + "Statistic": "Average", + "Period": "60", + "EvaluationPeriods": "20", + "Threshold": "100", + "AlarmActions": [ + { + "Ref": "ScaleUp25" + } + ], + "Dimensions": [ + { + "Name": "AutoScalingGroupName", + "Value": { + "Ref": "ComputeGroup" + } + } + ], + "ComparisonOperator": "GreaterThanThreshold" + } } }, "Outputs": { From 90d5f7294cc204e06ba5009f4853f67d7c283fcc Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Tue, 29 Mar 2016 11:10:34 +0100 Subject: [PATCH 15/18] Automatically set root disk size for small template --- .../templates/all-in-one/compute/8-node.json | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json index 76eddaf..2466cee 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -48,6 +48,17 @@ "eu-west-1": { "centos7": "ami-3758e244" } + }, + "LoginType2SystemDisk": { + "t2.large": { + "rootdisk": "8" + }, + "c3.large": { + "rootdisk": "100" + }, + "c4.large": { + "rootdisk": "100" + } } }, "Conditions": { @@ -356,7 +367,7 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": { "Ref": "SystemDiskSize" } + "VolumeSize": { "Fn::FindInMap": [ "LoginType2SystemDisk", { "Ref": "LoginType" }, "rootdisk" ] } } } ], From 6a8dd28d1c6a2680c348dbdedb4e9ccd084610fe Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Tue, 29 Mar 2016 13:11:10 +0100 Subject: [PATCH 16/18] Add instance store volumes to block device mappings --- .../aws-tools/templates/all-in-one/compute/16-node.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index 639d85e..f33d225 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -358,6 +358,14 @@ "Ebs": { "VolumeSize": { "Ref": "SystemDiskSize" } } + }, + { + "DeviceName": "/dev/sdb", + "VirtualName": "ephemeral0" + }, + { + "DeviceName": "/dev/sdc", + "VirtualName": "ephemeral1" } ], "InstanceType": { "Ref": "LoginType" }, From fbf2c8e839df2581d7427cb599ed6a1a360db566 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Tue, 29 Mar 2016 13:34:06 +0100 Subject: [PATCH 17/18] Updates to templates --- .../templates/all-in-one/compute/16-node.json | 22 ++++++- .../templates/all-in-one/compute/32-node.json | 62 +++++++++++++------ .../templates/all-in-one/compute/8-node.json | 30 +++++++-- 3 files changed, 88 insertions(+), 26 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json index f33d225..5553409 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/16-node.json @@ -303,6 +303,7 @@ }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", + "DependsOn": "ClusterwarePublicNet", "Properties": { "Tags": [ { @@ -385,6 +386,22 @@ "\n", "- content: |", "\n", + " #!/bin/bash -l", + "\n", + " computegroup=\"INSERTGROUP\"", + "\n", + " timestamp=$(date + '%Y-%m-%d'T'%T'.000Z)", + "\n", + " queued=$(qstat -u \\* | awk 'NR > 2{print $5}' | wc -l)", + "\n", + " /opt/clusterware/opt/aws/bin/aws --region \"eu-west-1\" cloudwatch put-metric-data --metric-name Queued --namespace \"ALCES/SGE\" --dimensions \"AutoScalingGroupName=${computegroup}\" --value $queued --timestamp $timestamp", + "\n", + " path: /opt/alces-sge-cloudwatch.sh", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", " cluster:", "\n", " uuid: '11111111-2222-3333-444444444444'", @@ -407,7 +424,10 @@ " path: /opt/clusterware/etc/config.yml", "\n", " permissions: '0640'", - "\n" + "\n", + "runcmd:", + "\n", + " - echo \"5 * * * * alces /opt/alces-sge-cloudwatch.sh\" >> /etc/crontab" ] ] } diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index e3ba35f..fd6cb02 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -38,6 +38,7 @@ "SharedDataSize": { "Description": "Enter the size in GB of the shared NFS area to deploy. If no shared area is required, enter None", "Type": "String", + "Default": "5000" }, "NetworkCIDR": { "Description": "Enter an address range that is permitted to access the Clusterware master node. Leave blank if unknown", @@ -47,13 +48,6 @@ "ConstraintDescription": "Please specify a valid IP range, e.g. 101.21.2.0/16" } }, - "Mappings": { - "AWSRegionArch2AMI": { - "eu-west-1": { - "centos7": "ami-3758e244" - } - } - }, "Conditions": { "CreateNetwork": { "Fn::Equals": [ @@ -64,12 +58,23 @@ ] }, "CreateSharedData": { - "Fn::Not": [ + "Fn::Not": [ + { + "Fn::Equals": [ { - "Ref": "SharedDataSize" + "Ref": "SharedDataSize" }, "None" - ] + ] + } + ] + } + }, + "Mappings": { + "AWSRegionArch2AMI": { + "eu-west-1": { + "centos7": "ami-3758e244" + } } }, "Resources": { @@ -321,19 +326,20 @@ "Iops": "1000", "AvailabilityZone": { "Fn::GetAtt": [ "AlcesClusterwareMasterNode", "AvailabilityZone" ] } }, - "Conditions": "CreateSharedData" + "Condition": "CreateSharedData" }, - "AttachSharedData" { + "AttachSharedData": { "Type": "AWS::EC2::VolumeAttachment", "Properties": { "Device": "xvdb", "InstanceId": { "Ref": "AlcesClusterwareMasterNode" }, "VolumeId": { "Ref": "SharedData" } }, - "Conditions": "CreateSharedData" + "Condition": "CreateSharedData" }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", + "DependsOn": "ClusterwarePublicNet", "Properties": { "Tags": [ { @@ -408,6 +414,22 @@ "\n", "- content: |", "\n", + " #!/bin/bash -l", + "\n", + " computegroup=\"INSERTGROUP\"", + "\n", + " timestamp=$(date + '%Y-%m-%d'T'%T'.000Z)", + "\n", + " queued=$(qstat -u \\* | awk 'NR > 2{print $5}' | wc -l)", + "\n", + " /opt/clusterware/opt/aws/bin/aws --region \"eu-west-1\" cloudwatch put-metric-data --metric-name Queued --namespace \"ALCES/SGE\" --dimensions \"AutoScalingGroupName=${computegroup}\" --value $queued --timestamp $timestamp", + "\n", + " path: /opt/alces-sge-cloudwatch.sh", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", " #!/bin/bash", "\n", " checkdisk=$(fdisk -l | grep xvdb)", @@ -434,7 +456,7 @@ "\n", " echo \"No disk detected\"", "\n", - " fi" + " fi", "\n", " path: /tmp/disk-format", "\n", @@ -464,10 +486,12 @@ " path: /opt/clusterware/etc/config.yml", "\n", " permissions: '0640'", - "\n" + "\n", "runcmd:", "\n", - " - /tmp/disk-format" + " - /tmp/disk-format", + "\n", + " - echo \"5 * * * * /opt/alces-sge-cloudwatch.sh\" >> /etc/crontab" ] ] } @@ -527,8 +551,6 @@ "\n", "write_files:", "\n", - "- content: |", - "\n", " #!/bin/bash", "\n", " diskcheck=$(showmount -e ", @@ -587,10 +609,10 @@ " path: /opt/clusterware/etc/config.yml", "\n", " permissions: '0640'", - "\n" + "\n", "runcmd:", "\n", - " - /tmp/disk-format", + " - /tmp/disk-format" ] ] } diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json index 2466cee..cda5715 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/8-node.json @@ -302,7 +302,8 @@ "Statement": [ { "Effect": "Allow", "Action": [ - "cloudwatch:PutMetricData" + "cloudwatch:PutMetricData", + "autoscaling:DescribeAutoScalingGroups" ], "Resource": [ "*" @@ -314,6 +315,7 @@ }, "AlcesClusterwareMasterNode": { "Type": "AWS::EC2::Instance", + "DependsOn": "ClusterwarePublicNet", "Properties": { "Tags": [ { @@ -367,7 +369,7 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": { "Fn::FindInMap": [ "LoginType2SystemDisk", { "Ref": "LoginType" }, "rootdisk" ] } + "VolumeSize": { "Fn::FindInMap": [ "LoginType2SystemDisk", { "Ref": "LoginType" }, "rootdisk"] } } } ], @@ -388,6 +390,22 @@ "\n", "- content: |", "\n", + " #!/bin/bash -l", + "\n", + " computegroup=\"INSERTGROUP\"", + "\n", + " timestamp=$(date + '%Y-%m-%d'T'%T'.000Z)", + "\n", + " queued=$(qstat -u \\* | awk 'NR > 2{print $5}' | wc -l)", + "\n", + " /opt/clusterware/opt/aws/bin/aws --region \"eu-west-1\" cloudwatch put-metric-data --metric-name Queued --namespace \"ALCES/SGE\" --dimensions \"AutoScalingGroupName=${computegroup}\" --value $queued --timestamp $timestamp", + "\n", + " path: /opt/alces-sge-cloudwatch.sh", + "\n", + " permissions: '0700'", + "\n", + "- content: |", + "\n", " cluster:", "\n", " uuid: '11111111-2222-3333-444444444444'", @@ -410,7 +428,10 @@ " path: /opt/clusterware/etc/config.yml", "\n", " permissions: '0640'", - "\n" + "\n", + "runcmd:", + "\n", + " - echo \"5 * * * * alces /opt/alces-sge-cloudwatch.sh\" >> /etc/crontab" ] ] } @@ -508,8 +529,7 @@ }, "ComputeGroup": { "Type": "AWS::AutoScaling::AutoScalingGroup", - "DependsOn": "AlcesClusterwareMasterNode", - "Properties": { + "Properties": { "DesiredCapacity": "8", "LaunchConfigurationName": { "Ref": "ComputeConfig" From e036f33069c6d9268204121346914c68593ceae8 Mon Sep 17 00:00:00 2001 From: Vaughan Jones Date: Tue, 29 Mar 2016 14:54:59 +0100 Subject: [PATCH 18/18] Make cloud-init work --- .../aws-tools/templates/all-in-one/compute/32-node.json | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json index fd6cb02..315783d 100644 --- a/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json +++ b/aws-cloudformation/aws-tools/templates/all-in-one/compute/32-node.json @@ -519,11 +519,8 @@ { "DeviceName": "/dev/sda1", "Ebs": { - "VolumeSize": "1000" + "VolumeSize": { "Ref": "SystemDiskSize" } } - }, - { - "DeviceName": "ephemeral0" } ], "SpotPrice": { @@ -551,6 +548,8 @@ "\n", "write_files:", "\n", + "- content: |", + "\n", " #!/bin/bash", "\n", " diskcheck=$(showmount -e ",