Skip to content

Commit

Permalink
Add support for ParallelCluster 3.12.0 (#296)
Browse files Browse the repository at this point in the history
Resolves #295

Update the FSxZ security groups to remove outbound security group rules.

Resolves #253

Update the installer to check the status of the ParallelCluster stack after
the config stack update is complete.
Make sure that the stack exists and that it was correctly deployed or else
give an error message.
  • Loading branch information
cartalla authored Jan 2, 2025
1 parent b7bc3d5 commit 2c93c97
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 8 deletions.
4 changes: 4 additions & 0 deletions create-slurm-security-groups/create-slurm-security-groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def main(self):
parser.add_argument("--fsxo-security-group-id", type=str, help="Id of security group attached to FSx for NetApp Ontap file systems.")
parser.add_argument("--fsxz-security-group-id", type=str, help="Id of security group attached to FSx for OpenZfs file systems.")
parser.add_argument("--cdk-cmd", type=str, choices=["deploy", "create", "update", "diff", "ls", "list", "synth", "synthesize", "destroy", "bootstrap"], default="create")
parser.add_argument("--min-pc-version", type=str, default="3.12.0", help="Minimum version of ParallelCluster being used. Used to control security group rules required by PC.")
parser.add_argument("--debug", action='store_const', const=True, default=False, help="Enable CDK debug mode")
args = parser.parse_args()

Expand All @@ -73,6 +74,9 @@ def main(self):
# Must be passed to the stack.
self.stack_parameters['region'] = args.region

logger.debug(f"min pc version: {args.min_pc_version}")
self.stack_parameters['min_parallel_cluster_version'] = args.min_pc_version

# Retrieve the AWS Account ID for CDK
sts_client = boto3.client("sts", region_name=args.region)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
)
from constructs import Construct
import logging
from packaging.version import parse as parse_version

logger = logging.getLogger(__file__)
logger_formatter = logging.Formatter('%(levelname)s: %(message)s')
Expand All @@ -28,6 +29,10 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
logger.info(f"VpcId: {self.config['VpcId']}")
self.vpc = ec2.Vpc.from_lookup(self, "Vpc", vpc_id = self.config['VpcId'])

self.min_parallel_cluster_version = self.node.try_get_context('min_parallel_cluster_version')
if self.min_parallel_cluster_version:
self.min_parallel_cluster_version = parse_version(self.min_parallel_cluster_version)

security_groups = {}
fsx_client_security_groups = {}
lustre_security_groups = {}
Expand Down Expand Up @@ -105,6 +110,10 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
)
security_groups['SlurmdbdSG'] = slurmdbd_sg

if not self.min_parallel_cluster_version:
logger.info("This is a bootstrap so exiting early.")
exit(0)

# Rules for compute nodes
# Allow mounting of /opt/slurm from the head node.
# This is needed in XIO VMs. ParallelCluster compute nodes have a local copy on their root volume.
Expand Down Expand Up @@ -153,14 +162,15 @@ def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon")
fsx_client_sg.connections.allow_to(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_client_sg_name} to {fsx_zfs_sg_name} NFS mount, status monitor, and lock daemon")
# There is a bug in PC 3.10.1 that requires outbound traffic to be enabled even though ZFS doesn't.
# Remove when bug in PC is fixed.
# This bug was resolved in PC 3.12.0.
# Tracked by https://github.com/aws-samples/aws-eda-slurm-cluster/issues/253
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
if self.min_parallel_cluster_version < parse_version('3.12.0'):
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(111), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} rpc for NFS")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp(2049), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS server daemon")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.tcp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")
fsx_client_sg.connections.allow_from(fsx_zfs_sg, ec2.Port.udp_range(20001, 20003), f"{fsx_zfs_sg_name} to {fsx_client_sg_name} NFS mount, status monitor, and lock daemon")

for sg_name, sg in security_groups.items():
CfnOutput(self, f"{sg_name}Id",
Expand Down
1 change: 1 addition & 0 deletions create-slurm-security-groups/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ aws-cdk-lib==2.111.0
boto3
colored
constructs>=10.0.0,<11.0.0
packaging
17 changes: 17 additions & 0 deletions source/cdk/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@
# * Disable Pyxis Spack plugin by default
# * Upgrade Python runtime to 3.12
# * Upgrade libjwt to version 1.17.0.
# 3.12.0:
# * OpenZFS security group requirements fixed.
MIN_PARALLEL_CLUSTER_VERSION = parse_version('3.6.0')
# Update source/resources/default_config.yml with latest version when this is updated.
PARALLEL_CLUSTER_VERSIONS = [
Expand All @@ -114,16 +116,19 @@
'3.10.1',
'3.11.0',
'3.11.1',
'3.12.0',
]
PARALLEL_CLUSTER_ENROOT_VERSIONS = {
# This can be found on the head node by running 'yum info enroot'
'3.11.0': '3.4.1', # confirmed
'3.11.1': '3.4.1', # confirmed
'3.12.0': '3.4.1', # confirmed
}
PARALLEL_CLUSTER_PYXIS_VERSIONS = {
# This can be found on the head node at /opt/parallelcluster/sources
'3.11.0': '0.20.0', # confirmed
'3.11.1': '0.20.0', # confirmed
'3.12.0': '0.20.0', # confirmed
}
PARALLEL_CLUSTER_MUNGE_VERSIONS = {
# This can be found on the head node at /opt/parallelcluster/sources
Expand All @@ -142,6 +147,7 @@
'3.10.1': '0.5.16', # confirmed
'3.11.0': '0.5.16', # confirmed
'3.11.1': '0.5.16', # confirmed
'3.12.0': '0.5.16', # confirmed
}
PARALLEL_CLUSTER_PYTHON_VERSIONS = {
# This can be found on the head node at /opt/parallelcluster/pyenv/versions
Expand All @@ -159,6 +165,7 @@
'3.10.1': '3.9.19', # confirmed
'3.11.0': '3.9.20', # confirmed
'3.11.1': '3.9.20', # confirmed
'3.12.0': '3.9.20', # confirmed
}
PARALLEL_CLUSTER_SLURM_VERSIONS = {
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
Expand All @@ -176,6 +183,7 @@
'3.10.1': '23.11.7', # confirmed
'3.11.0': '23.11.10', # confirmed
'3.11.1': '23.11.10', # confirmed
'3.12.0': '23.11.10', # confirmed
}
PARALLEL_CLUSTER_PC_SLURM_VERSIONS = {
# This can be found on the head node at /etc/chef/local-mode-cache/cache/
Expand All @@ -193,6 +201,7 @@
'3.10.1': '23-11-7-1', # confirmed
'3.11.0': '23-11-10-1', # confirmed
'3.11.1': '23-11-10-1', # confirmed
'3.12.0': '23-11-10-1', # confirmed
}
SLURM_REST_API_VERSIONS = {
'23-02-2-1': '0.0.39',
Expand Down Expand Up @@ -329,6 +338,14 @@ def get_PARALLEL_CLUSTER_LAMBDA_RUNTIME(parallel_cluster_version):
else:
return aws_lambda.Runtime.PYTHON_3_12

# Version 3.12.0

def PARALLEL_CLUSTER_REQUIRES_FSXZ_OUTBOUND_SG_RULES(parallel_cluster_version):
if parallel_cluster_version < parse_version('3.12.0'):
return True
else:
return False

# Determine all AWS regions available on the account.
default_region = environ.get("AWS_DEFAULT_REGION", "us-east-1")
ec2_client = boto3.client("ec2", region_name=default_region)
Expand Down
29 changes: 28 additions & 1 deletion source/slurm_installer/installer.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,8 @@ def main(self):
launch_installer = os.system(cmd) # nosec
if cdk_cmd == "deploy":
if int(launch_installer) == 0:
logger.info(f"{fg('green')}SLURM was successfully deployed!{attr('reset')}")
logger.info(f"{fg('green')}SLURM config was successfully deployed!{attr('reset')}")
self.wait_for_slurm_stack()
elif args.cdk_cmd == "destroy":
# Destroy stack if known
cmd_destroy = f"cdk destroy {self.install_parameters['stack_name']} -c {' -c '.join('{}={}'.format(key, val) for (key, val) in self.install_parameters.items() if val is not None)} --require-approval never"
Expand Down Expand Up @@ -458,6 +459,32 @@ def get_config(self, config_file):

return validated_config

def wait_for_slurm_stack(self):
'''
Wait for the Slurm stack to be created or updated.
'''
stack_name = self.config['slurm']['ClusterName']
cfn_client = boto3.client("cloudformation", region_name=self.config['Region'])

valid_states = ['CREATE_COMPLETE', 'UPDATE_COMPLETE']
invalid_states = ['ROLLBACK_COMPLETE', 'UPDATE_ROLLBACK_COMPLETE']
stack_status = None
while stack_status not in (valid_states + invalid_states):
try:
stack_info = cfn_client.describe_stacks(StackName=stack_name)['Stacks'][0]
except:
logger.error(f"ParallelCluster stack ({stack_name}) doesn't exist. Failed to create cluster.")
exit(1)
if stack_info:
stack_status = stack_info['StackStatus']
logger.info(f"ParallelCluster stack ({stack_name}) in {stack_status} state.")

if stack_status in invalid_states:
logger.error(f"ParallelCluster stack ({stack_name} deployment failed. State: {stack_status}")
exit(1)

logger.info(f"ParallelCluster stack {stack_name} successfully deployed.")

def upload_objects(install_directory, bucket, stack_name):
# Upload required assets to customer S3 bucket
logger.info(f"\n====== Uploading install files to {bucket}/{stack_name} ======\n")
Expand Down

0 comments on commit 2c93c97

Please sign in to comment.