Skip to content

Commit

Permalink
Improved error reporting for missing cloud key parameters; Refactored…
Browse files Browse the repository at this point in the history
… run_job to ensure resources are shut down if an error occurrs during initialisation; Added delay when checking for resource accessibility to address SSH connection blocking issue.
  • Loading branch information
jcohen02 committed Dec 23, 2015
1 parent b8fc7f8 commit 15aa315
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 23 deletions.
13 changes: 11 additions & 2 deletions src/main/deployer/config/platform/ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,26 @@ def __init__(self, *args, **kwargs):
def access_key(self):
return self._access_key

# TODO: Having issues with cloud library returning unexpected errors if
# access/secret key are set here to None. For now, ensuring that if there
# an attempt to set these values to None, they are set to empty string.
@access_key.setter
def access_key(self, value):
self._access_key = value
if not value:
self._access_key = ''
else:
self._access_key = value

@property
def secret_key(self):
return self._secret_key

@secret_key.setter
def secret_key(self, value):
self._secret_key = value
if not value:
self._secret_key = ''
else:
self._secret_key = value

@property
def user_key_name(self):
Expand Down
3 changes: 3 additions & 0 deletions src/main/deployer/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@
class DeployerError(Exception):
pass

class InvalidCredentialsError(DeployerError):
pass

class ResourceInitialisationError(DeployerError):
pass

Expand Down
36 changes: 21 additions & 15 deletions src/main/deployer/libhpc_run_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,24 +292,30 @@ def run_job(self, platform_config_input, job_config, software_config=None,
LOG.debug('Deployer instance <%s> obtained and configured '
'successfully...' % d)

# Now that the initial configuration has been done, we can run the job
# Begin by initialising the resources...

resource_info = d.initialise_resources(node_type=job_config.node_type,
num_processes=job_config.num_processes,
processes_per_node=job_config.processes_per_node,
job_id=job_config.job_id,
software_config=software_config)

# If an ip file was specified, write the public IPs of the resources
# to this file. Currently only supports EC2-style cloud platforms
if ip_file and (isinstance(d, JobDeploymentEC2Openstack) or
isinstance(d, JobDeploymentEC2)):
with open(ip_file, 'w') as f:
for node in resource_info:
f.write(node[0].public_ips[0] + '\n')

try:
# Now that the initial configuration has been done, we can run the job
# Begin by initialising the resources...

# This call can generate an exception when waiting for resources
# to become available or accessible, this will leave resources
# running when the call returns so this needs to go within the
# try/finally block.
resource_info = d.initialise_resources(node_type=job_config.node_type,
num_processes=job_config.num_processes,
processes_per_node=job_config.processes_per_node,
job_id=job_config.job_id,
software_config=software_config)

# If an ip file was specified, write the public IPs of the resources
# to this file. Currently only supports EC2-style cloud platforms
if ip_file and (isinstance(d, JobDeploymentEC2Openstack) or
isinstance(d, JobDeploymentEC2)):
with open(ip_file, 'w') as f:
for node in resource_info:
f.write(node[0].public_ips[0] + '\n')

if software_config:
d.deploy_software(software_config)
else:
Expand Down
28 changes: 22 additions & 6 deletions src/main/deployer/openstack_ec2_deployer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@
from deployer.config.software.base import SoftwareConfigManager,\
SoftwareConfigFile
from deployer.deployment_interface import JobDeploymentBase
from deployer.exceptions import ResourceInitialisationError, JobError
from deployer.exceptions import ResourceInitialisationError, JobError,\
InvalidCredentialsError
from deployer.utils import generate_instance_id

from libcloud.compute.providers import get_driver
Expand Down Expand Up @@ -113,6 +114,7 @@ def __init__(self, platform_config):
self.driver = EUCA(access_key, secret=secret_key, secure=False,
host=host, port=port, path='/services/Cloud')

LOG.debug('The cloud driver instance is <%s>' % self.driver)

# SAGA Session is pre-created by superclass
# Prepare the job security context and store it - this will allow
Expand Down Expand Up @@ -205,18 +207,28 @@ def initialise_resources(self, prefer_unconfigured=True,
raise ResourceInitialisationError('ERROR contacting the remote '
'cloud platform. Do you have an active network '
'connection? - <%s>' % str(e))
except:
except Exception as e:
LOG.debug('ERROR STRING: %s' % str(e))
img = None
raise ResourceInitialisationError('ERROR: The specified image <%s> '
if str(e).startswith('Unauthorized:'):
raise InvalidCredentialsError('ERROR: Access to the cloud '
'platform at <%s> was not authorised. Are your '
'credentials correct?' %
(self.platform_config.platform_service_host + ':'
+ str(self.platform_config.platform_service_port)))
else:
raise ResourceInitialisationError('ERROR: The specified image <%s> '
'is not present on the target platform, unable '
'to start resources.' % image_id)

sizes = self.driver.list_sizes()
size = next((s for s in sizes if s.id == node_type), None)
if not size:
raise ResourceInitialisationError('ERROR: The specified resource '
'size <%s> is not present on the target platform. '
'Unable to start resources.' % node_type)
'size (node_type) <%s> is not present on the '
'target platform. Unable to start resources. Have '
'you set the node_type parameter in your job spec?'
% node_type)

# Get the keypair name from the configuration
# If we're using an unconfigured resource, we use the admin key pair
Expand Down Expand Up @@ -670,13 +682,17 @@ def _wait_for_node_accessbility(self, *args, **kwargs):
return self._wait_for_node_accessbility_saga(*args, **kwargs)

def _wait_for_node_accessbility_saga(self, node_ip_list, user_id, key_file,
port=22, retries=3):
port=22, retries=3, pre_check_delay=10):
# Using saga to check if remote resources are accessible
#retries = 3
retries = 5
attempts_made = 0
connection_successful = False

LOG.debug('Waiting <%s> seconds to check for resource accessibility.'
% (pre_check_delay))
time.sleep(pre_check_delay)

# Create an empty session with no contexts
self.session = saga.Session(default = False)
if self.admin_ctx:
Expand Down

0 comments on commit 15aa315

Please sign in to comment.