Skip to content

Commit

Permalink
Feature: retry on subset of jobs hosts
Browse files Browse the repository at this point in the history
  • Loading branch information
AlanCoding committed Nov 1, 2017
1 parent f1813c3 commit 0ae9283
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 4 deletions.
22 changes: 21 additions & 1 deletion awx/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
# AWX
from awx.main.constants import SCHEDULEABLE_PROVIDERS, ANSI_SGR_PATTERN
from awx.main.models import * # noqa
from awx.main.models.unified_jobs import ACTIVE_STATES
from awx.main.access import get_user_capabilities
from awx.main.fields import ImplicitRoleField
from awx.main.utils import (
Expand Down Expand Up @@ -2642,9 +2643,19 @@ class Meta:
class JobRelaunchSerializer(JobSerializer):

passwords_needed_to_start = serializers.SerializerMethodField()
retry_counts = serializers.SerializerMethodField()
hosts = serializers.ChoiceField(
required=False, allow_null=True, default='all',
choices=[
('all', _('No change to job limit')),
('failed', _('All failed and unreachable hosts')),
('unreachable', _('Unreachable hosts'))
],
write_only=True
)

class Meta:
fields = ('passwords_needed_to_start',)
fields = ('passwords_needed_to_start', 'retry_counts', 'hosts',)

def to_internal_value(self, data):
obj = self.context.get('obj')
Expand All @@ -2666,6 +2677,14 @@ def get_passwords_needed_to_start(self, obj):
return obj.passwords_needed_to_start
return ''

def get_retry_counts(self, obj):
if obj.status in ACTIVE_STATES:
return _('Relaunch by host status not available until job finishes running.')
data = OrderedDict([])
for status in self.fields['hosts'].choices.keys():
data[status] = obj.retry_qs(status).count()
return data

def validate_passwords_needed_to_start(self, value):
obj = self.context.get('obj')
data = self.context.get('data')
Expand All @@ -2685,6 +2704,7 @@ def validate(self, attrs):
raise serializers.ValidationError(dict(errors=[_("Job Template Project is missing or undefined.")]))
if obj.inventory is None or obj.inventory.pending_deletion:
raise serializers.ValidationError(dict(errors=[_("Job Template Inventory is missing or undefined.")]))
attrs.pop('hosts', None)
attrs = super(JobRelaunchSerializer, self).validate(attrs)
return attrs

Expand Down
21 changes: 20 additions & 1 deletion awx/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3834,7 +3834,26 @@ def post(self, request, *args, **kwargs):
if not serializer.is_valid():
return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)

new_job = obj.copy_unified_job()
copy_kwargs = {}
retry_hosts = request.data.get('hosts', None)
if retry_hosts and retry_hosts != 'all':
if obj.status in ACTIVE_STATES:
return Response({'hosts': _(
'Wait until job finishes before retrying on {status_value} hosts.'
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
host_qs = obj.retry_qs(retry_hosts)
if not obj.job_events.filter(event='playbook_on_stats').exists():
return Response({'hosts': _(
'Cannot retry on {status_value} hosts, playbook stats not available.'
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
retry_host_list = host_qs.values_list('name', flat=True)
if len(retry_host_list) == 0:
return Response({'hosts': _(
'Cannot relaunch because previous job had 0 {status_value} hosts.'
).format(status_value=retry_hosts)}, status=status.HTTP_400_BAD_REQUEST)
copy_kwargs['limit'] = ','.join(retry_host_list)

new_job = obj.copy_unified_job(**copy_kwargs)
result = new_job.signal_start(**request.data)
if not result:
data = dict(passwords_needed_to_start=new_job.passwords_needed_to_start)
Expand Down
28 changes: 27 additions & 1 deletion awx/main/models/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from django.utils.translation import ugettext_lazy as _
from django.core.exceptions import ValidationError

# REST Framework
from rest_framework.exceptions import ParseError

# AWX
from awx.api.versioning import reverse
from awx.main.models.base import * # noqa
Expand Down Expand Up @@ -588,10 +591,33 @@ def get_passwords_needed_to_start(self):
return self.passwords_needed_to_start

def _get_hosts(self, **kwargs):
from awx.main.models.inventory import Host
Host = JobHostSummary._meta.get_field('host').related_model
kwargs['job_host_summaries__job__pk'] = self.pk
return Host.objects.filter(**kwargs)

def retry_qs(self, status):
'''
Returns Host queryset that will be used to produce the `limit`
field in a retry on a subset of hosts
'''
kwargs = {}
if status == 'all':
pass
elif status == 'failed':
# Special case for parity with Ansible .retry files
kwargs['job_host_summaries__failed'] = True
elif status in ['ok', 'changed', 'unreachable']:
if status == 'unreachable':
status_field = 'dark'
else:
status_field = status
kwargs['job_host_summaries__{}__gt'.format(status_field)] = 0
else:
raise ParseError(_(
'{status_value} is not a valid status option.'
).format(status_value=status))
return self._get_hosts(**kwargs)

@property
def task_impact(self):
# NOTE: We sorta have to assume the host count matches and that forks default to 5
Expand Down
4 changes: 3 additions & 1 deletion awx/main/models/unified_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ def delete(self):
pass
super(UnifiedJob, self).delete()

def copy_unified_job(self):
def copy_unified_job(self, limit=None):
'''
Returns saved object, including related fields.
Create a copy of this unified job for the purpose of relaunch
Expand All @@ -746,6 +746,8 @@ def copy_unified_job(self):
fields = unified_jt_class._get_unified_job_field_names() + [parent_field_name]
unified_job = copy_model_by_class(self, unified_job_class, fields, {})
unified_job.launch_type = 'relaunch'
if limit:
unified_job.limit = limit
unified_job.save()

# Labels coppied here
Expand Down
28 changes: 28 additions & 0 deletions awx/main/tests/functional/api/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,31 @@ def test_job_relaunch_permission_denied_response(
r = post(reverse('api:job_relaunch', kwargs={'pk':job.pk}), {}, jt_user, expect=403)
assert 'launched with prompted fields' in r.data['detail']
assert 'do not have permission' in r.data['detail']


@pytest.mark.django_db
@pytest.mark.parametrize("status,hosts", [
('all', 'host1,host2,host3'),
('failed', 'host3'),
])
def test_job_relaunch_on_failed_hosts(post, inventory, project, machine_credential, admin_user, status, hosts):
h1 = inventory.hosts.create(name='host1') # no-op
h2 = inventory.hosts.create(name='host2') # changed host
h3 = inventory.hosts.create(name='host3') # failed host
jt = JobTemplate.objects.create(
name='testjt', inventory=inventory,
project=project, credential=machine_credential
)
job = jt.create_unified_job(_eager_fields={'status': 'failed', 'limit': 'host1,host2,host3'})
job.job_events.create(event='playbook_on_stats')
job.job_host_summaries.create(host=h1, failed=False, ok=1, changed=0, failures=0, host_name=h1.name)
job.job_host_summaries.create(host=h2, failed=False, ok=0, changed=1, failures=0, host_name=h2.name)
job.job_host_summaries.create(host=h3, failed=False, ok=0, changed=0, failures=1, host_name=h3.name)

r = post(
url=reverse('api:job_relaunch', kwargs={'pk':job.pk}),
data={'hosts': status},
user=admin_user,
expect=201
)
assert r.data.get('limit') == hosts
1 change: 1 addition & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,4 @@
`deprovision_node` -> `deprovision_instance`, and `instance_group_remove` -> `remove_from_queue`,
which backward compatibility support for 3.1 use pattern
[[#6915](https://github.com/ansible/ansible-tower/issues/6915)]
* Allow relaunching jobs on a subset of hosts, by status.[[#219](https://github.com/ansible/awx/issues/219)]
76 changes: 76 additions & 0 deletions docs/retry_by_status.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Relaunch on Hosts with Status

This feature allows the user to relaunch a job, targeting only a subset
of hosts that had a particular status in the prior job.

### API Design of Relaunch

#### Basic Relaunch

POST to `/api/v2/jobs/N/relaunch/` without any request data should relaunch
the job with the same `limit` value that the original job used, which
may be an empty string.

#### Relaunch by Status

Providing request data containing `{"hosts": "<status>"}` should change
the `limit` of the relaunched job to target the hosts matching that status
from the previous job (unless the default option of "all" is used).
The options and meanings of `<status>` include:

- all: relaunch without changing the job limit
- ok: relaunch against all hosts with >=1 tasks that returned the "ok" status
- changed: relaunch against all hosts with >=1 tasks had a changed status
- failed: relaunch against all hosts with >=1 tasks failed plus all unreachable hosts
- unreachable: relaunch against all hosts with >=1 task when they were unreachable

These correspond to the playbook summary states from a playbook run, with
the notable exception of "failed" hosts. Ansible does not count an unreachable
event as a failed task, so unreachable hosts can (and often do) have no
associated failed tasks. The "failed" status here will still target both
status types, because Ansible will mark the _host_ as failed and include it
in the retry file if it was unreachable.

### Relaunch Endpoint

Doing a GET to the relaunch endpoint should return additional information
regarding the host summary of the last job. Example response:

```json
{
"passwords_needed_to_start": [],
"retry_counts": {
"all": 30,
"failed": 18,
"ok": 25,
"changed": 4,
"unreachable": 9
}
}
```

If the user launches, providing a status for which there were 0 hosts,
then the request will be rejected.

# Acceptance Criteria

Scenario: user launches a job against host "foobar", and the run fails
against this host. User changes name of host to "foo", and relaunches job
against failed hosts. The `limit` of the relaunched job should reference
"foo" and not "foobar".

The user should be able to provide passwords on relaunch, while also
running against hosts of a particular status.

Not providing the "hosts" key in a POST to the relaunch endpoint should
relaunch the same way that relaunching has previously worked.

If a playbook provisions a host, this feature should behave reasonably
when relaunching against a status that includes these hosts.

Feature should work even if hosts have tricky characters in their names,
like commas.

Also need to consider case where a task `meta: clear_host_errors` is present
inside a playbook, and that the retry subset behavior is the same as Ansible
for this case.

0 comments on commit 0ae9283

Please sign in to comment.