Skip to content

Commit

Permalink
Merge pull request os-autoinst#5025 from Martchus/auto-restart-error-…
Browse files Browse the repository at this point in the history
…handling

Improve error handling when automatically restarting a job
  • Loading branch information
Martchus authored Mar 8, 2023
2 parents f190eeb + 7a468c1 commit 88597fc
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 16 deletions.
1 change: 1 addition & 0 deletions lib/OpenQA/LiveHandler.pm
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ sub startup ($self) {
push @{$self->plugins->namespaces}, 'OpenQA::LiveHandler::Plugin', 'OpenQA::Shared::Plugin';
$self->plugin('SharedHelpers');
$self->plugin('CSRF');
$self->plugin('Gru'); # to invoke $job->cancel() when stopping developer session

OpenQA::Setup::set_secure_flag_on_cookies_of_https_connection($self);

Expand Down
18 changes: 13 additions & 5 deletions lib/OpenQA/Schema/Result/Jobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1753,9 +1753,9 @@ sub store_column ($self, $columnname, $value) {
return $self->SUPER::store_column($columnname, $value);
}

sub enqueue_finalize_job_results ($self, $carried_over = undef) {
my $gru = eval { OpenQA::App->singleton->gru }; # gru might not be present within tests
$gru->enqueue(finalize_job_results => [$self->id, $carried_over], {priority => -10}) if $gru;
sub enqueue_finalize_job_results ($self, $args = [], $options = {}) {
$options->{priority} //= -10;
OpenQA::App->singleton->gru->enqueue(finalize_job_results => [$self->id, @$args], $options);
}

# used to stop jobs with some kind of dependency relationship to another
Expand Down Expand Up @@ -1942,6 +1942,13 @@ sub handle_retry ($self) {
return 1;
}

sub enqueue_restart ($self, $options = {}) {
my $openqa_job_id = $self->id;
my $minion_job_id = OpenQA::App->singleton->gru->enqueue(restart_job => [$openqa_job_id], $options)->{minion_id};
log_debug "Enqueued restarting openQA job $openqa_job_id via Minion job $minion_job_id";
return $minion_job_id;
}

=head2 done
Finalize job by setting it as DONE.
Expand Down Expand Up @@ -2003,10 +2010,11 @@ sub done ($self, %args) {
}
$self->update(\%new_val);
$self->unblock;
$self->auto_duplicate if $restart || ($self->is_ok_to_retry && $self->handle_retry);
my %finalize_opts = (lax => 1);
$finalize_opts{parents} = [$self->enqueue_restart] if $restart || ($self->is_ok_to_retry && $self->handle_retry);
# bugrefs are there to mark reasons of failure - the function checks itself though
my $carried_over = $self->carry_over_bugrefs;
$self->enqueue_finalize_job_results($carried_over);
$self->enqueue_finalize_job_results([$carried_over], \%finalize_opts);

# stop other jobs in the cluster
if (defined $new_val{result} && !grep { $result eq $_ } OK_RESULTS) {
Expand Down
20 changes: 14 additions & 6 deletions lib/OpenQA/Shared/Plugin/Gru.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ sub register_tasks ($self) {
qw(OpenQA::Task::Job::ArchiveResults),
qw(OpenQA::Task::Job::FinalizeResults),
qw(OpenQA::Task::Job::HookScript),
qw(OpenQA::Task::Job::Restart),
qw(OpenQA::Task::Iso::Schedule),
qw(OpenQA::Task::Bug::Limit),
);
Expand Down Expand Up @@ -126,12 +127,19 @@ sub enqueue ($self, $task, $args = [], $options = {}, $jobs = []) {
my $gru_id = $gru->id;
my @ttl = defined $ttl ? (expire => $ttl) : ();
my @notes = defined $notes ? (%$notes) : ();
my $minion_id = $self->app->minion->enqueue(
$task => $args => {
@ttl,
priority => $priority,
delay => $delay,
notes => {gru_id => $gru_id, @notes}});
my $parents = $options->{parents};
my $lax = $options->{lax};
my %minion_options = (
@ttl,
priority => $priority,
delay => $delay,
notes => {gru_id => $gru_id, @notes},
defined $lax ? (lax => $lax) : (),
defined $parents ? (parents => $parents) : (),
);
use Data::Dumper;
print("$task options: " . Dumper(\%minion_options));
my $minion_id = $self->app->minion->enqueue($task => $args => \%minion_options);

return {minion_id => $minion_id, gru_id => $gru_id};
}
Expand Down
13 changes: 8 additions & 5 deletions lib/OpenQA/Task/Job/FinalizeResults.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ sub _finalize_results ($minion_job, $openqa_job_id = undef, $carried_over = unde
return $minion_job->retry({delay => 30})
unless my $guard = $app->minion->guard("process_job_results_for_$openqa_job_id", ONE_DAY);

# try to finalize each
my $openqa_job = $app->schema->resultset('Jobs')->find($openqa_job_id);
return $minion_job->finish("Job $openqa_job_id does not exist.") unless $openqa_job;

# try to finalize each
my %failed_to_finalize;
for my $module ($openqa_job->modules_with_job_prefetched) {
eval { $module->finalize_results; };
Expand All @@ -33,10 +34,12 @@ sub _finalize_results ($minion_job, $openqa_job_id = undef, $carried_over = unde
$minion_job->note(failed_modules => \%failed_to_finalize);
$minion_job->fail("Finalizing results of $count modules failed");
}
return if $openqa_job->state eq CANCELLED;
return if $carried_over;
_run_hook_script($minion_job, $openqa_job, $app, $ensure_task_retry_on_termination_signal_guard);
$app->minion->enqueue($_ => []) for @{$app->config->{minion_task_triggers}->{on_job_done}};

# invoke hook script
if ($openqa_job->state ne CANCELLED && !$carried_over) {
_run_hook_script($minion_job, $openqa_job, $app, $ensure_task_retry_on_termination_signal_guard);
$app->minion->enqueue($_ => []) for @{$app->config->{minion_task_triggers}->{on_job_done}};
}
}

sub _run_hook_script ($minion_job, $openqa_job, $app, $guard) {
Expand Down
49 changes: 49 additions & 0 deletions lib/OpenQA/Task/Job/Restart.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright SUSE LLC
# SPDX-License-Identifier: GPL-2.0-or-later

package OpenQA::Task::Job::Restart;
use Mojo::Base 'Mojolicious::Plugin', -signatures;

use Time::Seconds;

sub register ($self, $app, @args) {
$app->minion->add_task(restart_job => \&_restart_job);
}

sub restart_attempts { $ENV{OPENQA_JOB_RESTART_ATTEMPTS} // 5 }

sub restart_delay { $ENV{OPENQA_JOB_RESTART_DELAY} // 5 }

sub restart_openqa_job ($minion_job, $openqa_job) {
my $cloned_job_or_error = $openqa_job->auto_duplicate;
my $is_ok = ref $cloned_job_or_error || $cloned_job_or_error =~ qr/(already.*cloned|direct parent)/i;
$minion_job->note(
ref $cloned_job_or_error
? (cluster_cloned => $cloned_job_or_error->{cluster_cloned})
: (restart_error => $cloned_job_or_error));
return ($is_ok, $cloned_job_or_error);
}

sub _restart_job ($minion_job, @args) {
my $ensure_task_retry_on_termination_signal_guard = OpenQA::Task::SignalGuard->new($minion_job);

my ($openqa_job_id) = @args;
my $app = $minion_job->app;
return $minion_job->fail('No job ID specified.') unless defined $openqa_job_id;
my $openqa_job = $app->schema->resultset('Jobs')->find($openqa_job_id);
return $minion_job->finish("Job $openqa_job_id does not exist.") unless $openqa_job;

# duplicate job and finish normally if no error was returned or job can not be cloned
my ($is_ok, $cloned_job_or_error) = restart_openqa_job($minion_job, $openqa_job);
return $minion_job->finish(ref $cloned_job_or_error ? undef : $cloned_job_or_error) if $is_ok;

# retry a certain number of times, maybe the transaction failed due to a conflict
my $failures = $minion_job->info->{notes}->{failures};
$failures = $failures ? ($failures + 1) : (1);
my $max_attempts = restart_attempts;
return $minion_job->fail($cloned_job_or_error) if $failures >= $max_attempts;
$minion_job->note(failures => $failures, last_failure => $cloned_job_or_error);
$minion_job->retry({delay => restart_delay});
}

1;
3 changes: 3 additions & 0 deletions t/04-scheduler.t
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ use Test::MockModule;
use Test::Output qw(combined_like);
use Test::Warnings ':report_warnings';
use OpenQA::Schema::Result::Jobs;
use OpenQA::App;
use OpenQA::WebAPI;
use OpenQA::WebAPI::Controller::API::V1::Worker;
use OpenQA::Test::TimeLimit '10';

Expand All @@ -43,6 +45,7 @@ $mock_result->redefine(
my $schema = OpenQA::Test::Database->new->create;
my $jobs = $schema->resultset('Jobs');
my $t = Test::Mojo->new('OpenQA::Scheduler');
OpenQA::App->set_singleton(OpenQA::WebAPI->new);

subtest 'Authentication' => sub {
$t->get_ok('/test')->status_is(404)->content_like(qr/Not found/);
Expand Down
3 changes: 3 additions & 0 deletions t/10-jobs-results.t
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@ use FindBin;
use lib "$FindBin::Bin/lib", "$FindBin::Bin/../external/os-autoinst-common/lib";
use Mojo::Base -signatures;
use autodie ':all';
use OpenQA::App;
use OpenQA::Jobs::Constants;
use OpenQA::Utils 'resultdir';
use OpenQA::Test::Case;
use OpenQA::Task::SignalGuard;
use OpenQA::Test::TimeLimit '30';
use OpenQA::WebAPI;

my $schema = OpenQA::Test::Case->new->init_data;
my $jobs = $schema->resultset('Jobs');
OpenQA::App->set_singleton(OpenQA::WebAPI->new);

my %settings = (
DISTRI => 'Unicorn',
Expand Down
59 changes: 59 additions & 0 deletions t/10-jobs.t
Original file line number Diff line number Diff line change
Expand Up @@ -803,37 +803,56 @@ subtest 'delete job assigned as last use for asset' => sub {
};

subtest 'job setting based retriggering' => sub {
my $minion = $t->app->minion;
my %_settings = %settings;
$_settings{TEST} = 'no_retry';
my $jobs_nr = $jobs->count;
my $job = _job_create(\%_settings);
is $jobs->count, $jobs_nr + 1, 'one more job';
$job->done(result => FAILED);
perform_minion_jobs($minion);
is $jobs->count, $jobs_nr + 1, 'no additional job triggered (without retry)';
is $job->clone_id, undef, 'no clone';
$jobs_nr = $jobs->count;
$_settings{TEST} = 'retry:2';
$_settings{RETRY} = '2:bug#42';
$job = _job_create(\%_settings);
$job->done(result => PASSED);
perform_minion_jobs($minion);
is $jobs->count, $jobs_nr + 1, 'no additional job retriggered if PASSED (with retry)';
$job->update({state => SCHEDULED, result => NONE});
$job->done(result => USER_CANCELLED);
perform_minion_jobs($minion);
is $jobs->count, $jobs_nr + 1, 'no additional job retriggered if USER_CANCELLED (with retry)';
my $get_jobs = sub ($task) {
$minion->backend->pg->db->query(q{select * from minion_jobs where task = $1 order by id asc}, $task)->hashes;
# note: Querying DB directly as `$minion->jobs({tasks => [$task]})` does not return parents.
};
my $restart_job_count_before = @{$get_jobs->('restart_job')};
my $finalize_job_count_before = @{$get_jobs->('finalize_job_results')};
$job->update({state => SCHEDULED, result => NONE});
$job->done(result => FAILED);
perform_minion_jobs($minion);
is $jobs->count, $jobs_nr + 2, 'job retriggered as it FAILED (with retry)';
$job->update;
$job->discard_changes;
is $job->comments->first->text, 'Restarting because RETRY is set to 2 (and only restarted 0 times so far)',
'comment about retry';
is $jobs->count, $jobs_nr + 2, 'job is automatically retriggered';
my $restart_jobs = $get_jobs->('restart_job');
my $finalize_jobs = $get_jobs->('finalize_job_results');
is @$restart_jobs, $restart_job_count_before + 1, 'one restart job has been triggered';
is @$finalize_jobs, $finalize_job_count_before + 1, 'one finalize job has been triggered';
ok $finalize_jobs->[-1]->{lax}, 'finalize job would also run if restart job fails';
is_deeply $finalize_jobs->[-1]->{parents}, [$restart_jobs->[-1]->{id}], 'finalize job triggered after restart job'
or diag explain $finalize_jobs;
my $next_job_id = $job->id + 1;
for (1 .. 2) {
is $jobs->find({id => $next_job_id - 1})->clone_id, $next_job_id, "clone exists for retry nr. $_";
$job = $jobs->find({id => $next_job_id});
$jobs->find({id => $next_job_id})->done(result => FAILED);
$job->update;
perform_minion_jobs($minion);
$job->discard_changes;
++$next_job_id;
}
Expand Down Expand Up @@ -875,4 +894,44 @@ subtest '"race" between status updates and stale job detection' => sub {

is $t->app->minion->jobs({states => ['failed']})->total, 0, 'No unexpected failed minion background jobs';

subtest 'special cases when restarting job via Minion task' => sub {
local $ENV{OPENQA_JOB_RESTART_ATTEMPTS} = 2;
local $ENV{OPENQA_JOB_RESTART_DELAY} = 1;

my $minion = $t->app->minion;
my $test = sub ($args, $expected_state, $expected_result, $test, $task = 'restart_job') {
subtest $test => sub {
my $job_id = $minion->enqueue($task => $args);
perform_minion_jobs($minion);
my $job_info = $minion->job($job_id)->info;
is $job_info->{state}, $expected_state, 'state';
is $job_info->{result}, $expected_result, 'result';
return $job_id;
};
};
$test->([], 'failed', 'No job ID specified.',
'error without openQA job ID (can happen if job is enqueued via CLI)');
$test->(
[45678], 'finished',
'Job 45678 does not exist.',
'no error if openQA job does not exist (maybe job has already been deleted)'
);
$test->(
[99945], 'finished',
'Specified job 99945 has already been cloned as 99946',
'no error if openQA job already restarted but result still assigned accordingly'
);

# fake a different error
my $job_mock = Test::MockModule->new('OpenQA::Schema::Result::Jobs');
$job_mock->redefine(auto_duplicate => 'some error');

# run into error assuming there's one retry attempt left
$test->([99945], 'inactive', undef, 'retry scheduled if an error occurs and there are attempts left');

# run into error assuming there are no retry attempts left
local $ENV{OPENQA_JOB_RESTART_ATTEMPTS} = 1;
$test->([99945], 'failed', 'some error', 'error if an error occurs and there are no attempts left');
};

done_testing();
4 changes: 4 additions & 0 deletions t/api/04-jobs.t
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use OpenQA::File;
use OpenQA::Parser 'parser';
use OpenQA::Test::Case;
use OpenQA::Test::Client 'client';
use OpenQA::Test::Utils 'perform_minion_jobs';
use OpenQA::Jobs::Constants;
use OpenQA::JobDependencies::Constants;
use OpenQA::Log 'log_debug';
Expand Down Expand Up @@ -1431,6 +1432,7 @@ subtest 'marking job as done' => sub {
$t->post_ok('/api/v1/jobs/99961/set_done?result=failed&reason=test&worker_id=1');
$t->status_is(200, 'set_done accepted with correct worker_id');
$t->json_is('/result' => FAILED, 'post yields failed result (explicitely specified)');
perform_minion_jobs($t->app->minion);
my $job = $jobs->find(99961);
is $job->clone_id, undef, 'job not cloned when reason does not match configured regex';
is $job->result, FAILED, 'result is failure (as passed via param)';
Expand All @@ -1447,6 +1449,7 @@ subtest 'marking job as done' => sub {

$t->post_ok(Mojo::URL->new('/api/v1/jobs/99961/set_done')->query(\%cache_failure_params));
$t->status_is(200, 'set_done accepted without worker_id');
perform_minion_jobs($t->app->minion);
$t->get_ok('/api/v1/jobs/99961')->status_is(200);
$t->json_is('/job/result' => INCOMPLETE, 'result set');
$t->json_is('/job/reason' => $cache_failure_reason, 'reason set');
Expand Down Expand Up @@ -1484,6 +1487,7 @@ subtest 'marking job as done' => sub {
$t->get_ok('/api/v1/jobs/99961')->status_is(200);
$t->post_ok(
Mojo::URL->new('/api/v1/jobs/99961/set_done')->query({result => INCOMPLETE, reason => $incomplete_reason}));
perform_minion_jobs($t->app->minion);
$t->get_ok('/api/v1/jobs/99961')->status_is(200);
$t->json_is('/job/result' => INCOMPLETE, 'the job status is correct');
$t->json_is('/job/reason' => $incomplete_reason, 'the incomplete reason is correct');
Expand Down
4 changes: 4 additions & 0 deletions t/ui/18-tests-details.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ my $schema = $test_case->init_data(
);
my $jobs = $schema->resultset('Jobs');

# avoid enqueuing Minion jobs
my $jobs_mock = Test::MockModule->new('OpenQA::Schema::Result::Jobs');
$jobs_mock->noop(qw(enqueue_finalize_job_results enqueue_restart));

# prepare needles dir
my $needle_dir_fixture = $schema->resultset('NeedleDirs')->find(1);
my $needle_dir = prepare_clean_needles_dir;
Expand Down

0 comments on commit 88597fc

Please sign in to comment.