Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve worker load threshold detection
Browse files Browse the repository at this point in the history
With 03b301d we have a worker load limit which helps but there can still
be cases like happened on a PowerNV machine with the load going way above
the configured load limit. The reason was that when the worker was idle
within a short time frame of roughly one minute multiple jobs were
assigned to individual worker instances on the machine. As we only
looked at load15 which was still low at that time all jobs were picked
up by the machine leading to overload only about one minute later.
Further reducing the load limit would not realistically prevent this
situation but only delay until load15 decays sufficiently enough so that
new jobs will be picked up again. Instead this commit changes the
evaluation to look at all three system load values, load1, load5 and
load15, but considering the load evolution over time to react quickly
enough if the load rises but still accept a falling edge to allow to
pick up jobs again when the load decays.

Related progress issue: https://progress.opensuse.org/issues/168244
okurz committed Oct 16, 2024
1 parent 7c2b6c3 commit 519058c
Showing 2 changed files with 35 additions and 14 deletions.
30 changes: 19 additions & 11 deletions lib/OpenQA/Worker.pm
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@ BEGIN {
use Fcntl;
use File::Path qw(make_path remove_tree);
use File::Spec::Functions 'catdir';
use List::Util qw(all max min);
use Mojo::IOLoop;
use Mojo::File 'path';
use POSIX;
@@ -750,18 +751,25 @@ sub _handle_job_status_changed ($self, $job, $event_data) {
}
}

sub _load_avg ($field = 2) {
my $value = eval { (split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp))[$field] };
sub _load_avg () {
my @load = eval { split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp) };
splice @load, 3; # remove non-load numbers
log_warning "Unable to determine average load: $@" if $@;
return looks_like_number($value) ? $value : undef;
}

sub _check_system_utilization ($self) {
my $settings = $self->settings->global_settings;
return undef unless my $threshold = $settings->{CRITICAL_LOAD_AVG_THRESHOLD};
my $load_avg = _load_avg;
return "The average load $load_avg is exceeding the configured threshold of $threshold."
if defined $load_avg && $load_avg >= $threshold;
return [] unless all { looks_like_number $_ } @load;
return \@load;
}

sub _check_system_utilization (
$self,
$threshold = $self->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD},
$load = _load_avg())
{
return undef unless $threshold;
return undef unless @$load;
# look at the load evolution over time to react quick enough if the load
# rises but accept a falling edge
return "The average load (@$load) is exceeding the configured threshold of $threshold."
if max(@$load) > $threshold && ($load->[0] > $load->[1] || $load->[0] > $load->[2] || min(@$load) > $threshold);
return undef;
}

19 changes: 16 additions & 3 deletions t/24-worker-overall.t
Original file line number Diff line number Diff line change
@@ -30,8 +30,6 @@ $ENV{OPENQA_CONFIG} = "$FindBin::Bin/data/24-worker-overall";
# file specified via OPENQA_LOGFILE instead of stdout/stderr.
$ENV{OPENQA_LOGFILE} = undef;

my $load_avg_file = simulate_load('0.93 0.95 10.25 2/2207 1212', 'worker-overall-load-avg');

# define fake isotovideo
{
package Test::FakeProcess; # uncoverable statement count:1
@@ -90,6 +88,7 @@ my $dbus_mock = Test::MockModule->new('Net::DBus', no_auto => 1);
$dbus_mock->define(system => sub (@) { Test::FakeDBus->new });
my $cache_service_client_mock = Test::MockModule->new('OpenQA::CacheService::Client');
$cache_service_client_mock->redefine(info => sub { Test::FakeCacheServiceClientInfo->new });
my $load_avg_file = simulate_load('10.93 10.91 10.25 2/2207 1212', 'worker-overall-load-avg');

like(
exception {
@@ -120,6 +119,20 @@ combined_like { $worker->log_setup_info }
qr/.*http:\/\/localhost:9527,https:\/\/remotehost.*qemu_i386,qemu_x86_64.*Errors occurred.*foo.*bar.*/s,
'setup info with parse errors';

subtest 'worker load' => sub {
my $load = OpenQA::Worker::_load_avg();
is scalar @$load, 3, 'expected number of load values';
is $load->[0], 10.93, 'expected load';
is_deeply $load, [10.93, 10.91, 10.25], 'expected computed system load, rising flank';
ok !$worker->_check_system_utilization, 'default threshold not exceeded';
ok $worker->_check_system_utilization(10), 'stricter threshold exceeded by load';
ok !$worker->_check_system_utilization(10, [3, 9, 11]), 'load ok on falling flank';
ok $worker->_check_system_utilization(10, [12, 9, 3]), 'load exceeded on rising flank';
ok $worker->_check_system_utilization(10, [12, 3, 9]), 'load exceeded on rising flank and old load';
ok $worker->_check_system_utilization(10, [11, 13, 12]), 'load still exceeded on short load dip';
ok $worker->_check_system_utilization(10, [11, 12, 13]), 'load still exceeded on falling flank but high';
};

subtest 'delay and exec' => sub {
my $worker_mock = Test::MockModule->new('OpenQA::Worker');
$worker_mock->redefine(init => 42);
@@ -854,7 +867,7 @@ qr/Job 42 from some-host finished - reason: done.*A QEMU instance using.*Skippin
$worker_mock->unmock('is_qemu_running');
$worker->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD} = '10';
is $worker->status->{status}, 'broken', 'worker considered broken when average load exceeds threshold';
like $worker->current_error, qr/load 10\.25.*exceeding.*10/, 'error shows current load and threshold';
like $worker->current_error, qr/load \(.*10\.25.*exceeding.*10/, 'error shows current load and threshold';

# assume the error is gone
$load_avg_file->remove;

0 comments on commit 519058c

Please sign in to comment.