diff --git a/lib/OpenQA/Worker.pm b/lib/OpenQA/Worker.pm index 281c2e7b4cd3..909649e33b35 100644 --- a/lib/OpenQA/Worker.pm +++ b/lib/OpenQA/Worker.pm @@ -34,6 +34,7 @@ BEGIN { use Fcntl; use File::Path qw(make_path remove_tree); use File::Spec::Functions 'catdir'; +use List::Util qw(all max min); use Mojo::IOLoop; use Mojo::File 'path'; use POSIX; @@ -750,18 +751,25 @@ sub _handle_job_status_changed ($self, $job, $event_data) { } } -sub _load_avg ($field = 2) { - my $value = eval { (split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp))[$field] }; +sub _load_avg () { + my @load = eval { split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp) }; + splice @load, 3; # remove non-load numbers log_warning "Unable to determine average load: $@" if $@; - return looks_like_number($value) ? $value : undef; -} - -sub _check_system_utilization ($self) { - my $settings = $self->settings->global_settings; - return undef unless my $threshold = $settings->{CRITICAL_LOAD_AVG_THRESHOLD}; - my $load_avg = _load_avg; - return "The average load $load_avg is exceeding the configured threshold of $threshold." - if defined $load_avg && $load_avg >= $threshold; + return [] unless all { looks_like_number $_ } @load; + return \@load; +} + +sub _check_system_utilization ( + $self, + $threshold = $self->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD}, + $load = _load_avg()) +{ + return undef unless $threshold; + return undef unless @$load; + # look at the load evolution over time to react quick enough if the load + # rises but accept a falling edge + return "The average load (@$load) is exceeding the configured threshold of $threshold." + if max(@$load) > $threshold && ($load->[0] > $load->[1] || $load->[0] > $load->[2] || min(@$load) > $threshold); return undef; } diff --git a/t/24-worker-overall.t b/t/24-worker-overall.t index df0a3b1acba9..8f3a37046151 100644 --- a/t/24-worker-overall.t +++ b/t/24-worker-overall.t @@ -30,8 +30,6 @@ $ENV{OPENQA_CONFIG} = "$FindBin::Bin/data/24-worker-overall"; # file specified via OPENQA_LOGFILE instead of stdout/stderr. $ENV{OPENQA_LOGFILE} = undef; -my $load_avg_file = simulate_load('0.93 0.95 10.25 2/2207 1212', 'worker-overall-load-avg'); - # define fake isotovideo { package Test::FakeProcess; # uncoverable statement count:1 @@ -90,6 +88,7 @@ my $dbus_mock = Test::MockModule->new('Net::DBus', no_auto => 1); $dbus_mock->define(system => sub (@) { Test::FakeDBus->new }); my $cache_service_client_mock = Test::MockModule->new('OpenQA::CacheService::Client'); $cache_service_client_mock->redefine(info => sub { Test::FakeCacheServiceClientInfo->new }); +my $load_avg_file = simulate_load('10.93 10.91 10.25 2/2207 1212', 'worker-overall-load-avg'); like( exception { @@ -120,6 +119,20 @@ combined_like { $worker->log_setup_info } qr/.*http:\/\/localhost:9527,https:\/\/remotehost.*qemu_i386,qemu_x86_64.*Errors occurred.*foo.*bar.*/s, 'setup info with parse errors'; +subtest 'worker load' => sub { + my $load = OpenQA::Worker::_load_avg(); + is scalar @$load, 3, 'expected number of load values'; + is $load->[0], 10.93, 'expected load'; + is_deeply $load, [10.93, 10.91, 10.25], 'expected computed system load, rising flank'; + ok !$worker->_check_system_utilization, 'default threshold not exceeded'; + ok $worker->_check_system_utilization(10), 'stricter threshold exceeded by load'; + ok !$worker->_check_system_utilization(10, [3, 9, 11]), 'load ok on falling flank'; + ok $worker->_check_system_utilization(10, [12, 9, 3]), 'load exceeded on rising flank'; + ok $worker->_check_system_utilization(10, [12, 3, 9]), 'load exceeded on rising flank and old load'; + ok $worker->_check_system_utilization(10, [11, 13, 12]), 'load still exceeded on short load dip'; + ok $worker->_check_system_utilization(10, [11, 12, 13]), 'load still exceeded on falling flank but high'; +}; + subtest 'delay and exec' => sub { my $worker_mock = Test::MockModule->new('OpenQA::Worker'); $worker_mock->redefine(init => 42); @@ -854,7 +867,7 @@ qr/Job 42 from some-host finished - reason: done.*A QEMU instance using.*Skippin $worker_mock->unmock('is_qemu_running'); $worker->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD} = '10'; is $worker->status->{status}, 'broken', 'worker considered broken when average load exceeds threshold'; - like $worker->current_error, qr/load 10\.25.*exceeding.*10/, 'error shows current load and threshold'; + like $worker->current_error, qr/load \(.*10\.25.*exceeding.*10/, 'error shows current load and threshold'; # assume the error is gone $load_avg_file->remove;