From f8b54249fa71ecc79ef2a69e3e7eb2c174ad50ff Mon Sep 17 00:00:00 2001 From: Oliver Kurz Date: Tue, 15 Oct 2024 21:39:25 +0200 Subject: [PATCH] Improve worker load threshold detection With 03b301d1d we have a worker load limit which helps but there can still be cases like happened on a PowerNV machine with the load going way above the configured load limit. The reason was that when the worker was idle within a short time frame of roughly one minute multiple jobs were assigned to individual worker instances on the machine. As we only looked at load15 which was still low at that time all jobs were picked up by the machine leading to overload only about one minute later. Further reducing the load limit would not realistically prevent this situation but only delay until load15 decays sufficiently enough so that new jobs will be picked up again. Instead this commit changes the evaluation to look at all three system load values, load1, load5 and load15, but considering the load evolution over time to react quickly enough if the load rises but still accept a falling edge to allow to pick up jobs again when the load decays. Related progress issue: https://progress.opensuse.org/issues/168244 --- lib/OpenQA/Worker.pm | 29 ++++++++++++++++++----------- t/24-worker-overall.t | 20 +++++++++++++++++--- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/lib/OpenQA/Worker.pm b/lib/OpenQA/Worker.pm index 281c2e7b4cd..4b4b1edb0ab 100644 --- a/lib/OpenQA/Worker.pm +++ b/lib/OpenQA/Worker.pm @@ -34,6 +34,7 @@ BEGIN { use Fcntl; use File::Path qw(make_path remove_tree); use File::Spec::Functions 'catdir'; +use List::Util qw(all max min); use Mojo::IOLoop; use Mojo::File 'path'; use POSIX; @@ -750,18 +751,24 @@ sub _handle_job_status_changed ($self, $job, $event_data) { } } -sub _load_avg ($field = 2) { - my $value = eval { (split(' ', path($ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg')->slurp))[$field] }; +sub _load_avg ($path = $ENV{OPENQA_LOAD_AVG_FILE} // '/proc/loadavg') { + my @load = eval { split(' ', path($path)->slurp) }; log_warning "Unable to determine average load: $@" if $@; - return looks_like_number($value) ? $value : undef; -} - -sub _check_system_utilization ($self) { - my $settings = $self->settings->global_settings; - return undef unless my $threshold = $settings->{CRITICAL_LOAD_AVG_THRESHOLD}; - my $load_avg = _load_avg; - return "The average load $load_avg is exceeding the configured threshold of $threshold." - if defined $load_avg && $load_avg >= $threshold; + splice @load, 3; # remove non-load numbers + log_error "Unable to parse system load from file '$path'" and return [] unless all { looks_like_number $_ } @load; + return \@load; +} + +sub _check_system_utilization ( + $self, + $threshold = $self->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD}, + $load = _load_avg()) +{ + return undef unless $threshold && @$load >= 3; + # look at the load evolution over time to react quick enough if the load + # rises but accept a falling edge + return "The average load (@$load) is exceeding the configured threshold of $threshold." + if max(@$load) > $threshold && ($load->[0] > $load->[1] || $load->[0] > $load->[2] || min(@$load) > $threshold); return undef; } diff --git a/t/24-worker-overall.t b/t/24-worker-overall.t index df0a3b1acba..2e41d2349c9 100644 --- a/t/24-worker-overall.t +++ b/t/24-worker-overall.t @@ -30,8 +30,6 @@ $ENV{OPENQA_CONFIG} = "$FindBin::Bin/data/24-worker-overall"; # file specified via OPENQA_LOGFILE instead of stdout/stderr. $ENV{OPENQA_LOGFILE} = undef; -my $load_avg_file = simulate_load('0.93 0.95 10.25 2/2207 1212', 'worker-overall-load-avg'); - # define fake isotovideo { package Test::FakeProcess; # uncoverable statement count:1 @@ -90,6 +88,7 @@ my $dbus_mock = Test::MockModule->new('Net::DBus', no_auto => 1); $dbus_mock->define(system => sub (@) { Test::FakeDBus->new }); my $cache_service_client_mock = Test::MockModule->new('OpenQA::CacheService::Client'); $cache_service_client_mock->redefine(info => sub { Test::FakeCacheServiceClientInfo->new }); +my $load_avg_file = simulate_load('10.93 10.91 10.25 2/2207 1212', 'worker-overall-load-avg'); like( exception { @@ -120,6 +119,21 @@ combined_like { $worker->log_setup_info } qr/.*http:\/\/localhost:9527,https:\/\/remotehost.*qemu_i386,qemu_x86_64.*Errors occurred.*foo.*bar.*/s, 'setup info with parse errors'; +subtest 'worker load' => sub { + my $load = OpenQA::Worker::_load_avg(); + is scalar @$load, 3, 'expected number of load values'; + is $load->[0], 10.93, 'expected load'; + is_deeply $load, [10.93, 10.91, 10.25], 'expected computed system load, rising flank'; + is_deeply OpenQA::Worker::_load_avg(path($ENV{OPENQA_CONFIG}, 'invalid_loadavg')), [], 'error on invalid load'; + ok !$worker->_check_system_utilization, 'default threshold not exceeded'; + ok $worker->_check_system_utilization(10), 'stricter threshold exceeded by load'; + ok !$worker->_check_system_utilization(10, [3, 9, 11]), 'load ok on falling flank'; + ok $worker->_check_system_utilization(10, [12, 9, 3]), 'load exceeded on rising flank'; + ok $worker->_check_system_utilization(10, [12, 3, 9]), 'load exceeded on rising flank and old load'; + ok $worker->_check_system_utilization(10, [11, 13, 12]), 'load still exceeded on short load dip'; + ok $worker->_check_system_utilization(10, [11, 12, 13]), 'load still exceeded on falling flank but high'; +}; + subtest 'delay and exec' => sub { my $worker_mock = Test::MockModule->new('OpenQA::Worker'); $worker_mock->redefine(init => 42); @@ -854,7 +868,7 @@ qr/Job 42 from some-host finished - reason: done.*A QEMU instance using.*Skippin $worker_mock->unmock('is_qemu_running'); $worker->settings->global_settings->{CRITICAL_LOAD_AVG_THRESHOLD} = '10'; is $worker->status->{status}, 'broken', 'worker considered broken when average load exceeds threshold'; - like $worker->current_error, qr/load 10\.25.*exceeding.*10/, 'error shows current load and threshold'; + like $worker->current_error, qr/load \(.*10\.25.*exceeding.*10/, 'error shows current load and threshold'; # assume the error is gone $load_avg_file->remove;