Skip to content

Commit

Permalink
fix(node): avoid scan entire OS during cpu threshold check
Browse files Browse the repository at this point in the history
Scanning entire OS incur previous FDs to be included
in the later on started up node processes, causing an
accumulated resource over-usage issue.
  • Loading branch information
maqi committed Jan 13, 2025
1 parent cacc5f4 commit 15f045f
Showing 1 changed file with 38 additions and 24 deletions.
62 changes: 38 additions & 24 deletions ant-node/src/bin/antnode/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use std::{
process::Command,
time::Duration,
};
use sysinfo::{self, System};
use sysinfo::{self, Pid, ProcessRefreshKind, System};
use tokio::{
runtime::Runtime,
sync::{broadcast::error::RecvError, mpsc},
Expand Down Expand Up @@ -413,12 +413,14 @@ You can check your reward balance by running:
}
});
let ctrl_tx_clone_cpu = ctrl_tx.clone();
// Monitor host CPU usage
let pid = Pid::from_u32(std::process::id());
// Monitor process CPU usage
tokio::spawn(async move {
use rand::{thread_rng, Rng};

const CPU_CHECK_INTERVAL: Duration = Duration::from_secs(60);
const CPU_USAGE_THRESHOLD: f32 = 50.0;
// As this is now a per-process CPU usage, it shall never exceeds 10% for too long time.
const CPU_USAGE_THRESHOLD: f64 = 10.0;
const HIGH_CPU_CONSECUTIVE_LIMIT: u8 = 5;
const NODE_STOP_DELAY: Duration = Duration::from_secs(1);
const INITIAL_DELAY_MIN_S: u64 = 10;
Expand All @@ -427,38 +429,50 @@ You can check your reward balance by running:
const JITTER_MIN_S: u64 = 1;
const JITTER_MAX_S: u64 = 15;

let mut sys = System::new_all();

let mut high_cpu_count: u8 = 0;

let process_refresh_kind = ProcessRefreshKind::everything()
.without_disk_usage()
.without_memory();
let mut system = System::new_all();
let physical_core_count = system.physical_core_count();

// Random initial delay between 1 and 5 minutes
let initial_delay =
Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S));
tokio::time::sleep(initial_delay).await;

loop {
sys.refresh_cpu();
let cpu_usage = sys.global_cpu_info().cpu_usage();

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
high_cpu_count = 0;
}
system.refresh_process_specifics(pid, process_refresh_kind);
if let (Some(process), Some(core_count)) = (system.process(pid), physical_core_count) {
// divide by core_count to get value between 0-100
let cpu_usage =
((process.cpu_usage() as f64 / core_count as f64) * 10000.0).round() / 10000.0;
info!(
"Detected process {pid} CPU usage is {cpu_usage:?} (with {core_count} cores)"
);

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
high_cpu_count = 0;
}

if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT {
if let Err(err) = ctrl_tx_clone_cpu
.send(NodeCtrl::Stop {
delay: NODE_STOP_DELAY,
result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")),
})
.await
{
error!("Failed to send node control msg to antnode bin main thread: {err}");
if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT {
if let Err(err) = ctrl_tx_clone_cpu
.send(NodeCtrl::Stop {
delay: NODE_STOP_DELAY,
result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")),
})
.await
{
error!("Failed to send node control msg to antnode bin main thread: {err}");
}
break;
}
break;
} else {
error!("Cann't refresh systeminfo of process {pid} with OS core_count of {physical_core_count:?}");
}

// Add jitter to the interval
let jitter = Duration::from_secs(thread_rng().gen_range(JITTER_MIN_S..=JITTER_MAX_S));
tokio::time::sleep(CPU_CHECK_INTERVAL + jitter).await;
Expand Down

0 comments on commit 15f045f

Please sign in to comment.