Skip to content

Commit

Permalink
fix(metrics): avoid intialize un-necessary system resources
Browse files Browse the repository at this point in the history
It turned out that new_all function opens FD for un-necessary system
resources, which result in accumulated memory-usage issue.
Hence initialize with nothing and then refresh only related.
  • Loading branch information
maqi committed Jan 15, 2025
1 parent 2456e5e commit 46aac36
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 33 deletions.
55 changes: 44 additions & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion ant-logging/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ opentelemetry-semantic-conventions = { version = "0.12.0", optional = true }
rand = { version = "~0.8.5", features = ["small_rng"], optional = true }
serde = { version = "1.0.133", features = ["derive", "rc"] }
serde_json = { version = "1.0" }
sysinfo = { version = "0.30.8", default-features = false, optional = true }
sysinfo = { version = "0.33.1", optional = true }
thiserror = "1.0.23"
tokio = { version = "1.32.0", optional = true }
tracing = { version = "~0.1.26" }
Expand Down
23 changes: 16 additions & 7 deletions ant-logging/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

use serde::Serialize;
use std::time::Duration;
use sysinfo::{self, Networks, Pid, System};
use sysinfo::{self, Networks, Pid, ProcessRefreshKind, ProcessesToUpdate, System};
use tracing::{debug, error};

const UPDATE_INTERVAL: Duration = Duration::from_secs(15);
Expand Down Expand Up @@ -44,9 +44,9 @@ struct ProcessMetrics {
// Obtains the system metrics every UPDATE_INTERVAL and logs it.
// The function should be spawned as a task and should be re-run if our main process is restarted.
pub async fn init_metrics(pid: u32) {
let mut sys = System::new_all();
let mut networks = Networks::new_with_refreshed_list();
let pid = Pid::from_u32(pid);
let mut sys = System::new();

loop {
refresh_metrics(&mut sys, &mut networks, pid);
Expand All @@ -70,10 +70,10 @@ pub async fn init_metrics(pid: u32) {
}
};

let cpu_stat = sys.global_cpu_info();
let system_cpu_usage_percent = sys.global_cpu_usage();
let metrics = Metrics {
physical_cpu_threads: sys.cpus().len(),
system_cpu_usage_percent: cpu_stat.cpu_usage(),
system_cpu_usage_percent,
process,
};
match serde_json::to_string(&metrics) {
Expand All @@ -87,8 +87,17 @@ pub async fn init_metrics(pid: u32) {

// Refreshes only the metrics that we interested in.
fn refresh_metrics(sys: &mut System, networks: &mut Networks, pid: Pid) {
sys.refresh_process(pid);
sys.refresh_memory();
sys.refresh_cpu();
networks.refresh();
sys.refresh_cpu_usage();
networks.refresh(true);

// To refresh only the specific process:
sys.refresh_processes_specifics(
ProcessesToUpdate::Some(&[pid]),
true,
ProcessRefreshKind::nothing()
.with_cpu()
.with_disk_usage()
.with_memory(),
);
}
2 changes: 1 addition & 1 deletion ant-networking/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ rmp-serde = "1.1.1"
serde = { version = "1.0.133", features = ["derive", "rc"] }
sha2 = "0.10"
strum = { version = "0.26.2", features = ["derive"] }
sysinfo = { version = "0.30.8", default-features = false, optional = true }
sysinfo = { version = "0.33.1", optional = true }
thiserror = "1.0.23"
tokio = { version = "1.32.0", features = [
"io-util",
Expand Down
14 changes: 11 additions & 3 deletions ant-networking/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use prometheus_client::{
metrics::family::Family,
metrics::{counter::Counter, gauge::Gauge},
};
use sysinfo::{Pid, ProcessRefreshKind, System};
use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
use tokio::time::Duration;

const UPDATE_INTERVAL: Duration = Duration::from_secs(15);
Expand Down Expand Up @@ -246,12 +246,19 @@ impl NetworkMetricsRecorder {

let pid = Pid::from_u32(std::process::id());
let process_refresh_kind = ProcessRefreshKind::everything().without_disk_usage();
let mut system = System::new_all();

let mut system = System::new();
let physical_core_count = system.physical_core_count();
info!("Detected physical_core_count {physical_core_count:?}");

tokio::spawn(async move {
loop {
system.refresh_process_specifics(pid, process_refresh_kind);
system.refresh_processes_specifics(
ProcessesToUpdate::Some(&[pid]),
true,
process_refresh_kind,
);

if let (Some(process), Some(core_count)) =
(system.process(pid), physical_core_count)
{
Expand All @@ -264,6 +271,7 @@ impl NetworkMetricsRecorder {
/ 10000.0;
let _ = process_cpu_usage_percentage.set(cpu_usage);
}

sleep(UPDATE_INTERVAL).await;
}
});
Expand Down
2 changes: 1 addition & 1 deletion ant-node-manager/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ semver = "1.0.20"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
service-manager = "0.7.0"
sysinfo = "0.30.12"
sysinfo = "0.33.1"
thiserror = "1.0.23"
tokio = { version = "1.26", features = ["full"] }
tracing = { version = "~0.1.26" }
Expand Down
2 changes: 1 addition & 1 deletion ant-node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ rayon = "1.8.0"
self_encryption = "~0.30.0"
serde = { version = "1.0.133", features = ["derive", "rc"] }
strum = { version = "0.26.2", features = ["derive"] }
sysinfo = { version = "0.30.8", default-features = false }
sysinfo = "0.33.1"
thiserror = "1.0.23"
tokio = { version = "1.32.0", features = [
"io-util",
Expand Down
21 changes: 15 additions & 6 deletions ant-node/src/bin/antnode/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ You can check your reward balance by running:
}
});
let ctrl_tx_clone_cpu = ctrl_tx.clone();
// Monitor host CPU usage
// Monitor Host CPU usage
tokio::spawn(async move {
use rand::{thread_rng, Rng};

Expand All @@ -426,18 +426,28 @@ You can check your reward balance by running:
const JITTER_MIN_S: u64 = 1;
const JITTER_MAX_S: u64 = 15;

let mut sys = System::new_all();

let mut high_cpu_count: u8 = 0;

let mut system = System::new();
system.refresh_cpu_usage();

// Random initial delay between 1 and 5 minutes
let initial_delay =
Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S));
tokio::time::sleep(initial_delay).await;

loop {
sys.refresh_cpu();
let cpu_usage = sys.global_cpu_info().cpu_usage();
system.refresh_cpu_usage();

let cpu_usage = system.global_cpu_usage();

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
high_cpu_count = 0;
}

info!("Detected Host CPU usage is {cpu_usage:?}.");

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
Expand All @@ -457,7 +467,6 @@ You can check your reward balance by running:
}
break;
}

// Add jitter to the interval
let jitter = Duration::from_secs(thread_rng().gen_range(JITTER_MIN_S..=JITTER_MAX_S));
tokio::time::sleep(CPU_CHECK_INTERVAL + jitter).await;
Expand Down
2 changes: 1 addition & 1 deletion ant-service-management/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
semver = "1.0.20"
service-manager = "0.7.0"
sysinfo = "0.30.12"
sysinfo = "0.33.1"
thiserror = "1.0.23"
tokio = { version = "1.32.0", features = ["time"] }
tonic = { version = "0.6.2" }
Expand Down
2 changes: 1 addition & 1 deletion node-launchpad/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ serde_json = "1.0.107"
signal-hook = "0.3.17"
strip-ansi-escapes = "0.2.0"
strum = { version = "0.26.1", features = ["derive"] }
sysinfo = "0.30.12"
sysinfo = "0.33.1"
throbber-widgets-tui = "0.8.0"
tokio = { version = "1.32.0", features = ["full"] }
tokio-util = "0.7.9"
Expand Down

0 comments on commit 46aac36

Please sign in to comment.