Skip to content

Commit

Permalink
Adaptive collect interval
Browse files Browse the repository at this point in the history
  • Loading branch information
fraillt committed Nov 28, 2024
1 parent 7058272 commit 7ae7647
Show file tree
Hide file tree
Showing 2 changed files with 425 additions and 25 deletions.
361 changes: 361 additions & 0 deletions diffas
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
diff --git a/stress/Cargo.toml b/stress/Cargo.toml
index 90bb6e28..bb414a02 100644
--- a/stress/Cargo.toml
+++ b/stress/Cargo.toml
@@ -19,6 +19,11 @@ name = "metrics_histogram"
path = "src/metrics_histogram.rs"
doc = false

+[[bin]] # Bin to run measure latency in various conditions
+name = "metrics_latency"
+path = "src/metrics_latency.rs"
+doc = false
+
[[bin]] # Bin to run the metrics overflow stress tests
name = "metrics_overflow"
path = "src/metrics_overflow.rs"
diff --git a/stress/src/metrics_latency.rs b/stress/src/metrics_latency.rs
new file mode 100644
index 00000000..82e683b6
--- /dev/null
+++ b/stress/src/metrics_latency.rs
@@ -0,0 +1,339 @@
+use std::{
+ cell::RefCell,
+ collections::{BTreeMap, HashMap},
+ sync::{
+ atomic::{AtomicBool, AtomicU64, Ordering},
+ Arc, Mutex, Weak,
+ },
+ thread::{sleep, JoinHandle},
+ time::{Duration, Instant},
+ u64,
+};
+
+use opentelemetry::{metrics::MeterProvider, KeyValue};
+use opentelemetry_sdk::{
+ metrics::{
+ data::{self, ResourceMetrics},
+ reader::MetricReader,
+ InstrumentKind, ManualReader, MetricError, Pipeline, SdkMeterProvider, Temporality,
+ },
+ Resource,
+};
+use rand::{rngs, Rng, SeedableRng};
+
+thread_local! {
+ /// Store random number generator for each thread
+ static CURRENT_RNG: RefCell<rngs::SmallRng> = RefCell::new(rngs::SmallRng::from_entropy());
+}
+
+// copy/paste from opentelemetry-sdk/benches/metric.rs
+#[derive(Clone, Debug)]
+pub struct SharedReader(Arc<dyn MetricReader>);
+
+impl SharedReader {
+ pub fn new<R>(reader: R) -> Self
+ where
+ R: MetricReader,
+ {
+ Self(Arc::new(reader))
+ }
+}
+
+impl MetricReader for SharedReader {
+ fn register_pipeline(&self, pipeline: Weak<Pipeline>) {
+ self.0.register_pipeline(pipeline)
+ }
+
+ fn collect(&self, rm: &mut ResourceMetrics) -> Result<(), MetricError> {
+ self.0.collect(rm)
+ }
+
+ fn force_flush(&self) -> Result<(), MetricError> {
+ self.0.force_flush()
+ }
+
+ fn shutdown(&self) -> Result<(), MetricError> {
+ self.0.shutdown()
+ }
+
+ fn temporality(&self, kind: InstrumentKind) -> Temporality {
+ self.0.temporality(kind)
+ }
+}
+
+fn main() {
+ let available_threads: usize = std::thread::available_parallelism().map_or(1, |p| p.get());
+
+ for threads_count in [available_threads / 4, available_threads * 4] {
+ println!("*** updates, using {threads_count} threads ***");
+ measure_update_latency(&format!("no attribs"), threads_count, 10000000, |_i, _j| []);
+ measure_update_latency(&format!("1 attrib"), threads_count, 10000000, |_i, _j| {
+ [KeyValue::new("some_key", 1)]
+ });
+
+ measure_update_latency(&format!("9 attribs"), threads_count, 10000000, |_i, _j| {
+ // for http.server.request.duration as defined in https://opentelemetry.io/docs/specs/semconv/http/http-metrics/
+ [
+ KeyValue::new("http.request.method", "GET"),
+ KeyValue::new("url.scheme", "not_found"),
+ KeyValue::new("error.type", 404),
+ KeyValue::new("http.response.status_code", 404),
+ KeyValue::new("http.route", "testing/metrics/latency"),
+ KeyValue::new("network.protocol.name", "http"),
+ KeyValue::new("network.protocol.version", 2),
+ KeyValue::new("server.address", "example.com"),
+ KeyValue::new("server.port", 8080),
+ ]
+ });
+ println!("*** inserts, using {threads_count} threads ***");
+ measure_update_latency(&format!("1 attrib"), threads_count, 1500, |i, j| {
+ [KeyValue::new(format!("some_key{i}"), j as i64)]
+ });
+
+ measure_update_latency(&format!("10 attribs"), threads_count, 1500, |i, j| {
+ [
+ KeyValue::new(format!("random{i}"), j as i64),
+ KeyValue::new("http.request.method", "GET"),
+ KeyValue::new("url.scheme", "not_found"),
+ KeyValue::new("error.type", 404),
+ KeyValue::new("http.response.status_code", 404),
+ KeyValue::new("http.route", "testing/metrics/latency"),
+ KeyValue::new("network.protocol.name", "http"),
+ KeyValue::new("network.protocol.version", 2),
+ KeyValue::new("server.address", "example.com"),
+ KeyValue::new("server.port", 8080),
+ ]
+ });
+ println!("*** mix mostly updates (~10% inserts), using {threads_count} threads ***");
+ measure_update_latency(&format!("10 attribs"), threads_count, 10000, |_i, j| {
+ let randomness: i64 = 20
+ - CURRENT_RNG.with(|rng| {
+ let mut rng = rng.borrow_mut();
+ rng.gen_range(0..20)
+ });
+ [
+ KeyValue::new("random", (j / 10) as i64 + randomness),
+ KeyValue::new("http.request.method", "GET"),
+ KeyValue::new("url.scheme", "not_found"),
+ KeyValue::new("error.type", 404),
+ KeyValue::new("http.response.status_code", 404),
+ KeyValue::new("http.route", "testing/metrics/latency"),
+ KeyValue::new("network.protocol.name", "http"),
+ KeyValue::new("network.protocol.version", 2),
+ KeyValue::new("server.address", "example.com"),
+ KeyValue::new("server.port", 8080),
+ ]
+ });
+ }
+}
+
+fn measure_update_latency<const N: usize>(
+ msg: &str,
+ threads_count: usize,
+ collect_around: u64,
+ attribs: fn(usize, u64) -> [KeyValue; N],
+) {
+ let reader = SharedReader::new(
+ ManualReader::builder()
+ .with_temporality(Temporality::Delta)
+ .build(),
+ );
+ let provider = SdkMeterProvider::builder()
+ .with_reader(reader.clone())
+ .build();
+ let histogram = provider.meter("test").u64_counter("hello").build();
+ let mut threads = Vec::new();
+ let mut stats = Vec::new();
+ stats.resize_with(threads_count, || {
+ Arc::new(Mutex::new(HashMap::<u64, u64>::new()))
+ });
+ let total_iterations = Arc::new(AtomicU64::new(0));
+ let iterate_flag = Arc::new(AtomicBool::new(true));
+ // run multiple threads and measure how time it takes to update metric
+ for thread_idx in 0..threads_count {
+ let hist = histogram.clone();
+ let stat = stats[thread_idx].clone();
+ let iterate_flag = iterate_flag.clone();
+ let total_iterations = total_iterations.clone();
+ threads.push(std::thread::spawn(move || {
+ let mut stat = stat.lock().unwrap();
+ let mut iter_idx = 0;
+ while iterate_flag.load(Ordering::Acquire) {
+ let kv = attribs(thread_idx, iter_idx);
+ let start = Instant::now();
+ hist.add(1, &kv);
+ let curr = stat.entry(start.elapsed().as_nanos() as u64).or_default();
+ *curr += 1;
+ iter_idx += 1;
+ }
+ total_iterations.fetch_add(iter_idx, Ordering::AcqRel);
+ }));
+ }
+
+ let total_measurements = collect_measurements(reader, threads, iterate_flag, collect_around);
+
+ assert_eq!(total_measurements, total_iterations.load(Ordering::Acquire));
+
+ print_stats(msg, stats, total_measurements);
+}
+
+fn collect_measurements(
+ reader: SharedReader,
+ threads: Vec<JoinHandle<()>>,
+ iterate_flag: Arc<AtomicBool>,
+ collect_around: u64,
+) -> u64 {
+ let start = Instant::now();
+ let mut total_count = 0;
+ let mut wait_for_next_collect = Duration::from_micros(500);
+ while start.elapsed() < Duration::from_secs(1) {
+ sleep(wait_for_next_collect);
+ let collected = collect_and_return_count(&reader);
+ // calculate wait interval so that the next collect cycle would be close to `collect_around`
+ let ratio = collected as f64 / collect_around as f64;
+ let clamped = if ratio > 2.0 {
+ 2.0
+ } else if ratio < 0.5 {
+ 0.5
+ } else {
+ ratio
+ };
+ wait_for_next_collect =
+ Duration::from_micros((wait_for_next_collect.as_micros() as f64 / clamped) as u64);
+ total_count += collected;
+ }
+ iterate_flag.store(false, Ordering::Release);
+ threads.into_iter().for_each(|t| {
+ t.join().unwrap();
+ });
+ total_count += collect_and_return_count(&reader);
+ total_count
+}
+
+fn print_stats(msg: &str, stats: Vec<Arc<Mutex<HashMap<u64, u64>>>>, total_measurements: u64) {
+ let stats = stats
+ .into_iter()
+ .map(|s| Arc::into_inner(s).unwrap().into_inner().unwrap())
+ .flat_map(|s| s.into_iter())
+ .fold(BTreeMap::<u64, u64>::default(), |mut acc, (time, count)| {
+ *acc.entry(time).or_default() += count;
+ acc
+ });
+
+ let sum = stats.iter().fold(0, |mut acc, (&time, &count)| {
+ acc += time * count;
+ acc
+ });
+
+ println!("{msg}");
+ println!("\titer {}", format_count(total_measurements));
+ println!(
+ "\tp50 {}",
+ format_time(get_percentile_value(total_measurements, &stats, 50))
+ );
+ println!(
+ "\tp95 {}",
+ format_time(get_percentile_value(total_measurements, &stats, 95))
+ );
+ println!(
+ "\tp99 {}",
+ format_time(get_percentile_value(total_measurements, &stats, 99))
+ );
+ println!("\tavg {}", format_time(sum / total_measurements as u64));
+ println!("\tbest {}", format_time(*stats.iter().next().unwrap().0));
+ println!("\tworst {}", format_time(*stats.iter().last().unwrap().0));
+}
+
+fn collect_and_return_count(reader: &SharedReader) -> u64 {
+ let mut rm = ResourceMetrics {
+ resource: Resource::empty(),
+ scope_metrics: Vec::new(),
+ };
+ reader.collect(&mut rm).unwrap();
+ rm.scope_metrics
+ .into_iter()
+ .flat_map(|sm| sm.metrics.into_iter())
+ .flat_map(|m| {
+ m.data
+ .as_any()
+ .downcast_ref::<data::Sum<u64>>()
+ .unwrap()
+ .data_points
+ .clone()
+ .into_iter()
+ })
+ .map(|dp| dp.value)
+ .sum()
+}
+
+fn get_percentile_value(
+ total_measurements: u64,
+ stats: &BTreeMap<u64, u64>,
+ percentile: u64,
+) -> u64 {
+ assert!(percentile > 0 && percentile < 100);
+ let break_point = ((total_measurements as f64 * percentile as f64) / 100.0) as u64;
+ let mut iter = stats.iter().peekable();
+ let mut sum = 0;
+ while let Some(left) = iter.next() {
+ sum += left.1;
+ if let Some(&right) = iter.peek() {
+ let next_sum = sum + right.1;
+ if next_sum > break_point {
+ // interpolate
+ let diff = (next_sum - sum) as f32;
+ let ratio = (break_point - sum) as f32 / diff;
+ let time_diff = (right.0 - left.0) as f32;
+ return *left.0 + (time_diff * ratio) as u64;
+ }
+ }
+ }
+ 0
+}
+
+fn format_count(count: u64) -> String {
+ let count = count as f64;
+ let (val, symbol) = if count > 1000000.0 {
+ (count / 1000000.0, "M")
+ } else if count > 1000.0 {
+ (count / 1000.0, "K")
+ } else {
+ (count, "")
+ };
+ if val > 100.0 {
+ format!("{val:>5.1}{symbol}")
+ } else if val > 10.0 {
+ format!("{val:>5.2}{symbol}")
+ } else {
+ format!("{val:>5.3}{symbol}")
+ }
+}
+
+fn format_time(nanos: u64) -> String {
+ let nanos = nanos as f64;
+ let (val, symbol) = if nanos > 1000000.0 {
+ (nanos / 1000000.0, "ms")
+ } else if nanos > 1000.0 {
+ (nanos / 1000.0, "μs")
+ } else {
+ (nanos, "ns")
+ };
+ if val > 100.0 {
+ format!("{val:>5.1}{symbol}")
+ } else if val > 10.0 {
+ format!("{val:>5.2}{symbol}")
+ } else {
+ format!("{val:>5.3}{symbol}")
+ }
+}
+
+#[test]
+fn test_format_time() {
+ assert_eq!("12.00ns", format_time(12));
+ assert_eq!("123.0ns", format_time(123));
+ assert_eq!("1.234μs", format_time(1234));
+ assert_eq!("12.35μs", format_time(12349));
+ assert_eq!("123.4μs", format_time(123400));
+ assert_eq!("1.235ms", format_time(1234900));
+ assert_eq!("12.34ms", format_time(12340000));
+}
Loading

0 comments on commit 7ae7647

Please sign in to comment.