Skip to content

Commit

Permalink
fix: EXC-1765: Fix canister_age metric (#3443)
Browse files Browse the repository at this point in the history
The `scheduler_canister_age_rounds` metric tracks the difference between
the current round and the round in which the canister last fully
executed. By default, this last executed round is set to zero for newly
created canisters. Consequently, the `canister_age` metric for these new
canisters abruptly increases to the current round, which can be in the
billions.

This PR addresses this issue by excluding canisters with a zero 'last
executed round' from the `canister_age` metric observations.

Furthermore, this PR increases the upper bound of the metric's range to
5000 rounds.
  • Loading branch information
berestovskyy authored Jan 16, 2025
1 parent 6b3c844 commit 2f6fcec
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 36 deletions.
39 changes: 21 additions & 18 deletions rs/execution_environment/src/scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,24 +603,27 @@ impl SchedulerImpl {
// are not polluted by canisters that haven't had any messages for a long time.
for canister_id in &round_filtered_canisters.active_canister_ids {
let canister_state = state.canister_state(canister_id).unwrap();
let canister_age = current_round.get()
- canister_state
.scheduler_state
.last_full_execution_round
.get();
self.metrics.canister_age.observe(canister_age as f64);
// If `canister_age` > 1 / `compute_allocation` the canister ought to have been
// scheduled.
let allocation = Ratio::new(
canister_state
.scheduler_state
.compute_allocation
.as_percent(),
100,
);
if *allocation.numer() > 0 && Ratio::from_integer(canister_age) > allocation.recip() {
self.metrics.canister_compute_allocation_violation.inc();
}
// Newly created canisters have `last_full_execution_round` set to zero,
// and hence skew the `canister_age` metric.
let last_full_execution_round =
canister_state.scheduler_state.last_full_execution_round;
if last_full_execution_round.get() != 0 {
let canister_age = current_round.get() - last_full_execution_round.get();
self.metrics.canister_age.observe(canister_age as f64);
// If `canister_age` > 1 / `compute_allocation` the canister ought to have been
// scheduled.
let allocation = Ratio::new(
canister_state
.scheduler_state
.compute_allocation
.as_percent(),
100,
);
if *allocation.numer() > 0 && Ratio::from_integer(canister_age) > allocation.recip()
{
self.metrics.canister_compute_allocation_violation.inc();
}
};
}

for (message_id, status) in ingress_execution_results {
Expand Down
4 changes: 2 additions & 2 deletions rs/execution_environment/src/scheduler/scheduler_metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ impl SchedulerMetrics {
canister_age: metrics_registry.histogram(
"scheduler_canister_age_rounds",
"Number of rounds for which a canister was not scheduled.",
// 1, 2, 5, …, 100, 200, 500
decimal_buckets(0, 2),
// 1, 2, 5, …, 1000, 2000, 5000
decimal_buckets(0, 3),
),
canister_compute_allocation_violation: metrics_registry.int_counter(
"scheduler_compute_allocation_violations",
Expand Down
47 changes: 31 additions & 16 deletions rs/execution_environment/src/scheduler/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2685,21 +2685,23 @@ fn can_record_metrics_single_scheduler_thread() {

#[test]
fn can_record_metrics_for_a_round() {
let num_canisters = 3;
let scheduler_cores = num_canisters as usize - 1;
let instructions = 5;
let mut test = SchedulerTestBuilder::new()
.with_scheduler_config(SchedulerConfig {
scheduler_cores: 2,
max_instructions_per_round: NumInstructions::from(25),
max_instructions_per_message: NumInstructions::from(5),
max_instructions_per_message_without_dts: NumInstructions::new(5),
max_instructions_per_slice: NumInstructions::from(5),
scheduler_cores,
max_instructions_per_round: NumInstructions::from(instructions * 2),
max_instructions_per_message: NumInstructions::from(instructions),
max_instructions_per_message_without_dts: NumInstructions::new(instructions),
max_instructions_per_slice: NumInstructions::from(instructions),
instruction_overhead_per_execution: NumInstructions::from(0),
instruction_overhead_per_canister: NumInstructions::from(0),
instruction_overhead_per_canister_for_finalization: NumInstructions::from(0),
..SchedulerConfig::application_subnet()
})
.build();

let num_canisters = 3;
// The first two canisters have an `Allocation` of 45% and the last 9%. We'll be
// forced to execute the first two and then run out of instructions (based on
// the limits) which will result in a violation of third canister's
Expand All @@ -2715,13 +2717,10 @@ fn can_record_metrics_for_a_round() {
None,
);
for _ in 0..5 {
test.send_ingress(canister, ingress(5));
test.send_ingress(canister, ingress(instructions));
}
}

// For allocation violation to happen, the canister age should be more than `100/9 = 11 rounds`
test.advance_to_round(ExecutionRound::from(12));

for canister in test.state_mut().canister_states.values_mut() {
canister.scheduler_state.time_of_last_allocation_charge =
UNIX_EPOCH + Duration::from_secs(1);
Expand All @@ -2744,10 +2743,11 @@ fn can_record_metrics_for_a_round() {

let metrics = &test.scheduler().metrics;
assert_eq!(
metrics.executable_canisters_per_round.get_sample_sum() as i64,
3
metrics.executable_canisters_per_round.get_sample_sum() as u64,
num_canisters
);
assert_eq!(metrics.canister_age.get_sample_sum() as i64, 12);
// The canister age metric is not observed for newly created canisters.
assert_eq!(metrics.canister_age.get_sample_sum() as i64, 0);
assert_eq!(metrics.round_preparation_duration.get_sample_count(), 1);
assert_eq!(metrics.round_preparation_ingress.get_sample_count(), 1);
assert_eq!(metrics.round_scheduling_duration.get_sample_count(), 1);
Expand All @@ -2762,23 +2762,38 @@ fn can_record_metrics_for_a_round() {
);
assert_eq!(metrics.round_finalization_ingress.get_sample_count(), 1);
assert_eq!(metrics.round_finalization_charge.get_sample_count(), 1);
assert_eq!(metrics.canister_compute_allocation_violation.get(), 1);
// Compute allocation violation is not observed for newly created canisters.
assert_eq!(metrics.canister_compute_allocation_violation.get(), 0);
assert_eq!(
metrics.canister_messages_where_cycles_were_charged.get(),
10
scheduler_cores as u64 * 2
);

assert_eq!(
test.state()
.metadata
.subnet_metrics
.update_transactions_total,
10
scheduler_cores as u64 * 2
);
assert_eq!(
test.state().metadata.subnet_metrics.num_canisters,
num_canisters
);

// Bump up the round number.
test.execute_round(ExecutionRoundType::OrdinaryRound);

// For allocation violation to happen, the canister age should be more than `100/9 = 11 rounds`
// plus 2 rounds already executed.
test.advance_to_round(ExecutionRound::from(11 + 2));
test.execute_round(ExecutionRoundType::OrdinaryRound);

let metrics = &test.scheduler().metrics;
// The canister age metric should be observed now.
assert_eq!(metrics.canister_age.get_sample_sum() as i64, 12);
// Compute allocation violation should also be observed now.
assert_eq!(metrics.canister_compute_allocation_violation.get(), 1);
}

/// Check that when a canister is scheduled and can't prepay for execution, the
Expand Down

0 comments on commit 2f6fcec

Please sign in to comment.