Skip to content

Commit

Permalink
including cluster config into alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
gaantunes committed May 20, 2024
1 parent c6f356f commit 73c93b7
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 109 deletions.
24 changes: 16 additions & 8 deletions operations/alloy-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
local clusterAlerts = (import './alerts/clustering.libsonnet');
local controllerAlerts = (import './alerts/controller.libsonnet');
local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet');

{
prometheusAlerts+: {
groups+: [
if $._config.enableK8sCluster then
(import './alerts/clustering.libsonnet')
else
{}
+ (import './alerts/controller.libsonnet')
+ (import './alerts/opentelemetry.libsonnet')
],
groups+:
if $._config.enableAlloyCluster then
[
clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster),
controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster),
openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster),
]
else
[
controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster),
openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster)
],
},
}
150 changes: 90 additions & 60 deletions operations/alloy-mixin/alerts/clustering.libsonnet
Original file line number Diff line number Diff line change
@@ -1,67 +1,97 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_clustering',
[
// Cluster not converging.
alert.newRule(
'ClusterNotConverging',
'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0',
'Cluster is not converging: nodes report different number of peers in the cluster.',
'10m',
),
{
newAlloyClusterAlertsGroup(enableK8sCluster=true)::
alert.newGroup(
'alloy_clustering',
[
// Cluster not converging.
alert.newRule(
'ClusterNotConverging',
if enableK8sCluster then
'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0'
else
'stddev by (job) (sum without (state) (cluster_node_peers)) != 0',
'Cluster is not converging: nodes report different number of peers in the cluster.',
'10m',
),

alert.newRule(
'ClusterNodeCountMismatch',
// Assert that the number of known peers (regardless of state) reported by each
// Alloy instance matches the number of running Alloy instances in the
// same cluster and namespace as reported by a count of Prometheus
// metrics.
|||
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
|||,
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
'15m',
),
alert.newRule(
'ClusterNodeCountMismatch',
// Assert that the number of known peers (regardless of state) reported by each
// Alloy instance matches the number of running Alloy instances in the
// same cluster and namespace as reported by a count of Prometheus
// metrics.
if enableK8sCluster then
|||
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
|||
else
|||
sum without (state) (cluster_node_peers) !=
on (job) group_left
count by (job) (cluster_node_info)
|||
,
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
'15m',
),

// Nodes health score is not zero.
alert.newRule(
'ClusterNodeUnhealthy',
|||
cluster_node_gossip_health_score > 0
|||,
'Cluster node is reporting a gossip protocol health score > 0.',
'10m',
),
// Nodes health score is not zero.
alert.newRule(
'ClusterNodeUnhealthy',
|||
cluster_node_gossip_health_score > 0
|||,
'Cluster node is reporting a gossip protocol health score > 0.',
'10m',
),

// Node tried to join the cluster with an already-present node name.
alert.newRule(
'ClusterNodeNameConflict',
'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0',
'A node tried to join the cluster with a name conflicting with an existing peer.',
'10m',
),
// Node tried to join the cluster with an already-present node name.
alert.newRule(
'ClusterNodeNameConflict',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
else
'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
,
'A node tried to join the cluster with a name conflicting with an existing peer.',
'10m',
),

// Node stuck in Terminating state.
alert.newRule(
'ClusterNodeStuckTerminating',
'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0',
'Cluster node stuck in Terminating state.',
'10m',
),
// Node stuck in Terminating state.
alert.newRule(
'ClusterNodeStuckTerminating',
if enableK8sCluster then
'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0'
else
'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0'
,
'Cluster node stuck in Terminating state.',
'10m',
),

// Nodes are not using the same configuration file.
alert.newRule(
'ClusterConfigurationDrift',
|||
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info)
) > 1
|||,
'Cluster nodes are not using the same configuration file.',
'5m',
),
]
)
// Nodes are not using the same configuration file.
alert.newRule(
'ClusterConfigurationDrift',
if enableK8sCluster then
|||
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info)
) > 1
|||
else
|||
count without (sha256) (
max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info)
) > 1
|||
,
'Cluster nodes are not using the same configuration file.',
'5m',
),
]
)
}
49 changes: 30 additions & 19 deletions operations/alloy-mixin/alerts/controller.libsonnet
Original file line number Diff line number Diff line change
@@ -1,22 +1,33 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0',
'Component evaluations are taking too long.',
'15m',
),
{
newControllerAlertsGroup(enableK8sCluster=true):
alert.newGroup(
'alloy_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
if enableK8sCluster then
'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
else
'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
,
'Component evaluations are taking too long.',
'15m',
),

// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0',
'Unhealthy components detected.',
'15m',
),
]
)
// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
if enableK8sCluster then
'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
else
'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
,
'Unhealthy components detected.',
'15m',
),
]
)
}
55 changes: 33 additions & 22 deletions operations/alloy-mixin/alerts/opentelemetry.libsonnet
Original file line number Diff line number Diff line change
@@ -1,25 +1,36 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_otelcol',
[
// An otelcol.exporter component rcould not push some spans to the pipeline.
// This could be due to reaching a limit such as the ones
// imposed by otelcol.processor.memory_limiter.
alert.newRule(
'OtelcolReceiverRefusedSpans',
'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0',
'The receiver could not push some spans to the pipeline.',
'5m',
),
{
newOpenTelemetryAlertsGroup(enableK8sCluster=true):
alert.newGroup(
'alloy_otelcol',
[
// An otelcol.exporter component rcould not push some spans to the pipeline.
// This could be due to reaching a limit such as the ones
// imposed by otelcol.processor.memory_limiter.
alert.newRule(
'OtelcolReceiverRefusedSpans',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
else
'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
,
'The receiver could not push some spans to the pipeline.',
'5m',
),

// The exporter failed to send spans to their destination.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0',
'The exporter failed to send spans to their destination.',
'5m',
),
]
)
// The exporter failed to send spans to their destination.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
else
'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
,
'The exporter failed to send spans to their destination.',
'5m',
),
]
)
}

0 comments on commit 73c93b7

Please sign in to comment.