Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alloy-Mixin: allow k8s cluster and alloy cluster disable, add logs dashboard #808

Merged
merged 6 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions operations/alloy-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
local clusterAlerts = (import './alerts/clustering.libsonnet');
local controllerAlerts = (import './alerts/controller.libsonnet');
local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet');

{
local alloyClusterAlerts = [clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster)],

local otherAlerts = [
controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster),
openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster)
],

prometheusAlerts+: {
groups+: [
(import './alerts/clustering.libsonnet'),
(import './alerts/controller.libsonnet'),
(import './alerts/opentelemetry.libsonnet'),
],
groups+:
if $._config.enableAlloyCluster then
alloyClusterAlerts + otherAlerts
else
otherAlerts
},
}
144 changes: 84 additions & 60 deletions operations/alloy-mixin/alerts/clustering.libsonnet
Original file line number Diff line number Diff line change
@@ -1,67 +1,91 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_clustering',
[
// Cluster not converging.
alert.newRule(
'ClusterNotConverging',
'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0',
'Cluster is not converging: nodes report different number of peers in the cluster.',
'10m',
),
{
newAlloyClusterAlertsGroup(enableK8sCluster=true)::
alert.newGroup(
'alloy_clustering',
[
// Cluster not converging.
alert.newRule(
'ClusterNotConverging',
if enableK8sCluster then
'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0'
else
'stddev by (job) (sum without (state) (cluster_node_peers)) != 0',
'Cluster is not converging: nodes report different number of peers in the cluster.',
'10m',
),

alert.newRule(
'ClusterNodeCountMismatch',
// Assert that the number of known peers (regardless of state) reported by each
// Alloy instance matches the number of running Alloy instances in the
// same cluster and namespace as reported by a count of Prometheus
// metrics.
|||
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
|||,
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
'15m',
),
alert.newRule(
'ClusterNodeCountMismatch',
// Assert that the number of known peers (regardless of state) reported by each
// Alloy instance matches the number of running Alloy instances in the
// same cluster and namespace as reported by a count of Prometheus
// metrics.
if enableK8sCluster then |||
sum without (state) (cluster_node_peers) !=
on (cluster, namespace, job) group_left
count by (cluster, namespace, job) (cluster_node_info)
||| else |||
sum without (state) (cluster_node_peers) !=
on (job) group_left
count by (job) (cluster_node_info)
|||
,
'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
'15m',
),

// Nodes health score is not zero.
alert.newRule(
'ClusterNodeUnhealthy',
|||
cluster_node_gossip_health_score > 0
|||,
'Cluster node is reporting a gossip protocol health score > 0.',
'10m',
),
// Nodes health score is not zero.
alert.newRule(
'ClusterNodeUnhealthy',
|||
cluster_node_gossip_health_score > 0
|||,
'Cluster node is reporting a gossip protocol health score > 0.',
'10m',
),

// Node tried to join the cluster with an already-present node name.
alert.newRule(
'ClusterNodeNameConflict',
'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0',
'A node tried to join the cluster with a name conflicting with an existing peer.',
'10m',
),
// Node tried to join the cluster with an already-present node name.
alert.newRule(
'ClusterNodeNameConflict',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
else
'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
,
'A node tried to join the cluster with a name conflicting with an existing peer.',
'10m',
),

// Node stuck in Terminating state.
alert.newRule(
'ClusterNodeStuckTerminating',
'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0',
'Cluster node stuck in Terminating state.',
'10m',
),
// Node stuck in Terminating state.
alert.newRule(
'ClusterNodeStuckTerminating',
if enableK8sCluster then
'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0'
else
'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0'
,
'Cluster node stuck in Terminating state.',
'10m',
),

// Nodes are not using the same configuration file.
alert.newRule(
'ClusterConfigurationDrift',
|||
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info)
) > 1
|||,
'Cluster nodes are not using the same configuration file.',
'5m',
),
]
)
// Nodes are not using the same configuration file.
alert.newRule(
'ClusterConfigurationDrift',
if enableK8sCluster then |||
count without (sha256) (
max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info)
) > 1
||| else |||
count without (sha256) (
max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info)
) > 1
|||
,
'Cluster nodes are not using the same configuration file.',
'5m',
),
]
)
}
49 changes: 30 additions & 19 deletions operations/alloy-mixin/alerts/controller.libsonnet
Original file line number Diff line number Diff line change
@@ -1,22 +1,33 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0',
'Component evaluations are taking too long.',
'15m',
),
{
newControllerAlertsGroup(enableK8sCluster=true):
alert.newGroup(
'alloy_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
if enableK8sCluster then
'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
else
'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
,
'Component evaluations are taking too long.',
'15m',
),

// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0',
'Unhealthy components detected.',
'15m',
),
]
)
// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
if enableK8sCluster then
'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
else
'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
,
'Unhealthy components detected.',
'15m',
),
]
)
}
55 changes: 33 additions & 22 deletions operations/alloy-mixin/alerts/opentelemetry.libsonnet
Original file line number Diff line number Diff line change
@@ -1,25 +1,36 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'alloy_otelcol',
[
// An otelcol.exporter component rcould not push some spans to the pipeline.
// This could be due to reaching a limit such as the ones
// imposed by otelcol.processor.memory_limiter.
alert.newRule(
'OtelcolReceiverRefusedSpans',
'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0',
'The receiver could not push some spans to the pipeline.',
'5m',
),
{
newOpenTelemetryAlertsGroup(enableK8sCluster=true):
alert.newGroup(
'alloy_otelcol',
[
// An otelcol.exporter component rcould not push some spans to the pipeline.
// This could be due to reaching a limit such as the ones
// imposed by otelcol.processor.memory_limiter.
alert.newRule(
'OtelcolReceiverRefusedSpans',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
else
'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
,
'The receiver could not push some spans to the pipeline.',
'5m',
),

// The exporter failed to send spans to their destination.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0',
'The exporter failed to send spans to their destination.',
'5m',
),
]
)
// The exporter failed to send spans to their destination.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
if enableK8sCluster then
'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
else
'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
,
'The exporter failed to send spans to their destination.',
'5m',
),
]
)
}
12 changes: 12 additions & 0 deletions operations/alloy-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
_config+:: {
enableK8sCluster: true,
enableAlloyCluster: true,
enableLokiLogs: true,
filterSelector: 'job=~"$job"',
groupSelector: if self.enableK8sCluster then self.k8sClusterSelector + ', ' + self.filterSelector else self.filterSelector,
instanceSelector: self.groupSelector + ', instance=~"$instance"',
k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"',
dashboardTag: 'alloy-mixin'
}
}
28 changes: 20 additions & 8 deletions operations/alloy-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
{
grafanaDashboards+:
(import './dashboards/controller.libsonnet') +
(import './dashboards/resources.libsonnet') +
(import './dashboards/prometheus.libsonnet') +
(import './dashboards/cluster-node.libsonnet') +
(import './dashboards/opentelemetry.libsonnet') +
(import './dashboards/cluster-overview.libsonnet'),
local alloyClusterDashboards =
(import './dashboards/cluster-node.libsonnet') +
(import './dashboards/cluster-overview.libsonnet') +
(import './config.libsonnet');

local otherDashboards =
(import './dashboards/resources.libsonnet') +
(import './dashboards/controller.libsonnet') +
(import './dashboards/prometheus.libsonnet') +
(import './dashboards/opentelemetry.libsonnet') +
(import './config.libsonnet');

(import './dashboards/alloy-logs.libsonnet') +
{
grafanaDashboards+:
if $._config.enableAlloyCluster then
alloyClusterDashboards +
otherDashboards
else
otherDashboards
}
35 changes: 35 additions & 0 deletions operations/alloy-mixin/dashboards/alloy-logs.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet';
local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet';

{

local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'],

grafanaDashboards+:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem to follow the pattern that other dashboards do; this is the only one with a grafanaDashboards top-level key. Can we change this to be consistent with the other dashboards?

That would also mean moving the logic for whether to enable the dashboard outside of the definition of the dashboard itself, which would be more consistent with how the clustering dashboards are made optional.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is because the library I am using expects a grafanaDashboards variable.
I also don't like how it looks different, but since this is a totally different library which is synced with the latest version of grafonnet, I believe it is an acceptable trade off.

if $._config.enableLokiLogs then {
local alloyLogs =
logsDashboard.new(
'Alloy logs overview',
datasourceName='loki_datasource',
datasourceRegex='',
filterSelector=$._config.filterSelector,
labels=labels,
formatParser=null,
showLogsVolume=true
)
{
panels+:
{
logs+:
// Alloy logs already have timestamp
g.panel.logs.options.withShowTime(false),
},
dashboards+:
{
logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links)
+ g.dashboard.withRefresh('10s'),
},
},
'alloy-logs.json': alloyLogs.dashboards.logs,
} else {},
}
Loading
Loading