grafana · rfratto · May 24, 2024 · May 6, 2024 · May 8, 2024 · May 20, 2024
@@ -1,9 +1,20 @@
+local clusterAlerts = (import './alerts/clustering.libsonnet');
+local controllerAlerts = (import './alerts/controller.libsonnet');
+local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet');
+
 {
+  local alloyClusterAlerts = [clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster)],
+
+  local otherAlerts = [
+    controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster),
+    openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster)
+  ],
+
   prometheusAlerts+: {
-    groups+: [
-      (import './alerts/clustering.libsonnet'),
-      (import './alerts/controller.libsonnet'),
-      (import './alerts/opentelemetry.libsonnet'),
-    ],
+    groups+: 
+      if $._config.enableAlloyCluster then
+        alloyClusterAlerts + otherAlerts
+      else
+        otherAlerts
   },
 }
@@ -1,67 +1,91 @@
 local alert = import './utils/alert.jsonnet';
 
-alert.newGroup(
-  'alloy_clustering',
-  [
-    // Cluster not converging.
-    alert.newRule(
-      'ClusterNotConverging',
-      'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0',
-      'Cluster is not converging: nodes report different number of peers in the cluster.',
-      '10m',
-    ),
+{
+  newAlloyClusterAlertsGroup(enableK8sCluster=true)::
+    alert.newGroup(
+      'alloy_clustering',
+      [
+        // Cluster not converging.
+        alert.newRule(
+          'ClusterNotConverging',
+          if enableK8sCluster then 
+            'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0' 
+          else 
+            'stddev by (job) (sum without (state) (cluster_node_peers)) != 0',
+          'Cluster is not converging: nodes report different number of peers in the cluster.',
+          '10m',
+        ),
 
-    alert.newRule(
-      'ClusterNodeCountMismatch',
-      // Assert that the number of known peers (regardless of state) reported by each
-      // Alloy instance matches the number of running Alloy instances in the
-      // same cluster and namespace as reported by a count of Prometheus
-      // metrics.
-      |||
-        sum without (state) (cluster_node_peers) !=
-        on (cluster, namespace, job) group_left
-        count by (cluster, namespace, job) (cluster_node_info)
-      |||,
-      'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
-      '15m',
-    ),
+        alert.newRule(
+          'ClusterNodeCountMismatch',
+          // Assert that the number of known peers (regardless of state) reported by each
+          // Alloy instance matches the number of running Alloy instances in the
+          // same cluster and namespace as reported by a count of Prometheus
+          // metrics.
+          if enableK8sCluster then |||
+            sum without (state) (cluster_node_peers) !=
+            on (cluster, namespace, job) group_left
+            count by (cluster, namespace, job) (cluster_node_info)
+          ||| else |||
+            sum without (state) (cluster_node_peers) !=
+            on (job) group_left
+            count by (job) (cluster_node_info)
+          |||
+          ,
+          'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.',
+          '15m',
+        ),
 
-    // Nodes health score is not zero.
-    alert.newRule(
-      'ClusterNodeUnhealthy',
-      |||
-        cluster_node_gossip_health_score > 0
-      |||,
-      'Cluster node is reporting a gossip protocol health score > 0.',
-      '10m',
-    ),
+        // Nodes health score is not zero.
+        alert.newRule(
+          'ClusterNodeUnhealthy',        
+          |||
+            cluster_node_gossip_health_score > 0
+          |||,
+          'Cluster node is reporting a gossip protocol health score > 0.',
+          '10m',
+        ),
 
-    // Node tried to join the cluster with an already-present node name.
-    alert.newRule(
-      'ClusterNodeNameConflict',
-      'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0',
-      'A node tried to join the cluster with a name conflicting with an existing peer.',
-      '10m',
-    ),
+        // Node tried to join the cluster with an already-present node name.
+        alert.newRule(
+          'ClusterNodeNameConflict',
+          if enableK8sCluster then 
+            'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
+          else
+            'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0'
+          ,
+          'A node tried to join the cluster with a name conflicting with an existing peer.',
+          '10m',
+        ),
 
-    // Node stuck in Terminating state.
-    alert.newRule(
-      'ClusterNodeStuckTerminating',
-      'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0',
-      'Cluster node stuck in Terminating state.',
-      '10m',
-    ),
+        // Node stuck in Terminating state.
+        alert.newRule(
+          'ClusterNodeStuckTerminating',
+          if enableK8sCluster then
+            'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0'
+          else
+            'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0'
+          ,
+          'Cluster node stuck in Terminating state.',
+          '10m',
+        ),
 
-    // Nodes are not using the same configuration file.
-    alert.newRule(
-      'ClusterConfigurationDrift',
-      |||
-        count without (sha256) (
-            max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info)
-        ) > 1
-      |||,
-      'Cluster nodes are not using the same configuration file.',
-      '5m',
-    ),
-  ]
-)
+        // Nodes are not using the same configuration file.
+        alert.newRule(
+          'ClusterConfigurationDrift',
+          if enableK8sCluster then |||
+            count without (sha256) (
+                max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info)
+            ) > 1
+          ||| else |||
+            count without (sha256) (
+                max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info)
+            ) > 1
+          |||
+          ,
+          'Cluster nodes are not using the same configuration file.',
+          '5m',
+        ),
+      ]
+    )  
+}
@@ -1,22 +1,33 @@
 local alert = import './utils/alert.jsonnet';
 
-alert.newGroup(
-  'alloy_controller',
-  [
-    // Component evaluations are taking too long, which can lead to e.g. stale targets.
-    alert.newRule(
-      'SlowComponentEvaluations',
-      'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0',
-      'Component evaluations are taking too long.',
-      '15m',
-    ),
+{
+  newControllerAlertsGroup(enableK8sCluster=true):
+    alert.newGroup(
+      'alloy_controller',
+      [
+        // Component evaluations are taking too long, which can lead to e.g. stale targets.
+        alert.newRule(
+          'SlowComponentEvaluations',
+          if enableK8sCluster then
+            'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
+          else
+            'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0'
+          ,
+          'Component evaluations are taking too long.',
+          '15m',
+        ),
 
-    // Unhealthy components detected.
-    alert.newRule(
-      'UnhealthyComponents',
-      'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0',
-      'Unhealthy components detected.',
-      '15m',
-    ),
-  ]
-)
+        // Unhealthy components detected.
+        alert.newRule(
+          'UnhealthyComponents',
+          if enableK8sCluster then
+            'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
+          else
+            'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0'
+          ,
+          'Unhealthy components detected.',
+          '15m',
+        ),
+      ]
+    )
+}
@@ -1,25 +1,36 @@
 local alert = import './utils/alert.jsonnet';
 
-alert.newGroup(
-  'alloy_otelcol',
-  [
-    // An otelcol.exporter component rcould not push some spans to the pipeline.
-    // This could be due to reaching a limit such as the ones
-    // imposed by otelcol.processor.memory_limiter.
-    alert.newRule(
-      'OtelcolReceiverRefusedSpans',
-      'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0',
-      'The receiver could not push some spans to the pipeline.',
-      '5m',
-    ),
+{
+  newOpenTelemetryAlertsGroup(enableK8sCluster=true):
+    alert.newGroup(
+      'alloy_otelcol',
+      [
+        // An otelcol.exporter component rcould not push some spans to the pipeline.
+        // This could be due to reaching a limit such as the ones
+        // imposed by otelcol.processor.memory_limiter.
+        alert.newRule(
+          'OtelcolReceiverRefusedSpans',
+          if enableK8sCluster then
+            'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
+          else
+            'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0'
+          ,
+          'The receiver could not push some spans to the pipeline.',
+          '5m',
+        ),
 
-    // The exporter failed to send spans to their destination.
-    // There could be an issue with the payload or with the destination endpoint.
-    alert.newRule(
-      'OtelcolExporterFailedSpans',
-      'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0',
-      'The exporter failed to send spans to their destination.',
-      '5m',
-    ),
-  ]
-)
+        // The exporter failed to send spans to their destination.
+        // There could be an issue with the payload or with the destination endpoint.
+        alert.newRule(
+          'OtelcolExporterFailedSpans',
+          if enableK8sCluster then
+            'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
+          else
+            'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0'
+          ,
+          'The exporter failed to send spans to their destination.',
+          '5m',
+        ),
+      ]
+    )
+}
@@ -0,0 +1,12 @@
+{
+    _config+:: {
+        enableK8sCluster: true,
+        enableAlloyCluster: true,
+        enableLokiLogs: true,
+        filterSelector: 'job=~"$job"',
+        groupSelector: if self.enableK8sCluster then self.k8sClusterSelector + ', ' + self.filterSelector else self.filterSelector,
+        instanceSelector: self.groupSelector + ', instance=~"$instance"',
+        k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"',
+        dashboardTag: 'alloy-mixin'
+    }
+}
@@ -1,9 +1,21 @@
-{
-  grafanaDashboards+:
-    (import './dashboards/controller.libsonnet') +
-    (import './dashboards/resources.libsonnet') +
-    (import './dashboards/prometheus.libsonnet') +
-    (import './dashboards/cluster-node.libsonnet') +
-    (import './dashboards/opentelemetry.libsonnet') +
-    (import './dashboards/cluster-overview.libsonnet'),
+local alloyClusterDashboards =   
+  (import './dashboards/cluster-node.libsonnet') + 
+  (import './dashboards/cluster-overview.libsonnet') +
+  (import './config.libsonnet');
+
+local otherDashboards =  
+  (import './dashboards/resources.libsonnet') +
+  (import './dashboards/controller.libsonnet') + 
+  (import './dashboards/prometheus.libsonnet') + 
+  (import './dashboards/opentelemetry.libsonnet') +
+  (import './config.libsonnet');
+
+(import './dashboards/alloy-logs.libsonnet') +
+{   
+  grafanaDashboards+:     
+    if $._config.enableAlloyCluster then 
+       alloyClusterDashboards +
+       otherDashboards
+    else
+      otherDashboards
 }
@@ -0,0 +1,35 @@
+local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet';
+local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet';
+
+{
+
+  local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'],
+
+  grafanaDashboards+:
+    if $._config.enableLokiLogs then {
+      local alloyLogs =
+        logsDashboard.new(
+          'Alloy logs overview',
+          datasourceName='loki_datasource',
+          datasourceRegex='',
+          filterSelector=$._config.filterSelector,
+          labels=labels,
+          formatParser=null,
+          showLogsVolume=true
+        )
+        {
+          panels+:
+            {
+              logs+:
+                // Alloy logs already have timestamp
+                g.panel.logs.options.withShowTime(false),
+            },
+          dashboards+:
+            {
+              logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links)                     
+                     + g.dashboard.withRefresh('10s'),
+            },
+        },
+      'alloy-logs.json': alloyLogs.dashboards.logs,
+    } else {},
+}