From b7acf3f5ded9fbb5ada321c19b24be5bca804f2c Mon Sep 17 00:00:00 2001
From: Piotr <17101802+thampiotr@users.noreply.github.com>
Date: Wed, 26 Feb 2025 17:54:20 +0000
Subject: [PATCH] mixin: otel metrics and logs support (#2832)

* mixin: otel metrics and logs support

* otel dashbaord tweaks

* otel dashbaords

* otel dashbaords

* otel dashbaords

# Conflicts:
#	CHANGELOG.md

# Conflicts:
#	CHANGELOG.md

* otel dashbaords
---
 CHANGELOG.md                                  |   2 +
 example/grafana.yaml                          |   2 +
 .../alerts/opentelemetry.libsonnet            |  54 +++-
 .../dashboards/opentelemetry.libsonnet        | 237 +++++++++++++++---
 operations/alloy-mixin/grizzly.jsonnet        |  13 +-
 5 files changed, 257 insertions(+), 51 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d44f61be3e..dabff1f792 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,6 +22,8 @@ Main (unreleased)
 
 - Added additional backwards compatibility metrics to `prometheus.write.queue`. (@mattdurham)
 
+- Added OpenTelemetry logs and metrics support to Alloy mixin's dashboards and alerts. (@thampiotr)
+
 v1.7.1
 -----------------
 
diff --git a/example/grafana.yaml b/example/grafana.yaml
index 9d2f7411ee..b63c169bc1 100644
--- a/example/grafana.yaml
+++ b/example/grafana.yaml
@@ -54,6 +54,8 @@ services:
         condition: service_completed_successfully
     environment:
       - GRAFANA_URL=http://grafana:3000
+      - MIMIR_ADDRESS=http://mimir:9009
+      - MIMIR_TENANT_ID=fake
     volumes:
       - ../operations/alloy-mixin:/etc/alloy-mixin
     working_dir: /etc/alloy-mixin
diff --git a/operations/alloy-mixin/alerts/opentelemetry.libsonnet b/operations/alloy-mixin/alerts/opentelemetry.libsonnet
index 48338477ce..63dbb98316 100644
--- a/operations/alloy-mixin/alerts/opentelemetry.libsonnet
+++ b/operations/alloy-mixin/alerts/opentelemetry.libsonnet
@@ -1,6 +1,10 @@
 local alert = import './utils/alert.jsonnet';
 
 {
+  local successThreshold = 0.95,
+  local successThresholdText = '95%',
+  local pendingPeriod = '10m',
+
   local successRateQuery(enableK8sCluster, failed, success) =
         local sumBy = if enableK8sCluster then "cluster, namespace, job" else "job";
         |||
@@ -9,8 +13,8 @@ local alert = import './utils/alert.jsonnet';
                   /
                   sum by (%s) (rate(%s{}[1m]) + rate(%s{}[1m]))
                )
-          ) < 0.95
-        ||| % [sumBy, failed, sumBy, failed, success],
+          ) < %g
+        ||| % [sumBy, failed, sumBy, failed, success, successThreshold],
 
   newOpenTelemetryAlertsGroup(enableK8sCluster=true):
     alert.newGroup(
@@ -22,19 +26,55 @@ local alert = import './utils/alert.jsonnet';
         alert.newRule(
           'OtelcolReceiverRefusedSpans',
           successRateQuery(enableK8sCluster, "otelcol_receiver_refused_spans_total", "otelcol_receiver_accepted_spans_total"),
-          'The receiver pushing spans to the pipeline success rate is below 95%.',
+          'The receiver pushing spans to the pipeline success rate is below %s.' % successThresholdText,
           'The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.',
-          '10m',
+          pendingPeriod,
+        ),
+
+        // Metrics receiver alerts
+        alert.newRule(
+          'OtelcolReceiverRefusedMetrics',
+          successRateQuery(enableK8sCluster, "otelcol_receiver_refused_metric_points_total", "otelcol_receiver_accepted_metric_points_total"),
+          'The receiver pushing metrics to the pipeline success rate is below %s.' % successThresholdText,
+          'The receiver could not push some metric points to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.',
+          pendingPeriod,
+        ),
+
+        // Logs receiver alerts
+        alert.newRule(
+          'OtelcolReceiverRefusedLogs',
+          successRateQuery(enableK8sCluster, "otelcol_receiver_refused_log_records_total", "otelcol_receiver_accepted_log_records_total"),
+          'The receiver pushing logs to the pipeline success rate is below %s.' % successThresholdText,
+          'The receiver could not push some log records to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.',
+          pendingPeriod,
         ),
 
-        // The exporter success rate is below 95%.
+        // The exporter success rate is below threshold.
         // There could be an issue with the payload or with the destination endpoint.
         alert.newRule(
           'OtelcolExporterFailedSpans',
           successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_spans_total", "otelcol_exporter_sent_spans_total"),
-          'The exporter sending spans success rate is below 95%.',
+          'The exporter sending spans success rate is below %s.' % successThresholdText,
           'The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.',
-          '10m',
+          pendingPeriod,
+        ),
+
+        // Metrics exporter alerts
+        alert.newRule(
+          'OtelcolExporterFailedMetrics',
+          successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_metric_points_total", "otelcol_exporter_sent_metric_points_total"),
+          'The exporter sending metrics success rate is below %s.' % successThresholdText,
+          'The exporter failed to send metric points to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.',
+          pendingPeriod,
+        ),
+
+        // Logs exporter alerts
+        alert.newRule(
+          'OtelcolExporterFailedLogs',
+          successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_log_records_total", "otelcol_exporter_sent_log_records_total"),
+          'The exporter sending logs success rate is below %s.' % successThresholdText,
+          'The exporter failed to send log records to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.',
+          pendingPeriod,
         ),
       ]
     )
diff --git a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet
index 3cf6eda2d6..97c5651302 100644
--- a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet
+++ b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet
@@ -23,16 +23,80 @@ local stackedPanelMixin = {
       includeInstance=true,
       setenceCaseLabels=$._config.useSetenceCaseTemplateLabels),
 
+  local panelPosition3Across(row, col) = panel.withPosition({x: col*8, y: row*10, w: 8, h: 10}),
+  local panelPosition4Across(row, col) = panel.withPosition({x: col*6, y: row*10, w: 6, h: 10}),
+  local rowPosition(row) = panel.withPosition({h: 1, w: 24, x: 0, y: row*10}),
+
   [filename]:
     dashboard.new(name='Alloy / OpenTelemetry', tag=$._config.dashboardTag) +
     dashboard.withDashboardsLink(tag=$._config.dashboardTag) +
     dashboard.withUID(std.md5(filename)) +
     dashboard.withTemplateVariablesMixin(templateVariables) +
     dashboard.withPanelsMixin([
-      // "Receivers for traces" row
+      // First row - Metrics and Logs
       (
-        panel.new('Receivers for traces [otelcol.receiver]', 'row') +
-        panel.withPosition({ h: 1, w: 24, x: 0, y: 0 })
+        panel.new('Receivers [otelcol.receiver.*]', 'row') +
+        rowPosition(0)
+      ),
+      (
+        panel.new(title='Accepted metric points', type='timeseries') +
+        panel.withDescription(|||
+          Number of metric points successfully pushed into the pipeline.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=0, col=0) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_receiver_accepted_metric_points_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.new(title='Refused metric points', type='timeseries') +
+        panel.withDescription(|||
+          Number of metric points that could not be pushed into the pipeline.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=0, col=1) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_receiver_refused_metric_points_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.new(title='Accepted logs', type='timeseries') +
+        panel.withDescription(|||
+          Number of log records successfully pushed into the pipeline.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=0, col=2) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_receiver_accepted_log_records_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.new(title='Refused logs', type='timeseries') +
+        panel.withDescription(|||
+          Number of log records that could not be pushed into the pipeline.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=0, col=3) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_receiver_refused_log_records_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
       ),
       (
         panel.new(title='Accepted spans', type='timeseries') +
@@ -40,44 +104,56 @@ local stackedPanelMixin = {
           Number of spans successfully pushed into the pipeline.
         |||) +
         stackedPanelMixin +
-        panel.withPosition({ x: 0, y: 0, w: 8, h: 10 }) +
+        panelPosition4Across(row=1, col=0) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
-              rate(otelcol_receiver_accepted_spans_total{%(instanceSelector)s}[$__rate_interval])
+              sum by(instance) (rate(otelcol_receiver_accepted_spans_total{%(instanceSelector)s}[$__rate_interval]))
             ||| % $._config,
-            //TODO: How will the dashboard look if there is more than one receiver component? The legend is not unique enough?
-            legendFormat='{{ pod }} / {{ transport }}',
           ),
         ])
       ),
       (
         panel.new(title='Refused spans', type='timeseries') +
-        stackedPanelMixin +
         panel.withDescription(|||
           Number of spans that could not be pushed into the pipeline.
         |||) +
         stackedPanelMixin +
-        panel.withPosition({ x: 8, y: 0, w: 8, h: 10 }) +
+        panelPosition4Across(row=1, col=1) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
-              rate(otelcol_receiver_refused_spans_total{%(instanceSelector)s}[$__rate_interval])
+              sum by(instance) (rate(otelcol_receiver_refused_spans_total{%(instanceSelector)s}[$__rate_interval]))
             ||| % $._config,
-            legendFormat='{{ pod }} / {{ transport }}',
           ),
         ])
       ),
       (
         panel.newHeatmap('RPC server duration', 'ms') +
         panel.withDescription(|||
-          The duration of inbound RPCs.
+          The duration of inbound RPCs for otelcol.receiver.* components.
+        |||) +
+        panelPosition4Across(row=1, col=2) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by (le) (increase(rpc_server_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.receiver.*"}[$__rate_interval]))
+            ||| % $._config,
+            format='heatmap',
+            legendFormat='{{le}}',
+          ),
+        ])
+      ),
+      (
+        panel.newHeatmap('HTTP server duration', 'ms') +
+        panel.withDescription(|||
+          The duration of inbound HTTP requests for otelcol.receiver.* components.
         |||) +
-        panel.withPosition({ x: 16, y: 0, w: 8, h: 10 }) +
+        panelPosition4Across(row=1, col=3) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
-              sum by (le) (increase(rpc_server_duration_milliseconds_bucket{%(instanceSelector)s, rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval]))
+              sum by (le) (increase(http_server_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.receiver.*"}[$__rate_interval]))
             ||| % $._config,
             format='heatmap',
             legendFormat='{{le}}',
@@ -87,8 +163,8 @@ local stackedPanelMixin = {
 
       // "Batching" row
       (
-        panel.new('Batching of logs, metrics, and traces [otelcol.processor.batch]', 'row') +
-        panel.withPosition({ h: 1, w: 24, x: 0, y: 10 })
+        panel.new('Batching [otelcol.processor.batch]', 'row') +
+        rowPosition(2)
       ),
       (
         panel.newHeatmap('Number of units in the batch', 'short') +
@@ -96,7 +172,7 @@ local stackedPanelMixin = {
         panel.withDescription(|||
           Number of spans, metric datapoints, or log lines in a batch
         |||) +
-        panel.withPosition({ x: 0, y: 10, w: 8, h: 10 }) +
+        panelPosition3Across(row=2, col=0) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
@@ -115,13 +191,12 @@ local stackedPanelMixin = {
         panel.withDescription(|||
           Number of distinct metadata value combinations being processed
         |||) +
-        panel.withPosition({ x: 8, y: 10, w: 8, h: 10 }) +
+        panelPosition3Across(row=2, col=1) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
-              otelcol_processor_batch_metadata_cardinality{%(instanceSelector)s}
+              sum by(instance) (otelcol_processor_batch_metadata_cardinality{%(instanceSelector)s})
             ||| % $._config,
-            legendFormat='{{ pod }}',
           ),
         ])
       ),
@@ -130,54 +205,142 @@ local stackedPanelMixin = {
         panel.withDescription(|||
           Number of times the batch was sent due to a timeout trigger
         |||) +
-        panel.withPosition({ x: 16, y: 10, w: 8, h: 10 }) +
+        panelPosition3Across(row=2, col=2) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
-              rate(otelcol_processor_batch_timeout_trigger_send_total{%(instanceSelector)s}[$__rate_interval])
+              sum by(instance) (rate(otelcol_processor_batch_timeout_trigger_send_total{%(instanceSelector)s}[$__rate_interval]))
             ||| % $._config,
-            legendFormat='{{ pod }}',
           ),
         ])
       ),
 
-      // "Exporters for traces" row
+      // "Exporters" row
       (
-        panel.new('Exporters for traces [otelcol.exporter]', 'row') +
-        panel.withPosition({ h: 1, w: 24, x: 0, y: 20 })
+        panel.new('Exporters [otelcol.exporter.*]', 'row') +
+        rowPosition(3)
       ),
       (
-        panel.new(title='Exported sent spans', type='timeseries') +
+        panel.new(title='Exported metric points', type='timeseries') +
         panel.withDescription(|||
-          Number of spans successfully sent to destination.
+          Number of metric points successfully sent to destination.
         |||) +
         stackedPanelMixin +
-        panel.withPosition({ x: 0, y: 20, w: 8, h: 10 }) +
+        panelPosition4Across(row=3, col=0) +
         panel.withQueries([
           panel.newQuery(
             expr= ||| 
-              rate(otelcol_exporter_sent_spans_total{%(instanceSelector)s}[$__rate_interval])
+              sum by(instance) (rate(otelcol_exporter_sent_metric_points_total{%(instanceSelector)s}[$__rate_interval]))
             ||| % $._config,
-            legendFormat='{{ pod }}',
           ),
         ])
       ),
       (
-        panel.new(title='Exported failed spans', type='timeseries') +
+        panel.new(title='Failed metric points', type='timeseries') +
         panel.withDescription(|||
-          Number of spans in failed attempts to send to destination.
+          Number of metric points that failed to be sent to destination.
         |||) +
         stackedPanelMixin +
-        panel.withPosition({ x: 8, y: 20, w: 8, h: 10 }) +
+        panelPosition4Across(row=3, col=1) +
         panel.withQueries([
           panel.newQuery(
             expr= |||
-              rate(otelcol_exporter_send_failed_spans_total{%(instanceSelector)s}[$__rate_interval])
+              sum by(instance) (rate(otelcol_exporter_send_failed_metric_points_total{%(instanceSelector)s}[$__rate_interval]))
             ||| % $._config,
-            legendFormat='{{ pod }}',
           ),
         ])
       ),
-
+      (
+        panel.new(title='Exported logs', type='timeseries') +
+        panel.withDescription(|||
+          Number of log records successfully sent to destination.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=3, col=2) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_exporter_sent_log_records_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.new(title='Failed logs', type='timeseries') +
+        panel.withDescription(|||
+          Number of log records that failed to be sent to destination.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=3, col=3) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_exporter_send_failed_log_records_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.new(title='Exported spans', type='timeseries') +
+        panel.withDescription(|||
+          Number of spans successfully sent to destination.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=4, col=0) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_exporter_sent_spans_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.new(title='Failed spans', type='timeseries') +
+        panel.withDescription(|||
+          Number of spans that failed to be sent to destination.
+        |||) +
+        stackedPanelMixin +
+        panelPosition4Across(row=4, col=1) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by(instance) (rate(otelcol_exporter_send_failed_spans_total{%(instanceSelector)s}[$__rate_interval]))
+            ||| % $._config,
+          ),
+        ])
+      ),
+      (
+        panel.newHeatmap('RPC client duration', 'ms') +
+        panel.withDescription(|||
+          The duration of outbound RPCs for otelcol.exporter.* components.
+        |||) +
+        panelPosition4Across(row=4, col=2) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by (le) (increase(rpc_client_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.exporter.*"}[$__rate_interval]))
+            ||| % $._config,
+            format='heatmap',
+            legendFormat='{{le}}',
+          ),
+        ])
+      ),
+      (
+        panel.newHeatmap('HTTP client duration', 'ms') +
+        panel.withDescription(|||
+          The duration of outbound HTTP requests for otelcol.exporter.* components.
+        |||) +
+        panelPosition4Across(row=4, col=3) +
+        panel.withQueries([
+          panel.newQuery(
+            expr= |||
+              sum by (le) (increase(http_client_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.exporter.*"}[$__rate_interval]))
+            ||| % $._config,
+            format='heatmap',
+            legendFormat='{{le}}',
+          ),
+        ])
+      ),
     ]),
 }
diff --git a/operations/alloy-mixin/grizzly.jsonnet b/operations/alloy-mixin/grizzly.jsonnet
index d71bedfd58..c98d518cdb 100644
--- a/operations/alloy-mixin/grizzly.jsonnet
+++ b/operations/alloy-mixin/grizzly.jsonnet
@@ -7,11 +7,10 @@
 // mixin and continually deploy all dashboards.
 //
 
-(import './grizzly/dashboards.jsonnet')
+(import './grizzly/dashboards.jsonnet') +
 
-// By default, only dashboards get deployed; not alerts or recording rules.
-// To deploy alerts and recording rules, set up the environment variables used
-// by cortextool to authenticate with a Prometheus or Alertmanager intance and
-// uncomment the line below.
-
-//+ (import './grizzly/alerts.jsonnet')
+// By default, alerts get also deployed; This should work out-of-the-box when 
+// using the example docker-compose environment. If you are using grizzly with
+// a different environemnt, set up the environment variables as documented in
+// https://grafana.github.io/grizzly/configuration/ or comment out the line below.
++ (import './grizzly/alerts.jsonnet')