From b7acf3f5ded9fbb5ada321c19b24be5bca804f2c Mon Sep 17 00:00:00 2001 From: Piotr <17101802+thampiotr@users.noreply.github.com> Date: Wed, 26 Feb 2025 17:54:20 +0000 Subject: [PATCH] mixin: otel metrics and logs support (#2832) * mixin: otel metrics and logs support * otel dashbaord tweaks * otel dashbaords * otel dashbaords * otel dashbaords # Conflicts: # CHANGELOG.md # Conflicts: # CHANGELOG.md * otel dashbaords --- CHANGELOG.md | 2 + example/grafana.yaml | 2 + .../alerts/opentelemetry.libsonnet | 54 +++- .../dashboards/opentelemetry.libsonnet | 237 +++++++++++++++--- operations/alloy-mixin/grizzly.jsonnet | 13 +- 5 files changed, 257 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d44f61be3e..dabff1f792 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ Main (unreleased) - Added additional backwards compatibility metrics to `prometheus.write.queue`. (@mattdurham) +- Added OpenTelemetry logs and metrics support to Alloy mixin's dashboards and alerts. (@thampiotr) + v1.7.1 ----------------- diff --git a/example/grafana.yaml b/example/grafana.yaml index 9d2f7411ee..b63c169bc1 100644 --- a/example/grafana.yaml +++ b/example/grafana.yaml @@ -54,6 +54,8 @@ services: condition: service_completed_successfully environment: - GRAFANA_URL=http://grafana:3000 + - MIMIR_ADDRESS=http://mimir:9009 + - MIMIR_TENANT_ID=fake volumes: - ../operations/alloy-mixin:/etc/alloy-mixin working_dir: /etc/alloy-mixin diff --git a/operations/alloy-mixin/alerts/opentelemetry.libsonnet b/operations/alloy-mixin/alerts/opentelemetry.libsonnet index 48338477ce..63dbb98316 100644 --- a/operations/alloy-mixin/alerts/opentelemetry.libsonnet +++ b/operations/alloy-mixin/alerts/opentelemetry.libsonnet @@ -1,6 +1,10 @@ local alert = import './utils/alert.jsonnet'; { + local successThreshold = 0.95, + local successThresholdText = '95%', + local pendingPeriod = '10m', + local successRateQuery(enableK8sCluster, failed, success) = local sumBy = if enableK8sCluster then "cluster, namespace, job" else "job"; ||| @@ -9,8 +13,8 @@ local alert = import './utils/alert.jsonnet'; / sum by (%s) (rate(%s{}[1m]) + rate(%s{}[1m])) ) - ) < 0.95 - ||| % [sumBy, failed, sumBy, failed, success], + ) < %g + ||| % [sumBy, failed, sumBy, failed, success, successThreshold], newOpenTelemetryAlertsGroup(enableK8sCluster=true): alert.newGroup( @@ -22,19 +26,55 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'OtelcolReceiverRefusedSpans', successRateQuery(enableK8sCluster, "otelcol_receiver_refused_spans_total", "otelcol_receiver_accepted_spans_total"), - 'The receiver pushing spans to the pipeline success rate is below 95%.', + 'The receiver pushing spans to the pipeline success rate is below %s.' % successThresholdText, 'The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.', - '10m', + pendingPeriod, + ), + + // Metrics receiver alerts + alert.newRule( + 'OtelcolReceiverRefusedMetrics', + successRateQuery(enableK8sCluster, "otelcol_receiver_refused_metric_points_total", "otelcol_receiver_accepted_metric_points_total"), + 'The receiver pushing metrics to the pipeline success rate is below %s.' % successThresholdText, + 'The receiver could not push some metric points to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.', + pendingPeriod, + ), + + // Logs receiver alerts + alert.newRule( + 'OtelcolReceiverRefusedLogs', + successRateQuery(enableK8sCluster, "otelcol_receiver_refused_log_records_total", "otelcol_receiver_accepted_log_records_total"), + 'The receiver pushing logs to the pipeline success rate is below %s.' % successThresholdText, + 'The receiver could not push some log records to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.', + pendingPeriod, ), - // The exporter success rate is below 95%. + // The exporter success rate is below threshold. // There could be an issue with the payload or with the destination endpoint. alert.newRule( 'OtelcolExporterFailedSpans', successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_spans_total", "otelcol_exporter_sent_spans_total"), - 'The exporter sending spans success rate is below 95%.', + 'The exporter sending spans success rate is below %s.' % successThresholdText, 'The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.', - '10m', + pendingPeriod, + ), + + // Metrics exporter alerts + alert.newRule( + 'OtelcolExporterFailedMetrics', + successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_metric_points_total", "otelcol_exporter_sent_metric_points_total"), + 'The exporter sending metrics success rate is below %s.' % successThresholdText, + 'The exporter failed to send metric points to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.', + pendingPeriod, + ), + + // Logs exporter alerts + alert.newRule( + 'OtelcolExporterFailedLogs', + successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_log_records_total", "otelcol_exporter_sent_log_records_total"), + 'The exporter sending logs success rate is below %s.' % successThresholdText, + 'The exporter failed to send log records to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.', + pendingPeriod, ), ] ) diff --git a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet index 3cf6eda2d6..97c5651302 100644 --- a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet +++ b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet @@ -23,16 +23,80 @@ local stackedPanelMixin = { includeInstance=true, setenceCaseLabels=$._config.useSetenceCaseTemplateLabels), + local panelPosition3Across(row, col) = panel.withPosition({x: col*8, y: row*10, w: 8, h: 10}), + local panelPosition4Across(row, col) = panel.withPosition({x: col*6, y: row*10, w: 6, h: 10}), + local rowPosition(row) = panel.withPosition({h: 1, w: 24, x: 0, y: row*10}), + [filename]: dashboard.new(name='Alloy / OpenTelemetry', tag=$._config.dashboardTag) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin(templateVariables) + dashboard.withPanelsMixin([ - // "Receivers for traces" row + // First row - Metrics and Logs ( - panel.new('Receivers for traces [otelcol.receiver]', 'row') + - panel.withPosition({ h: 1, w: 24, x: 0, y: 0 }) + panel.new('Receivers [otelcol.receiver.*]', 'row') + + rowPosition(0) + ), + ( + panel.new(title='Accepted metric points', type='timeseries') + + panel.withDescription(||| + Number of metric points successfully pushed into the pipeline. + |||) + + stackedPanelMixin + + panelPosition4Across(row=0, col=0) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_receiver_accepted_metric_points_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.new(title='Refused metric points', type='timeseries') + + panel.withDescription(||| + Number of metric points that could not be pushed into the pipeline. + |||) + + stackedPanelMixin + + panelPosition4Across(row=0, col=1) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_receiver_refused_metric_points_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.new(title='Accepted logs', type='timeseries') + + panel.withDescription(||| + Number of log records successfully pushed into the pipeline. + |||) + + stackedPanelMixin + + panelPosition4Across(row=0, col=2) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_receiver_accepted_log_records_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.new(title='Refused logs', type='timeseries') + + panel.withDescription(||| + Number of log records that could not be pushed into the pipeline. + |||) + + stackedPanelMixin + + panelPosition4Across(row=0, col=3) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_receiver_refused_log_records_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) ), ( panel.new(title='Accepted spans', type='timeseries') + @@ -40,44 +104,56 @@ local stackedPanelMixin = { Number of spans successfully pushed into the pipeline. |||) + stackedPanelMixin + - panel.withPosition({ x: 0, y: 0, w: 8, h: 10 }) + + panelPosition4Across(row=1, col=0) + panel.withQueries([ panel.newQuery( expr= ||| - rate(otelcol_receiver_accepted_spans_total{%(instanceSelector)s}[$__rate_interval]) + sum by(instance) (rate(otelcol_receiver_accepted_spans_total{%(instanceSelector)s}[$__rate_interval])) ||| % $._config, - //TODO: How will the dashboard look if there is more than one receiver component? The legend is not unique enough? - legendFormat='{{ pod }} / {{ transport }}', ), ]) ), ( panel.new(title='Refused spans', type='timeseries') + - stackedPanelMixin + panel.withDescription(||| Number of spans that could not be pushed into the pipeline. |||) + stackedPanelMixin + - panel.withPosition({ x: 8, y: 0, w: 8, h: 10 }) + + panelPosition4Across(row=1, col=1) + panel.withQueries([ panel.newQuery( expr= ||| - rate(otelcol_receiver_refused_spans_total{%(instanceSelector)s}[$__rate_interval]) + sum by(instance) (rate(otelcol_receiver_refused_spans_total{%(instanceSelector)s}[$__rate_interval])) ||| % $._config, - legendFormat='{{ pod }} / {{ transport }}', ), ]) ), ( panel.newHeatmap('RPC server duration', 'ms') + panel.withDescription(||| - The duration of inbound RPCs. + The duration of inbound RPCs for otelcol.receiver.* components. + |||) + + panelPosition4Across(row=1, col=2) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by (le) (increase(rpc_server_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.receiver.*"}[$__rate_interval])) + ||| % $._config, + format='heatmap', + legendFormat='{{le}}', + ), + ]) + ), + ( + panel.newHeatmap('HTTP server duration', 'ms') + + panel.withDescription(||| + The duration of inbound HTTP requests for otelcol.receiver.* components. |||) + - panel.withPosition({ x: 16, y: 0, w: 8, h: 10 }) + + panelPosition4Across(row=1, col=3) + panel.withQueries([ panel.newQuery( expr= ||| - sum by (le) (increase(rpc_server_duration_milliseconds_bucket{%(instanceSelector)s, rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval])) + sum by (le) (increase(http_server_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.receiver.*"}[$__rate_interval])) ||| % $._config, format='heatmap', legendFormat='{{le}}', @@ -87,8 +163,8 @@ local stackedPanelMixin = { // "Batching" row ( - panel.new('Batching of logs, metrics, and traces [otelcol.processor.batch]', 'row') + - panel.withPosition({ h: 1, w: 24, x: 0, y: 10 }) + panel.new('Batching [otelcol.processor.batch]', 'row') + + rowPosition(2) ), ( panel.newHeatmap('Number of units in the batch', 'short') + @@ -96,7 +172,7 @@ local stackedPanelMixin = { panel.withDescription(||| Number of spans, metric datapoints, or log lines in a batch |||) + - panel.withPosition({ x: 0, y: 10, w: 8, h: 10 }) + + panelPosition3Across(row=2, col=0) + panel.withQueries([ panel.newQuery( expr= ||| @@ -115,13 +191,12 @@ local stackedPanelMixin = { panel.withDescription(||| Number of distinct metadata value combinations being processed |||) + - panel.withPosition({ x: 8, y: 10, w: 8, h: 10 }) + + panelPosition3Across(row=2, col=1) + panel.withQueries([ panel.newQuery( expr= ||| - otelcol_processor_batch_metadata_cardinality{%(instanceSelector)s} + sum by(instance) (otelcol_processor_batch_metadata_cardinality{%(instanceSelector)s}) ||| % $._config, - legendFormat='{{ pod }}', ), ]) ), @@ -130,54 +205,142 @@ local stackedPanelMixin = { panel.withDescription(||| Number of times the batch was sent due to a timeout trigger |||) + - panel.withPosition({ x: 16, y: 10, w: 8, h: 10 }) + + panelPosition3Across(row=2, col=2) + panel.withQueries([ panel.newQuery( expr= ||| - rate(otelcol_processor_batch_timeout_trigger_send_total{%(instanceSelector)s}[$__rate_interval]) + sum by(instance) (rate(otelcol_processor_batch_timeout_trigger_send_total{%(instanceSelector)s}[$__rate_interval])) ||| % $._config, - legendFormat='{{ pod }}', ), ]) ), - // "Exporters for traces" row + // "Exporters" row ( - panel.new('Exporters for traces [otelcol.exporter]', 'row') + - panel.withPosition({ h: 1, w: 24, x: 0, y: 20 }) + panel.new('Exporters [otelcol.exporter.*]', 'row') + + rowPosition(3) ), ( - panel.new(title='Exported sent spans', type='timeseries') + + panel.new(title='Exported metric points', type='timeseries') + panel.withDescription(||| - Number of spans successfully sent to destination. + Number of metric points successfully sent to destination. |||) + stackedPanelMixin + - panel.withPosition({ x: 0, y: 20, w: 8, h: 10 }) + + panelPosition4Across(row=3, col=0) + panel.withQueries([ panel.newQuery( expr= ||| - rate(otelcol_exporter_sent_spans_total{%(instanceSelector)s}[$__rate_interval]) + sum by(instance) (rate(otelcol_exporter_sent_metric_points_total{%(instanceSelector)s}[$__rate_interval])) ||| % $._config, - legendFormat='{{ pod }}', ), ]) ), ( - panel.new(title='Exported failed spans', type='timeseries') + + panel.new(title='Failed metric points', type='timeseries') + panel.withDescription(||| - Number of spans in failed attempts to send to destination. + Number of metric points that failed to be sent to destination. |||) + stackedPanelMixin + - panel.withPosition({ x: 8, y: 20, w: 8, h: 10 }) + + panelPosition4Across(row=3, col=1) + panel.withQueries([ panel.newQuery( expr= ||| - rate(otelcol_exporter_send_failed_spans_total{%(instanceSelector)s}[$__rate_interval]) + sum by(instance) (rate(otelcol_exporter_send_failed_metric_points_total{%(instanceSelector)s}[$__rate_interval])) ||| % $._config, - legendFormat='{{ pod }}', ), ]) ), - + ( + panel.new(title='Exported logs', type='timeseries') + + panel.withDescription(||| + Number of log records successfully sent to destination. + |||) + + stackedPanelMixin + + panelPosition4Across(row=3, col=2) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_exporter_sent_log_records_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.new(title='Failed logs', type='timeseries') + + panel.withDescription(||| + Number of log records that failed to be sent to destination. + |||) + + stackedPanelMixin + + panelPosition4Across(row=3, col=3) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_exporter_send_failed_log_records_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.new(title='Exported spans', type='timeseries') + + panel.withDescription(||| + Number of spans successfully sent to destination. + |||) + + stackedPanelMixin + + panelPosition4Across(row=4, col=0) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_exporter_sent_spans_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.new(title='Failed spans', type='timeseries') + + panel.withDescription(||| + Number of spans that failed to be sent to destination. + |||) + + stackedPanelMixin + + panelPosition4Across(row=4, col=1) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by(instance) (rate(otelcol_exporter_send_failed_spans_total{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, + ), + ]) + ), + ( + panel.newHeatmap('RPC client duration', 'ms') + + panel.withDescription(||| + The duration of outbound RPCs for otelcol.exporter.* components. + |||) + + panelPosition4Across(row=4, col=2) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by (le) (increase(rpc_client_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.exporter.*"}[$__rate_interval])) + ||| % $._config, + format='heatmap', + legendFormat='{{le}}', + ), + ]) + ), + ( + panel.newHeatmap('HTTP client duration', 'ms') + + panel.withDescription(||| + The duration of outbound HTTP requests for otelcol.exporter.* components. + |||) + + panelPosition4Across(row=4, col=3) + + panel.withQueries([ + panel.newQuery( + expr= ||| + sum by (le) (increase(http_client_duration_milliseconds_bucket{%(instanceSelector)s, component_id=~"otelcol.exporter.*"}[$__rate_interval])) + ||| % $._config, + format='heatmap', + legendFormat='{{le}}', + ), + ]) + ), ]), } diff --git a/operations/alloy-mixin/grizzly.jsonnet b/operations/alloy-mixin/grizzly.jsonnet index d71bedfd58..c98d518cdb 100644 --- a/operations/alloy-mixin/grizzly.jsonnet +++ b/operations/alloy-mixin/grizzly.jsonnet @@ -7,11 +7,10 @@ // mixin and continually deploy all dashboards. // -(import './grizzly/dashboards.jsonnet') +(import './grizzly/dashboards.jsonnet') + -// By default, only dashboards get deployed; not alerts or recording rules. -// To deploy alerts and recording rules, set up the environment variables used -// by cortextool to authenticate with a Prometheus or Alertmanager intance and -// uncomment the line below. - -//+ (import './grizzly/alerts.jsonnet') +// By default, alerts get also deployed; This should work out-of-the-box when +// using the example docker-compose environment. If you are using grizzly with +// a different environemnt, set up the environment variables as documented in +// https://grafana.github.io/grizzly/configuration/ or comment out the line below. ++ (import './grizzly/alerts.jsonnet')