Skip to content

Commit

Permalink
mixin: otel metrics and logs support (#2832)
Browse files Browse the repository at this point in the history
* mixin: otel metrics and logs support

* otel dashbaord tweaks

* otel dashbaords

* otel dashbaords

* otel dashbaords

# Conflicts:
#	CHANGELOG.md

# Conflicts:
#	CHANGELOG.md

* otel dashbaords
  • Loading branch information
thampiotr authored Feb 26, 2025
1 parent 4f02c75 commit b7acf3f
Show file tree
Hide file tree
Showing 5 changed files with 257 additions and 51 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Main (unreleased)

- Added additional backwards compatibility metrics to `prometheus.write.queue`. (@mattdurham)

- Added OpenTelemetry logs and metrics support to Alloy mixin's dashboards and alerts. (@thampiotr)

v1.7.1
-----------------

Expand Down
2 changes: 2 additions & 0 deletions example/grafana.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ services:
condition: service_completed_successfully
environment:
- GRAFANA_URL=http://grafana:3000
- MIMIR_ADDRESS=http://mimir:9009
- MIMIR_TENANT_ID=fake
volumes:
- ../operations/alloy-mixin:/etc/alloy-mixin
working_dir: /etc/alloy-mixin
Expand Down
54 changes: 47 additions & 7 deletions operations/alloy-mixin/alerts/opentelemetry.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
local alert = import './utils/alert.jsonnet';

{
local successThreshold = 0.95,
local successThresholdText = '95%',
local pendingPeriod = '10m',

local successRateQuery(enableK8sCluster, failed, success) =
local sumBy = if enableK8sCluster then "cluster, namespace, job" else "job";
|||
Expand All @@ -9,8 +13,8 @@ local alert = import './utils/alert.jsonnet';
/
sum by (%s) (rate(%s{}[1m]) + rate(%s{}[1m]))
)
) < 0.95
||| % [sumBy, failed, sumBy, failed, success],
) < %g
||| % [sumBy, failed, sumBy, failed, success, successThreshold],

newOpenTelemetryAlertsGroup(enableK8sCluster=true):
alert.newGroup(
Expand All @@ -22,19 +26,55 @@ local alert = import './utils/alert.jsonnet';
alert.newRule(
'OtelcolReceiverRefusedSpans',
successRateQuery(enableK8sCluster, "otelcol_receiver_refused_spans_total", "otelcol_receiver_accepted_spans_total"),
'The receiver pushing spans to the pipeline success rate is below 95%.',
'The receiver pushing spans to the pipeline success rate is below %s.' % successThresholdText,
'The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.',
'10m',
pendingPeriod,
),

// Metrics receiver alerts
alert.newRule(
'OtelcolReceiverRefusedMetrics',
successRateQuery(enableK8sCluster, "otelcol_receiver_refused_metric_points_total", "otelcol_receiver_accepted_metric_points_total"),
'The receiver pushing metrics to the pipeline success rate is below %s.' % successThresholdText,
'The receiver could not push some metric points to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.',
pendingPeriod,
),

// Logs receiver alerts
alert.newRule(
'OtelcolReceiverRefusedLogs',
successRateQuery(enableK8sCluster, "otelcol_receiver_refused_log_records_total", "otelcol_receiver_accepted_log_records_total"),
'The receiver pushing logs to the pipeline success rate is below %s.' % successThresholdText,
'The receiver could not push some log records to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.',
pendingPeriod,
),

// The exporter success rate is below 95%.
// The exporter success rate is below threshold.
// There could be an issue with the payload or with the destination endpoint.
alert.newRule(
'OtelcolExporterFailedSpans',
successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_spans_total", "otelcol_exporter_sent_spans_total"),
'The exporter sending spans success rate is below 95%.',
'The exporter sending spans success rate is below %s.' % successThresholdText,
'The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.',
'10m',
pendingPeriod,
),

// Metrics exporter alerts
alert.newRule(
'OtelcolExporterFailedMetrics',
successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_metric_points_total", "otelcol_exporter_sent_metric_points_total"),
'The exporter sending metrics success rate is below %s.' % successThresholdText,
'The exporter failed to send metric points to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.',
pendingPeriod,
),

// Logs exporter alerts
alert.newRule(
'OtelcolExporterFailedLogs',
successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_log_records_total", "otelcol_exporter_sent_log_records_total"),
'The exporter sending logs success rate is below %s.' % successThresholdText,
'The exporter failed to send log records to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.',
pendingPeriod,
),
]
)
Expand Down
Loading

0 comments on commit b7acf3f

Please sign in to comment.