From 4d1607413f1e11e8bb8f9b4efbea4dc997dfde14 Mon Sep 17 00:00:00 2001 From: Gabriel Antunes Date: Mon, 6 May 2024 19:41:03 -0300 Subject: [PATCH 1/6] add logs dashboard and start using config variables for k8s cluster and alloy cluster --- operations/alloy-mixin/alerts.libsonnet | 9 +- .../alloy-mixin/alloy-cluster-node.json | 502 ++++++++++++++ .../alloy-mixin/alloy-cluster-overview.json | 350 ++++++++++ operations/alloy-mixin/alloy-controller.json | 542 +++++++++++++++ operations/alloy-mixin/alloy-logs.json | 306 ++++++++ .../alloy-mixin/alloy-opentelemetry.json | 420 +++++++++++ .../alloy-prometheus-remote-write.json | 655 ++++++++++++++++++ operations/alloy-mixin/alloy-resources.json | 329 +++++++++ operations/alloy-mixin/config.libsonnet | 12 + operations/alloy-mixin/dashboards.libsonnet | 27 +- .../dashboards/alloy-logs.libsonnet | 31 + .../dashboards/cluster-node.libsonnet | 97 ++- .../dashboards/cluster-overview.libsonnet | 49 +- .../dashboards/controller.libsonnet | 90 ++- .../dashboards/opentelemetry.libsonnet | 53 +- .../dashboards/prometheus.libsonnet | 4 +- .../dashboards/resources.libsonnet | 4 +- .../dashboards/utils/dashboard.jsonnet | 8 +- operations/alloy-mixin/jsonnetfile.json | 30 +- operations/alloy-mixin/jsonnetfile.lock.json | 56 ++ operations/alloy-mixin/mixin.libsonnet | 3 +- 21 files changed, 3406 insertions(+), 171 deletions(-) create mode 100644 operations/alloy-mixin/alloy-cluster-node.json create mode 100644 operations/alloy-mixin/alloy-cluster-overview.json create mode 100644 operations/alloy-mixin/alloy-controller.json create mode 100644 operations/alloy-mixin/alloy-logs.json create mode 100644 operations/alloy-mixin/alloy-opentelemetry.json create mode 100644 operations/alloy-mixin/alloy-prometheus-remote-write.json create mode 100644 operations/alloy-mixin/alloy-resources.json create mode 100644 operations/alloy-mixin/config.libsonnet create mode 100644 operations/alloy-mixin/dashboards/alloy-logs.libsonnet create mode 100644 operations/alloy-mixin/jsonnetfile.lock.json diff --git a/operations/alloy-mixin/alerts.libsonnet b/operations/alloy-mixin/alerts.libsonnet index d8e247fb13..2b28ede772 100644 --- a/operations/alloy-mixin/alerts.libsonnet +++ b/operations/alloy-mixin/alerts.libsonnet @@ -1,9 +1,12 @@ { prometheusAlerts+: { groups+: [ - (import './alerts/clustering.libsonnet'), - (import './alerts/controller.libsonnet'), - (import './alerts/opentelemetry.libsonnet'), + if $._config.enableK8sCluster then + (import './alerts/clustering.libsonnet') + else + {} + + (import './alerts/controller.libsonnet') + + (import './alerts/opentelemetry.libsonnet') ], }, } diff --git a/operations/alloy-mixin/alloy-cluster-node.json b/operations/alloy-mixin/alloy-cluster-node.json new file mode 100644 index 0000000000..12084f1628 --- /dev/null +++ b/operations/alloy-mixin/alloy-cluster-node.json @@ -0,0 +1,502 @@ +{ + "annotations": { + "list": [ + { + "datasource": "$loki_datasource", + "enable": true, + "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", + "iconColor": "rgba(0, 211, 255, 1)", + "instant": false, + "name": "Deployments", + "titleFormat": "{{cluster}}/{{namespace}}" + } + ] + }, + "graphTooltip": 1, + "links": [ + { + "icon": "doc", + "targetBlank": true, + "title": "Documentation", + "tooltip": "Clustering documentation", + "type": "link", + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + }, + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "title": "Node Info", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "Information about a specific cluster node.\n\n* Lamport clock time: The observed Lamport time on the specific node's clock used to provide partial ordering around gossip messages. Nodes should ideally be observing roughly the same time, meaning they are up-to-date on the cluster state. If a node is falling behind, it means that it has not recently processed the same number of messages and may have an outdated view of its peers.\n\n* Internal cluster state observers: The number of Observer functions that are registered to run whenever the node detects a cluster change.\n\n* Gossip health score: A health score assigned to this node by the memberlist implementation. The lower, the better.\n\n* Gossip protocol version: The protocol version used by nodes to communicate with one another. It should match across all nodes.\n", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(cluster_node_lamport_time{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Lamport clock time" + }, + { + "datasource": "${datasource}", + "expr": "sum(cluster_node_update_observers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Internal cluster state observers" + }, + { + "datasource": "${datasource}", + "expr": "sum(cluster_node_gossip_health_score{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Gossip health score" + }, + { + "datasource": "${datasource}", + "expr": "sum(cluster_node_gossip_proto_version{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Gossip protocol version" + } + ], + "title": "Node Info", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Value #(.*)", + "renamePattern": "$1" + } + }, + { + "id": "reduce", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { }, + "indexByName": { }, + "renameByName": { + "Field": "Metric", + "Max": "Value" + } + } + } + ], + "type": "table" + }, + { + "datasource": "${datasource}", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(cluster_node_gossip_received_events_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{event}}", + "range": true + } + ], + "title": "Gossip ops/s", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Known peers to the node (including the local node).\n", + "fieldConfig": { + "defaults": { + "unit": "suffix:peers" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", + "instant": false, + "legendFormat": "__auto", + "range": true + } + ], + "title": "Known peers", + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "Known peers to the node by state (including the local node).\n", + "fieldConfig": { + "defaults": { + "unit": "suffix:nodes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", + "instant": false, + "legendFormat": "{{state}}", + "range": true + } + ], + "title": "Peers by state", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "title": "Gossip Transport", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "axisCenteredZero": true + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(cluster_transport_rx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", + "instant": false, + "legendFormat": "rx", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "-1 * rate(cluster_transport_tx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", + "instant": false, + "legendFormat": "tx", + "range": true + } + ], + "title": "Transport bandwidth", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "1 - (\n rate(cluster_transport_tx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) /\n rate(cluster_transport_tx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])\n )", + "instant": false, + "legendFormat": "Tx success %", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "1 - ( \\\n rate(cluster_transport_rx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) / \\\n rate(cluster_transport_rx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) \\\n )", + "instant": false, + "legendFormat": "Rx success %", + "range": true + } + ], + "title": "Packet write success rate", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "The number of packets enqueued currently to be decoded or encoded and sent during communication with other nodes.\n\nThe incoming and outgoing packet queue should be as empty as possible; a growing queue means that Alloy cannot keep up with the number of messages required to have all nodes informed of cluster changes, and the nodes may not converge in a timely fashion.\n", + "fieldConfig": { + "defaults": { + "unit": "pkts" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "cluster_transport_tx_packet_queue_length{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", + "instant": false, + "legendFormat": "tx queue", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "cluster_transport_rx_packet_queue_length{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", + "instant": false, + "legendFormat": "rx queue", + "range": true + } + ], + "title": "Pending packet queue", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "axisCenteredZero": true + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 26 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(cluster_transport_stream_rx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", + "instant": false, + "legendFormat": "rx", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "-1 * rate(cluster_transport_stream_tx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", + "instant": false, + "legendFormat": "tx", + "range": true + } + ], + "title": "Stream bandwidth", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 26 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "1 - (\n rate(cluster_transport_stream_tx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) /\n rate(cluster_transport_stream_tx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])\n )", + "instant": false, + "legendFormat": "Tx success %", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "1 - (\n rate(cluster_transport_stream_rx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) /\n rate(cluster_transport_stream_rx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])\n )", + "instant": false, + "legendFormat": "Rx success %", + "range": true + } + ], + "title": "Stream write success rate", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "The number of open connections from this node to its peers.\n\nEach node picks up a subset of its peers to continuously gossip messages around cluster status using streaming HTTP/2 connections. This panel can be used to detect networking failures that result in cluster communication being disrupted and convergence taking longer than expected or outright failing.\n", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 26 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "cluster_transport_streams{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", + "instant": false, + "legendFormat": "Open streams", + "range": true + } + ], + "title": "Open transport streams", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "tags": [ + "alloy-mixin" + ], + "templating": { + "list": [ + { + "label": "Data Source", + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "label": "Loki Data Source", + "name": "loki_datasource", + "query": "loki", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "datasource": "${datasource}", + "label": "job", + "name": "job", + "query": { + "query": "label_values(alloy_component_controller_running_components, job)", + "refId": "job" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "cluster", + "name": "cluster", + "query": { + "query": "label_values(alloy_component_controller_running_components{job=\"$job\"}, cluster)", + "refId": "cluster" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "namespace", + "name": "namespace", + "query": { + "query": "label_values(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\"}, namespace)", + "refId": "namespace" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "instance", + "name": "instance", + "query": { + "query": "label_values(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\"}, instance)", + "refId": "instance" + }, + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d", + "90d" + ] + }, + "timezone": "utc", + "title": "Alloy / Cluster Node", + "uid": "4047e755d822da63c8158cde32ae4dce" + } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-cluster-overview.json b/operations/alloy-mixin/alloy-cluster-overview.json new file mode 100644 index 0000000000..b3a68e8fcb --- /dev/null +++ b/operations/alloy-mixin/alloy-cluster-overview.json @@ -0,0 +1,350 @@ +{ + "annotations": { + "list": [ + { + "datasource": "$loki_datasource", + "enable": true, + "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", + "iconColor": "rgba(0, 211, 255, 1)", + "instant": false, + "name": "Deployments", + "titleFormat": "{{cluster}}/{{namespace}}" + } + ] + }, + "graphTooltip": 1, + "links": [ + { + "icon": "doc", + "targetBlank": true, + "title": "Documentation", + "tooltip": "Clustering documentation", + "type": "link", + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + }, + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "${datasource}", + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "count(cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })", + "instant": true, + "legendFormat": "__auto", + "range": false + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "Nodes info.\n", + "fieldConfig": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Dashboard" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "1": { + "index": 0, + "text": "Link" + } + }, + "type": "value" + } + ] + }, + { + "id": "links", + "value": [ + { + "targetBlank": false, + "title": "Detail dashboard for node", + "url": "/d/4047e755d822da63c8158cde32ae4dce/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-job=${job}&var-cluster=${cluster}&var-namespace=${namespace}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 16, + "x": 8, + "y": 0 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false + } + ], + "title": "Node table", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": false, + "__name__": true, + "cluster": true, + "namespace": true, + "state": false + }, + "indexByName": { }, + "renameByName": { + "Value": "Dashboard", + "instance": "", + "state": "" + } + } + } + ], + "type": "table" + }, + { + "datasource": "${datasource}", + "description": "Whether the cluster state has converged.\n\nIt is normal for the cluster state to be diverged briefly as gossip events propagate. It is not normal for the cluster state to be diverged for a long period of time.\n\nThis will show one of the following:\n\n* Converged: Nodes are aware of all other nodes, with the correct states.\n* Not converged: A subset of nodes aren't aware of their peers, or don't have an updated view of peer states.\n", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "1": { + "color": "red", + "index": 1, + "text": "Not converged" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "color": "green", + "index": 0, + "text": "Converged" + } + }, + "type": "special" + } + ], + "unit": "suffix:nodes" + } + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 9 + }, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "clamp((\n sum(stddev by (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }) != 0) or\n (sum(abs(sum without (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) - scalar(count(cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) != 0))\n ),\n 1, 1\n )", + "format": "time_series", + "instant": true, + "legendFormat": "__auto", + "range": false + } + ], + "title": "Convergance state", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 80, + "spanNulls": true + }, + "mappings": [ + { + "options": { + "0": { + "color": "green", + "text": "Yes" + } + }, + "type": "value" + }, + { + "options": { + "1": { + "color": "red", + "text": "No" + } + }, + "type": "value" + } + ], + "max": 1, + "noValue": 0 + } + }, + "gridPos": { + "h": 9, + "w": 16, + "x": 8, + "y": 9 + }, + "options": { + "mergeValues": true + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "ceil(clamp((\n sum(stddev by (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) or\n (sum(abs(sum without (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) - scalar(count(cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }))))\n ),\n 0, 1\n ))", + "instant": false, + "legendFormat": "Converged", + "range": true + } + ], + "title": "Convergance state timeline", + "type": "state-timeline" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "tags": [ + "alloy-mixin" + ], + "templating": { + "list": [ + { + "label": "Data Source", + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "label": "Loki Data Source", + "name": "loki_datasource", + "query": "loki", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "datasource": "${datasource}", + "label": "job", + "name": "job", + "query": { + "query": "label_values(alloy_component_controller_running_components, job)", + "refId": "job" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "cluster", + "name": "cluster", + "query": { + "query": "label_values(alloy_component_controller_running_components{job=\"$job\"}, cluster)", + "refId": "cluster" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "namespace", + "name": "namespace", + "query": { + "query": "label_values(alloy_component_controller_running_components{job=\"$job\"cluster=\"$cluster\"}, namespace)", + "refId": "namespace" + }, + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d", + "90d" + ] + }, + "timezone": "utc", + "title": "Alloy / Cluster Overview", + "uid": "3a6b7020692f53d8e53b49196f7637dd" + } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-controller.json b/operations/alloy-mixin/alloy-controller.json new file mode 100644 index 0000000000..1385c21d00 --- /dev/null +++ b/operations/alloy-mixin/alloy-controller.json @@ -0,0 +1,542 @@ +{ + "annotations": { + "list": [ + { + "datasource": "$loki_datasource", + "enable": true, + "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", + "iconColor": "rgba(0, 211, 255, 1)", + "instant": false, + "name": "Deployments", + "titleFormat": "{{cluster}}/{{namespace}}" + } + ] + }, + "graphTooltip": 1, + "links": [ + { + "icon": "doc", + "targetBlank": true, + "title": "Documentation", + "tooltip": "Component controller documentation", + "type": "link", + "url": "https://grafana.com/docs/alloy/latest/concepts/component_controller/" + }, + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "${datasource}", + "description": "The number of Alloy instances whose metrics are being sent and reported.\n", + "fieldConfig": { + "defaults": { + "unit": "instances" + } + }, + "gridPos": { + "h": 4, + "w": 10, + "x": 0, + "y": 0 + }, + "options": { + "colorMode": "none", + "graphMode": "none" + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "count(alloy_component_controller_evaluating{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })", + "instant": false, + "legendFormat": "__auto", + "range": true + } + ], + "title": "Running instances", + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "The number of running components across all running instances.\n", + "fieldConfig": { + "defaults": { + "unit": "components" + } + }, + "gridPos": { + "h": 4, + "w": 10, + "x": 0, + "y": 4 + }, + "options": { + "colorMode": "none", + "graphMode": "none" + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })", + "instant": false, + "legendFormat": "__auto", + "range": true + } + ], + "title": "Running components", + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "The percentage of components which are in a healthy state.\n", + "fieldConfig": { + "defaults": { + "max": 1, + "min": 0, + "noValue": "No components", + "unit": "percentunit" + } + }, + "gridPos": { + "h": 4, + "w": 10, + "x": 0, + "y": 8 + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "text": { + "valueSize": 80 + } + }, + "pluginVersion": "9.0.6", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(alloy_component_controller_running_components{' + $._config.groupSelector + ',health_type=\"healthy\"}) /\nsum(alloy_component_controller_running_components{' + $._config.groupSelector + '})\n", + "instant": false, + "legendFormat": "__auto", + "range": true + } + ], + "title": "Overall component health", + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "Breakdown of components by health across all running instances.\n\n* Healthy: components have been evaluated completely and are reporting themselves as healthy.\n* Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy.\n* Unknown: A component has been created but has not yet been started.\n* Exited: A component has exited. It will not return to the running state.\n\nMore information on a component's health state can be retrieved using\nthe Alloy UI.\n\nNote that components may be in a degraded state even if they report\nthemselves as healthy. Use component-specific dashboards and alerts\nto observe detailed information about the behavior of a component.\n", + "fieldConfig": { + "defaults": { + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Unhealthy" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unknown" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "blue", + "value": 1 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Exited" + }, + "properties": [ + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 1 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 14, + "x": 10, + "y": 0 + }, + "options": { + "orientation": "vertical", + "showUnfilled": true + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"healthy\"}) or vector(0)", + "instant": true, + "legendFormat": "Healthy", + "range": false + }, + { + "datasource": "${datasource}", + "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"unhealthy\"}) or vector(0)", + "instant": true, + "legendFormat": "Unhealthy", + "range": false + }, + { + "datasource": "${datasource}", + "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"unknown\"}) or vector(0)", + "instant": true, + "legendFormat": "Unknown", + "range": false + }, + { + "datasource": "${datasource}", + "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"exited\"}) or vector(0)", + "instant": true, + "legendFormat": "Exited", + "range": false + } + ], + "title": "Components by health", + "type": "bargauge" + }, + { + "datasource": "${datasource}", + "description": "The frequency at which components get updated.\n", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "points", + "pointSize": 3 + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 12 + }, + "options": { + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (instance) (rate(alloy_component_evaluation_seconds_count{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }[$__rate_interval]))", + "instant": false, + "legendFormat": "__auto", + "range": true + } + ], + "title": "Component evaluation rate", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "The percentiles for how long it takes to complete component evaluations.\n\nComponent evaluations must complete for components to have the latest\narguments. The longer the evaluations take, the slower it will be to\nreconcile the state of components.\n\nIf evaluation is taking too long, consider sharding your components to\ndeal with smaller amounts of data and reuse data as much as possible.\n", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 12 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])))\nor\nhistogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))\n", + "instant": false, + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])))\nor\nhistogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "(\n histogram_sum(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) /\n histogram_count(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])))\n)\nor\n(\n sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])) /\n sum(rate(alloy_component_evaluation_seconds_count{' + $._config.groupSelector + '}[$__rate_interval]))\n)\n", + "instant": false, + "legendFormat": "Average", + "range": true + } + ], + "title": "Component evaluation time", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.\n\nIdeally, no component should take more than 1 minute to evaluate. The components displayed in this chart\nmay be a sign of a problem with the pipeline.\n", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 12 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{' + $._config.groupSelector + '}[$__rate_interval]))\n/ scalar(sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])))\n", + "instant": false, + "legendFormat": "{{component path}} {{component_id}}", + "range": true + } + ], + "title": "Slow components evaluation times", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 22 + }, + "maxDataPoints": 30, + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "scheme": "Spectral" + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 0.10000000000000001 + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "unit": "s" + } + }, + "pluginVersion": "9.0.6", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(increase(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))\n", + "format": "heatmap", + "instant": false, + "legendFormat": "{{le}}", + "range": true + } + ], + "title": "Component evaluation histogram", + "type": "heatmap" + }, + { + "datasource": "${datasource}", + "description": "Detailed histogram of how long components wait to be evaluated after their dependency is updated.\n\nThe goal is to design your config so that most of the time components do not\nqueue for long; under 10ms is a good goal.\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "maxDataPoints": 30, + "options": { + "calculate": false, + "cellGap": 0, + "color": { + "scheme": "Spectral" + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 0.10000000000000001 + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "unit": "s" + } + }, + "pluginVersion": "9.0.6", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(increase(alloy_component_dependencies_wait_seconds{' + $._config.groupSelector + '}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))\n", + "format": "heatmap", + "instant": false, + "legendFormat": "{{le}}", + "range": true + } + ], + "title": "Component dependency wait histogram", + "type": "heatmap" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "tags": [ + "alloy-mixin" + ], + "templating": { + "list": [ + { + "label": "Data Source", + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "label": "Loki Data Source", + "name": "loki_datasource", + "query": "loki", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "datasource": "${datasource}", + "label": "cluster", + "name": "cluster", + "query": { + "query": "label_values(alloy_component_controller_running_components, cluster)\n", + "refId": "cluster" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "namespace", + "name": "namespace", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", + "refId": "namespace" + }, + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d", + "90d" + ] + }, + "timezone": "utc", + "title": "Alloy / Controller", + "uid": "bf9f456aad7108b2c808dbd9973e386f" + } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-logs.json b/operations/alloy-mixin/alloy-logs.json new file mode 100644 index 0000000000..037571fc24 --- /dev/null +++ b/operations/alloy-mixin/alloy-logs.json @@ -0,0 +1,306 @@ +{ + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "description": "Logs volume grouped by \"level\" label.", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { + "mode": "normal" + } + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "(E|e)merg|(F|f)atal|(A|a)lert|(C|c)rit.*" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(E|e)(rr.*|RR.*)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(W|w)(arn.*|ARN.*|rn|RN)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(N|n)(otice|ote)|(I|i)(nf.*|NF.*)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "dbg.*|DBG.*|(D|d)(EBUG|ebug)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "(T|t)(race|RACE)" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "light-blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "logs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "text", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24 + }, + "id": 1, + "interval": "30s", + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v10.0.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "sum by (level) (count_over_time({job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\",instance=~\"$instance\",level=~\"$level\"}\n|~ \"$regex_search\"\n\n[$__interval]))\n", + "legendFormat": "{{ level }}" + } + ], + "title": "Logs volume", + "transformations": [ + { + "id": "renameByRegex", + "options": { + "regex": "Value", + "renamePattern": "logs" + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 18, + "w": 24 + }, + "id": 2, + "options": { + "dedupStrategy": "exact", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showTime": false, + "wrapLogMessage": true + }, + "pluginVersion": "v10.0.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "expr": "{job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\",instance=~\"$instance\",level=~\"$level\"} \n|~ \"$regex_search\"\n\n\n" + } + ], + "title": "Logs", + "type": "logs" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "templating": { + "list": [ + { + "label": "Loki data source", + "name": "loki_datasource", + "query": "loki", + "regex": "", + "type": "datasource" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values({job=\"$job\"}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values({job=\"$job\",cluster=~\"$cluster\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "query": "label_values({job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\"}, instance)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "loki", + "uid": "${loki_datasource}" + }, + "includeAll": true, + "label": "Level", + "multi": true, + "name": "level", + "query": "label_values({job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\",instance=~\"$instance\"}, level)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "label": "Regex search", + "name": "regex_search", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Alloy logs overview", + "uid": "alloy-logs-overview" + } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-opentelemetry.json b/operations/alloy-mixin/alloy-opentelemetry.json new file mode 100644 index 0000000000..f17f5c0e90 --- /dev/null +++ b/operations/alloy-mixin/alloy-opentelemetry.json @@ -0,0 +1,420 @@ +{ + "graphTooltip": 1, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "title": "Receivers for traces [otelcol.receiver]", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "Number of spans successfully pushed into the pipeline.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(receiver_accepted_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{ pod }} / {{ transport }}", + "range": true + } + ], + "title": "Accepted spans", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Number of spans that could not be pushed into the pipeline.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(receiver_refused_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{ pod }} / {{ transport }}", + "range": true + } + ], + "title": "Refused spans", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "The duration of inbound RPCs.\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 0 + }, + "maxDataPoints": 30, + "options": { + "calculate": false, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "scale": "exponential", + "scheme": "Oranges", + "steps": 65 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "9.0.6", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (le) (increase(rpc_server_duration_milliseconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", rpc_service=\"opentelemetry.proto.collector.trace.v1.TraceService\"}[$__rate_interval]))", + "format": "heatmap", + "instant": false, + "legendFormat": "{{le}}", + "range": true + } + ], + "title": "RPC server duration", + "type": "heatmap" + }, + { + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "title": "Batching of logs, metrics, and traces [otelcol.processor.batch]", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "Number of spans, metric datapoints, or log lines in a batch\n", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 10 + }, + "maxDataPoints": 30, + "options": { + "calculate": false, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "scale": "exponential", + "scheme": "Oranges", + "steps": 65 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "unit": "short" + } + }, + "pluginVersion": "9.0.6", + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", + "format": "heatmap", + "instant": false, + "legendFormat": "{{le}}", + "range": true + } + ], + "title": "Number of units in the batch", + "type": "heatmap" + }, + { + "datasource": "${datasource}", + "description": "Number of distinct metadata value combinations being processed\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 10 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "processor_batch_metadata_cardinality_ratio{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}\n", + "instant": false, + "legendFormat": "{{ pod }}", + "range": true + } + ], + "title": "Distinct metadata values", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Number of times the batch was sent due to a timeout trigger\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 10 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{ pod }}", + "range": true + } + ], + "title": "Timeout trigger", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "title": "Exporters for traces [otelcol.exporter]", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "Number of spans successfully sent to destination.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 20 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(exporter_sent_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{ pod }}", + "range": true + } + ], + "title": "Exported sent spans", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Number of spans in failed attempts to send to destination.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + } + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 20 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{ pod }}", + "range": true + } + ], + "title": "Exported failed spans", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "tags": [ + "alloy-mixin" + ], + "templating": { + "list": [ + { + "label": "Data Source", + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "label": "Loki Data Source", + "name": "loki_datasource", + "query": "loki", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "datasource": "${datasource}", + "label": "cluster", + "name": "cluster", + "query": { + "query": "label_values(alloy_component_controller_running_components, cluster)\n", + "refId": "cluster" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "namespace", + "name": "namespace", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", + "refId": "namespace" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": "${datasource}", + "includeAll": true, + "label": "instance", + "multi": true, + "name": "instance", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n", + "refId": "instance" + }, + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d", + "90d" + ] + }, + "timezone": "utc", + "title": "Alloy / OpenTelemetry", + "uid": "9b6d37c8603e19e8922133984faad93d" + } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-prometheus-remote-write.json b/operations/alloy-mixin/alloy-prometheus-remote-write.json new file mode 100644 index 0000000000..845b692f0a --- /dev/null +++ b/operations/alloy-mixin/alloy-prometheus-remote-write.json @@ -0,0 +1,655 @@ +{ + "annotations": { + "list": [ + { + "datasource": "$loki_datasource", + "enable": true, + "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", + "iconColor": "rgba(0, 211, 255, 1)", + "instant": false, + "name": "Deployments", + "titleFormat": "{{cluster}}/{{namespace}}" + } + ] + }, + "graphTooltip": 1, + "links": [ + { + "icon": "doc", + "targetBlank": true, + "title": "Documentation", + "tooltip": "Component documentation", + "type": "link", + "url": "https://grafana.com/docs/alloy/latest/reference/components/prometheus.remote_write/" + }, + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "title": "prometheus.scrape", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "Percentage of targets successfully scraped by prometheus.scrape\ncomponents.\n\nThis metric is calculated by dividing the number of targets\nsuccessfully scraped by the total number of targets scraped,\nacross all the namespaces in the selected cluster.\n\nLow success rates can indicate a problem with scrape targets,\nstale service discovery, or Alloy misconfiguration.\n", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 1 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(up{cluster=\"$cluster\"})\n/\ncount (up{cluster=\"$cluster\"})\n", + "instant": false, + "legendFormat": "% of targets successfully scraped", + "range": true + } + ], + "title": "Scrape success rate in $cluster", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Duration of successful scrapes by prometheus.scrape components,\nacross all the namespaces in the selected cluster.\n\nThis metric should be below your configured scrape interval.\nHigh durations can indicate a problem with a scrape target or\na performance issue with Alloy.\n", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 1 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "quantile(0.99, scrape_duration_seconds{cluster=\"$cluster\"})\n", + "instant": false, + "legendFormat": "p99", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "quantile(0.95, scrape_duration_seconds{cluster=\"$cluster\"})\n", + "instant": false, + "legendFormat": "p95", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "quantile(0.50, scrape_duration_seconds{cluster=\"$cluster\"})\n", + "instant": false, + "legendFormat": "p50", + "range": true + } + ], + "title": "Scrape duration in $cluster", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": "${datasource}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "title": "prometheus.remote_write", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 0, + "y": 12 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 6, + "y": 12 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + } + ], + "title": "Data write throughput", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "unit": "s" + } + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 12, + "y": 12 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", + "range": true + } + ], + "title": "Write latency", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Total number of shards which are concurrently sending samples read\nfrom the Write-Ahead Log.\n\nShards are bound to a minimum and maximum, displayed on the graph.\nThe lowest minimum and the highest maximum across all clients is\nshown.\n\nEach client has its own set of shards, minimum shards, and maximum\nshards; filter to a specific URL to display more granular\ninformation.\n", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Minimum" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 15 + ], + "fill": "dash" + } + }, + { + "id": "custom.showPoints", + "value": "never" + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Maximum" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 15 + ], + "fill": "dash" + } + }, + { + "id": "custom.showPoints", + "value": "never" + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 18, + "y": 12 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum without (remote_name, url) (\n prometheus_remote_storage_shards{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "min (\n prometheus_remote_storage_shards_min{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "instant": false, + "legendFormat": "Minimum", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "max (\n prometheus_remote_storage_shards_max{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "instant": false, + "legendFormat": "Maximum", + "range": true + } + ], + "title": "Shards", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Total outgoing samples sent by prometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "cps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 22 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum without (url, remote_name) (\n rate(prometheus_remote_storage_samples_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + } + ], + "title": "Sent samples / second", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of samples which prometheus.remote_write could not send due to\nnon-recoverable errors.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "cps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum without (url,remote_name) (\n rate(prometheus_remote_storage_samples_failed_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + } + ], + "title": "Failed samples / second", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of samples which prometheus.remote_write attempted to resend\nafter receiving a recoverable error.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "cps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 22 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum without (url,remote_name) (\n rate(prometheus_remote_storage_samples_retried_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + } + ], + "title": "Retried samples / second", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Total number of active series across all components.\n\nAn \"active series\" is a series that prometheus.remote_write recently\nreceived a sample for. Active series are garbage collected whenever a\ntruncation of the WAL occurs.\n", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 32 + }, + "options": { + "legend": { + "showLegend": false + } + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum(prometheus_remote_write_wal_storage_active_series{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"})\n", + "instant": false, + "legendFormat": "Series", + "range": true + } + ], + "title": "Active series (total)", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Total number of active series which are currently being tracked by\nprometheus.remote_write components, with separate lines for each Alloy instance.\n\nAn \"active series\" is a series that prometheus.remote_write recently\nreceived a sample for. Active series are garbage collected whenever a\ntruncation of the WAL occurs.\n", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 32 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "prometheus_remote_write_wal_storage_active_series{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id!=\"\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n", + "instant": false, + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "range": true + } + ], + "title": "Active series (by instance/component)", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Total number of active series which are currently being tracked by\nprometheus.remote_write components, aggregated across all instances.\n\nAn \"active series\" is a series that prometheus.remote_write recently\nreceived a sample for. Active series are garbage collected whenever a\ntruncation of the WAL occurs.\n", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 32 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id!=\"\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"})\n", + "instant": false, + "legendFormat": "{{component_path}} {{component_id}}", + "range": true + } + ], + "title": "Active series (by component)", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "tags": [ + "alloy-mixin" + ], + "templating": { + "list": [ + { + "label": "Data Source", + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "label": "Loki Data Source", + "name": "loki_datasource", + "query": "loki", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "datasource": "${datasource}", + "label": "cluster", + "name": "cluster", + "query": { + "query": "label_values(alloy_component_controller_running_components, cluster)\n", + "refId": "cluster" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "namespace", + "name": "namespace", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", + "refId": "namespace" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": "${datasource}", + "includeAll": true, + "label": "instance", + "multi": true, + "name": "instance", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n", + "refId": "instance" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": "${datasource}", + "includeAll": true, + "label": "component_path", + "multi": true, + "name": "component_path", + "query": { + "query": "label_values(prometheus_remote_write_wal_samples_appended_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id=~\"prometheus\\\\.remote_write\\\\..*\", component_path=~\".*\"}, component_path)\n", + "refId": "component_path" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": "${datasource}", + "includeAll": true, + "label": "component", + "multi": true, + "name": "component", + "query": { + "query": "label_values(prometheus_remote_write_wal_samples_appended_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id=~\"prometheus\\\\.remote_write\\\\..*\"}, component_id)\n", + "refId": "component" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": "${datasource}", + "includeAll": true, + "label": "url", + "multi": true, + "name": "url", + "query": { + "query": "label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id=~\"$component\"}, url)\n", + "refId": "url" + }, + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d", + "90d" + ] + }, + "timezone": "utc", + "title": "Alloy / Prometheus Components", + "uid": "e324cc55567d7f3a8e32860ff8e6d0d9" + } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-resources.json b/operations/alloy-mixin/alloy-resources.json new file mode 100644 index 0000000000..7aefa1a697 --- /dev/null +++ b/operations/alloy-mixin/alloy-resources.json @@ -0,0 +1,329 @@ +{ + "annotations": { + "list": [ + { + "datasource": "$loki_datasource", + "enable": true, + "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", + "iconColor": "rgba(0, 211, 255, 1)", + "instant": false, + "name": "Deployments", + "titleFormat": "{{cluster}}/{{namespace}}" + } + ] + }, + "graphTooltip": 1, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "alloy-mixin" + ], + "targetBlank": false, + "title": "Dashboards", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "${datasource}", + "description": "CPU usage of the Alloy process relative to 1 CPU core.\n\nFor example, 100% means using one entire CPU core.\n", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(alloy_resources_process_cpu_seconds_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "CPU usage", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Resident memory size of the Alloy process.\n", + "fieldConfig": { + "defaults": { + "unit": "decbytes" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "alloy_resources_process_resident_memory_bytes{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Memory (RSS)", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate at which the Alloy process performs garbage collections.\n", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "points", + "pointSize": 3 + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(go_gc_duration_seconds_count{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[5m])\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Garbage collections", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Number of goroutines which are running in parallel. An infinitely\ngrowing number of these indicates a goroutine leak.\n", + "fieldConfig": { + "defaults": { + "unit": "none" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "go_goroutines{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Goroutines", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Heap memory currently in use by the Alloy process.\n", + "fieldConfig": { + "defaults": { + "unit": "decbytes" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Memory (heap inuse)", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data received across all network interfaces for the machine\nAlloy is running on.\n\nData shown here is across all running processes and not exclusive to\nthe running Alloy process.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "gradientMode": "none", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(alloy_resources_machine_rx_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Network receive bandwidth", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data sent across all network interfaces for the machine\nAlloy is running on.\n\nData shown here is across all running processes and not exclusive to\nthe running Alloy process.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 30, + "gradientMode": "none", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "rate(alloy_resources_machine_tx_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Network send bandwidth", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 36, + "tags": [ + "alloy-mixin" + ], + "templating": { + "list": [ + { + "label": "Data Source", + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "label": "Loki Data Source", + "name": "loki_datasource", + "query": "loki", + "refresh": 1, + "sort": 2, + "type": "datasource" + }, + { + "datasource": "${datasource}", + "label": "cluster", + "name": "cluster", + "query": { + "query": "label_values(alloy_component_controller_running_components, cluster)\n", + "refId": "cluster" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "datasource": "${datasource}", + "label": "namespace", + "name": "namespace", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", + "refId": "namespace" + }, + "refresh": 2, + "sort": 2, + "type": "query" + }, + { + "allValue": ".*", + "datasource": "${datasource}", + "includeAll": true, + "label": "instance", + "multi": true, + "name": "instance", + "query": { + "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n", + "refId": "instance" + }, + "refresh": 2, + "sort": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d", + "90d" + ] + }, + "timezone": "utc", + "title": "Alloy / Resources", + "uid": "d6a8574c31f3d7cb8f1345ec84d15a67" + } \ No newline at end of file diff --git a/operations/alloy-mixin/config.libsonnet b/operations/alloy-mixin/config.libsonnet new file mode 100644 index 0000000000..6e1ecace1f --- /dev/null +++ b/operations/alloy-mixin/config.libsonnet @@ -0,0 +1,12 @@ +{ + _config+:: { + enableK8sCluster: true, + enableAlloyCluster: true, + enableLokiLogs: true, + filterSelector: 'job="$job"', + groupSelector: if self.enableK8sCluster then self.filterSelector + ', ' + self.k8sClusterSelector else self.filterSelector, + instanceSelector: self.groupSelector + ', instance="$instance"', + k8sClusterSelector: 'cluster="$cluster", namespace="$namespace", ', + dashboardTag: 'alloy-mixin' + } +} \ No newline at end of file diff --git a/operations/alloy-mixin/dashboards.libsonnet b/operations/alloy-mixin/dashboards.libsonnet index 661de183dc..61e4dda7c9 100644 --- a/operations/alloy-mixin/dashboards.libsonnet +++ b/operations/alloy-mixin/dashboards.libsonnet @@ -1,9 +1,20 @@ -{ - grafanaDashboards+: - (import './dashboards/controller.libsonnet') + - (import './dashboards/resources.libsonnet') + - (import './dashboards/prometheus.libsonnet') + - (import './dashboards/cluster-node.libsonnet') + - (import './dashboards/opentelemetry.libsonnet') + - (import './dashboards/cluster-overview.libsonnet'), +local alloyClusterDashboards = + (import './dashboards/controller.libsonnet') + + (import './dashboards/cluster-node.libsonnet') + + (import './dashboards/cluster-overview.libsonnet') + + (import './config.libsonnet'); + +local otherDashboards = (import './dashboards/resources.libsonnet') + + (import './dashboards/prometheus.libsonnet') + + (import './dashboards/opentelemetry.libsonnet') + + (import './config.libsonnet'); + +(import './dashboards/alloy-logs.libsonnet') + +{ + grafanaDashboards+: + if $._config.enableAlloyCluster then + alloyClusterDashboards + + otherDashboards + else + otherDashboards } diff --git a/operations/alloy-mixin/dashboards/alloy-logs.libsonnet b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet new file mode 100644 index 0000000000..dc1183c6b0 --- /dev/null +++ b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet @@ -0,0 +1,31 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet'; +local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; +{ + grafanaDashboards+: + if $._config.enableLokiLogs then { + local alloyLogs = + logsDashboard.new( + 'Alloy logs overview', + datasourceName='loki_datasource', + datasourceRegex='', + filterSelector=$._config.filterSelector, + labels=['cluster', 'namespace', 'instance', 'level'], + formatParser=null, + showLogsVolume=true + ) + { + panels+: + { + logs+: + // Alloy logs already have timestamp + g.panel.logs.options.withShowTime(false), + }, + dashboards+: + { + logs+: g.dashboard.withLinksMixin($.grafanaDashboards['alloy-resources.json'].links) + + g.dashboard.withRefresh('10s'), + }, + }, + 'alloy-logs.json': alloyLogs.dashboards.logs, + } else {}, +} diff --git a/operations/alloy-mixin/dashboards/cluster-node.libsonnet b/operations/alloy-mixin/dashboards/cluster-node.libsonnet index 0e6241afdc..a81320d879 100644 --- a/operations/alloy-mixin/dashboards/cluster-node.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-node.libsonnet @@ -4,23 +4,26 @@ local filename = 'alloy-cluster-node.json'; { [filename]: - dashboard.new(name='Alloy / Cluster Node') + + dashboard.new(name='Alloy / Cluster Node', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode', desc='Clustering documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), + dashboard.newTemplateVariable('job', + 'label_values(alloy_component_controller_running_components, job)' + ), + dashboard.newTemplateVariable('cluster', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' + ), + dashboard.newTemplateVariable('namespace', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' + ), + dashboard.newTemplateVariable('instance', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster", namespace="$namespace"}, instance)' + ), ]) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ @@ -49,22 +52,22 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 0, y: 1, w: 12, h: 8 }) + panel.withQueries([ panel.newNamedInstantQuery( - expr='sum(cluster_node_lamport_time{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr='sum(cluster_node_lamport_time{' + $._config.instanceSelector + '})', refId='Lamport clock time', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_update_observers{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr='sum(cluster_node_update_observers{' + $._config.instanceSelector + '})', refId='Internal cluster state observers', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_gossip_health_score{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr='sum(cluster_node_gossip_health_score{' + $._config.instanceSelector + '})', refId='Gossip health score', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_gossip_proto_version{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr='sum(cluster_node_gossip_proto_version{' + $._config.instanceSelector + '})', refId='Gossip protocol version', format='table', ), @@ -100,7 +103,7 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 12, y: 1, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_node_gossip_received_events_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr='rate(cluster_node_gossip_received_events_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='{{event}}' ), ]) @@ -114,7 +117,7 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 0, y: 9, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='sum(cluster_node_peers{instance="$instance", cluster="$cluster", namespace="$namespace"})', + expr='sum(cluster_node_peers{' + $._config.instanceSelector + '})', ), ]) + panel.withUnit('suffix:peers') @@ -128,7 +131,7 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 12, y: 9, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='cluster_node_peers{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr='cluster_node_peers{' + $._config.instanceSelector + '}', legendFormat='{{state}}', ), ]) + @@ -150,11 +153,11 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_transport_rx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr='rate(cluster_transport_rx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='rx', ), panel.newQuery( - expr='-1 * rate(cluster_transport_tx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr='-1 * rate(cluster_transport_tx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='tx', ), ]) + @@ -172,21 +175,19 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr=||| - 1 - ( - rate(cluster_transport_tx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_tx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + expr= + '1 - ( + rate(cluster_transport_tx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / + rate(cluster_transport_tx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) + )', legendFormat='Tx success %', ), panel.newQuery( - expr=||| - 1 - ( - rate(cluster_transport_rx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_rx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + expr= + '1 - ( + rate(cluster_transport_rx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / + rate(cluster_transport_rx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) + )', legendFormat='Rx success %', ), ]) + @@ -208,11 +209,11 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='cluster_transport_tx_packet_queue_length{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr='cluster_transport_tx_packet_queue_length{' + $._config.instanceSelector + '}', legendFormat='tx queue', ), panel.newQuery( - expr='cluster_transport_rx_packet_queue_length{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr='cluster_transport_rx_packet_queue_length{' + $._config.instanceSelector + '}', legendFormat='rx queue', ), ]) + @@ -229,11 +230,11 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_transport_stream_rx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr='rate(cluster_transport_stream_rx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='rx', ), panel.newQuery( - expr='-1 * rate(cluster_transport_stream_tx_bytes_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval])', + expr='-1 * rate(cluster_transport_stream_tx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='tx', ), ]) + @@ -251,21 +252,19 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr=||| - 1 - ( - rate(cluster_transport_stream_tx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_stream_tx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + expr= + '1 - ( + rate(cluster_transport_stream_tx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / + rate(cluster_transport_stream_tx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) + )', legendFormat='Tx success %' ), panel.newQuery( - expr=||| - 1 - ( - rate(cluster_transport_stream_rx_packets_failed_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) / - rate(cluster_transport_stream_rx_packets_total{instance="$instance", cluster="$cluster", namespace="$namespace"}[$__rate_interval]) - ) - |||, + expr= + '1 - ( + rate(cluster_transport_stream_rx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / + rate(cluster_transport_stream_rx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) + )', legendFormat='Rx success %' ), ]) + @@ -287,7 +286,7 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='cluster_transport_streams{instance="$instance", cluster="$cluster", namespace="$namespace"}', + expr='cluster_transport_streams{' + $._config.instanceSelector + '}', legendFormat='Open streams' ), ]) diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 314828cbe4..7a63f238de 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -5,20 +5,23 @@ local cluster_node_filename = 'alloy-cluster-node.json'; { [filename]: - dashboard.new(name='Alloy / Cluster Overview') + + dashboard.new(name='Alloy / Cluster Overview', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode', desc='Clustering documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), + dashboard.newTemplateVariable('job', + 'label_values(alloy_component_controller_running_components, job)' + ), + dashboard.newTemplateVariable('cluster', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' + ), + dashboard.newTemplateVariable('namespace', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' + ), ]) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ @@ -31,7 +34,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 8, x: 0, y: 0 }) + panel.withQueries([ panel.newInstantQuery( - expr='count(cluster_node_info{cluster="$cluster", namespace="$namespace"})' + expr='count(cluster_node_info{' + $._config.groupSelector + '})' ), ]) ), @@ -44,7 +47,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 16, x: 8, y: 0 }) + panel.withQueries([ panel.newInstantQuery( - expr='cluster_node_info{cluster="$cluster", namespace="$namespace"}', + expr='cluster_node_info{' + $._config.groupSelector + '}', format='table', ), ]) + @@ -97,7 +100,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; { targetBlank: false, title: 'Detail dashboard for node', - url: '/d/%(uid)s/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-cluster=${cluster}&var-namespace=${namespace}' % { uid: std.md5(cluster_node_filename) }, + url: '/d/%(uid)s/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-job=${job}&var-cluster=${cluster}&var-namespace=${namespace}' % { uid: std.md5(cluster_node_filename) }, }, ], }, @@ -122,14 +125,14 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 8, x: 0, y: 9 }) + panel.withQueries([ panel.newInstantQuery( - expr=||| - clamp(( - sum(stddev by (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"}) != 0) or - (sum(abs(sum without (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"})) - scalar(count(cluster_node_info{cluster="$cluster", namespace="$namespace"})) != 0)) + expr= + 'clamp(( + sum(stddev by (state) (cluster_node_peers{' + $._config.groupSelector + '}) != 0) or + (sum(abs(sum without (state) (cluster_node_peers{' + $._config.groupSelector + '})) - scalar(count(cluster_node_info{' + $._config.groupSelector + '})) != 0)) ), 1, 1 - ) - |||, + )' + , format='time_series' ), ]) + @@ -191,14 +194,14 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 16, x: 8, y: 9 }) + panel.withQueries([ panel.newQuery( - expr=||| - ceil(clamp(( - sum(stddev by (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"})) or - (sum(abs(sum without (state) (cluster_node_peers{cluster="$cluster", namespace="$namespace"})) - scalar(count(cluster_node_info{cluster="$cluster", namespace="$namespace"})))) + expr= + 'ceil(clamp(( + sum(stddev by (state) (cluster_node_peers{' + $._config.groupSelector + '})) or + (sum(abs(sum without (state) (cluster_node_peers{' + $._config.groupSelector + '})) - scalar(count(cluster_node_info{' + $._config.groupSelector + '})))) ), 0, 1 - )) - |||, + ))' + , legendFormat='Converged' ), ]) + diff --git a/operations/alloy-mixin/dashboards/controller.libsonnet b/operations/alloy-mixin/dashboards/controller.libsonnet index bd6623e80b..b1711c2281 100644 --- a/operations/alloy-mixin/dashboards/controller.libsonnet +++ b/operations/alloy-mixin/dashboards/controller.libsonnet @@ -4,20 +4,23 @@ local filename = 'alloy-controller.json'; { [filename]: - dashboard.new(name='Alloy / Controller') + + dashboard.new(name='Alloy / Controller', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/concepts/component_controller/', desc='Component controller documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), + dashboard.newTemplateVariable('job', + 'label_values(alloy_component_controller_running_components, job)' + ), + dashboard.newTemplateVariable('cluster', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' + ), + dashboard.newTemplateVariable('namespace', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' + ), ]) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ @@ -34,7 +37,7 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr='count(alloy_component_controller_evaluating{cluster="$cluster", namespace="$namespace"})', + expr='count(alloy_component_controller_evaluating{' + $._config.groupSelector + '})', ), ]) ), @@ -49,7 +52,7 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"})', + expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + '})', ), ]) ), @@ -72,10 +75,9 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace",health_type="healthy"}) / - sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}) - |||, + expr= + 'sum(alloy_component_controller_running_components{' + $._config.groupSelector + ',health_type="healthy"}) / + sum(alloy_component_controller_running_components{' + $._config.groupSelector + '})', ), ]) ), @@ -157,19 +159,19 @@ local filename = 'alloy-controller.json'; panel.withQueries([ panel.newInstantQuery( legendFormat='Healthy', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="healthy"}) or vector(0)', + expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="healthy"}) or vector(0)', ), panel.newInstantQuery( legendFormat='Unhealthy', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="unhealthy"}) or vector(0)', + expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="unhealthy"}) or vector(0)', ), panel.newInstantQuery( legendFormat='Unknown', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="unknown"}) or vector(0)', + expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="unknown"}) or vector(0)', ), panel.newInstantQuery( legendFormat='Exited', - expr='sum(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace", health_type="exited"}) or vector(0)', + expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="exited"}) or vector(0)', ), ]) ), @@ -194,7 +196,7 @@ local filename = 'alloy-controller.json'; panel.withMultiTooltip() + panel.withQueries([ panel.newQuery( - expr='sum by (instance) (rate(alloy_component_evaluation_seconds_count{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', + expr='sum by (instance) (rate(alloy_component_evaluation_seconds_count{' + $._config.groupSelector + '}[$__rate_interval]))', ), ]) ), @@ -218,33 +220,30 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) + expr= + 'histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) or - histogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) - |||, + histogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))', legendFormat='99th percentile', ), panel.newQuery( - expr=||| - histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) + expr= + 'histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) or - histogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) - |||, + histogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))', legendFormat='50th percentile', ), panel.newQuery( - expr=||| - ( - histogram_sum(sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) / - histogram_count(sum(rate(alloy_component_evaluation_seconds{cluster="$cluster",namespace="$namespace"}[$__rate_interval]))) + expr= + '( + histogram_sum(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) / + histogram_count(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) ) or ( - sum(rate(alloy_component_evaluation_seconds_sum{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) / - sum(rate(alloy_component_evaluation_seconds_count{cluster="$cluster",namespace="$namespace"}[$__rate_interval])) - ) - |||, + sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])) / + sum(rate(alloy_component_evaluation_seconds_count{' + $._config.groupSelector + '}[$__rate_interval])) + )', legendFormat='Average', ), ]) @@ -263,10 +262,9 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) - / scalar(sum(rate(alloy_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))) - |||, + expr= + 'sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{' + $._config.groupSelector + '}[$__rate_interval])) + / scalar(sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])))', legendFormat='{{component path}} {{component_id}}', ), ]) @@ -286,11 +284,10 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum(increase(alloy_component_evaluation_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + expr= + 'sum(increase(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])) or ignoring (le) - sum by (le) (increase(alloy_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) - |||, + sum by (le) (increase(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))', format='heatmap', legendFormat='{{le}}', ), @@ -311,11 +308,10 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - sum(increase(alloy_component_dependencies_wait_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + expr= + 'sum(increase(alloy_component_dependencies_wait_seconds{' + $._config.groupSelector + '}[$__rate_interval])) or ignoring (le) - sum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) - |||, + sum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))', format='heatmap', legendFormat='{{le}}', ), diff --git a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet index 04aa577186..240558ee8d 100644 --- a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet +++ b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet @@ -16,19 +16,22 @@ local stackedPanelMixin = { { [filename]: - dashboard.new(name='Alloy / OpenTelemetry') + - dashboard.withDashboardsLink() + + dashboard.new(name='Alloy / OpenTelemetry', tag=$._config.dashboardTag) + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newMultiTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), + dashboard.newTemplateVariable('job', + 'label_values(alloy_component_controller_running_components, job)' + ), + dashboard.newTemplateVariable('cluster', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' + ), + dashboard.newTemplateVariable('namespace', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' + ), + dashboard.newTemplateVariable('instance', + 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster", namespace="$namespace"}, instance)' + ), ]) + dashboard.withPanelsMixin([ // "Receivers for traces" row @@ -45,9 +48,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(receiver_accepted_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr='rate(receiver_accepted_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', //TODO: How will the dashboard look if there is more than one receiver component? The legend is not unique enough? legendFormat='{{ pod }} / {{ transport }}', ), @@ -63,9 +64,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(receiver_refused_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr='rate(receiver_refused_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='{{ pod }} / {{ transport }}', ), ]) @@ -78,7 +77,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 16, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='sum by (le) (increase(rpc_server_duration_milliseconds_bucket{cluster="$cluster", namespace="$namespace", instance=~"$instance", rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval]))', + expr='sum by (le) (increase(rpc_server_duration_milliseconds_bucket{' + $._config.instanceSelector + ', rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval]))', format='heatmap', legendFormat='{{le}}', ), @@ -99,7 +98,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]))', + expr='sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{' + $._config.instanceSelector + '}[$__rate_interval]))', format='heatmap', legendFormat='{{le}}', ), @@ -116,9 +115,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - processor_batch_metadata_cardinality_ratio{cluster="$cluster", namespace="$namespace", instance=~"$instance"} - |||, + expr='processor_batch_metadata_cardinality_ratio{' + $._config.instanceSelector + '}', legendFormat='{{ pod }}', ), ]) @@ -131,9 +128,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 16, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(processor_batch_timeout_trigger_send_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr='rate(processor_batch_timeout_trigger_send_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='{{ pod }}', ), ]) @@ -153,9 +148,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 20, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(exporter_sent_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr='rate(exporter_sent_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='{{ pod }}', ), ]) @@ -169,9 +162,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 20, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(exporter_send_failed_spans_ratio_total{cluster="$cluster", namespace="$namespace", instance=~"$instance"}[$__rate_interval]) - |||, + expr='rate(exporter_send_failed_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='{{ pod }}', ), ]) diff --git a/operations/alloy-mixin/dashboards/prometheus.libsonnet b/operations/alloy-mixin/dashboards/prometheus.libsonnet index e54b28bd08..439f861ae7 100644 --- a/operations/alloy-mixin/dashboards/prometheus.libsonnet +++ b/operations/alloy-mixin/dashboards/prometheus.libsonnet @@ -389,12 +389,12 @@ local remoteWritePanels(y_offset) = [ { [filename]: - dashboard.new(name='Alloy / Prometheus Components') + + dashboard.new(name='Alloy / Prometheus Components', tag=$._config.dashboardTag) + dashboard.withDocsLink( url='https://grafana.com/docs/alloy/latest/reference/components/prometheus.remote_write/', desc='Component documentation', ) + - dashboard.withDashboardsLink() + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin([ dashboard.newTemplateVariable('cluster', ||| diff --git a/operations/alloy-mixin/dashboards/resources.libsonnet b/operations/alloy-mixin/dashboards/resources.libsonnet index 8d38b7c789..ce30e976eb 100644 --- a/operations/alloy-mixin/dashboards/resources.libsonnet +++ b/operations/alloy-mixin/dashboards/resources.libsonnet @@ -28,8 +28,8 @@ local stackedPanelMixin = { { [filename]: - dashboard.new(name='Alloy / Resources') + - dashboard.withDashboardsLink() + + dashboard.new(name='Alloy / Resources', tag=$._config.dashboardTag) + + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + dashboard.withTemplateVariablesMixin([ dashboard.newTemplateVariable('cluster', ||| diff --git a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet index 40b5f85ced..1c61914d19 100644 --- a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet +++ b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet @@ -2,13 +2,13 @@ // schemaVersion present in Grafana 9. { - new(name=''):: { + new(name='', tag='alloy-mixin'):: { title: name, timezone: 'utc', refresh: '10s', schemaVersion: 36, graphTooltip: 1, // shared crosshair for all graphs - tags: ['alloy-mixin'], + tags: [tag], templating: { list: [{ name: 'datasource', @@ -114,7 +114,7 @@ }], }, - withDashboardsLink():: { + withDashboardsLink(tag='alloy-mixin'):: { links+: [{ title: 'Dashboards', type: 'dashboards', @@ -122,7 +122,7 @@ icon: 'external link', includeVars: true, keepTime: true, - tags: ['alloy-mixin'], + tags: [tag], targetBlank: false, }], }, diff --git a/operations/alloy-mixin/jsonnetfile.json b/operations/alloy-mixin/jsonnetfile.json index 4388812ca2..fb40c35599 100644 --- a/operations/alloy-mixin/jsonnetfile.json +++ b/operations/alloy-mixin/jsonnetfile.json @@ -1,5 +1,33 @@ { "version": 1, - "dependencies": [], + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/logs" + } + }, + "version": "master" + } + ], "legacyImports": true } diff --git a/operations/alloy-mixin/jsonnetfile.lock.json b/operations/alloy-mixin/jsonnetfile.lock.json new file mode 100644 index 0000000000..97201a3be0 --- /dev/null +++ b/operations/alloy-mixin/jsonnetfile.lock.json @@ -0,0 +1,56 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v10.0.0" + } + }, + "version": "1c56af39815c4903e47c27194444456f005f65df", + "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/" + } + }, + "version": "21526e83f442793d5a0c5969867d123915422b79", + "sum": "IkBo9nj0Qt1eC9w80dO5SI4yvHzmmXcKx5BK8H8U0Mk=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib/logs" + } + }, + "version": "21526e83f442793d5a0c5969867d123915422b79", + "sum": "CemcPbsPzyRUchDLH1TKTxWWgBlg1MRT0jH2X172z6w=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/docsonnet.git", + "subdir": "doc-util" + } + }, + "version": "6ac6c69685b8c29c54515448eaca583da2d88150", + "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "63d430b69a95741061c2f7fc9d84b1a778511d9c", + "sum": "qiZi3axUSXCVzKUF83zSAxklwrnitMmrDK4XAfjPMdE=" + } + ], + "legacyImports": false +} diff --git a/operations/alloy-mixin/mixin.libsonnet b/operations/alloy-mixin/mixin.libsonnet index 741c943035..cd32e269c6 100644 --- a/operations/alloy-mixin/mixin.libsonnet +++ b/operations/alloy-mixin/mixin.libsonnet @@ -1,3 +1,4 @@ { grafanaDashboardFolder: 'Alloy' } + (import './dashboards.libsonnet') + -(import './alerts.libsonnet') +(import './alerts.libsonnet') + +(import './config.libsonnet') From c6f356f3ce00abda7d193da05d457fbf181ad4e3 Mon Sep 17 00:00:00 2001 From: Gabriel Antunes Date: Wed, 8 May 2024 19:01:32 -0300 Subject: [PATCH 2/6] updating template variables --- .../alloy-mixin/alloy-cluster-node.json | 502 ------------ .../alloy-mixin/alloy-cluster-overview.json | 350 -------- operations/alloy-mixin/alloy-controller.json | 542 ------------- operations/alloy-mixin/alloy-logs.json | 306 ------- .../alloy-mixin/alloy-opentelemetry.json | 420 ---------- .../alloy-prometheus-remote-write.json | 655 --------------- operations/alloy-mixin/alloy-resources.json | 329 -------- operations/alloy-mixin/config.libsonnet | 6 +- .../dashboards/cluster-node.libsonnet | 29 +- .../dashboards/cluster-overview.libsonnet | 24 +- .../dashboards/controller.libsonnet | 25 +- .../dashboards/opentelemetry.libsonnet | 29 +- .../dashboards/prometheus.libsonnet | 757 +++++++++--------- .../dashboards/resources.libsonnet | 66 +- .../dashboards/utils/dashboard.jsonnet | 4 +- 15 files changed, 474 insertions(+), 3570 deletions(-) delete mode 100644 operations/alloy-mixin/alloy-cluster-node.json delete mode 100644 operations/alloy-mixin/alloy-cluster-overview.json delete mode 100644 operations/alloy-mixin/alloy-controller.json delete mode 100644 operations/alloy-mixin/alloy-logs.json delete mode 100644 operations/alloy-mixin/alloy-opentelemetry.json delete mode 100644 operations/alloy-mixin/alloy-prometheus-remote-write.json delete mode 100644 operations/alloy-mixin/alloy-resources.json diff --git a/operations/alloy-mixin/alloy-cluster-node.json b/operations/alloy-mixin/alloy-cluster-node.json deleted file mode 100644 index 12084f1628..0000000000 --- a/operations/alloy-mixin/alloy-cluster-node.json +++ /dev/null @@ -1,502 +0,0 @@ -{ - "annotations": { - "list": [ - { - "datasource": "$loki_datasource", - "enable": true, - "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", - "iconColor": "rgba(0, 211, 255, 1)", - "instant": false, - "name": "Deployments", - "titleFormat": "{{cluster}}/{{namespace}}" - } - ] - }, - "graphTooltip": 1, - "links": [ - { - "icon": "doc", - "targetBlank": true, - "title": "Documentation", - "tooltip": "Clustering documentation", - "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" - }, - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "title": "Node Info", - "type": "row" - }, - { - "datasource": "${datasource}", - "description": "Information about a specific cluster node.\n\n* Lamport clock time: The observed Lamport time on the specific node's clock used to provide partial ordering around gossip messages. Nodes should ideally be observing roughly the same time, meaning they are up-to-date on the cluster state. If a node is falling behind, it means that it has not recently processed the same number of messages and may have an outdated view of its peers.\n\n* Internal cluster state observers: The number of Observer functions that are registered to run whenever the node detects a cluster change.\n\n* Gossip health score: A health score assigned to this node by the memberlist implementation. The lower, the better.\n\n* Gossip protocol version: The protocol version used by nodes to communicate with one another. It should match across all nodes.\n", - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 1 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(cluster_node_lamport_time{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "Lamport clock time" - }, - { - "datasource": "${datasource}", - "expr": "sum(cluster_node_update_observers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "Internal cluster state observers" - }, - { - "datasource": "${datasource}", - "expr": "sum(cluster_node_gossip_health_score{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "Gossip health score" - }, - { - "datasource": "${datasource}", - "expr": "sum(cluster_node_gossip_proto_version{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false, - "refId": "Gossip protocol version" - } - ], - "title": "Node Info", - "transformations": [ - { - "id": "renameByRegex", - "options": { - "regex": "Value #(.*)", - "renamePattern": "$1" - } - }, - { - "id": "reduce", - "options": { } - }, - { - "id": "organize", - "options": { - "excludeByName": { }, - "indexByName": { }, - "renameByName": { - "Field": "Metric", - "Max": "Value" - } - } - } - ], - "type": "table" - }, - { - "datasource": "${datasource}", - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 1 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(cluster_node_gossip_received_events_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", - "instant": false, - "legendFormat": "{{event}}", - "range": true - } - ], - "title": "Gossip ops/s", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Known peers to the node (including the local node).\n", - "fieldConfig": { - "defaults": { - "unit": "suffix:peers" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"})", - "instant": false, - "legendFormat": "__auto", - "range": true - } - ], - "title": "Known peers", - "type": "stat" - }, - { - "datasource": "${datasource}", - "description": "Known peers to the node by state (including the local node).\n", - "fieldConfig": { - "defaults": { - "unit": "suffix:nodes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", - "instant": false, - "legendFormat": "{{state}}", - "range": true - } - ], - "title": "Peers by state", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 17 - }, - "title": "Gossip Transport", - "type": "row" - }, - { - "datasource": "${datasource}", - "fieldConfig": { - "defaults": { - "custom": { - "axisCenteredZero": true - }, - "unit": "Bps" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 18 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(cluster_transport_rx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", - "instant": false, - "legendFormat": "rx", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "-1 * rate(cluster_transport_tx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", - "instant": false, - "legendFormat": "tx", - "range": true - } - ], - "title": "Transport bandwidth", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "fieldConfig": { - "defaults": { - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 18 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "1 - (\n rate(cluster_transport_tx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) /\n rate(cluster_transport_tx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])\n )", - "instant": false, - "legendFormat": "Tx success %", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "1 - ( \\\n rate(cluster_transport_rx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) / \\\n rate(cluster_transport_rx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) \\\n )", - "instant": false, - "legendFormat": "Rx success %", - "range": true - } - ], - "title": "Packet write success rate", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "The number of packets enqueued currently to be decoded or encoded and sent during communication with other nodes.\n\nThe incoming and outgoing packet queue should be as empty as possible; a growing queue means that Alloy cannot keep up with the number of messages required to have all nodes informed of cluster changes, and the nodes may not converge in a timely fashion.\n", - "fieldConfig": { - "defaults": { - "unit": "pkts" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 18 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "cluster_transport_tx_packet_queue_length{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", - "instant": false, - "legendFormat": "tx queue", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "cluster_transport_rx_packet_queue_length{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", - "instant": false, - "legendFormat": "rx queue", - "range": true - } - ], - "title": "Pending packet queue", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "fieldConfig": { - "defaults": { - "custom": { - "axisCenteredZero": true - }, - "unit": "Bps" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 26 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(cluster_transport_stream_rx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", - "instant": false, - "legendFormat": "rx", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "-1 * rate(cluster_transport_stream_tx_bytes_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])", - "instant": false, - "legendFormat": "tx", - "range": true - } - ], - "title": "Stream bandwidth", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "fieldConfig": { - "defaults": { - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 26 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "1 - (\n rate(cluster_transport_stream_tx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) /\n rate(cluster_transport_stream_tx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])\n )", - "instant": false, - "legendFormat": "Tx success %", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "1 - (\n rate(cluster_transport_stream_rx_packets_failed_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval]) /\n rate(cluster_transport_stream_rx_packets_total{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}[$__rate_interval])\n )", - "instant": false, - "legendFormat": "Rx success %", - "range": true - } - ], - "title": "Stream write success rate", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "The number of open connections from this node to its peers.\n\nEach node picks up a subset of its peers to continuously gossip messages around cluster status using streaming HTTP/2 connections. This panel can be used to detect networking failures that result in cluster communication being disrupted and convergence taking longer than expected or outright failing.\n", - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 26 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "cluster_transport_streams{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , instance=\"$instance\"}", - "instant": false, - "legendFormat": "Open streams", - "range": true - } - ], - "title": "Open transport streams", - "type": "timeseries" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "tags": [ - "alloy-mixin" - ], - "templating": { - "list": [ - { - "label": "Data Source", - "name": "datasource", - "query": "prometheus", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "label": "Loki Data Source", - "name": "loki_datasource", - "query": "loki", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "datasource": "${datasource}", - "label": "job", - "name": "job", - "query": { - "query": "label_values(alloy_component_controller_running_components, job)", - "refId": "job" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "cluster", - "name": "cluster", - "query": { - "query": "label_values(alloy_component_controller_running_components{job=\"$job\"}, cluster)", - "refId": "cluster" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "namespace", - "name": "namespace", - "query": { - "query": "label_values(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\"}, namespace)", - "refId": "namespace" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "instance", - "name": "instance", - "query": { - "query": "label_values(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\"}, instance)", - "refId": "instance" - }, - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d", - "90d" - ] - }, - "timezone": "utc", - "title": "Alloy / Cluster Node", - "uid": "4047e755d822da63c8158cde32ae4dce" - } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-cluster-overview.json b/operations/alloy-mixin/alloy-cluster-overview.json deleted file mode 100644 index b3a68e8fcb..0000000000 --- a/operations/alloy-mixin/alloy-cluster-overview.json +++ /dev/null @@ -1,350 +0,0 @@ -{ - "annotations": { - "list": [ - { - "datasource": "$loki_datasource", - "enable": true, - "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", - "iconColor": "rgba(0, 211, 255, 1)", - "instant": false, - "name": "Deployments", - "titleFormat": "{{cluster}}/{{namespace}}" - } - ] - }, - "graphTooltip": 1, - "links": [ - { - "icon": "doc", - "targetBlank": true, - "title": "Documentation", - "tooltip": "Clustering documentation", - "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" - }, - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "datasource": "${datasource}", - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 0 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "count(cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })", - "instant": true, - "legendFormat": "__auto", - "range": false - } - ], - "title": "Nodes", - "type": "stat" - }, - { - "datasource": "${datasource}", - "description": "Nodes info.\n", - "fieldConfig": { - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Dashboard" - }, - "properties": [ - { - "id": "mappings", - "value": [ - { - "options": { - "1": { - "index": 0, - "text": "Link" - } - }, - "type": "value" - } - ] - }, - { - "id": "links", - "value": [ - { - "targetBlank": false, - "title": "Detail dashboard for node", - "url": "/d/4047e755d822da63c8158cde32ae4dce/alloy-cluster-node?var-instance=${__data.fields.instance}&var-datasource=${datasource}&var-loki_datasource=${loki_datasource}&var-job=${job}&var-cluster=${cluster}&var-namespace=${namespace}" - } - ] - } - ] - } - ] - }, - "gridPos": { - "h": 9, - "w": 16, - "x": 8, - "y": 0 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }", - "format": "table", - "instant": true, - "legendFormat": "__auto", - "range": false - } - ], - "title": "Node table", - "transformations": [ - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "Value": false, - "__name__": true, - "cluster": true, - "namespace": true, - "state": false - }, - "indexByName": { }, - "renameByName": { - "Value": "Dashboard", - "instance": "", - "state": "" - } - } - } - ], - "type": "table" - }, - { - "datasource": "${datasource}", - "description": "Whether the cluster state has converged.\n\nIt is normal for the cluster state to be diverged briefly as gossip events propagate. It is not normal for the cluster state to be diverged for a long period of time.\n\nThis will show one of the following:\n\n* Converged: Nodes are aware of all other nodes, with the correct states.\n* Not converged: A subset of nodes aren't aware of their peers, or don't have an updated view of peer states.\n", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "1": { - "color": "red", - "index": 1, - "text": "Not converged" - } - }, - "type": "value" - }, - { - "options": { - "match": "null", - "result": { - "color": "green", - "index": 0, - "text": "Converged" - } - }, - "type": "special" - } - ], - "unit": "suffix:nodes" - } - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 9 - }, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "textMode": "auto" - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "clamp((\n sum(stddev by (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }) != 0) or\n (sum(abs(sum without (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) - scalar(count(cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) != 0))\n ),\n 1, 1\n )", - "format": "time_series", - "instant": true, - "legendFormat": "__auto", - "range": false - } - ], - "title": "Convergance state", - "type": "stat" - }, - { - "datasource": "${datasource}", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 80, - "spanNulls": true - }, - "mappings": [ - { - "options": { - "0": { - "color": "green", - "text": "Yes" - } - }, - "type": "value" - }, - { - "options": { - "1": { - "color": "red", - "text": "No" - } - }, - "type": "value" - } - ], - "max": 1, - "noValue": 0 - } - }, - "gridPos": { - "h": 9, - "w": 16, - "x": 8, - "y": 9 - }, - "options": { - "mergeValues": true - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "ceil(clamp((\n sum(stddev by (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) or\n (sum(abs(sum without (state) (cluster_node_peers{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })) - scalar(count(cluster_node_info{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }))))\n ),\n 0, 1\n ))", - "instant": false, - "legendFormat": "Converged", - "range": true - } - ], - "title": "Convergance state timeline", - "type": "state-timeline" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "tags": [ - "alloy-mixin" - ], - "templating": { - "list": [ - { - "label": "Data Source", - "name": "datasource", - "query": "prometheus", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "label": "Loki Data Source", - "name": "loki_datasource", - "query": "loki", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "datasource": "${datasource}", - "label": "job", - "name": "job", - "query": { - "query": "label_values(alloy_component_controller_running_components, job)", - "refId": "job" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "cluster", - "name": "cluster", - "query": { - "query": "label_values(alloy_component_controller_running_components{job=\"$job\"}, cluster)", - "refId": "cluster" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "namespace", - "name": "namespace", - "query": { - "query": "label_values(alloy_component_controller_running_components{job=\"$job\"cluster=\"$cluster\"}, namespace)", - "refId": "namespace" - }, - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d", - "90d" - ] - }, - "timezone": "utc", - "title": "Alloy / Cluster Overview", - "uid": "3a6b7020692f53d8e53b49196f7637dd" - } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-controller.json b/operations/alloy-mixin/alloy-controller.json deleted file mode 100644 index 1385c21d00..0000000000 --- a/operations/alloy-mixin/alloy-controller.json +++ /dev/null @@ -1,542 +0,0 @@ -{ - "annotations": { - "list": [ - { - "datasource": "$loki_datasource", - "enable": true, - "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", - "iconColor": "rgba(0, 211, 255, 1)", - "instant": false, - "name": "Deployments", - "titleFormat": "{{cluster}}/{{namespace}}" - } - ] - }, - "graphTooltip": 1, - "links": [ - { - "icon": "doc", - "targetBlank": true, - "title": "Documentation", - "tooltip": "Component controller documentation", - "type": "link", - "url": "https://grafana.com/docs/alloy/latest/concepts/component_controller/" - }, - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "datasource": "${datasource}", - "description": "The number of Alloy instances whose metrics are being sent and reported.\n", - "fieldConfig": { - "defaults": { - "unit": "instances" - } - }, - "gridPos": { - "h": 4, - "w": 10, - "x": 0, - "y": 0 - }, - "options": { - "colorMode": "none", - "graphMode": "none" - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })", - "instant": false, - "legendFormat": "__auto", - "range": true - } - ], - "title": "Running instances", - "type": "stat" - }, - { - "datasource": "${datasource}", - "description": "The number of running components across all running instances.\n", - "fieldConfig": { - "defaults": { - "unit": "components" - } - }, - "gridPos": { - "h": 4, - "w": 10, - "x": 0, - "y": 4 - }, - "options": { - "colorMode": "none", - "graphMode": "none" - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", })", - "instant": false, - "legendFormat": "__auto", - "range": true - } - ], - "title": "Running components", - "type": "stat" - }, - { - "datasource": "${datasource}", - "description": "The percentage of components which are in a healthy state.\n", - "fieldConfig": { - "defaults": { - "max": 1, - "min": 0, - "noValue": "No components", - "unit": "percentunit" - } - }, - "gridPos": { - "h": 4, - "w": 10, - "x": 0, - "y": 8 - }, - "options": { - "colorMode": "value", - "graphMode": "area", - "text": { - "valueSize": 80 - } - }, - "pluginVersion": "9.0.6", - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(alloy_component_controller_running_components{' + $._config.groupSelector + ',health_type=\"healthy\"}) /\nsum(alloy_component_controller_running_components{' + $._config.groupSelector + '})\n", - "instant": false, - "legendFormat": "__auto", - "range": true - } - ], - "title": "Overall component health", - "type": "stat" - }, - { - "datasource": "${datasource}", - "description": "Breakdown of components by health across all running instances.\n\n* Healthy: components have been evaluated completely and are reporting themselves as healthy.\n* Unhealthy: Components either could not be evaluated or are reporting themselves as unhealthy.\n* Unknown: A component has been created but has not yet been started.\n* Exited: A component has exited. It will not return to the running state.\n\nMore information on a component's health state can be retrieved using\nthe Alloy UI.\n\nNote that components may be in a degraded state even if they report\nthemselves as healthy. Use component-specific dashboards and alerts\nto observe detailed information about the behavior of a component.\n", - "fieldConfig": { - "defaults": { - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Unhealthy" - }, - "properties": [ - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Unknown" - }, - "properties": [ - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "blue", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Exited" - }, - "properties": [ - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 1 - } - ] - } - } - ] - } - ] - }, - "gridPos": { - "h": 12, - "w": 14, - "x": 10, - "y": 0 - }, - "options": { - "orientation": "vertical", - "showUnfilled": true - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"healthy\"}) or vector(0)", - "instant": true, - "legendFormat": "Healthy", - "range": false - }, - { - "datasource": "${datasource}", - "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"unhealthy\"}) or vector(0)", - "instant": true, - "legendFormat": "Unhealthy", - "range": false - }, - { - "datasource": "${datasource}", - "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"unknown\"}) or vector(0)", - "instant": true, - "legendFormat": "Unknown", - "range": false - }, - { - "datasource": "${datasource}", - "expr": "sum(alloy_component_controller_running_components{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", , health_type=\"exited\"}) or vector(0)", - "instant": true, - "legendFormat": "Exited", - "range": false - } - ], - "title": "Components by health", - "type": "bargauge" - }, - { - "datasource": "${datasource}", - "description": "The frequency at which components get updated.\n", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "points", - "pointSize": 3 - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 12 - }, - "options": { - "tooltip": { - "mode": "multi" - } - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum by (instance) (rate(alloy_component_evaluation_seconds_count{job=\"$job\", cluster=\"$cluster\", namespace=\"$namespace\", }[$__rate_interval]))", - "instant": false, - "legendFormat": "__auto", - "range": true - } - ], - "title": "Component evaluation rate", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "The percentiles for how long it takes to complete component evaluations.\n\nComponent evaluations must complete for components to have the latest\narguments. The longer the evaluations take, the slower it will be to\nreconcile the state of components.\n\nIf evaluation is taking too long, consider sharding your components to\ndeal with smaller amounts of data and reuse data as much as possible.\n", - "fieldConfig": { - "defaults": { - "unit": "s" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 12 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])))\nor\nhistogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])))\nor\nhistogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))\n", - "instant": false, - "legendFormat": "50th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "(\n histogram_sum(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) /\n histogram_count(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])))\n)\nor\n(\n sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])) /\n sum(rate(alloy_component_evaluation_seconds_count{' + $._config.groupSelector + '}[$__rate_interval]))\n)\n", - "instant": false, - "legendFormat": "Average", - "range": true - } - ], - "title": "Component evaluation time", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.\n\nIdeally, no component should take more than 1 minute to evaluate. The components displayed in this chart\nmay be a sign of a problem with the pipeline.\n", - "fieldConfig": { - "defaults": { - "unit": "percentunit" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 12 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{' + $._config.groupSelector + '}[$__rate_interval]))\n/ scalar(sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])))\n", - "instant": false, - "legendFormat": "{{component path}} {{component_id}}", - "range": true - } - ], - "title": "Slow components evaluation times", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n", - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 22 - }, - "maxDataPoints": 30, - "options": { - "calculate": false, - "cellGap": 0, - "color": { - "scheme": "Spectral" - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 0.10000000000000001 - }, - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "unit": "s" - } - }, - "pluginVersion": "9.0.6", - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(increase(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))\n", - "format": "heatmap", - "instant": false, - "legendFormat": "{{le}}", - "range": true - } - ], - "title": "Component evaluation histogram", - "type": "heatmap" - }, - { - "datasource": "${datasource}", - "description": "Detailed histogram of how long components wait to be evaluated after their dependency is updated.\n\nThe goal is to design your config so that most of the time components do not\nqueue for long; under 10ms is a good goal.\n", - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 22 - }, - "maxDataPoints": 30, - "options": { - "calculate": false, - "cellGap": 0, - "color": { - "scheme": "Spectral" - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 0.10000000000000001 - }, - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "unit": "s" - } - }, - "pluginVersion": "9.0.6", - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(increase(alloy_component_dependencies_wait_seconds{' + $._config.groupSelector + '}[$__rate_interval]))\nor ignoring (le)\nsum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))\n", - "format": "heatmap", - "instant": false, - "legendFormat": "{{le}}", - "range": true - } - ], - "title": "Component dependency wait histogram", - "type": "heatmap" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "tags": [ - "alloy-mixin" - ], - "templating": { - "list": [ - { - "label": "Data Source", - "name": "datasource", - "query": "prometheus", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "label": "Loki Data Source", - "name": "loki_datasource", - "query": "loki", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "datasource": "${datasource}", - "label": "cluster", - "name": "cluster", - "query": { - "query": "label_values(alloy_component_controller_running_components, cluster)\n", - "refId": "cluster" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "namespace", - "name": "namespace", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", - "refId": "namespace" - }, - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d", - "90d" - ] - }, - "timezone": "utc", - "title": "Alloy / Controller", - "uid": "bf9f456aad7108b2c808dbd9973e386f" - } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-logs.json b/operations/alloy-mixin/alloy-logs.json deleted file mode 100644 index 037571fc24..0000000000 --- a/operations/alloy-mixin/alloy-logs.json +++ /dev/null @@ -1,306 +0,0 @@ -{ - "links": [ - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "description": "Logs volume grouped by \"level\" label.", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "bars", - "fillOpacity": 50, - "stacking": { - "mode": "normal" - } - }, - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byRegexp", - "options": "(E|e)merg|(F|f)atal|(A|a)lert|(C|c)rit.*" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "purple", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "(E|e)(rr.*|RR.*)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "(W|w)(arn.*|ARN.*|rn|RN)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "orange", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "(N|n)(otice|ote)|(I|i)(nf.*|NF.*)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "dbg.*|DBG.*|(D|d)(EBUG|ebug)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "(T|t)(race|RACE)" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "light-blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byRegexp", - "options": "logs" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "text", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 6, - "w": 24 - }, - "id": 1, - "interval": "30s", - "options": { - "tooltip": { - "mode": "multi", - "sort": "desc" - } - }, - "pluginVersion": "v10.0.0", - "targets": [ - { - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "expr": "sum by (level) (count_over_time({job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\",instance=~\"$instance\",level=~\"$level\"}\n|~ \"$regex_search\"\n\n[$__interval]))\n", - "legendFormat": "{{ level }}" - } - ], - "title": "Logs volume", - "transformations": [ - { - "id": "renameByRegex", - "options": { - "regex": "Value", - "renamePattern": "logs" - } - } - ], - "type": "timeseries" - }, - { - "datasource": { - "type": "datasource", - "uid": "-- Mixed --" - }, - "gridPos": { - "h": 18, - "w": 24 - }, - "id": 2, - "options": { - "dedupStrategy": "exact", - "enableLogDetails": true, - "prettifyLogMessage": true, - "showTime": false, - "wrapLogMessage": true - }, - "pluginVersion": "v10.0.0", - "targets": [ - { - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "expr": "{job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\",instance=~\"$instance\",level=~\"$level\"} \n|~ \"$regex_search\"\n\n\n" - } - ], - "title": "Logs", - "type": "logs" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "templating": { - "list": [ - { - "label": "Loki data source", - "name": "loki_datasource", - "query": "loki", - "regex": "", - "type": "datasource" - }, - { - "allValue": ".*", - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "includeAll": true, - "label": "Cluster", - "multi": true, - "name": "cluster", - "query": "label_values({job=\"$job\"}, cluster)", - "refresh": 2, - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "includeAll": true, - "label": "Namespace", - "multi": true, - "name": "namespace", - "query": "label_values({job=\"$job\",cluster=~\"$cluster\"}, namespace)", - "refresh": 2, - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "includeAll": true, - "label": "Instance", - "multi": true, - "name": "instance", - "query": "label_values({job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\"}, instance)", - "refresh": 2, - "sort": 1, - "type": "query" - }, - { - "allValue": ".*", - "datasource": { - "type": "loki", - "uid": "${loki_datasource}" - }, - "includeAll": true, - "label": "Level", - "multi": true, - "name": "level", - "query": "label_values({job=\"$job\",cluster=~\"$cluster\",namespace=~\"$namespace\",instance=~\"$instance\"}, level)", - "refresh": 2, - "sort": 1, - "type": "query" - }, - { - "current": { - "selected": false, - "text": "", - "value": "" - }, - "label": "Regex search", - "name": "regex_search", - "options": [ - { - "selected": true, - "text": "", - "value": "" - } - ], - "query": "", - "type": "textbox" - } - ] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timezone": "utc", - "title": "Alloy logs overview", - "uid": "alloy-logs-overview" - } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-opentelemetry.json b/operations/alloy-mixin/alloy-opentelemetry.json deleted file mode 100644 index f17f5c0e90..0000000000 --- a/operations/alloy-mixin/alloy-opentelemetry.json +++ /dev/null @@ -1,420 +0,0 @@ -{ - "graphTooltip": 1, - "links": [ - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "title": "Receivers for traces [otelcol.receiver]", - "type": "row" - }, - { - "datasource": "${datasource}", - "description": "Number of spans successfully pushed into the pipeline.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - } - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 0 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{ pod }} / {{ transport }}", - "range": true - } - ], - "title": "Accepted spans", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Number of spans that could not be pushed into the pipeline.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - } - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 0 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{ pod }} / {{ transport }}", - "range": true - } - ], - "title": "Refused spans", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "The duration of inbound RPCs.\n", - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 0 - }, - "maxDataPoints": 30, - "options": { - "calculate": false, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "scale": "exponential", - "scheme": "Oranges", - "steps": 65 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1.0000000000000001e-09 - }, - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "unit": "ms" - } - }, - "pluginVersion": "9.0.6", - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum by (le) (increase(rpc_server_duration_milliseconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", rpc_service=\"opentelemetry.proto.collector.trace.v1.TraceService\"}[$__rate_interval]))", - "format": "heatmap", - "instant": false, - "legendFormat": "{{le}}", - "range": true - } - ], - "title": "RPC server duration", - "type": "heatmap" - }, - { - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 10 - }, - "title": "Batching of logs, metrics, and traces [otelcol.processor.batch]", - "type": "row" - }, - { - "datasource": "${datasource}", - "description": "Number of spans, metric datapoints, or log lines in a batch\n", - "fieldConfig": { - "defaults": { - "unit": "short" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 10 - }, - "maxDataPoints": 30, - "options": { - "calculate": false, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "scale": "exponential", - "scheme": "Oranges", - "steps": 65 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1.0000000000000001e-09 - }, - "tooltip": { - "show": true, - "yHistogram": true - }, - "yAxis": { - "unit": "short" - } - }, - "pluginVersion": "9.0.6", - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval]))", - "format": "heatmap", - "instant": false, - "legendFormat": "{{le}}", - "range": true - } - ], - "title": "Number of units in the batch", - "type": "heatmap" - }, - { - "datasource": "${datasource}", - "description": "Number of distinct metadata value combinations being processed\n", - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 10 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}\n", - "instant": false, - "legendFormat": "{{ pod }}", - "range": true - } - ], - "title": "Distinct metadata values", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Number of times the batch was sent due to a timeout trigger\n", - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 10 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{ pod }}", - "range": true - } - ], - "title": "Timeout trigger", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 20 - }, - "title": "Exporters for traces [otelcol.exporter]", - "type": "row" - }, - { - "datasource": "${datasource}", - "description": "Number of spans successfully sent to destination.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - } - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 20 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{ pod }}", - "range": true - } - ], - "title": "Exported sent spans", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Number of spans in failed attempts to send to destination.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - } - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 20 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{ pod }}", - "range": true - } - ], - "title": "Exported failed spans", - "type": "timeseries" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "tags": [ - "alloy-mixin" - ], - "templating": { - "list": [ - { - "label": "Data Source", - "name": "datasource", - "query": "prometheus", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "label": "Loki Data Source", - "name": "loki_datasource", - "query": "loki", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "datasource": "${datasource}", - "label": "cluster", - "name": "cluster", - "query": { - "query": "label_values(alloy_component_controller_running_components, cluster)\n", - "refId": "cluster" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "namespace", - "name": "namespace", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", - "refId": "namespace" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": "${datasource}", - "includeAll": true, - "label": "instance", - "multi": true, - "name": "instance", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n", - "refId": "instance" - }, - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d", - "90d" - ] - }, - "timezone": "utc", - "title": "Alloy / OpenTelemetry", - "uid": "9b6d37c8603e19e8922133984faad93d" - } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-prometheus-remote-write.json b/operations/alloy-mixin/alloy-prometheus-remote-write.json deleted file mode 100644 index 845b692f0a..0000000000 --- a/operations/alloy-mixin/alloy-prometheus-remote-write.json +++ /dev/null @@ -1,655 +0,0 @@ -{ - "annotations": { - "list": [ - { - "datasource": "$loki_datasource", - "enable": true, - "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", - "iconColor": "rgba(0, 211, 255, 1)", - "instant": false, - "name": "Deployments", - "titleFormat": "{{cluster}}/{{namespace}}" - } - ] - }, - "graphTooltip": 1, - "links": [ - { - "icon": "doc", - "targetBlank": true, - "title": "Documentation", - "tooltip": "Component documentation", - "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/components/prometheus.remote_write/" - }, - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "collapsed": false, - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "title": "prometheus.scrape", - "type": "row" - }, - { - "datasource": "${datasource}", - "description": "Percentage of targets successfully scraped by prometheus.scrape\ncomponents.\n\nThis metric is calculated by dividing the number of targets\nsuccessfully scraped by the total number of targets scraped,\nacross all the namespaces in the selected cluster.\n\nLow success rates can indicate a problem with scrape targets,\nstale service discovery, or Alloy misconfiguration.\n", - "fieldConfig": { - "defaults": { - "unit": "percentunit" - } - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 0, - "y": 1 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(up{cluster=\"$cluster\"})\n/\ncount (up{cluster=\"$cluster\"})\n", - "instant": false, - "legendFormat": "% of targets successfully scraped", - "range": true - } - ], - "title": "Scrape success rate in $cluster", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Duration of successful scrapes by prometheus.scrape components,\nacross all the namespaces in the selected cluster.\n\nThis metric should be below your configured scrape interval.\nHigh durations can indicate a problem with a scrape target or\na performance issue with Alloy.\n", - "fieldConfig": { - "defaults": { - "unit": "s" - } - }, - "gridPos": { - "h": 10, - "w": 12, - "x": 12, - "y": 1 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "quantile(0.99, scrape_duration_seconds{cluster=\"$cluster\"})\n", - "instant": false, - "legendFormat": "p99", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "quantile(0.95, scrape_duration_seconds{cluster=\"$cluster\"})\n", - "instant": false, - "legendFormat": "p95", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "quantile(0.50, scrape_duration_seconds{cluster=\"$cluster\"})\n", - "instant": false, - "legendFormat": "p50", - "range": true - } - ], - "title": "Scrape duration in $cluster", - "type": "timeseries" - }, - { - "collapsed": false, - "datasource": "${datasource}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 11 - }, - "title": "prometheus.remote_write", - "type": "row" - }, - { - "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", - "fieldConfig": { - "defaults": { - "unit": "s" - } - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 0, - "y": 12 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - } - ], - "title": "WAL delay", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" - } - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 6, - "y": 12 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - } - ], - "title": "Data write throughput", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", - "fieldConfig": { - "defaults": { - "unit": "s" - } - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 12, - "y": 12 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "50th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", - "instant": false, - "legendFormat": "Average", - "range": true - } - ], - "title": "Write latency", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Total number of shards which are concurrently sending samples read\nfrom the Write-Ahead Log.\n\nShards are bound to a minimum and maximum, displayed on the graph.\nThe lowest minimum and the highest maximum across all clients is\nshown.\n\nEach client has its own set of shards, minimum shards, and maximum\nshards; filter to a specific URL to display more granular\ninformation.\n", - "fieldConfig": { - "defaults": { - "unit": "none" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Minimum" - }, - "properties": [ - { - "id": "custom.lineStyle", - "value": { - "dash": [ - 10, - 15 - ], - "fill": "dash" - } - }, - { - "id": "custom.showPoints", - "value": "never" - }, - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": false, - "viz": false - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Maximum" - }, - "properties": [ - { - "id": "custom.lineStyle", - "value": { - "dash": [ - 10, - 15 - ], - "fill": "dash" - } - }, - { - "id": "custom.showPoints", - "value": "never" - }, - { - "id": "custom.hideFrom", - "value": { - "legend": true, - "tooltip": false, - "viz": false - } - } - ] - } - ] - }, - "gridPos": { - "h": 10, - "w": 6, - "x": 18, - "y": 12 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n prometheus_remote_storage_shards{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "min (\n prometheus_remote_storage_shards_min{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", - "instant": false, - "legendFormat": "Minimum", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "max (\n prometheus_remote_storage_shards_max{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", - "instant": false, - "legendFormat": "Maximum", - "range": true - } - ], - "title": "Shards", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Total outgoing samples sent by prometheus.remote_write.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "cps" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 22 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum without (url, remote_name) (\n rate(prometheus_remote_storage_samples_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - } - ], - "title": "Sent samples / second", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Rate of samples which prometheus.remote_write could not send due to\nnon-recoverable errors.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "cps" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 22 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum without (url,remote_name) (\n rate(prometheus_remote_storage_samples_failed_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - } - ], - "title": "Failed samples / second", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Rate of samples which prometheus.remote_write attempted to resend\nafter receiving a recoverable error.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "cps" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 22 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum without (url,remote_name) (\n rate(prometheus_remote_storage_samples_retried_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - } - ], - "title": "Retried samples / second", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Total number of active series across all components.\n\nAn \"active series\" is a series that prometheus.remote_write recently\nreceived a sample for. Active series are garbage collected whenever a\ntruncation of the WAL occurs.\n", - "fieldConfig": { - "defaults": { - "unit": "short" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 32 - }, - "options": { - "legend": { - "showLegend": false - } - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum(prometheus_remote_write_wal_storage_active_series{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"})\n", - "instant": false, - "legendFormat": "Series", - "range": true - } - ], - "title": "Active series (total)", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Total number of active series which are currently being tracked by\nprometheus.remote_write components, with separate lines for each Alloy instance.\n\nAn \"active series\" is a series that prometheus.remote_write recently\nreceived a sample for. Active series are garbage collected whenever a\ntruncation of the WAL occurs.\n", - "fieldConfig": { - "defaults": { - "unit": "short" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 32 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "prometheus_remote_write_wal_storage_active_series{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id!=\"\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n", - "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", - "range": true - } - ], - "title": "Active series (by instance/component)", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Total number of active series which are currently being tracked by\nprometheus.remote_write components, aggregated across all instances.\n\nAn \"active series\" is a series that prometheus.remote_write recently\nreceived a sample for. Active series are garbage collected whenever a\ntruncation of the WAL occurs.\n", - "fieldConfig": { - "defaults": { - "unit": "short" - } - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 32 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id!=\"\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"})\n", - "instant": false, - "legendFormat": "{{component_path}} {{component_id}}", - "range": true - } - ], - "title": "Active series (by component)", - "type": "timeseries" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "tags": [ - "alloy-mixin" - ], - "templating": { - "list": [ - { - "label": "Data Source", - "name": "datasource", - "query": "prometheus", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "label": "Loki Data Source", - "name": "loki_datasource", - "query": "loki", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "datasource": "${datasource}", - "label": "cluster", - "name": "cluster", - "query": { - "query": "label_values(alloy_component_controller_running_components, cluster)\n", - "refId": "cluster" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "namespace", - "name": "namespace", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", - "refId": "namespace" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": "${datasource}", - "includeAll": true, - "label": "instance", - "multi": true, - "name": "instance", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n", - "refId": "instance" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": "${datasource}", - "includeAll": true, - "label": "component_path", - "multi": true, - "name": "component_path", - "query": { - "query": "label_values(prometheus_remote_write_wal_samples_appended_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id=~\"prometheus\\\\.remote_write\\\\..*\", component_path=~\".*\"}, component_path)\n", - "refId": "component_path" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": "${datasource}", - "includeAll": true, - "label": "component", - "multi": true, - "name": "component", - "query": { - "query": "label_values(prometheus_remote_write_wal_samples_appended_total{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id=~\"prometheus\\\\.remote_write\\\\..*\"}, component_id)\n", - "refId": "component" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": "${datasource}", - "includeAll": true, - "label": "url", - "multi": true, - "name": "url", - "query": { - "query": "label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$instance\", component_id=~\"$component\"}, url)\n", - "refId": "url" - }, - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d", - "90d" - ] - }, - "timezone": "utc", - "title": "Alloy / Prometheus Components", - "uid": "e324cc55567d7f3a8e32860ff8e6d0d9" - } \ No newline at end of file diff --git a/operations/alloy-mixin/alloy-resources.json b/operations/alloy-mixin/alloy-resources.json deleted file mode 100644 index 7aefa1a697..0000000000 --- a/operations/alloy-mixin/alloy-resources.json +++ /dev/null @@ -1,329 +0,0 @@ -{ - "annotations": { - "list": [ - { - "datasource": "$loki_datasource", - "enable": true, - "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"alloy\" | name_extracted=~\"alloy.*\"", - "iconColor": "rgba(0, 211, 255, 1)", - "instant": false, - "name": "Deployments", - "titleFormat": "{{cluster}}/{{namespace}}" - } - ] - }, - "graphTooltip": 1, - "links": [ - { - "asDropdown": true, - "icon": "external link", - "includeVars": true, - "keepTime": true, - "tags": [ - "alloy-mixin" - ], - "targetBlank": false, - "title": "Dashboards", - "type": "dashboards" - } - ], - "panels": [ - { - "datasource": "${datasource}", - "description": "CPU usage of the Alloy process relative to 1 CPU core.\n\nFor example, 100% means using one entire CPU core.\n", - "fieldConfig": { - "defaults": { - "unit": "percentunit" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 0 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(alloy_resources_process_cpu_seconds_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "CPU usage", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Resident memory size of the Alloy process.\n", - "fieldConfig": { - "defaults": { - "unit": "decbytes" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "alloy_resources_process_resident_memory_bytes{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "Memory (RSS)", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Rate at which the Alloy process performs garbage collections.\n", - "fieldConfig": { - "defaults": { - "custom": { - "drawStyle": "points", - "pointSize": 3 - }, - "unit": "ops" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 8 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(go_gc_duration_seconds_count{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[5m])\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "Garbage collections", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Number of goroutines which are running in parallel. An infinitely\ngrowing number of these indicates a goroutine leak.\n", - "fieldConfig": { - "defaults": { - "unit": "none" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 8 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "go_goroutines{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "Goroutines", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Heap memory currently in use by the Alloy process.\n", - "fieldConfig": { - "defaults": { - "unit": "decbytes" - } - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 8 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\nand on(instance)\nalloy_build_info{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}\n", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "Memory (heap inuse)", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Rate of data received across all network interfaces for the machine\nAlloy is running on.\n\nData shown here is across all running processes and not exclusive to\nthe running Alloy process.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 30, - "gradientMode": "none", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(alloy_resources_machine_rx_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "Network receive bandwidth", - "type": "timeseries" - }, - { - "datasource": "${datasource}", - "description": "Rate of data sent across all network interfaces for the machine\nAlloy is running on.\n\nData shown here is across all running processes and not exclusive to\nthe running Alloy process.\n", - "fieldConfig": { - "defaults": { - "custom": { - "fillOpacity": 30, - "gradientMode": "none", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" - } - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 16 - }, - "targets": [ - { - "datasource": "${datasource}", - "expr": "rate(alloy_resources_machine_tx_bytes_total{cluster=\"$cluster\",namespace=\"$namespace\",instance=~\"$instance\"}[$__rate_interval])\n", - "instant": false, - "legendFormat": "{{instance}}", - "range": true - } - ], - "title": "Network send bandwidth", - "type": "timeseries" - } - ], - "refresh": "10s", - "schemaVersion": 36, - "tags": [ - "alloy-mixin" - ], - "templating": { - "list": [ - { - "label": "Data Source", - "name": "datasource", - "query": "prometheus", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "label": "Loki Data Source", - "name": "loki_datasource", - "query": "loki", - "refresh": 1, - "sort": 2, - "type": "datasource" - }, - { - "datasource": "${datasource}", - "label": "cluster", - "name": "cluster", - "query": { - "query": "label_values(alloy_component_controller_running_components, cluster)\n", - "refId": "cluster" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "datasource": "${datasource}", - "label": "namespace", - "name": "namespace", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\"}, namespace)\n", - "refId": "namespace" - }, - "refresh": 2, - "sort": 2, - "type": "query" - }, - { - "allValue": ".*", - "datasource": "${datasource}", - "includeAll": true, - "label": "instance", - "multi": true, - "name": "instance", - "query": { - "query": "label_values(alloy_component_controller_running_components{cluster=\"$cluster\", namespace=\"$namespace\"}, instance)\n", - "refId": "instance" - }, - "refresh": 2, - "sort": 2, - "type": "query" - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d", - "90d" - ] - }, - "timezone": "utc", - "title": "Alloy / Resources", - "uid": "d6a8574c31f3d7cb8f1345ec84d15a67" - } \ No newline at end of file diff --git a/operations/alloy-mixin/config.libsonnet b/operations/alloy-mixin/config.libsonnet index 6e1ecace1f..6e14c9e052 100644 --- a/operations/alloy-mixin/config.libsonnet +++ b/operations/alloy-mixin/config.libsonnet @@ -3,10 +3,10 @@ enableK8sCluster: true, enableAlloyCluster: true, enableLokiLogs: true, - filterSelector: 'job="$job"', + filterSelector: 'job=~"$job"', groupSelector: if self.enableK8sCluster then self.filterSelector + ', ' + self.k8sClusterSelector else self.filterSelector, - instanceSelector: self.groupSelector + ', instance="$instance"', - k8sClusterSelector: 'cluster="$cluster", namespace="$namespace", ', + instanceSelector: self.groupSelector + ', instance=~"$instance"', + k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"', dashboardTag: 'alloy-mixin' } } \ No newline at end of file diff --git a/operations/alloy-mixin/dashboards/cluster-node.libsonnet b/operations/alloy-mixin/dashboards/cluster-node.libsonnet index a81320d879..5290cd81c2 100644 --- a/operations/alloy-mixin/dashboards/cluster-node.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-node.libsonnet @@ -3,6 +3,20 @@ local panel = import './utils/panel.jsonnet'; local filename = 'alloy-cluster-node.json'; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + ], + [filename]: dashboard.new(name='Alloy / Cluster Node', tag=$._config.dashboardTag) + dashboard.withDocsLink( @@ -11,20 +25,7 @@ local filename = 'alloy-cluster-node.json'; ) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('job', - 'label_values(alloy_component_controller_running_components, job)' - ), - dashboard.newTemplateVariable('cluster', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' - ), - dashboard.newTemplateVariable('namespace', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' - ), - dashboard.newTemplateVariable('instance', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster", namespace="$namespace"}, instance)' - ), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 7a63f238de..2cbeefdaf1 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -4,6 +4,18 @@ local filename = 'alloy-cluster-overview.json'; local cluster_node_filename = 'alloy-cluster-node.json'; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + ], + [filename]: dashboard.new(name='Alloy / Cluster Overview', tag=$._config.dashboardTag) + dashboard.withDocsLink( @@ -12,17 +24,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; ) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('job', - 'label_values(alloy_component_controller_running_components, job)' - ), - dashboard.newTemplateVariable('cluster', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' - ), - dashboard.newTemplateVariable('namespace', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' - ), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), diff --git a/operations/alloy-mixin/dashboards/controller.libsonnet b/operations/alloy-mixin/dashboards/controller.libsonnet index b1711c2281..56412a6cb1 100644 --- a/operations/alloy-mixin/dashboards/controller.libsonnet +++ b/operations/alloy-mixin/dashboards/controller.libsonnet @@ -3,6 +3,19 @@ local panel = import './utils/panel.jsonnet'; local filename = 'alloy-controller.json'; { + + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + ], + [filename]: dashboard.new(name='Alloy / Controller', tag=$._config.dashboardTag) + dashboard.withDocsLink( @@ -11,17 +24,7 @@ local filename = 'alloy-controller.json'; ) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('job', - 'label_values(alloy_component_controller_running_components, job)' - ), - dashboard.newTemplateVariable('cluster', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' - ), - dashboard.newTemplateVariable('namespace', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' - ), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), diff --git a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet index 240558ee8d..05836db590 100644 --- a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet +++ b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet @@ -15,24 +15,25 @@ local stackedPanelMixin = { }; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + ], + [filename]: dashboard.new(name='Alloy / OpenTelemetry', tag=$._config.dashboardTag) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('job', - 'label_values(alloy_component_controller_running_components, job)' - ), - dashboard.newTemplateVariable('cluster', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + '}, cluster)' - ), - dashboard.newTemplateVariable('namespace', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster"}, namespace)' - ), - dashboard.newTemplateVariable('instance', - 'label_values(alloy_component_controller_running_components{' + $._config.filterSelector + ', cluster="$cluster", namespace="$namespace"}, instance)' - ), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + dashboard.withPanelsMixin([ // "Receivers for traces" row ( diff --git a/operations/alloy-mixin/dashboards/prometheus.libsonnet b/operations/alloy-mixin/dashboards/prometheus.libsonnet index 439f861ae7..f8ecdc4ac3 100644 --- a/operations/alloy-mixin/dashboards/prometheus.libsonnet +++ b/operations/alloy-mixin/dashboards/prometheus.libsonnet @@ -2,392 +2,411 @@ local dashboard = import './utils/dashboard.jsonnet'; local panel = import './utils/panel.jsonnet'; local filename = 'alloy-prometheus-remote-write.json'; -local stackedPanelMixin = { - fieldConfig+: { - defaults+: { - custom+: { - fillOpacity: 20, - gradientMode: 'hue', - stacking: { mode: 'normal' }, +{ + local stackedPanelMixin = { + fieldConfig+: { + defaults+: { + custom+: { + fillOpacity: 20, + gradientMode: 'hue', + stacking: { mode: 'normal' }, + }, }, }, }, -}; -local scrapePanels(y_offset) = [ - panel.newRow(title='prometheus.scrape', y=y_offset), + local scrapePanels(y_offset) = [ + panel.newRow(title='prometheus.scrape', y=y_offset), - // Scrape success rate - ( - panel.new(title='Scrape success rate in $cluster', type='timeseries') + - panel.withUnit('percentunit') + - panel.withDescription(||| - Percentage of targets successfully scraped by prometheus.scrape - components. + // Scrape success rate + ( + panel.new(title='Scrape success rate in $cluster', type='timeseries') + + panel.withUnit('percentunit') + + panel.withDescription(||| + Percentage of targets successfully scraped by prometheus.scrape + components. - This metric is calculated by dividing the number of targets - successfully scraped by the total number of targets scraped, - across all the namespaces in the selected cluster. + This metric is calculated by dividing the number of targets + successfully scraped by the total number of targets scraped, + across all the namespaces in the selected cluster. - Low success rates can indicate a problem with scrape targets, - stale service discovery, or Alloy misconfiguration. - |||) + - panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum(up{cluster="$cluster"}) - / - count (up{cluster="$cluster"}) - |||, - legendFormat='% of targets successfully scraped', - ), - ]) - ), + Low success rates can indicate a problem with scrape targets, + stale service discovery, or Alloy misconfiguration. + |||) + + panel.withPosition({ x: 0, y: 1 + y_offset, w: 12, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + sum(up{job=~"$job", cluster=~"$cluster"}) + / + count (up{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='% of targets successfully scraped', + ), + ]) + ), - // Scrape duration - ( - panel.new(title='Scrape duration in $cluster', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - Duration of successful scrapes by prometheus.scrape components, - across all the namespaces in the selected cluster. + // Scrape duration + ( + panel.new(title='Scrape duration in $cluster', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + Duration of successful scrapes by prometheus.scrape components, + across all the namespaces in the selected cluster. - This metric should be below your configured scrape interval. - High durations can indicate a problem with a scrape target or - a performance issue with Alloy. - |||) + - panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - quantile(0.99, scrape_duration_seconds{cluster="$cluster"}) - |||, - legendFormat='p99', - ), - panel.newQuery( - expr=||| - quantile(0.95, scrape_duration_seconds{cluster="$cluster"}) - |||, - legendFormat='p95', - ), - panel.newQuery( - expr=||| - quantile(0.50, scrape_duration_seconds{cluster="$cluster"}) - |||, - legendFormat='p50', - ), + This metric should be below your configured scrape interval. + High durations can indicate a problem with a scrape target or + a performance issue with Alloy. + |||) + + panel.withPosition({ x: 12, y: 1 + y_offset, w: 12, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + quantile(0.99, scrape_duration_seconds{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='p99', + ), + panel.newQuery( + expr=||| + quantile(0.95, scrape_duration_seconds{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='p95', + ), + panel.newQuery( + expr=||| + quantile(0.50, scrape_duration_seconds{job=~"$job", cluster=~"$cluster"}) + |||, + legendFormat='p50', + ), - ]) - ), -]; + ]) + ), + ], -local remoteWritePanels(y_offset) = [ - panel.newRow(title='prometheus.remote_write', y=y_offset), + local remoteWritePanels(y_offset) = [ + panel.newRow(title='prometheus.remote_write', y=y_offset), - // WAL delay - ( - panel.new(title='WAL delay', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - How far behind prometheus.remote_write from samples recently written - to the WAL. + // WAL delay + ( + panel.new(title='WAL delay', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + How far behind prometheus.remote_write from samples recently written + to the WAL. - Each endpoint prometheus.remote_write is configured to send metrics - has its own delay. The time shown here is the sum across all - endpoints for the given component. + Each endpoint prometheus.remote_write is configured to send metrics + has its own delay. The time shown here is the sum across all + endpoints for the given component. - It is normal for the WAL delay to be within 1-3 scrape intervals. If - the WAL delay continues to increase beyond that amount, try - increasing the number of maximum shards. - |||) + - panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum by (instance, component_path, component_id) ( - prometheus_remote_storage_highest_timestamp_in_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component"} - - ignoring(url, remote_name) group_right(instance) - prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + It is normal for the WAL delay to be within 1-3 scrape intervals. If + the WAL delay continues to increase beyond that amount, try + increasing the number of maximum shards. + |||) + + panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum by (instance, component_path, component_id) ( + prometheus_remote_storage_highest_timestamp_in_seconds{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component"} + - ignoring(url, remote_name) group_right(instance) + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} + )', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Data write throughput - ( - panel.new(title='Data write throughput', type='timeseries') + - stackedPanelMixin + - panel.withUnit('Bps') + - panel.withDescription(||| - Rate of data containing samples and metadata sent by - prometheus.remote_write. - |||) + - panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (remote_name, url) ( - rate(prometheus_remote_storage_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + - rate(prometheus_remote_storage_metadata_bytes_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Data write throughput + ( + panel.new(title='Data write throughput', type='timeseries') + + stackedPanelMixin + + panel.withUnit('Bps') + + panel.withDescription(||| + Rate of data containing samples and metadata sent by + prometheus.remote_write. + |||) + + panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum without (remote_name, url) ( + rate(prometheus_remote_storage_bytes_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + + rate(prometheus_remote_storage_metadata_bytes_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Write latency - ( - panel.new(title='Write latency', type='timeseries') + - panel.withUnit('s') + - panel.withDescription(||| - Latency of writes to the remote system made by - prometheus.remote_write. - |||) + - panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - histogram_quantile(0.99, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )) - |||, - legendFormat='99th percentile', - ), - panel.newQuery( - expr=||| - histogram_quantile(0.50, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )) - |||, - legendFormat='50th percentile', - ), - panel.newQuery( - expr=||| - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster="$cluster",namespace="$namespace",instance=~"$instance", component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) - |||, - legendFormat='Average', - ), - ]) - ), + // Write latency + ( + panel.new(title='Write latency', type='timeseries') + + panel.withUnit('s') + + panel.withDescription(||| + Latency of writes to the remote system made by + prometheus.remote_write. + |||) + + panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'histogram_quantile(0.99, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ))', + legendFormat='99th percentile', + ), + panel.newQuery( + expr= + 'histogram_quantile(0.50, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ))', + legendFormat='50th percentile', + ), + panel.newQuery( + expr= + 'sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component"}[$__rate_interval]))' + , + legendFormat='Average', + ), + ]) + ), - // Shards - ( - local minMaxOverride = { - properties: [{ - id: 'custom.lineStyle', - value: { - dash: [10, 15], - fill: 'dash', - }, - }, { - id: 'custom.showPoints', - value: 'never', - }, { - id: 'custom.hideFrom', - value: { - legend: true, - tooltip: false, - viz: false, - }, - }], - }; + // Shards + ( + local minMaxOverride = { + properties: [{ + id: 'custom.lineStyle', + value: { + dash: [10, 15], + fill: 'dash', + }, + }, { + id: 'custom.showPoints', + value: 'never', + }, { + id: 'custom.hideFrom', + value: { + legend: true, + tooltip: false, + viz: false, + }, + }], + }; - panel.new(title='Shards', type='timeseries') { - fieldConfig+: { - overrides: [ - minMaxOverride { matcher: { id: 'byName', options: 'Minimum' } }, - minMaxOverride { matcher: { id: 'byName', options: 'Maximum' } }, - ], - }, - } + - panel.withUnit('none') + - panel.withDescription(||| - Total number of shards which are concurrently sending samples read - from the Write-Ahead Log. + panel.new(title='Shards', type='timeseries') { + fieldConfig+: { + overrides: [ + minMaxOverride { matcher: { id: 'byName', options: 'Minimum' } }, + minMaxOverride { matcher: { id: 'byName', options: 'Maximum' } }, + ], + }, + } + + panel.withUnit('none') + + panel.withDescription(||| + Total number of shards which are concurrently sending samples read + from the Write-Ahead Log. - Shards are bound to a minimum and maximum, displayed on the graph. - The lowest minimum and the highest maximum across all clients is - shown. + Shards are bound to a minimum and maximum, displayed on the graph. + The lowest minimum and the highest maximum across all clients is + shown. - Each client has its own set of shards, minimum shards, and maximum - shards; filter to a specific URL to display more granular - information. - |||) + - panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (remote_name, url) ( - prometheus_remote_storage_shards{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - panel.newQuery( - expr=||| - min ( - prometheus_remote_storage_shards_min{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='Minimum', - ), - panel.newQuery( - expr=||| - max ( - prometheus_remote_storage_shards_max{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - ) - |||, - legendFormat='Maximum', - ), - ]) - ), + Each client has its own set of shards, minimum shards, and maximum + shards; filter to a specific URL to display more granular + information. + |||) + + panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum without (remote_name, url) ( + prometheus_remote_storage_shards{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} + )', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + panel.newQuery( + expr= + 'min ( + prometheus_remote_storage_shards_min{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} + )', + legendFormat='Minimum', + ), + panel.newQuery( + expr= + 'max ( + prometheus_remote_storage_shards_max{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} + )', + legendFormat='Maximum', + ), + ]) + ), - // Sent samples / second - ( - panel.new(title='Sent samples / second', type='timeseries') + - stackedPanelMixin + - panel.withUnit('cps') + - panel.withDescription(||| - Total outgoing samples sent by prometheus.remote_write. - |||) + - panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (url, remote_name) ( - rate(prometheus_remote_storage_samples_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Sent samples / second + ( + panel.new(title='Sent samples / second', type='timeseries') + + stackedPanelMixin + + panel.withUnit('cps') + + panel.withDescription(||| + Total outgoing samples sent by prometheus.remote_write. + |||) + + panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum without (url, remote_name) ( + rate(prometheus_remote_storage_samples_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Failed samples / second - ( - panel.new(title='Failed samples / second', type='timeseries') + - stackedPanelMixin + - panel.withUnit('cps') + - panel.withDescription(||| - Rate of samples which prometheus.remote_write could not send due to - non-recoverable errors. - |||) + - panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_failed_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Failed samples / second + ( + panel.new(title='Failed samples / second', type='timeseries') + + stackedPanelMixin + + panel.withUnit('cps') + + panel.withDescription(||| + Rate of samples which prometheus.remote_write could not send due to + non-recoverable errors. + |||) + + panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum without (url,remote_name) ( + rate(prometheus_remote_storage_samples_failed_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Retried samples / second - ( - panel.new(title='Retried samples / second', type='timeseries') + - stackedPanelMixin + - panel.withUnit('cps') + - panel.withDescription(||| - Rate of samples which prometheus.remote_write attempted to resend - after receiving a recoverable error. - |||) + - panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_retried_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ) - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + // Retried samples / second + ( + panel.new(title='Retried samples / second', type='timeseries') + + stackedPanelMixin + + panel.withUnit('cps') + + panel.withDescription(||| + Rate of samples which prometheus.remote_write attempted to resend + after receiving a recoverable error. + |||) + + panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum without (url,remote_name) ( + rate(prometheus_remote_storage_samples_retried_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )', + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Active series (Total) - ( - panel.new(title='Active series (total)', type='timeseries') { - options+: { - legend+: { - showLegend: false, + // Active series (Total) + ( + panel.new(title='Active series (total)', type='timeseries') { + options+: { + legend+: { + showLegend: false, + }, }, - }, - } + - panel.withUnit('short') + - panel.withDescription(||| - Total number of active series across all components. + } + + panel.withUnit('short') + + panel.withDescription(||| + Total number of active series across all components. - An "active series" is a series that prometheus.remote_write recently - received a sample for. Active series are garbage collected whenever a - truncation of the WAL occurs. - |||) + - panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum(prometheus_remote_write_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) - |||, - legendFormat='Series', - ), - ]) - ), + An "active series" is a series that prometheus.remote_write recently + received a sample for. Active series are garbage collected whenever a + truncation of the WAL occurs. + |||) + + panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum(prometheus_remote_write_wal_storage_active_series{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"})' + , + legendFormat='Series', + ), + ]) + ), - // Active series (by instance/component) - ( - panel.new(title='Active series (by instance/component)', type='timeseries') + - panel.withUnit('short') + - panel.withDescription(||| - Total number of active series which are currently being tracked by - prometheus.remote_write components, with separate lines for each Alloy instance. + // Active series (by instance/component) + ( + panel.new(title='Active series (by instance/component)', type='timeseries') + + panel.withUnit('short') + + panel.withDescription(||| + Total number of active series which are currently being tracked by + prometheus.remote_write components, with separate lines for each Alloy instance. - An "active series" is a series that prometheus.remote_write recently - received a sample for. Active series are garbage collected whenever a - truncation of the WAL occurs. - |||) + - panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - prometheus_remote_write_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"} - |||, - legendFormat='{{instance}} / {{component_path}} {{component_id}}', - ), - ]) - ), + An "active series" is a series that prometheus.remote_write recently + received a sample for. Active series are garbage collected whenever a + truncation of the WAL occurs. + |||) + + panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'prometheus_remote_write_wal_storage_active_series{' + $._config.instanceSelector + ', component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}' + , + legendFormat='{{instance}} / {{component_path}} {{component_id}}', + ), + ]) + ), - // Active series (by component) - ( - panel.new(title='Active series (by component)', type='timeseries') + - panel.withUnit('short') + - panel.withDescription(||| - Total number of active series which are currently being tracked by - prometheus.remote_write components, aggregated across all instances. + // Active series (by component) + ( + panel.new(title='Active series (by component)', type='timeseries') + + panel.withUnit('short') + + panel.withDescription(||| + Total number of active series which are currently being tracked by + prometheus.remote_write components, aggregated across all instances. - An "active series" is a series that prometheus.remote_write recently - received a sample for. Active series are garbage collected whenever a - truncation of the WAL occurs. - |||) + - panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + - panel.withQueries([ - panel.newQuery( - expr=||| - sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) - |||, - legendFormat='{{component_path}} {{component_id}}', - ), - ]) - ), -]; + An "active series" is a series that prometheus.remote_write recently + received a sample for. Active series are garbage collected whenever a + truncation of the WAL occurs. + |||) + + panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr= + 'sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{' + $._config.instanceSelector + ', component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"})' + , + legendFormat='{{component_path}} {{component_id}}', + ), + ]) + ), + ], + + local panels = + if $._config.enableK8sCluster then + // First row, offset is 0 + scrapePanels(y_offset=0) + + // Scrape panels take 11 units, so offset next row by 11. + remoteWritePanels(y_offset=11) + else + remoteWritePanels(y_offset=0), + + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + dashboard.newMultiTemplateVariable('component_path', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance", component_id=~"prometheus.remote_write.*", component_path=~".*"}, component_path)'), + dashboard.newMultiTemplateVariable('component', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance", component_id=~"prometheus.remote_write.*"}, component_id)'), + dashboard.newMultiTemplateVariable('url', 'label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{job="$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance", component_id=~"$component"}, url)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + dashboard.newMultiTemplateVariable('component_path', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*", component_path=~".*"}, component_path)'), + dashboard.newMultiTemplateVariable('component', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*"}, component_id)'), + dashboard.newMultiTemplateVariable('url', 'label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{job=~"$job", instance=~"$instance", component_id=~"$component"}, url)'), + ], -{ [filename]: dashboard.new(name='Alloy / Prometheus Components', tag=$._config.dashboardTag) + dashboard.withDocsLink( @@ -396,34 +415,12 @@ local remoteWritePanels(y_offset) = [ ) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newMultiTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), - dashboard.newMultiTemplateVariable('component_path', ||| - label_values(prometheus_remote_write_wal_samples_appended_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"prometheus\\.remote_write\\..*", component_path=~".*"}, component_path) - |||), - dashboard.newMultiTemplateVariable('component', ||| - label_values(prometheus_remote_write_wal_samples_appended_total{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"prometheus\\.remote_write\\..*"}, component_id) - |||), - dashboard.newMultiTemplateVariable('url', ||| - label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster="$cluster", namespace="$namespace", instance=~"$instance", component_id=~"$component"}, url) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ - dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), + dashboard.newLokiAnnotation('Deployments', '{cluster=~"$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), ]) + dashboard.withPanelsMixin( - // First row, offset is 0 - scrapePanels(y_offset=0) + - // Scrape panels take 11 units, so offset next row by 11. - remoteWritePanels(y_offset=11) + panels ), } diff --git a/operations/alloy-mixin/dashboards/resources.libsonnet b/operations/alloy-mixin/dashboards/resources.libsonnet index ce30e976eb..ef0118039f 100644 --- a/operations/alloy-mixin/dashboards/resources.libsonnet +++ b/operations/alloy-mixin/dashboards/resources.libsonnet @@ -27,21 +27,25 @@ local stackedPanelMixin = { }; { + local templateVariables = + if $._config.enableK8sCluster then + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + ] + else + [ + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job"}, instance)'), + ], + [filename]: dashboard.new(name='Alloy / Resources', tag=$._config.dashboardTag) + dashboard.withDashboardsLink(tag=$._config.dashboardTag) + dashboard.withUID(std.md5(filename)) + - dashboard.withTemplateVariablesMixin([ - dashboard.newTemplateVariable('cluster', ||| - label_values(alloy_component_controller_running_components, cluster) - |||), - dashboard.newTemplateVariable('namespace', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster"}, namespace) - |||), - dashboard.newMultiTemplateVariable('instance', ||| - label_values(alloy_component_controller_running_components{cluster="$cluster", namespace="$namespace"}, instance) - |||), - ]) + + dashboard.withTemplateVariablesMixin(templateVariables) + // TODO(@tpaschalis) Make the annotation optional. dashboard.withAnnotations([ dashboard.newLokiAnnotation('Deployments', '{cluster="$cluster", container="kube-diff-logger"} | json | namespace_extracted="alloy" | name_extracted=~"alloy.*"', 'rgba(0, 211, 255, 1)'), @@ -59,7 +63,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 0, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='rate(alloy_resources_process_cpu_seconds_total{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[$__rate_interval])', + expr='rate(alloy_resources_process_cpu_seconds_total{' + $._config.instanceSelector + '}[$__rate_interval])', legendFormat='{{instance}}' ), ]) @@ -75,7 +79,7 @@ local stackedPanelMixin = { panel.withPosition({ x: 12, y: 0, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='alloy_resources_process_resident_memory_bytes{cluster="$cluster",namespace="$namespace",instance=~"$instance"}', + expr='alloy_resources_process_resident_memory_bytes{' + $._config.instanceSelector + '}', legendFormat='{{instance}}' ), ]) @@ -95,11 +99,11 @@ local stackedPanelMixin = { // Lots of programs export go_goroutines so we ignore anything that // doesn't also have an Alloy-specific metric (i.e., // alloy_build_info). - expr=||| - rate(go_gc_duration_seconds_count{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[5m]) + expr= + 'rate(go_gc_duration_seconds_count{' + $._config.instanceSelector + '}[5m]) and on(instance) - alloy_build_info{cluster="$cluster",namespace="$namespace",instance=~"$instance"} - |||, + alloy_build_info{' + $._config.instanceSelector + '}' + , legendFormat='{{instance}}' ), ]) @@ -119,11 +123,11 @@ local stackedPanelMixin = { // Lots of programs export go_goroutines so we ignore anything that // doesn't also have an Alloy-specific metric (i.e., // alloy_build_info). - expr=||| - go_goroutines{cluster="$cluster",namespace="$namespace",instance=~"$instance"} + expr= + 'go_goroutines{' + $._config.instanceSelector + '} and on(instance) - alloy_build_info{cluster="$cluster",namespace="$namespace",instance=~"$instance"} - |||, + alloy_build_info{' + $._config.instanceSelector + '}' + , legendFormat='{{instance}}' ), ]) @@ -142,11 +146,11 @@ local stackedPanelMixin = { // Lots of programs export go_memstats_heap_inuse_bytes so we ignore // anything that doesn't also have an Alloy-specific metric // (i.e., alloy_build_info). - expr=||| - go_memstats_heap_inuse_bytes{cluster="$cluster",namespace="$namespace",instance=~"$instance"} + expr= + 'go_memstats_heap_inuse_bytes{' + $._config.instanceSelector + '} and on(instance) - alloy_build_info{cluster="$cluster",namespace="$namespace",instance=~"$instance"} - |||, + alloy_build_info{' + $._config.instanceSelector + '}' + , legendFormat='{{instance}}' ), ]) @@ -167,9 +171,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 16, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(alloy_resources_machine_rx_bytes_total{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[$__rate_interval]) - |||, + expr= + 'rate(alloy_resources_machine_rx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])' + , legendFormat='{{instance}}' ), ]) @@ -190,9 +194,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 12, y: 16, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr=||| - rate(alloy_resources_machine_tx_bytes_total{cluster="$cluster",namespace="$namespace",instance=~"$instance"}[$__rate_interval]) - |||, + expr= + 'rate(alloy_resources_machine_tx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])' + , legendFormat='{{instance}}' ), ]) diff --git a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet index 1c61914d19..99d5db5a3b 100644 --- a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet +++ b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet @@ -77,6 +77,8 @@ datasource: '${datasource}', refresh: 2, sort: 2, + allValue: '.*', + includeAll: true, }, newLokiAnnotation(name, expression, color):: { @@ -90,8 +92,6 @@ }, newMultiTemplateVariable(name, query):: $.newTemplateVariable(name, query) { - allValue: '.*', - includeAll: true, multi: true, }, From 73c93b77049a6d7aac3268a9101cd2920621c5bb Mon Sep 17 00:00:00 2001 From: Gabriel Antunes Date: Mon, 20 May 2024 18:29:04 -0300 Subject: [PATCH 3/6] including cluster config into alerts --- operations/alloy-mixin/alerts.libsonnet | 24 ++- .../alloy-mixin/alerts/clustering.libsonnet | 150 +++++++++++------- .../alloy-mixin/alerts/controller.libsonnet | 49 +++--- .../alerts/opentelemetry.libsonnet | 55 ++++--- 4 files changed, 169 insertions(+), 109 deletions(-) diff --git a/operations/alloy-mixin/alerts.libsonnet b/operations/alloy-mixin/alerts.libsonnet index 2b28ede772..f249d0d4d9 100644 --- a/operations/alloy-mixin/alerts.libsonnet +++ b/operations/alloy-mixin/alerts.libsonnet @@ -1,12 +1,20 @@ +local clusterAlerts = (import './alerts/clustering.libsonnet'); +local controllerAlerts = (import './alerts/controller.libsonnet'); +local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet'); + { prometheusAlerts+: { - groups+: [ - if $._config.enableK8sCluster then - (import './alerts/clustering.libsonnet') - else - {} - + (import './alerts/controller.libsonnet') - + (import './alerts/opentelemetry.libsonnet') - ], + groups+: + if $._config.enableAlloyCluster then + [ + clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster), + controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), + openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster), + ] + else + [ + controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), + openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster) + ], }, } diff --git a/operations/alloy-mixin/alerts/clustering.libsonnet b/operations/alloy-mixin/alerts/clustering.libsonnet index 2f4f49e13e..005be441b7 100644 --- a/operations/alloy-mixin/alerts/clustering.libsonnet +++ b/operations/alloy-mixin/alerts/clustering.libsonnet @@ -1,67 +1,97 @@ local alert = import './utils/alert.jsonnet'; -alert.newGroup( - 'alloy_clustering', - [ - // Cluster not converging. - alert.newRule( - 'ClusterNotConverging', - 'stddev by (cluster, namespace) (sum without (state) (cluster_node_peers)) != 0', - 'Cluster is not converging: nodes report different number of peers in the cluster.', - '10m', - ), +{ + newAlloyClusterAlertsGroup(enableK8sCluster=true):: + alert.newGroup( + 'alloy_clustering', + [ + // Cluster not converging. + alert.newRule( + 'ClusterNotConverging', + if enableK8sCluster then + 'stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0' + else + 'stddev by (job) (sum without (state) (cluster_node_peers)) != 0', + 'Cluster is not converging: nodes report different number of peers in the cluster.', + '10m', + ), - alert.newRule( - 'ClusterNodeCountMismatch', - // Assert that the number of known peers (regardless of state) reported by each - // Alloy instance matches the number of running Alloy instances in the - // same cluster and namespace as reported by a count of Prometheus - // metrics. - ||| - sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) - |||, - 'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.', - '15m', - ), + alert.newRule( + 'ClusterNodeCountMismatch', + // Assert that the number of known peers (regardless of state) reported by each + // Alloy instance matches the number of running Alloy instances in the + // same cluster and namespace as reported by a count of Prometheus + // metrics. + if enableK8sCluster then + ||| + sum without (state) (cluster_node_peers) != + on (cluster, namespace, job) group_left + count by (cluster, namespace, job) (cluster_node_info) + ||| + else + ||| + sum without (state) (cluster_node_peers) != + on (job) group_left + count by (job) (cluster_node_info) + ||| + , + 'Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state.', + '15m', + ), - // Nodes health score is not zero. - alert.newRule( - 'ClusterNodeUnhealthy', - ||| - cluster_node_gossip_health_score > 0 - |||, - 'Cluster node is reporting a gossip protocol health score > 0.', - '10m', - ), + // Nodes health score is not zero. + alert.newRule( + 'ClusterNodeUnhealthy', + ||| + cluster_node_gossip_health_score > 0 + |||, + 'Cluster node is reporting a gossip protocol health score > 0.', + '10m', + ), - // Node tried to join the cluster with an already-present node name. - alert.newRule( - 'ClusterNodeNameConflict', - 'sum by (cluster, namespace) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0', - 'A node tried to join the cluster with a name conflicting with an existing peer.', - '10m', - ), + // Node tried to join the cluster with an already-present node name. + alert.newRule( + 'ClusterNodeNameConflict', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' + else + 'sum by (job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0' + , + 'A node tried to join the cluster with a name conflicting with an existing peer.', + '10m', + ), - // Node stuck in Terminating state. - alert.newRule( - 'ClusterNodeStuckTerminating', - 'sum by (cluster, namespace, instance) (cluster_node_peers{state="terminating"}) > 0', - 'Cluster node stuck in Terminating state.', - '10m', - ), + // Node stuck in Terminating state. + alert.newRule( + 'ClusterNodeStuckTerminating', + if enableK8sCluster then + 'sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0' + else + 'sum by (job, instance) (cluster_node_peers{state="terminating"}) > 0' + , + 'Cluster node stuck in Terminating state.', + '10m', + ), - // Nodes are not using the same configuration file. - alert.newRule( - 'ClusterConfigurationDrift', - ||| - count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace) cluster_node_info) - ) > 1 - |||, - 'Cluster nodes are not using the same configuration file.', - '5m', - ), - ] -) + // Nodes are not using the same configuration file. + alert.newRule( + 'ClusterConfigurationDrift', + if enableK8sCluster then + ||| + count without (sha256) ( + max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + ) > 1 + ||| + else + ||| + count without (sha256) ( + max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info) + ) > 1 + ||| + , + 'Cluster nodes are not using the same configuration file.', + '5m', + ), + ] + ) +} diff --git a/operations/alloy-mixin/alerts/controller.libsonnet b/operations/alloy-mixin/alerts/controller.libsonnet index 2d43680b84..175c229d11 100644 --- a/operations/alloy-mixin/alerts/controller.libsonnet +++ b/operations/alloy-mixin/alerts/controller.libsonnet @@ -1,22 +1,33 @@ local alert = import './utils/alert.jsonnet'; -alert.newGroup( - 'alloy_controller', - [ - // Component evaluations are taking too long, which can lead to e.g. stale targets. - alert.newRule( - 'SlowComponentEvaluations', - 'sum by (cluster, namespace, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0', - 'Component evaluations are taking too long.', - '15m', - ), +{ + newControllerAlertsGroup(enableK8sCluster=true): + alert.newGroup( + 'alloy_controller', + [ + // Component evaluations are taking too long, which can lead to e.g. stale targets. + alert.newRule( + 'SlowComponentEvaluations', + if enableK8sCluster then + 'sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0' + else + 'sum by (job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0' + , + 'Component evaluations are taking too long.', + '15m', + ), - // Unhealthy components detected. - alert.newRule( - 'UnhealthyComponents', - 'sum by (cluster, namespace) (alloy_component_controller_running_components{health_type!="healthy"}) > 0', - 'Unhealthy components detected.', - '15m', - ), - ] -) + // Unhealthy components detected. + alert.newRule( + 'UnhealthyComponents', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0' + else + 'sum by (job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0' + , + 'Unhealthy components detected.', + '15m', + ), + ] + ) +} diff --git a/operations/alloy-mixin/alerts/opentelemetry.libsonnet b/operations/alloy-mixin/alerts/opentelemetry.libsonnet index e611545a18..23d23c3ad3 100644 --- a/operations/alloy-mixin/alerts/opentelemetry.libsonnet +++ b/operations/alloy-mixin/alerts/opentelemetry.libsonnet @@ -1,25 +1,36 @@ local alert = import './utils/alert.jsonnet'; -alert.newGroup( - 'alloy_otelcol', - [ - // An otelcol.exporter component rcould not push some spans to the pipeline. - // This could be due to reaching a limit such as the ones - // imposed by otelcol.processor.memory_limiter. - alert.newRule( - 'OtelcolReceiverRefusedSpans', - 'sum by (cluster, namespace) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0', - 'The receiver could not push some spans to the pipeline.', - '5m', - ), +{ + newOpenTelemetryAlertsGroup(enableK8sCluster=true): + alert.newGroup( + 'alloy_otelcol', + [ + // An otelcol.exporter component rcould not push some spans to the pipeline. + // This could be due to reaching a limit such as the ones + // imposed by otelcol.processor.memory_limiter. + alert.newRule( + 'OtelcolReceiverRefusedSpans', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0' + else + 'sum by (job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0' + , + 'The receiver could not push some spans to the pipeline.', + '5m', + ), - // The exporter failed to send spans to their destination. - // There could be an issue with the payload or with the destination endpoint. - alert.newRule( - 'OtelcolExporterFailedSpans', - 'sum by (cluster, namespace) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0', - 'The exporter failed to send spans to their destination.', - '5m', - ), - ] -) + // The exporter failed to send spans to their destination. + // There could be an issue with the payload or with the destination endpoint. + alert.newRule( + 'OtelcolExporterFailedSpans', + if enableK8sCluster then + 'sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0' + else + 'sum by (job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0' + , + 'The exporter failed to send spans to their destination.', + '5m', + ), + ] + ) +} From 25eaee8114210c2dba0534f636d03ed293ef51b0 Mon Sep 17 00:00:00 2001 From: Gabriel Antunes Date: Thu, 23 May 2024 16:38:12 -0300 Subject: [PATCH 4/6] using jsonnet string formatting instead of concatenation --- operations/alloy-mixin/config.libsonnet | 4 +- operations/alloy-mixin/dashboards.libsonnet | 13 +- .../dashboards/alloy-logs.libsonnet | 6 +- .../dashboards/cluster-node.libsonnet | 108 +++++++++----- .../dashboards/cluster-overview.libsonnet | 34 ++--- .../dashboards/controller.libsonnet | 91 +++++++----- .../dashboards/opentelemetry.libsonnet | 40 ++++-- .../dashboards/prometheus.libsonnet | 134 ++++++++++-------- .../dashboards/resources.libsonnet | 52 +++---- 9 files changed, 285 insertions(+), 197 deletions(-) diff --git a/operations/alloy-mixin/config.libsonnet b/operations/alloy-mixin/config.libsonnet index 6e14c9e052..2d2a931b48 100644 --- a/operations/alloy-mixin/config.libsonnet +++ b/operations/alloy-mixin/config.libsonnet @@ -1,10 +1,10 @@ { _config+:: { - enableK8sCluster: true, + enableK8sCluster: false, enableAlloyCluster: true, enableLokiLogs: true, filterSelector: 'job=~"$job"', - groupSelector: if self.enableK8sCluster then self.filterSelector + ', ' + self.k8sClusterSelector else self.filterSelector, + groupSelector: if self.enableK8sCluster then self.k8sClusterSelector + ', ' + self.filterSelector else self.filterSelector, instanceSelector: self.groupSelector + ', instance=~"$instance"', k8sClusterSelector: 'cluster=~"$cluster", namespace=~"$namespace"', dashboardTag: 'alloy-mixin' diff --git a/operations/alloy-mixin/dashboards.libsonnet b/operations/alloy-mixin/dashboards.libsonnet index 61e4dda7c9..281c48c765 100644 --- a/operations/alloy-mixin/dashboards.libsonnet +++ b/operations/alloy-mixin/dashboards.libsonnet @@ -1,13 +1,14 @@ -local alloyClusterDashboards = - (import './dashboards/controller.libsonnet') + +local alloyClusterDashboards = (import './dashboards/cluster-node.libsonnet') + (import './dashboards/cluster-overview.libsonnet') + (import './config.libsonnet'); -local otherDashboards = (import './dashboards/resources.libsonnet') + - (import './dashboards/prometheus.libsonnet') + - (import './dashboards/opentelemetry.libsonnet') + - (import './config.libsonnet'); +local otherDashboards = + (import './dashboards/resources.libsonnet') + + (import './dashboards/controller.libsonnet') + + (import './dashboards/prometheus.libsonnet') + + (import './dashboards/opentelemetry.libsonnet') + + (import './config.libsonnet'); (import './dashboards/alloy-logs.libsonnet') + { diff --git a/operations/alloy-mixin/dashboards/alloy-logs.libsonnet b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet index dc1183c6b0..e1d4e894bd 100644 --- a/operations/alloy-mixin/dashboards/alloy-logs.libsonnet +++ b/operations/alloy-mixin/dashboards/alloy-logs.libsonnet @@ -1,6 +1,10 @@ local g = import 'github.com/grafana/grafonnet/gen/grafonnet-v10.0.0/main.libsonnet'; local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; + { + + local labels = if $._config.enableK8sCluster then ['cluster', 'namespace', 'job', 'instance', 'level'] else ['job', 'instance', 'level'], + grafanaDashboards+: if $._config.enableLokiLogs then { local alloyLogs = @@ -9,7 +13,7 @@ local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main datasourceName='loki_datasource', datasourceRegex='', filterSelector=$._config.filterSelector, - labels=['cluster', 'namespace', 'instance', 'level'], + labels=labels, formatParser=null, showLogsVolume=true ) diff --git a/operations/alloy-mixin/dashboards/cluster-node.libsonnet b/operations/alloy-mixin/dashboards/cluster-node.libsonnet index 5290cd81c2..4d9f417b80 100644 --- a/operations/alloy-mixin/dashboards/cluster-node.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-node.libsonnet @@ -6,10 +6,10 @@ local filename = 'alloy-cluster-node.json'; local templateVariables = if $._config.enableK8sCluster then [ - dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), - dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), - dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), - dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), ] else [ @@ -53,22 +53,30 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 0, y: 1, w: 12, h: 8 }) + panel.withQueries([ panel.newNamedInstantQuery( - expr='sum(cluster_node_lamport_time{' + $._config.instanceSelector + '})', + expr= ||| + sum(cluster_node_lamport_time{%(instanceSelector)s}) + ||| % $._config, refId='Lamport clock time', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_update_observers{' + $._config.instanceSelector + '})', + expr= ||| + sum(cluster_node_update_observers{%(instanceSelector)s}) + ||| % $._config, refId='Internal cluster state observers', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_gossip_health_score{' + $._config.instanceSelector + '})', + expr= ||| + sum(cluster_node_gossip_health_score{%(instanceSelector)s}) + ||| % $._config, refId='Gossip health score', format='table', ), panel.newNamedInstantQuery( - expr='sum(cluster_node_gossip_proto_version{' + $._config.instanceSelector + '})', + expr= ||| + sum(cluster_node_gossip_proto_version{%(instanceSelector)s}) + ||| % $._config, refId='Gossip protocol version', format='table', ), @@ -104,7 +112,9 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 12, y: 1, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_node_gossip_received_events_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(cluster_node_gossip_received_events_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{event}}' ), ]) @@ -118,7 +128,9 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 0, y: 9, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='sum(cluster_node_peers{' + $._config.instanceSelector + '})', + expr= ||| + sum(cluster_node_peers{%(instanceSelector)s}) + ||| % $._config, ), ]) + panel.withUnit('suffix:peers') @@ -132,7 +144,9 @@ local filename = 'alloy-cluster-node.json'; panel.withPosition({ x: 12, y: 9, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='cluster_node_peers{' + $._config.instanceSelector + '}', + expr= ||| + cluster_node_peers{%(instanceSelector)s} + ||| % $._config, legendFormat='{{state}}', ), ]) + @@ -154,11 +168,15 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_transport_rx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(cluster_transport_rx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='rx', ), panel.newQuery( - expr='-1 * rate(cluster_transport_tx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + -1 * rate(cluster_transport_tx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='tx', ), ]) + @@ -176,19 +194,21 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr= - '1 - ( - rate(cluster_transport_tx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / - rate(cluster_transport_tx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) - )', + expr= ||| + 1 - ( + rate(cluster_transport_tx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_tx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Tx success %', ), panel.newQuery( - expr= - '1 - ( - rate(cluster_transport_rx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / - rate(cluster_transport_rx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) - )', + expr= ||| + 1 - ( + rate(cluster_transport_rx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_rx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Rx success %', ), ]) + @@ -210,11 +230,15 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='cluster_transport_tx_packet_queue_length{' + $._config.instanceSelector + '}', + expr= ||| + cluster_transport_tx_packet_queue_length{%(instanceSelector)s} + ||| % $._config, legendFormat='tx queue', ), panel.newQuery( - expr='cluster_transport_rx_packet_queue_length{' + $._config.instanceSelector + '}', + expr= ||| + cluster_transport_rx_packet_queue_length{%(instanceSelector)s} + ||| % $._config, legendFormat='rx queue', ), ]) + @@ -231,11 +255,15 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='rate(cluster_transport_stream_rx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(cluster_transport_stream_rx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='rx', ), panel.newQuery( - expr='-1 * rate(cluster_transport_stream_tx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + -1 * rate(cluster_transport_stream_tx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='tx', ), ]) + @@ -253,19 +281,21 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr= - '1 - ( - rate(cluster_transport_stream_tx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / - rate(cluster_transport_stream_tx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) - )', + expr= ||| + 1 - ( + rate(cluster_transport_stream_tx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_stream_tx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Tx success %' ), panel.newQuery( - expr= - '1 - ( - rate(cluster_transport_stream_rx_packets_failed_total{' + $._config.instanceSelector + '}[$__rate_interval]) / - rate(cluster_transport_stream_rx_packets_total{' + $._config.instanceSelector + '}[$__rate_interval]) - )', + expr= ||| + 1 - ( + rate(cluster_transport_stream_rx_packets_failed_total{%(instanceSelector)s}[$__rate_interval]) / + rate(cluster_transport_stream_rx_packets_total{%(instanceSelector)s}[$__rate_interval]) + ) + ||| % $._config, legendFormat='Rx success %' ), ]) + @@ -287,7 +317,9 @@ local filename = 'alloy-cluster-node.json'; }) + panel.withQueries([ panel.newQuery( - expr='cluster_transport_streams{' + $._config.instanceSelector + '}', + expr= ||| + cluster_transport_streams{%(instanceSelector)s} + ||| % $._config, legendFormat='Open streams' ), ]) diff --git a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet index 2cbeefdaf1..361eb6f93e 100644 --- a/operations/alloy-mixin/dashboards/cluster-overview.libsonnet +++ b/operations/alloy-mixin/dashboards/cluster-overview.libsonnet @@ -7,9 +7,9 @@ local cluster_node_filename = 'alloy-cluster-node.json'; local templateVariables = if $._config.enableK8sCluster then [ - dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), - dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), - dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), ] else [ @@ -36,7 +36,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 8, x: 0, y: 0 }) + panel.withQueries([ panel.newInstantQuery( - expr='count(cluster_node_info{' + $._config.groupSelector + '})' + expr='count(cluster_node_info{%(groupSelector)s})' ), ]) ), @@ -49,7 +49,7 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 16, x: 8, y: 0 }) + panel.withQueries([ panel.newInstantQuery( - expr='cluster_node_info{' + $._config.groupSelector + '}', + expr='cluster_node_info{%(groupSelector)s}', format='table', ), ]) + @@ -127,14 +127,14 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 8, x: 0, y: 9 }) + panel.withQueries([ panel.newInstantQuery( - expr= - 'clamp(( - sum(stddev by (state) (cluster_node_peers{' + $._config.groupSelector + '}) != 0) or - (sum(abs(sum without (state) (cluster_node_peers{' + $._config.groupSelector + '})) - scalar(count(cluster_node_info{' + $._config.groupSelector + '})) != 0)) + expr= ||| + clamp(( + sum(stddev by (state) (cluster_node_peers{%(groupSelector)s}) != 0) or + (sum(abs(sum without (state) (cluster_node_peers{%(groupSelector)s})) - scalar(count(cluster_node_info{%(groupSelector)s})) != 0)) ), 1, 1 - )' - , + ) + ||| % $._config, format='time_series' ), ]) + @@ -196,14 +196,14 @@ local cluster_node_filename = 'alloy-cluster-node.json'; panel.withPosition({ h: 9, w: 16, x: 8, y: 9 }) + panel.withQueries([ panel.newQuery( - expr= - 'ceil(clamp(( - sum(stddev by (state) (cluster_node_peers{' + $._config.groupSelector + '})) or - (sum(abs(sum without (state) (cluster_node_peers{' + $._config.groupSelector + '})) - scalar(count(cluster_node_info{' + $._config.groupSelector + '})))) + expr= ||| + ceil(clamp(( + sum(stddev by (state) (cluster_node_peers{%(groupSelector)s})) or + (sum(abs(sum without (state) (cluster_node_peers{%(groupSelector)s})) - scalar(count(cluster_node_info{%(groupSelector)s})))) ), 0, 1 - ))' - , + )) + ||| % $._config, legendFormat='Converged' ), ]) + diff --git a/operations/alloy-mixin/dashboards/controller.libsonnet b/operations/alloy-mixin/dashboards/controller.libsonnet index 56412a6cb1..aa5b4ce357 100644 --- a/operations/alloy-mixin/dashboards/controller.libsonnet +++ b/operations/alloy-mixin/dashboards/controller.libsonnet @@ -7,9 +7,9 @@ local filename = 'alloy-controller.json'; local templateVariables = if $._config.enableK8sCluster then [ - dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), - dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), - dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), ] else [ @@ -40,7 +40,9 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 0, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr='count(alloy_component_controller_evaluating{' + $._config.groupSelector + '})', + expr= ||| + count(alloy_component_controller_evaluating{%(groupSelector)s}) + ||| % $._config, ), ]) ), @@ -55,7 +57,9 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 4, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + '})', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s}) + ||| % $._config, ), ]) ), @@ -78,9 +82,10 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 8, w: 10, h: 4 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum(alloy_component_controller_running_components{' + $._config.groupSelector + ',health_type="healthy"}) / - sum(alloy_component_controller_running_components{' + $._config.groupSelector + '})', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s,health_type="healthy"}) / + sum(alloy_component_controller_running_components{%(groupSelector)s}) + ||| % $._config, ), ]) ), @@ -162,19 +167,27 @@ local filename = 'alloy-controller.json'; panel.withQueries([ panel.newInstantQuery( legendFormat='Healthy', - expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="healthy"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="healthy"}) or vector(0) + ||| % $._config, ), panel.newInstantQuery( legendFormat='Unhealthy', - expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="unhealthy"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="unhealthy"}) or vector(0) + ||| % $._config, ), panel.newInstantQuery( legendFormat='Unknown', - expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="unknown"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="unknown"}) or vector(0) + ||| % $._config, ), panel.newInstantQuery( legendFormat='Exited', - expr='sum(alloy_component_controller_running_components{' + $._config.groupSelector + ', health_type="exited"}) or vector(0)', + expr= ||| + sum(alloy_component_controller_running_components{%(groupSelector)s, health_type="exited"}) or vector(0) + ||| % $._config, ), ]) ), @@ -199,7 +212,9 @@ local filename = 'alloy-controller.json'; panel.withMultiTooltip() + panel.withQueries([ panel.newQuery( - expr='sum by (instance) (rate(alloy_component_evaluation_seconds_count{' + $._config.groupSelector + '}[$__rate_interval]))', + expr= ||| + sum by (instance) (rate(alloy_component_evaluation_seconds_count{%(groupSelector)s}[$__rate_interval])) + ||| % $._config, ), ]) ), @@ -223,30 +238,33 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 8, y: 12, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) + expr= ||| + histogram_quantile(0.99, sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) or - histogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))', + histogram_quantile(0.99, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{%(groupSelector)s}[$__rate_interval]))) + ||| % $._config, legendFormat='99th percentile', ), panel.newQuery( - expr= - 'histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) + expr= ||| + histogram_quantile(0.50, sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) or - histogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval])))', + histogram_quantile(0.50, sum by (le) (rate(alloy_component_evaluation_seconds_bucket{%(groupSelector)s}[$__rate_interval]))) + ||| % $._config, legendFormat='50th percentile', ), panel.newQuery( - expr= - '( - histogram_sum(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) / - histogram_count(sum(rate(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval]))) + expr= ||| + ( + histogram_sum(sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) / + histogram_count(sum(rate(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval]))) ) or ( - sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])) / - sum(rate(alloy_component_evaluation_seconds_count{' + $._config.groupSelector + '}[$__rate_interval])) - )', + sum(rate(alloy_component_evaluation_seconds_sum{%(groupSelector)s}[$__rate_interval])) / + sum(rate(alloy_component_evaluation_seconds_count{%(groupSelector)s}[$__rate_interval])) + ) + ||| % $._config, legendFormat='Average', ), ]) @@ -265,9 +283,10 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{' + $._config.groupSelector + '}[$__rate_interval])) - / scalar(sum(rate(alloy_component_evaluation_seconds_sum{' + $._config.groupSelector + '}[$__rate_interval])))', + expr= ||| + sum by (component_path, component_id) (rate(alloy_component_evaluation_slow_seconds{%(groupSelector)s}[$__rate_interval])) + / scalar(sum(rate(alloy_component_evaluation_seconds_sum{%(groupSelector)s}[$__rate_interval]))) + ||| % $._config, legendFormat='{{component path}} {{component_id}}', ), ]) @@ -287,10 +306,11 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum(increase(alloy_component_evaluation_seconds{' + $._config.groupSelector + '}[$__rate_interval])) + expr= ||| + sum(increase(alloy_component_evaluation_seconds{%(groupSelector)s}[$__rate_interval])) or ignoring (le) - sum by (le) (increase(alloy_component_evaluation_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))', + sum by (le) (increase(alloy_component_evaluation_seconds_bucket{%(groupSelector)s}[$__rate_interval]))' + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), @@ -311,10 +331,11 @@ local filename = 'alloy-controller.json'; panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum(increase(alloy_component_dependencies_wait_seconds{' + $._config.groupSelector + '}[$__rate_interval])) + expr= ||| + sum(increase(alloy_component_dependencies_wait_seconds{%(groupSelector)s}[$__rate_interval])) or ignoring (le) - sum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{' + $._config.groupSelector + '}[$__rate_interval]))', + sum by (le) (increase(alloy_component_dependencies_wait_seconds_bucket{%(groupSelector)s}[$__rate_interval])) + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), diff --git a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet index 05836db590..c78d6af468 100644 --- a/operations/alloy-mixin/dashboards/opentelemetry.libsonnet +++ b/operations/alloy-mixin/dashboards/opentelemetry.libsonnet @@ -18,10 +18,10 @@ local stackedPanelMixin = { local templateVariables = if $._config.enableK8sCluster then [ - dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), - dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), - dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), - dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), ] else [ @@ -49,7 +49,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='rate(receiver_accepted_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(receiver_accepted_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, //TODO: How will the dashboard look if there is more than one receiver component? The legend is not unique enough? legendFormat='{{ pod }} / {{ transport }}', ), @@ -65,7 +67,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='rate(receiver_refused_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(receiver_refused_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }} / {{ transport }}', ), ]) @@ -78,7 +82,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 16, y: 0, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='sum by (le) (increase(rpc_server_duration_milliseconds_bucket{' + $._config.instanceSelector + ', rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval]))', + expr= ||| + sum by (le) (increase(rpc_server_duration_milliseconds_bucket{%(instanceSelector)s, rpc_service="opentelemetry.proto.collector.trace.v1.TraceService"}[$__rate_interval])) + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), @@ -99,7 +105,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{' + $._config.instanceSelector + '}[$__rate_interval]))', + expr= ||| + sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{%(instanceSelector)s}[$__rate_interval])) + ||| % $._config, format='heatmap', legendFormat='{{le}}', ), @@ -116,7 +124,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='processor_batch_metadata_cardinality_ratio{' + $._config.instanceSelector + '}', + expr= ||| + processor_batch_metadata_cardinality_ratio{%(instanceSelector)s} + ||| % $._config, legendFormat='{{ pod }}', ), ]) @@ -129,7 +139,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 16, y: 10, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='rate(processor_batch_timeout_trigger_send_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(processor_batch_timeout_trigger_send_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }}', ), ]) @@ -149,7 +161,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 20, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='rate(exporter_sent_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(exporter_sent_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }}', ), ]) @@ -163,7 +177,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 8, y: 20, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr='rate(exporter_send_failed_spans_ratio_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(exporter_send_failed_spans_ratio_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{ pod }}', ), ]) diff --git a/operations/alloy-mixin/dashboards/prometheus.libsonnet b/operations/alloy-mixin/dashboards/prometheus.libsonnet index f8ecdc4ac3..b023dcc264 100644 --- a/operations/alloy-mixin/dashboards/prometheus.libsonnet +++ b/operations/alloy-mixin/dashboards/prometheus.libsonnet @@ -105,12 +105,13 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 0, y: 1 + y_offset, w: 6, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum by (instance, component_path, component_id) ( - prometheus_remote_storage_highest_timestamp_in_seconds{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component"} + expr= ||| + sum by (instance, component_path, component_id) ( + prometheus_remote_storage_highest_timestamp_in_seconds{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"} - ignoring(url, remote_name) group_right(instance) - prometheus_remote_storage_queue_highest_sent_timestamp_seconds{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} - )', + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) @@ -128,11 +129,12 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 6, y: 1 + y_offset, w: 6, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum without (remote_name, url) ( - rate(prometheus_remote_storage_bytes_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + - rate(prometheus_remote_storage_metadata_bytes_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )', + expr= ||| + sum without (remote_name, url) ( + rate(prometheus_remote_storage_bytes_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + + rate(prometheus_remote_storage_metadata_bytes_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) @@ -149,24 +151,26 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 12, y: 1 + y_offset, w: 6, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'histogram_quantile(0.99, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ))', + expr= ||| + histogram_quantile(0.99, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )) + ||| % $._config, legendFormat='99th percentile', ), panel.newQuery( - expr= - 'histogram_quantile(0.50, sum by (le) ( - rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - ))', + expr= ||| + histogram_quantile(0.50, sum by (le) ( + rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + )) + ||| % $._config, legendFormat='50th percentile', ), panel.newQuery( - expr= - 'sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / - sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component"}[$__rate_interval]))' - , + expr= ||| + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) / + sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component"}[$__rate_interval])) + ||| % $._config, legendFormat='Average', ), ]) @@ -218,24 +222,27 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 18, y: 1 + y_offset, w: 6, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum without (remote_name, url) ( - prometheus_remote_storage_shards{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} - )', + expr= ||| + sum without (remote_name, url) ( + prometheus_remote_storage_shards{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), panel.newQuery( - expr= - 'min ( - prometheus_remote_storage_shards_min{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} - )', + expr= ||| + min ( + prometheus_remote_storage_shards_min{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, legendFormat='Minimum', ), panel.newQuery( - expr= - 'max ( - prometheus_remote_storage_shards_max{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"} - )', + expr= ||| + max ( + prometheus_remote_storage_shards_max{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ) + ||| % $._config, legendFormat='Maximum', ), ]) @@ -252,10 +259,11 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 0, y: 11 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum without (url, remote_name) ( - rate(prometheus_remote_storage_samples_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )', + expr= ||| + sum without (url, remote_name) ( + rate(prometheus_remote_storage_samples_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) @@ -273,10 +281,11 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 8, y: 11 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_failed_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )', + expr= ||| + sum without (url,remote_name) ( + rate(prometheus_remote_storage_samples_failed_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) @@ -294,10 +303,11 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 16, y: 11 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum without (url,remote_name) ( - rate(prometheus_remote_storage_samples_retried_total{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) - )', + expr= ||| + sum without (url,remote_name) ( + rate(prometheus_remote_storage_samples_retried_total{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}[$__rate_interval]) + ) + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) @@ -323,9 +333,9 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 0, y: 21 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum(prometheus_remote_write_wal_storage_active_series{' + $._config.instanceSelector + ', component_path=~"$component_path", component_id=~"$component", url=~"$url"})' - , + expr= ||| + sum(prometheus_remote_write_wal_storage_active_series{%(instanceSelector)s, component_path=~"$component_path", component_id=~"$component", url=~"$url"}) + ||| % $._config, legendFormat='Series', ), ]) @@ -346,9 +356,9 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 8, y: 21 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'prometheus_remote_write_wal_storage_active_series{' + $._config.instanceSelector + ', component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}' - , + expr= ||| + prometheus_remote_write_wal_storage_active_series{%(instanceSelector)s, component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"} + ||| % $._config, legendFormat='{{instance}} / {{component_path}} {{component_id}}', ), ]) @@ -369,9 +379,9 @@ local filename = 'alloy-prometheus-remote-write.json'; panel.withPosition({ x: 16, y: 21 + y_offset, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( - expr= - 'sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{' + $._config.instanceSelector + ', component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"})' - , + expr= ||| + sum by (component_path, component_id) (prometheus_remote_write_wal_storage_active_series{%(instanceSelector)s, component_id!="", component_path=~"$component_path", component_id=~"$component", url=~"$url"}) + ||| % $._config, legendFormat='{{component_path}} {{component_id}}', ), ]) @@ -390,13 +400,13 @@ local filename = 'alloy-prometheus-remote-write.json'; local templateVariables = if $._config.enableK8sCluster then [ - dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), - dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), - dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), - dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), - dashboard.newMultiTemplateVariable('component_path', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance", component_id=~"prometheus.remote_write.*", component_path=~".*"}, component_path)'), - dashboard.newMultiTemplateVariable('component', 'label_values(prometheus_remote_write_wal_samples_appended_total{job=~"$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance", component_id=~"prometheus.remote_write.*"}, component_id)'), - dashboard.newMultiTemplateVariable('url', 'label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{job="$job", cluster=~"$cluster", namespace=~"$namespace", instance=~"$instance", component_id=~"$component"}, url)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), + dashboard.newMultiTemplateVariable('component_path', 'label_values(prometheus_remote_write_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*", component_path=~".*"}, component_path)'), + dashboard.newMultiTemplateVariable('component', 'label_values(prometheus_remote_write_wal_samples_appended_total{cluster=~"$cluster", namespace=~"$namespace", job=~"$job", instance=~"$instance", component_id=~"prometheus.remote_write.*"}, component_id)'), + dashboard.newMultiTemplateVariable('url', 'label_values(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~"$cluster", namespace=~"$namespace", job="$job", instance=~"$instance", component_id=~"$component"}, url)'), ] else [ diff --git a/operations/alloy-mixin/dashboards/resources.libsonnet b/operations/alloy-mixin/dashboards/resources.libsonnet index ef0118039f..7aba016d98 100644 --- a/operations/alloy-mixin/dashboards/resources.libsonnet +++ b/operations/alloy-mixin/dashboards/resources.libsonnet @@ -30,10 +30,10 @@ local stackedPanelMixin = { local templateVariables = if $._config.enableK8sCluster then [ - dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components, job)'), - dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components{job=~"$job"}, cluster)'), - dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster"}, namespace)'), - dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{job=~"$job", cluster=~"$cluster", namespace=~"$namespace"}, instance)'), + dashboard.newTemplateVariable('cluster', 'label_values(alloy_component_controller_running_components, cluster)'), + dashboard.newTemplateVariable('namespace', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster"}, namespace)'), + dashboard.newMultiTemplateVariable('job', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace"}, job)'), + dashboard.newMultiTemplateVariable('instance', 'label_values(alloy_component_controller_running_components{cluster=~"$cluster", namespace=~"$namespace", job=~"$job"}, instance)'), ] else [ @@ -63,7 +63,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 0, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='rate(alloy_resources_process_cpu_seconds_total{' + $._config.instanceSelector + '}[$__rate_interval])', + expr= ||| + rate(alloy_resources_process_cpu_seconds_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -79,7 +81,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 12, y: 0, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr='alloy_resources_process_resident_memory_bytes{' + $._config.instanceSelector + '}', + expr= ||| + alloy_resources_process_resident_memory_bytes{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -99,11 +103,11 @@ local stackedPanelMixin = { // Lots of programs export go_goroutines so we ignore anything that // doesn't also have an Alloy-specific metric (i.e., // alloy_build_info). - expr= - 'rate(go_gc_duration_seconds_count{' + $._config.instanceSelector + '}[5m]) + expr= ||| + rate(go_gc_duration_seconds_count{%(instanceSelector)s}[5m]) and on(instance) - alloy_build_info{' + $._config.instanceSelector + '}' - , + alloy_build_info{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -123,11 +127,11 @@ local stackedPanelMixin = { // Lots of programs export go_goroutines so we ignore anything that // doesn't also have an Alloy-specific metric (i.e., // alloy_build_info). - expr= - 'go_goroutines{' + $._config.instanceSelector + '} + expr= ||| + go_goroutines{%(instanceSelector)s} and on(instance) - alloy_build_info{' + $._config.instanceSelector + '}' - , + alloy_build_info{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -146,11 +150,11 @@ local stackedPanelMixin = { // Lots of programs export go_memstats_heap_inuse_bytes so we ignore // anything that doesn't also have an Alloy-specific metric // (i.e., alloy_build_info). - expr= - 'go_memstats_heap_inuse_bytes{' + $._config.instanceSelector + '} + expr= ||| + go_memstats_heap_inuse_bytes{%(instanceSelector)s} and on(instance) - alloy_build_info{' + $._config.instanceSelector + '}' - , + alloy_build_info{%(instanceSelector)s} + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -171,9 +175,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 0, y: 16, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr= - 'rate(alloy_resources_machine_rx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])' - , + expr= ||| + rate(alloy_resources_machine_rx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{instance}}' ), ]) @@ -194,9 +198,9 @@ local stackedPanelMixin = { panel.withPosition({ x: 12, y: 16, w: 12, h: 8 }) + panel.withQueries([ panel.newQuery( - expr= - 'rate(alloy_resources_machine_tx_bytes_total{' + $._config.instanceSelector + '}[$__rate_interval])' - , + expr= ||| + rate(alloy_resources_machine_tx_bytes_total{%(instanceSelector)s}[$__rate_interval]) + ||| % $._config, legendFormat='{{instance}}' ), ]) From aa32b9274688c132d3fda312ef98cba7597ed784 Mon Sep 17 00:00:00 2001 From: Gabriel Antunes Date: Thu, 23 May 2024 17:32:29 -0300 Subject: [PATCH 5/6] remove all option from single selection variables --- operations/alloy-mixin/config.libsonnet | 2 +- operations/alloy-mixin/dashboards/utils/dashboard.jsonnet | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/operations/alloy-mixin/config.libsonnet b/operations/alloy-mixin/config.libsonnet index 2d2a931b48..09d7f31e09 100644 --- a/operations/alloy-mixin/config.libsonnet +++ b/operations/alloy-mixin/config.libsonnet @@ -1,6 +1,6 @@ { _config+:: { - enableK8sCluster: false, + enableK8sCluster: true, enableAlloyCluster: true, enableLokiLogs: true, filterSelector: 'job=~"$job"', diff --git a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet index 99d5db5a3b..09135d023a 100644 --- a/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet +++ b/operations/alloy-mixin/dashboards/utils/dashboard.jsonnet @@ -76,9 +76,7 @@ }, datasource: '${datasource}', refresh: 2, - sort: 2, - allValue: '.*', - includeAll: true, + sort: 2, }, newLokiAnnotation(name, expression, color):: { @@ -93,6 +91,8 @@ newMultiTemplateVariable(name, query):: $.newTemplateVariable(name, query) { multi: true, + allValue: '.*', + includeAll: true, }, withPanelsMixin(panels):: { panels+: panels }, From 5feb10a6aa0da64242986065e9f672e7740e4075 Mon Sep 17 00:00:00 2001 From: Gabriel Antunes Date: Thu, 23 May 2024 18:03:33 -0300 Subject: [PATCH 6/6] formatting changes --- operations/alloy-mixin/alerts.libsonnet | 22 +++++++-------- .../alloy-mixin/alerts/clustering.libsonnet | 28 ++++++++----------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/operations/alloy-mixin/alerts.libsonnet b/operations/alloy-mixin/alerts.libsonnet index f249d0d4d9..548e41febd 100644 --- a/operations/alloy-mixin/alerts.libsonnet +++ b/operations/alloy-mixin/alerts.libsonnet @@ -3,18 +3,18 @@ local controllerAlerts = (import './alerts/controller.libsonnet'); local openTelemetryAlerts = (import './alerts/opentelemetry.libsonnet'); { + local alloyClusterAlerts = [clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster)], + + local otherAlerts = [ + controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), + openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster) + ], + prometheusAlerts+: { groups+: - if $._config.enableAlloyCluster then - [ - clusterAlerts.newAlloyClusterAlertsGroup($._config.enableK8sCluster), - controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), - openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster), - ] - else - [ - controllerAlerts.newControllerAlertsGroup($._config.enableK8sCluster), - openTelemetryAlerts.newOpenTelemetryAlertsGroup($._config.enableK8sCluster) - ], + if $._config.enableAlloyCluster then + alloyClusterAlerts + otherAlerts + else + otherAlerts }, } diff --git a/operations/alloy-mixin/alerts/clustering.libsonnet b/operations/alloy-mixin/alerts/clustering.libsonnet index 005be441b7..9f2411285b 100644 --- a/operations/alloy-mixin/alerts/clustering.libsonnet +++ b/operations/alloy-mixin/alerts/clustering.libsonnet @@ -22,14 +22,11 @@ local alert = import './utils/alert.jsonnet'; // Alloy instance matches the number of running Alloy instances in the // same cluster and namespace as reported by a count of Prometheus // metrics. - if enableK8sCluster then - ||| + if enableK8sCluster then ||| sum without (state) (cluster_node_peers) != on (cluster, namespace, job) group_left count by (cluster, namespace, job) (cluster_node_info) - ||| - else - ||| + ||| else ||| sum without (state) (cluster_node_peers) != on (job) group_left count by (job) (cluster_node_info) @@ -76,18 +73,15 @@ local alert = import './utils/alert.jsonnet'; // Nodes are not using the same configuration file. alert.newRule( 'ClusterConfigurationDrift', - if enableK8sCluster then - ||| - count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) - ) > 1 - ||| - else - ||| - count without (sha256) ( - max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info) - ) > 1 - ||| + if enableK8sCluster then ||| + count without (sha256) ( + max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + ) > 1 + ||| else ||| + count without (sha256) ( + max by (sha256, job) (alloy_config_hash and on(job) cluster_node_info) + ) > 1 + ||| , 'Cluster nodes are not using the same configuration file.', '5m',