From f675e8e9f041a6edf1a037853326ba032cb1fb88 Mon Sep 17 00:00:00 2001 From: Maxime Brunet Date: Thu, 7 Apr 2022 15:55:59 -0700 Subject: [PATCH] Follow Jsonnet monitoring-mixin structure --- .gitignore | 1 + dashboards/demo/README.md | 4 +- .../cassandra-condensed.jsonnet | 522 -------- .../dashboards-jsonnet/overview.jsonnet | 1110 ----------------- .../dashboards-jsonnet/system-metrics.jsonnet | 902 -------------- dashboards/grafana/make-dashboards.sh | 10 - make_package.sh | 5 +- mixin/README.md | 14 + mixin/config.libsonnet | 5 + mixin/dashboards.jsonnet | 6 + .../dashboards/cassandra-condensed.libsonnet | 529 ++++++++ mixin/dashboards/dashboards.libsonnet | 3 + mixin/dashboards/overview.libsonnet | 1108 ++++++++++++++++ mixin/dashboards/system-metrics.libsonnet | 908 ++++++++++++++ mixin/jsonnetfile.json | 24 + mixin/jsonnetfile.lock.json | 26 + mixin/make-dashboards.sh | 11 + mixin/mixin.libsonnet | 2 + 18 files changed, 2640 insertions(+), 2550 deletions(-) delete mode 100644 dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet delete mode 100644 dashboards/grafana/dashboards-jsonnet/overview.jsonnet delete mode 100644 dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet delete mode 100755 dashboards/grafana/make-dashboards.sh create mode 100644 mixin/README.md create mode 100644 mixin/config.libsonnet create mode 100644 mixin/dashboards.jsonnet create mode 100644 mixin/dashboards/cassandra-condensed.libsonnet create mode 100644 mixin/dashboards/dashboards.libsonnet create mode 100644 mixin/dashboards/overview.libsonnet create mode 100644 mixin/dashboards/system-metrics.libsonnet create mode 100644 mixin/jsonnetfile.json create mode 100644 mixin/jsonnetfile.lock.json create mode 100755 mixin/make-dashboards.sh create mode 100644 mixin/mixin.libsonnet diff --git a/.gitignore b/.gitignore index 1af9d35..d5304a7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv .classpath target .vscode +vendor diff --git a/dashboards/demo/README.md b/dashboards/demo/README.md index e881cdb..94b2607 100644 --- a/dashboards/demo/README.md +++ b/dashboards/demo/README.md @@ -20,10 +20,10 @@ To use: 3. Open your web browser to [http://localhost:3000](http://localhost:3000) - If you want to change the jsonnet dashboards, make your changes then run: + If you want to change the jsonnet dashboards, make your changes under `mixin/dashboards/` then run: ```` - ../grafana/make-dashboards.sh + mixin/make-dashboards.sh ```` Refresh the browser to see changes. diff --git a/dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet b/dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet deleted file mode 100644 index 9397551..0000000 --- a/dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet +++ /dev/null @@ -1,522 +0,0 @@ -local grafana = import 'grafonnet/grafana.libsonnet'; - -local dashboard = grafana.dashboard; -local row = grafana.row; -local singlestat = grafana.singlestat; -local graphpanel = grafana.graphPanel; -local text = grafana.text; -local prometheus = grafana.prometheus; -local template = grafana.template; - -local prefix = std.extVar('prefix'); - -local graphHeight = 300; -local singlestatHeight = 100; -local singlestatSpan = 1; -local graphSpan = 4; - -dashboard.new( - 'Cassandra Cluster Condensed', - description='Single pane of glass for most important Cassandra metrics', - schemaVersion=14, - refresh='30s', - time_from='now-30m', - editable=true, - tags=['os'], - style='dark' -) -.addTemplate( - template.datasource( - 'PROMETHEUS_DS', - 'prometheus', - 'Prometheus', - hide='all', - ) -) - -.addTemplate( - template.custom( - 'by', - 'cluster,dc,rack,instance', - 'cluster', - valuelabels={ - "cluster": "Cluster", - "dc" : "Datacenter", - "rack" : "Rack", - "instance" : "Host"}, - label='Group By', - ) -) -.addTemplate( - template.interval( - 'rate', - '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d', - '5m', - label='Rate', - ) -) -.addTemplate( - template.new( - 'cluster', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{}, cluster)', - label='Cluster', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'dc', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)', - label='DataCenter', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'rack', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)', - label='Rack', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'keyspace', - '$PROMETHEUS_DS', - 'label_values(' + prefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc"}, keyspace)', - label='Keyspace', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'table', - '$PROMETHEUS_DS', - 'label_values(' + prefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc", keyspace=~"$keyspace"}, table)', - label='Table', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'host', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)', - label='Host', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.custom( - 'latency', - '0.999,0.99,0.98,0.95,0.90,0.75,0.50', - '0.95', - valuelabels={ - "0.999" : "P999", - "0.99" : "P99", - "0.98" : "P98", - "0.95" : "P95", - "0.90" : "P90", - "0.75" : "P75", - "0.50" : "P50" - }, - label='Percentile' - ) -) -.addRow( - row.new( - title='Cluster Overview', - height=singlestatHeight, - ) - .addPanel( - singlestat.new( - 'Nodes Up', - description="Nodes that are currently running in this time window", - format='none', - decimals=0, - datasource='$PROMETHEUS_DS', - colorValue=true, - colors=["#d44a3a", "#299c46", "#299c46"], - thresholds='0.1,1000', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'count(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"} >= 0) or vector(0)' - ) - ) - ) - .addPanel( - singlestat.new( - 'Nodes Down', - description="Nodes that are currently not running in this time window", - format='none', - decimals=0, - colorValue=true, - colors=[ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"], - datasource='$PROMETHEUS_DS', - thresholds='1,2', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'count(absent(sum(rate(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[5m])))) OR vector(0)' - ) - ) - ) - .addPanel( - singlestat.new( - 'Compactions / $rate', - description="Rate of compactions during this window", - format='none', - decimals=0, - datasource='$PROMETHEUS_DS', - sparklineShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(rate(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' - ) - ) - ) - .addPanel( - singlestat.new( - 'CQL Requests / $rate', - description="Rate of CQL requests during this window", - format='none', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - decimals=0, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(irate(dse_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' - ) - ) - ) - .addPanel( - singlestat.new( - 'Dropped Messages / $rate', - description="Rate of Dropped requests during this window", - format='none', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - thresholds="30,300", - colorValue=true, - decimals=0, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(irate(' + prefix + '_table_dropped_mutations_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' - ) - ) - ) - .addPanel( - text.new( - transparent=true, - mode="html", - content='', - span=2 - ) - ) - .addPanel( - singlestat.new( - 'CQL Clients', - description="Number of connected clients during this time window", - format='none', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - thresholds="100,1000", - colorValue=true, - decimals=0, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(' + prefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"})' - ) - ) - ) - .addPanel( - singlestat.new( - 'Timeouts / $rate', - description="Client timeouts over the last $rate", - format='none', - datasource='$PROMETHEUS_DS', - thresholds='100,300', - colorValue=true, - sparklineShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(irate(' + prefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))', - ) - ) - ) - .addPanel( - singlestat.new( - 'Hints / $rate', - description="Hints stored over the last $rate", - format='none', - datasource='$PROMETHEUS_DS', - thresholds='1000,30000', - colorValue=true, - sparklineShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(irate(' + prefix + '_storage_hints_on_disk_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' - ) - ) - ) - .addPanel( - singlestat.new( - 'Data Size', - description="Data", - format='bytes', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(' + prefix + '_table_live_disk_space_used_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"})' - ) - ) - ) - .addPanel( - singlestat.new( - 'GC Time / $rate', - description="Data", - format='ms', - decimals=1, - datasource='$PROMETHEUS_DS', - sparklineShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(rate(' + prefix + '_jvm_gc_time{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' - ) - ) - ) -) -.addRow( - row.new( - title='Condensed Metrics', - height=graphHeight - ) - .addPanel( - graphpanel.new( - title="Requests Served / $by / $rate", - description="(no keyspace/table filters apply)", - datasource='$PROMETHEUS_DS', - span=graphSpan, - labelY2="Clients Connected", - legend_hideZero=true, - legend_hideEmpty=true - ) - .addSeriesOverride({ - "alias": "/.*Connected/", - "yaxis": 2 - }) - .addTarget( - prometheus.target( - expr='sum(irate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by, request_type)', - legendFormat="{{$by}}:{{request_type}}" - ) - ) - .addTarget( - prometheus.target( - expr='sum(' + prefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}) by ($by)', - legendFormat= "{{$by}}:Clients Connected" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Coordinator $latency Latency / $by", - description="(no keyspace/table filters apply)", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="µs", - min=0, - legend_hideZero=true, - legend_hideEmpty=true - ) - .addTarget( - prometheus.target( - expr='histogram_quantile($latency, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by (le, request_type, $by))', - legendFormat="$by:{{$by}} {{$latency}} {{request_type}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Memtable Space $keyspace.$table / $by", - datasource='$PROMETHEUS_DS', - span=graphSpan, - formatY1="bytes", - formatY2="short", - labelY2="Flush", - min=0, - legend_hideZero=true, - legend_hideEmpty=true - ) - .addSeriesOverride({ - "alias": "/.*Flushes/", - "bars": true, - "lines": false, - "zindex": -3, - "yaxis": 2, - }) - .addTarget( - prometheus.target( - expr='sum(' + prefix + '_table_memtable_off_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)', - legendFormat="{{$by}} : Off Heap" - ) - ) - .addTarget( - prometheus.target( - expr='sum(' + prefix + '_table_memtable_on_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)', - legendFormat="{{$by}} : On Heap" - ) - ) - .addTarget( - prometheus.target( - expr='sum(idelta(' + prefix + '_table_memtable_switch_count_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{$by}} : Flushes" - ) - ) - .addTarget( - prometheus.target( - expr='sum(idelta(' + prefix + '_table_pending_flushes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{$by}} : Pending Flushes" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Compactions $keyspace.$table / $by", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="bps", - formatY2="short", - labelY2="Count", - legend_hideZero=true, - legend_hideEmpty=true, - min=0 - ) - .addSeriesOverride({ - "alias": "/.*Compactions/", - "bars": true, - "lines": false, - "zindex": -3, - "yaxis": 2, - }) - .addTarget( - prometheus.target( - expr='sum(irate(' + prefix + '_table_compaction_bytes_written_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{by}} : Bytes Compacted" - ) - ) - .addTarget( - prometheus.target( - expr='sum(irate(' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{by}} : Pending Compactions" - ) - ) - .addTarget( - prometheus.target( - expr='sum(irate(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{by}} : Completed Compactions" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Table $latency Latency / $by", - description="", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="µs", - min=0, - legend_hideZero=true, - legend_hideEmpty=true - ) - .addTarget( - prometheus.target( - expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_range_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', - legendFormat="$by:{{$by}} Local Range Scan" - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', - legendFormat="$by:{{$by}} Local Read" - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_write_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', - legendFormat="$by:{{$by}} Local Write" - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_coordinator_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table"}[$rate])) by (le, $by))', - legendFormat="$by:{{$by}} Coordinator Read" - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_coordinator_scan_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', - legendFormat="$by:{{$by}} Coordinator Range Scan" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Streaming / $by / $rate", - description="", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="Bps", - min=0, - legend_hideZero=true, - legend_hideEmpty=true - ) - .addTarget( - prometheus.target( - expr='sum(irate(' + prefix + '_streaming_total_incoming_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{$by}}: Incoming Stream" - ) - ) - .addTarget( - prometheus.target( - expr='sum(irate(' + prefix + '_streaming_total_outgoing_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)', - legendFormat="{{$by}}: Outgoing Stream" - ) - ) - ) -) diff --git a/dashboards/grafana/dashboards-jsonnet/overview.jsonnet b/dashboards/grafana/dashboards-jsonnet/overview.jsonnet deleted file mode 100644 index 6b00a26..0000000 --- a/dashboards/grafana/dashboards-jsonnet/overview.jsonnet +++ /dev/null @@ -1,1110 +0,0 @@ -local grafana = (import 'grafonnet/grafana.libsonnet') - + (import 'grafonnet-polystat-panel/plugin.libsonnet'); -local dashboard = grafana.dashboard; -local prometheus = grafana.prometheus; -local template = grafana.template; -local row = grafana.row; - -local graphPanel = grafana.graphPanel; -local tablePanel = grafana.tablePanel; -local singleStatPanel = grafana.singlestat; -local textPanel = grafana.text; -local polystatPanel = grafana.polystatPanel; - -local prefix = std.extVar('prefix'); - -local fillLatencySeriesOverrides = { - 'alias': 'p999', - 'fillBelowTo': 'p98', - 'lines': false -}; -local removeMinLatencySeriesOverrides = { - 'alias': 'p98', - 'lines': false -}; - -local fillMinMaxSeriesOverrides = { - 'alias': 'max', - 'fillBelowTo': 'min', - 'lines': false -}; -local removeMinlineSeriesOverrides = { - 'alias': 'min', - 'lines': false -}; - - -// used in the single stat panels where higher is better - cache hit rates for example -local reversedColors =[ - '#d44a3a', - 'rgba(237, 129, 40, 0.89)', - '#299c46', -]; - -dashboard.new( - 'Cassandra Overview', - schemaVersion=14, - refresh='30s', - time_from='now-30m', - editable=true, - tags=['Cassandra', 'Overview'], - style='dark' -) -.addTemplate( - grafana.template.datasource( - 'PROMETHEUS_DS', - 'prometheus', - 'Prometheus', - hide='all', - ) -) -.addTemplate( - template.new( - 'cluster', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{}, cluster)', - label='Cluster', - refresh='time', - ) -) -.addTemplate( - template.new( - 'dc', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)', - label='DataCenter', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'rack', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)', - label='Rack', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addTemplate( - template.new( - 'node', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)', - label='Node', - refresh='time', - includeAll=true, - allValues=".*", - ) -) -.addRow( - row.new(title='', height='50px') - .addPanel(textPanel.new(transparent=true)) - .addPanel( - textPanel.new( - transparent=true, - mode="html", - content='', - ) - ) - .addPanel(textPanel.new(transparent=true)) -) -.addRow( - row.new(title='Request Throughputs (Coordinator Perspective)') - .addPanel( - graphPanel.new( - 'Request Throughputs', - description='Total Requests Per Cluster, by Request Type', - format='rps', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Error throughputs', - description='Total Timeouts, Failures, Unavailable Rates for each cluster', - format='rps', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_failures_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}} failures', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}} timeouts', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unavailables_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}} unavailable errors', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unfinished_commit_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}} unfinished commit errors', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_condition_not_met_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}} condition not met errors', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_contention_histogram_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{request_type}} contention histogram errors', - ) - ) - ) - .addPanel( - singleStatPanel.new( - 'Read / Write Distribution', - description='Part of reads in the total of standard requests (Reads+Writes). CAS, Views, ... operations are ignored.', - format='percentunit', - datasource='$PROMETHEUS_DS', - transparent=true, - postfix=' Reads', - postfixFontSize='30%', - valueFontSize='30%', - valueName="current", - decimals=2, - thresholds='0.25,0.5,0.75', - timeFrom='', - colors=[ - "#DEB6F2", - "#CA95E5", - "#8F3BB8" - ], - gaugeShow=true, - gaugeMinValue=0, - gaugeMaxValue=1, - gaugeThresholdLabels=true, - gaugeThresholdMarkers=false, - sparklineFillColor='rgba(31, 118, 189, 0.18)', - sparklineFull=false, - sparklineLineColor='#FFB357', - sparklineShow=false - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) / ignoring (request_type) (sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) + ignoring (request_type) sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Read Latency (98 - 999th percentile)', - description='Read latency for coordinated reads', - format='µs', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='histogram_quantile(0.98, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))', - legendFormat='p98', - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))', - legendFormat='p99', - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile(0.999, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))', - legendFormat='p999', - ) - ) - .addSeriesOverride(fillLatencySeriesOverrides) - .addSeriesOverride(removeMinLatencySeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Write Latency (98th - p999 Percentile)', - description='Write latency for coordinated writes', - format='µs', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='histogram_quantile(0.98, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))', - legendFormat='p98', - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))', - legendFormat='p99', - ) - ) - .addTarget( - prometheus.target( - expr='histogram_quantile(0.999, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))', - legendFormat='p999', - ) - ) - .addSeriesOverride(fillLatencySeriesOverrides) - .addSeriesOverride(removeMinLatencySeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Other Latencies', - description='Other p99 latencies for coordinated requests', - format='µs', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - # In scope!~"Write|Read|.*-.*", we want to exclude charts above and all the per-consistency_level info like "Read-LOCAL_ONE" - expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type!~"write|read|.*-.*"}[1m:30s])) by (le, request_type, cluster))', - legendFormat='p99 {{request_type}}' - ) - ) - ) -) -.addRow( - row.new(title='Nodes Status',) - .addPanel( - polystatPanel.new( - 'Nodes Status', - description='Nodes Status uses Internal/Gossip activity. Be mindful that if Native or Thrift protocol are disabled, the nodes won\'t be reachable, and still marked up', - datasource='$PROMETHEUS_DS', - transparent=true, - span=12, - global_unit_format='none', - global_operator_name='current', - global_thresholds=[ - { - "value": 0, - "state": 2, - "color": "#d44a3a" - }, - { - "value": 1, - "state": 0, - "color": "#299c46" - } - ], - range_maps=[ - { - "from": "0", - "to": "0.9999", - "text": "DOWN" - }, - { - "from": "1", - "to": "1", - "text": "UP" - } - ], - mapping_type=2, - value_enabled=true, - ) - .addTarget( - prometheus.target( - 'max by (cluster, dc, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0', - legendFormat='{{instance}}', - instant=true, - ) - ) - ) - .addPanel( - singleStatPanel.new( - 'Nodes Count', - description='Nodes up and down in the cluster', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - decimals=0, - prefix='Total:', - postfix=' Nodes', - postfixFontSize='80%', - valueFontSize='80%', - span=4 - ) - .addTarget( - prometheus.target( - expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', - legendFormat='Total Number Of Nodes', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Nodes Status History', - description='Nodes up and down in the cluster per protocol/activity', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - decimals=0, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=false, - shared_tooltip=false, - min=0, - span=8 - ) - .addTarget( - prometheus.target( - expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', - legendFormat='Total Number Of Nodes', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="native"}[2m:30s])) > bool 0)', - legendFormat='Nodes Coordinating Requests (Native protocol)', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0)', - legendFormat='Nodes With Internal Activity (Gossip protocol)', - ) - ) - ) -) -.addRow( - row.new(title='Data Status') - .addPanel( - tablePanel.new( - 'Disk Space Usage', - description='Disk space used ordered (fullest disks first)', - datasource='$PROMETHEUS_DS', - transform='timeseries_aggregations', - transparent=true, - styles=[ - { - "alias": "Node --> Mounting Point", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "mappingType": 1, - "pattern": "Metric", - "preserveFormat": true, - "sanitize": true, - "thresholds": [], - "type": "string", - "unit": "short" - }, - { - "alias": "% Disk Space Used", - "colorMode": "row", - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "link": false, - "mappingType": 1, - "pattern": "Current", - "thresholds": [ - "0.5", - "0.75", - ], - "type": "number", - "unit": "percentunit" - } - ], - columns=[ - { - "text": "Current", - "value": "current" - } - ], - sort={ - "col": 1, - "desc": true - } - ) - .addTarget( - prometheus.target( - expr='min by (instance, df) (1-(collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"} - / ignoring (type) (collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="used"} - + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="reserved"} - + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"})) - )', - legendFormat='{{cluster}}-{{instance}} --> {{df}}', - instant=true - ) - ) - ) - .addPanel( - graphPanel.new( - 'Cassandra cluster Data Size', - description='Total sizes of the data on distinct nodes', - format='bytes', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster) (' + prefix + '_table_live_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='Live space - {{cluster}}', - ) - ) - .addTarget( - prometheus.target( - expr='sum by (cluster) (' + prefix + '_table_total_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='Total space - {{cluster}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'SSTable Count', - description='SSTable Count Max and Average per table', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='max by (cluster, keyspace, table) (' + prefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='Table - {{keyspace}}.{{table}}', - ) - ) - .addTarget( - prometheus.target( - expr='max by (cluster) (' + prefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='Max in cluster - {{cluster}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Pending Compactions', - description='Maximum pending compactions on any node in the cluster', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - bars=false, - lines=true, - stack=false, - decimals=0, - ) - .addTarget( - prometheus.target( - expr='max by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - expr='min by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - expr='avg by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Pending Compactions per Table', - description='Maximum pending compactions per table', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - bars=false, - lines=true, - stack=true, - decimals=0, - ) - .addTarget( - prometheus.target( - expr='max by (cluster, keyspace, table) (' + prefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='max for {{keyspace}}.{{table}}', - ) - ) - ) -) -.addRow( - row.new(title='Cassandra Internals') - .addPanel( - graphPanel.new( - 'Pending Tasks', - description='Cluster wide pending threads, by thread pool name', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, pool_name) (' + prefix + '_thread_pools_pending_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='{{cluster}} - pending {{pool_name}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Blocked Tasks', - description='Cluster wide blocked threads, by thread pool name', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, pool_name) (rate(' + prefix + '_thread_pools_total_blocked_tasks_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{cluster}} - blocked {{pool_name}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Dropped Messages', - description='Dropped messages rate summed by message type and cluster', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, message_type) (rate(' + prefix + '_dropped_message_dropped_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='{{cluster}} - dropped {{message_type}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Active Tasks', - description='active threads summed per cluster', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster, pool_name) (' + prefix + '_thread_pools_active_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='{{cluster}} - active {{pool_name}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Hinted Handoff', - description='Sum of hints being handed off per cluster.', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - min=0, - ) - .addTarget( - prometheus.target( - expr='sum by (cluster) (' + prefix + '_storage_total_hints_in_progress_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='count', - ) - ) - ) -) -.addRow( - row.new(title='Hardware / Operating System') - .addPanel( - graphPanel.new( - 'CPU Utilization', - description='Maximum CPU utilisation (max 100%)', - format='percentunit', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - percentage=true, - decimals=1, - min=0, - max=1, - ) - .addTarget( - prometheus.target( - expr='max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - expr='min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - expr='avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Unix Load (1m rate)', - description='Max Unix load on a node for a cluster', - format='short', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - ) - .addTarget( - prometheus.target( - expr='max by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - 'min by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - 'avg by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Memory Utilisation', - description='Maximum Memory allocated per usage (worst node) - excludes caches, buffers, etc', - format='bytes', - datasource='$PROMETHEUS_DS', - transparent=true, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - fill=1, - linewidth=2, - ) - .addTarget( - prometheus.target( - expr='min by (cluster) (sum by (cluster, dc, rack, instance) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', - legendFormat='min memory available', - ) - ) - .addTarget( - prometheus.target( - expr='max by (cluster, memory) (sum by (cluster, dc, rack, instance, memory) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', - legendFormat='max memory {{memory}}', - ) - ) - ) - .addPanel( - graphPanel.new( - 'Disk Read Thoughput', - description='Disk read throughput', - format='bps', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - ) - .addTarget( - prometheus.target( - expr='max by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - 'min by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - 'avg by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Disk Write Thoughput', - description='Disk write throughput', - format='bps', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - ) - .addTarget( - prometheus.target( - expr='max by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - 'min by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - 'avg by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Network I/O', - description='Network In and Out per cluster', - format='bytes', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=1, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - bars=false, - ) - .addTarget( - prometheus.target( - 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='outgoing', - ) - ) - .addTarget( - prometheus.target( - 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='incoming', - ) - ) - .addSeriesOverride({ - "alias": "incoming", - "transform": "negative-Y" - }) - - ) -) -.addRow( - row.new(title='JVM / Garbage Collection') - .addPanel( - graphPanel.new( - 'Application Throughput (% time NOT doing GC)', - description='Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC', - format='percentunit', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - decimals=2, - min=0, - max=1, - ) - .addTarget( - prometheus.target( - 'max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - 'min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - 'avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'Garbage Collection Time', - description='Garbage collection duration', - format='ms', - datasource='$PROMETHEUS_DS', - transparent=true, - fill=0, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - ) - .addTarget( - prometheus.target( - 'max by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - 'min by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - 'avg by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', - legendFormat='avg', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) - .addPanel( - graphPanel.new( - 'JVM Heap Memory Utilisation', - description='Maximum JVM Heap Memory size (worst node) and minimum available heap size', - format='bytes', - datasource='$PROMETHEUS_DS', - transparent=true, - legend_show=true, - legend_values=true, - legend_current=true, - legend_alignAsTable=true, - legend_sort='current', - legend_sortDesc=true, - shared_tooltip=false, - fill=1, - linewidth=2, - ) - .addTarget( - prometheus.target( - 'max by (cluster) - (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='max', - ) - ) - .addTarget( - prometheus.target( - 'min by (cluster) - (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='min', - ) - ) - .addTarget( - prometheus.target( - 'avg by (cluster) - (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='avg', - ) - ) - .addTarget( - prometheus.target( - 'min by ( cluster) - (' + prefix + '_jvm_memory_max{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', - legendFormat='Heap memory available', - ) - ) - .addSeriesOverride(fillMinMaxSeriesOverrides) - .addSeriesOverride(removeMinlineSeriesOverrides) - ) -) diff --git a/dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet b/dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet deleted file mode 100644 index e9482ae..0000000 --- a/dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet +++ /dev/null @@ -1,902 +0,0 @@ -local grafana = import 'grafonnet/grafana.libsonnet'; - -local dashboard = grafana.dashboard; -local row = grafana.row; -local singlestat = grafana.singlestat; -local graphpanel = grafana.graphPanel; -local text = grafana.text; -local prometheus = grafana.prometheus; -local template = grafana.template; - -local prefix = std.extVar('prefix'); - -local textstatHeight = 100; -local graphHeight = 250; -local singlestatHeight = 125; -local singlestatSpan = 2; -local graphSpan = 6; - -dashboard.new( - 'System & Node Metrics', - description='Operating System Metrics and Apache Cassandra Node Information', - schemaVersion=14, - time_from='now-30m', - refresh='1m', - tags=['os'], - style='dark' -) -.addTemplate( - template.datasource( - 'PROMETHEUS_DS', - 'prometheus', - 'Prometheus', - hide='all', - ) -) -.addTemplate( - template.interval( - 'rate', - '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d', - '5m', - label='Rate', - ) -) -.addTemplate( - template.new( - 'host', - '$PROMETHEUS_DS', - 'label_values(collectd_collectd_queue_length{}, instance)', - label='Host', - refresh='time', - ) -) -.addPanel( - text.new( - transparent=true, - mode="html", - content='' - ), - { - "h": 3, - "w": 5, - "x": 9, - "y": -1 - }, -) -.addRow( - row.new( - title='Basic CPU / Mem / Disk Gauge', - height=singlestatHeight, - ) - .addPanel( - singlestat.new( - 'CPU Busy', - description="Busy state of all CPU cores together", - format='percent', - datasource='$PROMETHEUS_DS', - thresholds='85,95', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - "(1 - ((sum(irate(collectd_cpu_total{instance='$host', type='idle'}[$rate])) by (instance) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) by (instance)))) * 100", - ) - ) - ) - .addPanel( - singlestat.new( - 'Memory Used', - description="Percentage of memory used (ignoring page cache)", - format='percent', - datasource='$PROMETHEUS_DS', - thresholds='85,95', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - '100 * ((sum(collectd_memory{instance="$host", memory="free"}) + sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})) / sum(collectd_memory{instance="$host"}))', - ) - ) - ) - .addPanel( - singlestat.new( - 'Swap Used', - description="Percentage of swap in use", - format='percent', - datasource='$PROMETHEUS_DS', - thresholds='0,1', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - "(sum(collectd_swap{instance='$host',swap='used'}) / sum(collectd_swap{instance='$host'})) * 100" - ) - ) - ) - .addPanel( - singlestat.new( - 'Disk Used', - description="Percentage of root disk in use", - format='percent', - datasource='$PROMETHEUS_DS', - thresholds='50,85', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - '(sum(collectd_df_df_complex{instance="$host", df="root", type="used"}) / sum(collectd_df_df_complex{instance="$host", df="root"})) * 100' - ) - ) - ) - .addPanel( - singlestat.new( - 'CPU System Load (1m avg)', - description="Busy state of all CPU cores together (1 min average)", - format='percent', - datasource='$PROMETHEUS_DS', - thresholds='85,95', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'avg(collectd_load_shortterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100', - ) - ) - ) - .addPanel( - singlestat.new( - 'CPU System Load (5m avg)', - description="Busy state of all CPU cores together (5 min average)", - format='percent', - datasource='$PROMETHEUS_DS', - thresholds='85,95', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'avg(collectd_load_midterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100', - ) - ) - ) -) -.addRow( - row.new( - title='Basic CPU / Mem / Disk Info', - height=textstatHeight - ) - .addPanel( - singlestat.new( - 'CPU Cores', - description="Total number of CPU cores", - format="short", - datasource='$PROMETHEUS_DS', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'count(count(collectd_cpu_total{instance="$host"}) by (cpu))' - ) - ) - ) - .addPanel( - singlestat.new( - 'Total RAM', - description="Total amount of system memory", - format="bytes", - datasource='$PROMETHEUS_DS', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(collectd_memory{instance="$host"})' - ) - ) - ) - .addPanel( - singlestat.new( - 'Total Swap', - description="Total amount of swap space", - format="bytes", - datasource='$PROMETHEUS_DS', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(collectd_swap{instance="$host"})' - ) - ) - ) - .addPanel( - singlestat.new( - 'Total RootFS', - description="Total amount of disk space", - format='bytes', - datasource='$PROMETHEUS_DS', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'sum(collectd_df_df_complex{df="root",instance="$host"})' - ) - ) - ) - .addPanel( - singlestat.new( - 'System Load (1m avg)', - description="System Load (1m avg)", - format="short", - datasource='$PROMETHEUS_DS', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'collectd_load_shortterm{instance="$host"}' - ) - ) - ) - .addPanel( - singlestat.new( - 'System Uptime', - description="Uptime of the host", - format="s", - decimals=1, - datasource='$PROMETHEUS_DS', - span=singlestatSpan - ) - .addTarget( - prometheus.target( - 'collectd_uptime{instance="$host"}' - ) - ) - ) -) -.addRow( - row.new( - title='Basic CPU / Mem Graph', - height=graphHeight - ) - .addPanel( - graphpanel.new( - title="CPU Basic", - datasource='$PROMETHEUS_DS', - span=graphSpan, - percentage=true, - stack=true, - min=0, - max=100 - ) - .addTarget( - prometheus.target( - expr="sum(irate(collectd_cpu_total{instance='$host',type='system'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", - legendFormat="Busy System" - ) - ) - .addTarget( - prometheus.target( - expr="sum(irate(collectd_cpu_total{instance='$host',type='user'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", - legendFormat="Busy User" - ) - ) - .addTarget( - prometheus.target( - expr="sum(irate(collectd_cpu_total{instance='$host',type='wait'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", - legendFormat="Busy IOWait" - ) - ) - .addTarget( - prometheus.target( - expr="sum(irate(collectd_cpu_total{instance='$host',type='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", - legendFormat="Busy IRQ" - ) - ) - .addTarget( - prometheus.target( - expr="sum(irate(collectd_cpu_total{instance='$host',type!='idle',type!='system',type!='user',type!='wait',type!='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", - legendFormat="Busy Other" - ) - ) - .addTarget( - prometheus.target( - expr="sum(irate(collectd_cpu_total{instance='$host',type='idle'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", - legendFormat="Idle" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Basic memory usage", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="bytes", - min=0 - ) - .addTarget( - prometheus.target( - expr='sum(collectd_memory{instance="$host"})', - legendFormat="RAM Total" - ) - ) - .addTarget( - prometheus.target( - expr='sum(collectd_memory{instance="$host"}) - sum(collectd_memory{instance="$host", memory="free"}) - sum(collectd_memory{instance="$host", memory="cached"}) - sum(collectd_memory{instance="$host", memory="buffered"})', - legendFormat="RAM Used" - ) - ) - .addTarget( - prometheus.target( - expr='sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})', - legendFormat="RAM Cache + Buffer" - ) - ) - .addTarget( - prometheus.target( - expr='sum(collectd_memory{instance="$host", memory="free"})', - legendFormat="RAM Free" - ) - ) - .addTarget( - prometheus.target( - expr='sum(collectd_swap{instance="$host"}) - sum(collectd_swap{instance="$host", swap="free"})', - legendFormat="SWAP Used" - ) - ) - ) -) -.addRow( - row.new( - title='Basic Network / Disk Graph', - height=graphHeight - ) - .addPanel( - graphpanel.new( - title="Network Traffic / Second", - description="Basic network info per interface", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="bps", - labelY1="Receive (-) / Send (+)", - - ) - .addSeriesOverride({ - "alias": "/.*receive.*/", - "transform": "negative-Y" - }) - .addTarget( - prometheus.target( - expr="irate(collectd_interface_if_octets_rx_total{instance='$host'}[$rate]) * 8", - legendFormat="{{interface}} receive" - ) - ) - .addTarget( - prometheus.target( - expr="irate(collectd_interface_if_octets_tx_total{instance='$host'}[$rate]) * 8", - legendFormat="{{interface}} send" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Network Packets / Second", - description="Basic network info per interface", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="pps", - labelY1="Receive (-) / Send (+)", - - ) - .addSeriesOverride({ - "alias": "/.*receive.*/", - "transform": "negative-Y" - }) - .addTarget( - prometheus.target( - expr="irate(collectd_interface_if_packets_rx_total{instance='$host'}[$rate]) * 8", - legendFormat="{{interface}} receive" - ) - ) - .addTarget( - prometheus.target( - expr="irate(collectd_interface_if_packets_tx_total{instance='$host'}[$rate]) * 8", - legendFormat="{{interface}} send" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Disk Activity / Second", - description="Disk Activity / Second", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="Bps", - labelY1="Read (-) / Write (+)", - legend_hideZero=true, - legend_hideEmpty=true - ) - .addSeriesOverride({ - "alias": "/.*Read.*/", - "transform": "negative-Y" - }) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_octets_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])', - legendFormat="{{disk}} - Read" - ) - ) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_octets_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])', - legendFormat="{{disk}} - Write" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Disk IOPS", - description="Disk iops per disk", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="iops", - labelY1="Read (-) / Write (+)", - legend_hideZero=true, - legend_hideEmpty=true - ) - .addSeriesOverride({ - "alias": "/.*Read.*/", - "transform": "negative-Y" - }) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_ops_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])', - legendFormat="{{disk}} - Read" - ) - ) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_ops_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])', - legendFormat="{{disk}} - Write" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Disk Used", - description="Disk space used", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="decbytes", - legend_hideZero=true, - legend_hideEmpty=true - ) - .addTarget( - prometheus.target( - expr='sum(collectd_df_df_complex{instance="$host", type="used"}) by (df)', - legendFormat="{{df}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Disk Queue Length", - description="The amount of requests pending in the disk queue", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideZero=true, - legend_hideEmpty=true - ) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_io_time_weighted_io_time_total{instance="$host",disk=~".*[0-9]+"}[$rate]) / 1000', - legendFormat="{{disk}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Disk Latency", - description="Disk access times", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="ms", - labelY1="Read (-) / Write (+)", - legend_hideZero=true, - legend_hideEmpty=true - ) - .addSeriesOverride({ - "alias": "/.*Read.*/", - "transform": "negative-Y" - }) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_time_read_total{instance="$host",disk=~".*[0-9]+"}[$rate])', - legendFormat="{{disk}} - Read" - ) - ) - .addTarget( - prometheus.target( - expr='irate(collectd_disk_disk_time_write_total{instance="$host",disk=~".*[0-9]+"}[$rate])', - legendFormat="{{disk}} - Write" - ) - ) - ) -) -.addRow( - row.new( - title='CPU Details', - height=graphHeight, - collapse=true - ) - .addPanel( - graphpanel.new( - title="CPU User", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="percent", - min=0, - max=100 - ) - .addTarget( - prometheus.target( - expr='sum(rate(collectd_cpu_total{instance="$host", type="user"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', - legendFormat="{{cpu}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="CPU System", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="percent", - min=0, - max=100 - ) - .addTarget( - prometheus.target( - expr='sum(rate(collectd_cpu_total{instance="$host", type="system"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', - legendFormat="{{cpu}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="CPU IOWait", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="percent", - min=0, - max=100 - ) - .addTarget( - prometheus.target( - expr='sum(rate(collectd_cpu_total{instance="$host", type="wait"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', - legendFormat="{{cpu}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="CPU SoftIRQ", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="percent", - min=0, - max=100 - ) - .addTarget( - prometheus.target( - expr='sum(rate(collectd_cpu_total{instance="$host", type="softirq"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', - legendFormat="{{cpu}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="CPU Other", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="percent", - min=0, - max=100 - ) - .addTarget( - prometheus.target( - expr='sum(rate(collectd_cpu_total{instance="$host", type=~"(interrupt|nice|steal)"}[$rate])) by (cpu, type) / ignoring(type) group_left sum(rate(collectd_cpu_total{instance="$host" }[$rate])) by (cpu) * 100', - legendFormat="{{cpu}} - {{type}}" - ) - ) - ) -) -.addRow( - row.new( - title='Advanced Details', - height=graphHeight, - collapse=true - ) - .addPanel( - graphpanel.new( - title="Context Switches / Second", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short" - ) - .addTarget( - prometheus.target( - expr='irate(collectd_contextswitch_total{instance="$host"}[$rate])', - legendFormat="Context Switches" - ) - ) - ) - .addPanel( - graphpanel.new( - title="IRQ Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='rate(collectd_irq_total{instance="$host", irq != "LOC"}[$rate])', - legendFormat="{{irq}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="NUMA Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='irate(collectd_numa_vmpage_action_total{instance="$host"}[$rate])', - legendFormat="{{numa}} - {{type}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="TCP Connection Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='collectd_tcpconns_tcp_connections{instance="$host"}', - legendFormat="{{tcpconns}} - {{type}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="TCP Connection Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='rate(collectd_protocols_protocol_counter_total{instance="$host"}[$rate])', - legendFormat="{{protocols}} - {{type}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Processor Speeds", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="hertz", - points=true, - lines=false, - pointradius=5, - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='collectd_cpufreq{instance="$host"}', - legendFormat="{{cpufreq}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Page Cache Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='rate(collectd_vmem_vmpage_faults_majflt_total{instance="$host"}[$rate])', - legendFormat="Major fault" - ) - ) - .addTarget( - prometheus.target( - expr='rate(collectd_vmem_vmpage_faults_minflt_total{instance="$host"}[$rate])', - legendFormat="Minor fault" - ) - ) - .addTarget( - prometheus.target( - expr='rate(collectd_vmem_vmpage_action_total{instance="$host"}[$rate])', - legendFormat="Action - {{vmem}}" - ) - ) - .addTarget( - prometheus.target( - expr='rate(collectd_vmem_vmpage_io_in_total{instance="$host"}[$rate])', - legendFormat="IO read page" - ) - ) - .addTarget( - prometheus.target( - expr='rate(collectd_vmem_vmpage_io_out_total{instance="$host"}[$rate])', - legendFormat="IO write page" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Page Cache Layout", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - percentage=true, - stack=true, - min=0, - max=100, - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='collectd_vmem_vmpage_number{instance="$host"}', - legendFormat="{{vmem}}" - ) - ) - ) - .addPanel( - graphpanel.new( - title="Process Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="short", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='collectd_processes_ps_count_threads{instance="$host"}', - legendFormat="Thread Count" - ) - ) - .addTarget( - prometheus.target( - expr='collectd_processes_ps_count_processes{instance="$host"}', - legendFormat="Process Count" - ) - ) - .addTarget( - prometheus.target( - expr='collectd_processes_ps_state{instance="$host"}', - legendFormat="Process State - {{processes}}" - ) - ) - ) -) -.addRow( - row.new( - title='Basic Cassandra Overview', - height=singlestatHeight - ) - .addPanel( - singlestat.new( - 'SSTable Count', - description="Number of sstables on the node", - format='short', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan, - thresholds="100000,500000" - ) - .addTarget( - prometheus.target( - "sum(" + prefix + "_table_live_ss_table_count{instance='$host'})" - ) - ) - ) - .addPanel( - singlestat.new( - 'Pending Compactions', - description="Number of pending compactions on the node", - format='short', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan, - thresholds="10,50" - ) - .addTarget( - prometheus.target( - "sum(" + prefix + "_compaction_pending_tasks{instance='$host'})" - ) - ) - ) - .addPanel( - singlestat.new( - 'Connected Clients', - description="Number of client connections to the node", - format='percent', - datasource='$PROMETHEUS_DS', - sparklineShow=true, - gaugeShow=true, - span=singlestatSpan, - thresholds="100,1000" - ) - .addTarget( - prometheus.target( - "sum(" + prefix + "_client_connected_native_clients{instance='$host'})" - ) - ) - ) - .addPanel( - graphpanel.new( - title="GC Activity", - datasource='$PROMETHEUS_DS', - span=graphSpan, - format="bytes", - legend_hideEmpty=true, - legend_hideZero=true - ) - .addTarget( - prometheus.target( - expr='sum(' + prefix + '_jvm_memory_max{instance="$host", memory_type="total"})', - legendFormat="JVM Heap Total" - ) - ) - .addSeriesOverride({ - "alias": "/.*Total.*/", - "fill": 0 - }) - .addTarget( - prometheus.target( - expr='sum(' + prefix + '_jvm_memory_used{instance="$host", memory_type="non_heap"})', - legendFormat="JVM Non-Heap Used" - ) - ) - .addTarget( - prometheus.target( - expr='sum(' + prefix + '_jvm_memory_used{instance="$host", memory_type="heap"})', - legendFormat="JVM Heap Used" - ) - ) - ) -) \ No newline at end of file diff --git a/dashboards/grafana/make-dashboards.sh b/dashboards/grafana/make-dashboards.sh deleted file mode 100755 index 7e80adf..0000000 --- a/dashboards/grafana/make-dashboards.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -cd $DIR -for file in dashboards-jsonnet/*; do - name=$(basename $file); - echo "Generating ${name%.jsonnet}.json" - docker run -v `pwd`:/here datastax/grafonnet-lib:v0.1.3 jsonnet --ext-str prefix=mcac /here/$file > `pwd`/generated-dashboards/${name%.jsonnet}.json; -done diff --git a/make_package.sh b/make_package.sh index 2394077..3073c31 100755 --- a/make_package.sh +++ b/make_package.sh @@ -41,10 +41,7 @@ cd $PACKAGE_DIR tar zcvf $PROJECT_DIR_NAME.tar.gz $PROJECT_DIR_NAME zip $PROJECT_DIR_NAME.zip $(tar ztf $PROJECT_DIR_NAME.tar.gz) popd -pushd . -cd dashboards/grafana -./make-dashboards.sh -popd +mixin/make-dashboards.sh DASHBOARD_DIR_NAME=datastax-mcac-dashboards-$VERSION mkdir -p $PACKAGE_DIR/$DASHBOARD_DIR_NAME/grafana diff --git a/mixin/README.md b/mixin/README.md new file mode 100644 index 0000000..b2db7fb --- /dev/null +++ b/mixin/README.md @@ -0,0 +1,14 @@ +# MCAC Mixin + +The MCAC Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Apache Cassandra. + +## Requirements + +- [Jsonnet](https://github.com/google/go-jsonnet) +- [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler) (`jb`) + +## Install as library + +```shell +jb install github.com/datastax/metric-collector-for-apache-cassandra/mixin@master +``` diff --git a/mixin/config.libsonnet b/mixin/config.libsonnet new file mode 100644 index 0000000..cd9503e --- /dev/null +++ b/mixin/config.libsonnet @@ -0,0 +1,5 @@ +{ + _config+:: { + metricPrefix: 'mcac', + }, +} diff --git a/mixin/dashboards.jsonnet b/mixin/dashboards.jsonnet new file mode 100644 index 0000000..0d31ce1 --- /dev/null +++ b/mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import './mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/mixin/dashboards/cassandra-condensed.libsonnet b/mixin/dashboards/cassandra-condensed.libsonnet new file mode 100644 index 0000000..452d699 --- /dev/null +++ b/mixin/dashboards/cassandra-condensed.libsonnet @@ -0,0 +1,529 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; + +local dashboard = grafana.dashboard; +local row = grafana.row; +local singlestat = grafana.singlestat; +local graphpanel = grafana.graphPanel; +local text = grafana.text; +local prometheus = grafana.prometheus; +local template = grafana.template; + +local graphHeight = 300; +local singlestatHeight = 100; +local singlestatSpan = 1; +local graphSpan = 4; + +{ + _config+:: { + metricPrefix: error 'must provide metric prefix', + }, + grafanaDashboards+:: { + 'cassandra-condensed.json': + dashboard.new( + 'Cassandra Cluster Condensed', + description='Single pane of glass for most important Cassandra metrics', + schemaVersion=14, + refresh='30s', + time_from='now-30m', + editable=true, + tags=['os'], + style='dark' + ) + .addTemplate( + template.datasource( + 'PROMETHEUS_DS', + 'prometheus', + 'Prometheus', + hide='all', + ) + ) + + .addTemplate( + template.custom( + 'by', + 'cluster,dc,rack,instance', + 'cluster', + valuelabels={ + cluster: 'Cluster', + dc: 'Datacenter', + rack: 'Rack', + instance: 'Host', + }, + label='Group By', + ) + ) + .addTemplate( + template.interval( + 'rate', + '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d', + '5m', + label='Rate', + ) + ) + .addTemplate( + template.new( + 'cluster', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{}, cluster)', + label='Cluster', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'dc', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)', + label='DataCenter', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'rack', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)', + label='Rack', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'keyspace', + '$PROMETHEUS_DS', + 'label_values(' + $._config.metricPrefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc"}, keyspace)', + label='Keyspace', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'table', + '$PROMETHEUS_DS', + 'label_values(' + $._config.metricPrefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc", keyspace=~"$keyspace"}, table)', + label='Table', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'host', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)', + label='Host', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.custom( + 'latency', + '0.999,0.99,0.98,0.95,0.90,0.75,0.50', + '0.95', + valuelabels={ + '0.999': 'P999', + '0.99': 'P99', + '0.98': 'P98', + '0.95': 'P95', + '0.90': 'P90', + '0.75': 'P75', + '0.50': 'P50', + }, + label='Percentile' + ) + ) + .addRow( + row.new( + title='Cluster Overview', + height=singlestatHeight, + ) + .addPanel( + singlestat.new( + 'Nodes Up', + description='Nodes that are currently running in this time window', + format='none', + decimals=0, + datasource='$PROMETHEUS_DS', + colorValue=true, + colors=['#d44a3a', '#299c46', '#299c46'], + thresholds='0.1,1000', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'count(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"} >= 0) or vector(0)' + ) + ) + ) + .addPanel( + singlestat.new( + 'Nodes Down', + description='Nodes that are currently not running in this time window', + format='none', + decimals=0, + colorValue=true, + colors=['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'], + datasource='$PROMETHEUS_DS', + thresholds='1,2', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'count(absent(sum(rate(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[5m])))) OR vector(0)' + ) + ) + ) + .addPanel( + singlestat.new( + 'Compactions / $rate', + description='Rate of compactions during this window', + format='none', + decimals=0, + datasource='$PROMETHEUS_DS', + sparklineShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(rate(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' + ) + ) + ) + .addPanel( + singlestat.new( + 'CQL Requests / $rate', + description='Rate of CQL requests during this window', + format='none', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + decimals=0, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(irate(dse_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' + ) + ) + ) + .addPanel( + singlestat.new( + 'Dropped Messages / $rate', + description='Rate of Dropped requests during this window', + format='none', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + thresholds='30,300', + colorValue=true, + decimals=0, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(irate(' + $._config.metricPrefix + '_table_dropped_mutations_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' + ) + ) + ) + .addPanel( + text.new( + transparent=true, + mode='html', + content='', + span=2 + ) + ) + .addPanel( + singlestat.new( + 'CQL Clients', + description='Number of connected clients during this time window', + format='none', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + thresholds='100,1000', + colorValue=true, + decimals=0, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(' + $._config.metricPrefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"})' + ) + ) + ) + .addPanel( + singlestat.new( + 'Timeouts / $rate', + description='Client timeouts over the last $rate', + format='none', + datasource='$PROMETHEUS_DS', + thresholds='100,300', + colorValue=true, + sparklineShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(irate(' + $._config.metricPrefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))', + ) + ) + ) + .addPanel( + singlestat.new( + 'Hints / $rate', + description='Hints stored over the last $rate', + format='none', + datasource='$PROMETHEUS_DS', + thresholds='1000,30000', + colorValue=true, + sparklineShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(irate(' + $._config.metricPrefix + '_storage_hints_on_disk_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' + ) + ) + ) + .addPanel( + singlestat.new( + 'Data Size', + description='Data', + format='bytes', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(' + $._config.metricPrefix + '_table_live_disk_space_used_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"})' + ) + ) + ) + .addPanel( + singlestat.new( + 'GC Time / $rate', + description='Data', + format='ms', + decimals=1, + datasource='$PROMETHEUS_DS', + sparklineShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))' + ) + ) + ) + ) + .addRow( + row.new( + title='Condensed Metrics', + height=graphHeight + ) + .addPanel( + graphpanel.new( + title='Requests Served / $by / $rate', + description='(no keyspace/table filters apply)', + datasource='$PROMETHEUS_DS', + span=graphSpan, + labelY2='Clients Connected', + legend_hideZero=true, + legend_hideEmpty=true + ) + .addSeriesOverride({ + alias: '/.*Connected/', + yaxis: 2, + }) + .addTarget( + prometheus.target( + expr='sum(irate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by, request_type)', + legendFormat='{{$by}}:{{request_type}}' + ) + ) + .addTarget( + prometheus.target( + expr='sum(' + $._config.metricPrefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}) by ($by)', + legendFormat='{{$by}}:Clients Connected' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Coordinator $latency Latency / $by', + description='(no keyspace/table filters apply)', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='µs', + min=0, + legend_hideZero=true, + legend_hideEmpty=true + ) + .addTarget( + prometheus.target( + expr='histogram_quantile($latency, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by (le, request_type, $by))', + legendFormat='$by:{{$by}} {{$latency}} {{request_type}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Memtable Space $keyspace.$table / $by', + datasource='$PROMETHEUS_DS', + span=graphSpan, + formatY1='bytes', + formatY2='short', + labelY2='Flush', + min=0, + legend_hideZero=true, + legend_hideEmpty=true + ) + .addSeriesOverride({ + alias: '/.*Flushes/', + bars: true, + lines: false, + zindex: -3, + yaxis: 2, + }) + .addTarget( + prometheus.target( + expr='sum(' + $._config.metricPrefix + '_table_memtable_off_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)', + legendFormat='{{$by}} : Off Heap' + ) + ) + .addTarget( + prometheus.target( + expr='sum(' + $._config.metricPrefix + '_table_memtable_on_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)', + legendFormat='{{$by}} : On Heap' + ) + ) + .addTarget( + prometheus.target( + expr='sum(idelta(' + $._config.metricPrefix + '_table_memtable_switch_count_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{$by}} : Flushes' + ) + ) + .addTarget( + prometheus.target( + expr='sum(idelta(' + $._config.metricPrefix + '_table_pending_flushes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{$by}} : Pending Flushes' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Compactions $keyspace.$table / $by', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='bps', + formatY2='short', + labelY2='Count', + legend_hideZero=true, + legend_hideEmpty=true, + min=0 + ) + .addSeriesOverride({ + alias: '/.*Compactions/', + bars: true, + lines: false, + zindex: -3, + yaxis: 2, + }) + .addTarget( + prometheus.target( + expr='sum(irate(' + $._config.metricPrefix + '_table_compaction_bytes_written_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{by}} : Bytes Compacted' + ) + ) + .addTarget( + prometheus.target( + expr='sum(irate(' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{by}} : Pending Compactions' + ) + ) + .addTarget( + prometheus.target( + expr='sum(irate(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{by}} : Completed Compactions' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Table $latency Latency / $by', + description='', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='µs', + min=0, + legend_hideZero=true, + legend_hideEmpty=true + ) + .addTarget( + prometheus.target( + expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_range_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', + legendFormat='$by:{{$by}} Local Range Scan' + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', + legendFormat='$by:{{$by}} Local Read' + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_write_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', + legendFormat='$by:{{$by}} Local Write' + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_coordinator_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table"}[$rate])) by (le, $by))', + legendFormat='$by:{{$by}} Coordinator Read' + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_coordinator_scan_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))', + legendFormat='$by:{{$by}} Coordinator Range Scan' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Streaming / $by / $rate', + description='', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='Bps', + min=0, + legend_hideZero=true, + legend_hideEmpty=true + ) + .addTarget( + prometheus.target( + expr='sum(irate(' + $._config.metricPrefix + '_streaming_total_incoming_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{$by}}: Incoming Stream' + ) + ) + .addTarget( + prometheus.target( + expr='sum(irate(' + $._config.metricPrefix + '_streaming_total_outgoing_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)', + legendFormat='{{$by}}: Outgoing Stream' + ) + ) + ) + ), + }, +} diff --git a/mixin/dashboards/dashboards.libsonnet b/mixin/dashboards/dashboards.libsonnet new file mode 100644 index 0000000..aa49aca --- /dev/null +++ b/mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,3 @@ +(import './cassandra-condensed.libsonnet') + +(import './overview.libsonnet') + +(import './system-metrics.libsonnet') diff --git a/mixin/dashboards/overview.libsonnet b/mixin/dashboards/overview.libsonnet new file mode 100644 index 0000000..1c1effc --- /dev/null +++ b/mixin/dashboards/overview.libsonnet @@ -0,0 +1,1108 @@ +local grafana = (import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet') + + (import 'github.com/thelastpickle/grafonnet-polystat-panel/plugin.libsonnet'); +local dashboard = grafana.dashboard; +local prometheus = grafana.prometheus; +local template = grafana.template; +local row = grafana.row; + +local graphPanel = grafana.graphPanel; +local tablePanel = grafana.tablePanel; +local singleStatPanel = grafana.singlestat; +local textPanel = grafana.text; +local polystatPanel = grafana.polystatPanel; + +local fillLatencySeriesOverrides = { + alias: 'p999', + fillBelowTo: 'p98', + lines: false, +}; +local removeMinLatencySeriesOverrides = { + alias: 'p98', + lines: false, +}; + +local fillMinMaxSeriesOverrides = { + alias: 'max', + fillBelowTo: 'min', + lines: false, +}; +local removeMinlineSeriesOverrides = { + alias: 'min', + lines: false, +}; + + +// used in the single stat panels where higher is better - cache hit rates for example +local reversedColors = [ + '#d44a3a', + 'rgba(237, 129, 40, 0.89)', + '#299c46', +]; + +{ + _config+:: { + metricPrefix: error 'must provide metric prefix', + }, + grafanaDashboards+:: { + 'overview.json': + dashboard.new( + 'Cassandra Overview', + schemaVersion=14, + refresh='30s', + time_from='now-30m', + editable=true, + tags=['Cassandra', 'Overview'], + style='dark' + ) + .addTemplate( + grafana.template.datasource( + 'PROMETHEUS_DS', + 'prometheus', + 'Prometheus', + hide='all', + ) + ) + .addTemplate( + template.new( + 'cluster', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{}, cluster)', + label='Cluster', + refresh='time', + ) + ) + .addTemplate( + template.new( + 'dc', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)', + label='DataCenter', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'rack', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)', + label='Rack', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addTemplate( + template.new( + 'node', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)', + label='Node', + refresh='time', + includeAll=true, + allValues='.*', + ) + ) + .addRow( + row.new(title='', height='50px') + .addPanel(textPanel.new(transparent=true)) + .addPanel( + textPanel.new( + transparent=true, + mode='html', + content='', + ) + ) + .addPanel(textPanel.new(transparent=true)) + ) + .addRow( + row.new(title='Request Throughputs (Coordinator Perspective)') + .addPanel( + graphPanel.new( + 'Request Throughputs', + description='Total Requests Per Cluster, by Request Type', + format='rps', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Error throughputs', + description='Total Timeouts, Failures, Unavailable Rates for each cluster', + format='rps', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_failures_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}} failures', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}} timeouts', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_unavailables_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}} unavailable errors', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_unfinished_commit_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}} unfinished commit errors', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_condition_not_met_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}} condition not met errors', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_contention_histogram_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{request_type}} contention histogram errors', + ) + ) + ) + .addPanel( + singleStatPanel.new( + 'Read / Write Distribution', + description='Part of reads in the total of standard requests (Reads+Writes). CAS, Views, ... operations are ignored.', + format='percentunit', + datasource='$PROMETHEUS_DS', + transparent=true, + postfix=' Reads', + postfixFontSize='30%', + valueFontSize='30%', + valueName='current', + decimals=2, + thresholds='0.25,0.5,0.75', + timeFrom='', + colors=[ + '#DEB6F2', + '#CA95E5', + '#8F3BB8', + ], + gaugeShow=true, + gaugeMinValue=0, + gaugeMaxValue=1, + gaugeThresholdLabels=true, + gaugeThresholdMarkers=false, + sparklineFillColor='rgba(31, 118, 189, 0.18)', + sparklineFull=false, + sparklineLineColor='#FFB357', + sparklineShow=false + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) / ignoring (request_type) (sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) + ignoring (request_type) sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Read Latency (98 - 999th percentile)', + description='Read latency for coordinated reads', + format='µs', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='histogram_quantile(0.98, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))', + legendFormat='p98', + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile(0.99, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))', + legendFormat='p99', + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile(0.999, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))', + legendFormat='p999', + ) + ) + .addSeriesOverride(fillLatencySeriesOverrides) + .addSeriesOverride(removeMinLatencySeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Write Latency (98th - p999 Percentile)', + description='Write latency for coordinated writes', + format='µs', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='histogram_quantile(0.98, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))', + legendFormat='p98', + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile(0.99, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))', + legendFormat='p99', + ) + ) + .addTarget( + prometheus.target( + expr='histogram_quantile(0.999, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))', + legendFormat='p999', + ) + ) + .addSeriesOverride(fillLatencySeriesOverrides) + .addSeriesOverride(removeMinLatencySeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Other Latencies', + description='Other p99 latencies for coordinated requests', + format='µs', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + // In scope!~"Write|Read|.*-.*", we want to exclude charts above and all the per-consistency_level info like "Read-LOCAL_ONE" + expr='histogram_quantile(0.99, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type!~"write|read|.*-.*"}[1m:30s])) by (le, request_type, cluster))', + legendFormat='p99 {{request_type}}' + ) + ) + ) + ) + .addRow( + row.new(title='Nodes Status',) + .addPanel( + polystatPanel.new( + 'Nodes Status', + description="Nodes Status uses Internal/Gossip activity. Be mindful that if Native or Thrift protocol are disabled, the nodes won't be reachable, and still marked up", + datasource='$PROMETHEUS_DS', + transparent=true, + span=12, + global_unit_format='none', + global_operator_name='current', + global_thresholds=[ + { + value: 0, + state: 2, + color: '#d44a3a', + }, + { + value: 1, + state: 0, + color: '#299c46', + }, + ], + range_maps=[ + { + from: '0', + to: '0.9999', + text: 'DOWN', + }, + { + from: '1', + to: '1', + text: 'UP', + }, + ], + mapping_type=2, + value_enabled=true, + ) + .addTarget( + prometheus.target( + 'max by (cluster, dc, rack, instance) (changes(' + $._config.metricPrefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0', + legendFormat='{{instance}}', + instant=true, + ) + ) + ) + .addPanel( + singleStatPanel.new( + 'Nodes Count', + description='Nodes up and down in the cluster', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + decimals=0, + prefix='Total:', + postfix=' Nodes', + postfixFontSize='80%', + valueFontSize='80%', + span=4 + ) + .addTarget( + prometheus.target( + expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', + legendFormat='Total Number Of Nodes', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Nodes Status History', + description='Nodes up and down in the cluster per protocol/activity', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + decimals=0, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=false, + shared_tooltip=false, + min=0, + span=8 + ) + .addTarget( + prometheus.target( + expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', + legendFormat='Total Number Of Nodes', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + $._config.metricPrefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="native"}[2m:30s])) > bool 0)', + legendFormat='Nodes Coordinating Requests (Native protocol)', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + $._config.metricPrefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0)', + legendFormat='Nodes With Internal Activity (Gossip protocol)', + ) + ) + ) + ) + .addRow( + row.new(title='Data Status') + .addPanel( + tablePanel.new( + 'Disk Space Usage', + description='Disk space used ordered (fullest disks first)', + datasource='$PROMETHEUS_DS', + transform='timeseries_aggregations', + transparent=true, + styles=[ + { + alias: 'Node --> Mounting Point', + colorMode: null, + colors: [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + dateFormat: 'YYYY-MM-DD HH:mm:ss', + decimals: 2, + mappingType: 1, + pattern: 'Metric', + preserveFormat: true, + sanitize: true, + thresholds: [], + type: 'string', + unit: 'short', + }, + { + alias: '% Disk Space Used', + colorMode: 'row', + colors: [ + 'rgba(50, 172, 45, 0.97)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(245, 54, 54, 0.9)', + ], + dateFormat: 'YYYY-MM-DD HH:mm:ss', + decimals: 2, + link: false, + mappingType: 1, + pattern: 'Current', + thresholds: [ + '0.5', + '0.75', + ], + type: 'number', + unit: 'percentunit', + }, + ], + columns=[ + { + text: 'Current', + value: 'current', + }, + ], + sort={ + col: 1, + desc: true, + } + ) + .addTarget( + prometheus.target( + expr='min by (instance, df) (1-(collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"}\n / ignoring (type) (collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="used"}\n + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="reserved"}\n + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"}))\n )', + legendFormat='{{cluster}}-{{instance}} --> {{df}}', + instant=true + ) + ) + ) + .addPanel( + graphPanel.new( + 'Cassandra cluster Data Size', + description='Total sizes of the data on distinct nodes', + format='bytes', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster) (' + $._config.metricPrefix + '_table_live_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='Live space - {{cluster}}', + ) + ) + .addTarget( + prometheus.target( + expr='sum by (cluster) (' + $._config.metricPrefix + '_table_total_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='Total space - {{cluster}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'SSTable Count', + description='SSTable Count Max and Average per table', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='max by (cluster, keyspace, table) (' + $._config.metricPrefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='Table - {{keyspace}}.{{table}}', + ) + ) + .addTarget( + prometheus.target( + expr='max by (cluster) (' + $._config.metricPrefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='Max in cluster - {{cluster}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Pending Compactions', + description='Maximum pending compactions on any node in the cluster', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + bars=false, + lines=true, + stack=false, + decimals=0, + ) + .addTarget( + prometheus.target( + expr='max by (cluster) (' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + expr='min by (cluster) (' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + expr='avg by (cluster) (' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Pending Compactions per Table', + description='Maximum pending compactions per table', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + bars=false, + lines=true, + stack=true, + decimals=0, + ) + .addTarget( + prometheus.target( + expr='max by (cluster, keyspace, table) (' + $._config.metricPrefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='max for {{keyspace}}.{{table}}', + ) + ) + ) + ) + .addRow( + row.new(title='Cassandra Internals') + .addPanel( + graphPanel.new( + 'Pending Tasks', + description='Cluster wide pending threads, by thread pool name', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, pool_name) (' + $._config.metricPrefix + '_thread_pools_pending_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='{{cluster}} - pending {{pool_name}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Blocked Tasks', + description='Cluster wide blocked threads, by thread pool name', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, pool_name) (rate(' + $._config.metricPrefix + '_thread_pools_total_blocked_tasks_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{cluster}} - blocked {{pool_name}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Dropped Messages', + description='Dropped messages rate summed by message type and cluster', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, message_type) (rate(' + $._config.metricPrefix + '_dropped_message_dropped_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='{{cluster}} - dropped {{message_type}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Active Tasks', + description='active threads summed per cluster', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster, pool_name) (' + $._config.metricPrefix + '_thread_pools_active_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='{{cluster}} - active {{pool_name}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Hinted Handoff', + description='Sum of hints being handed off per cluster.', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + min=0, + ) + .addTarget( + prometheus.target( + expr='sum by (cluster) (' + $._config.metricPrefix + '_storage_total_hints_in_progress_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='count', + ) + ) + ) + ) + .addRow( + row.new(title='Hardware / Operating System') + .addPanel( + graphPanel.new( + 'CPU Utilization', + description='Maximum CPU utilisation (max 100%)', + format='percentunit', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + percentage=true, + decimals=1, + min=0, + max=1, + ) + .addTarget( + prometheus.target( + expr='max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + expr='min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + expr='avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Unix Load (1m rate)', + description='Max Unix load on a node for a cluster', + format='short', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + ) + .addTarget( + prometheus.target( + expr='max by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Memory Utilisation', + description='Maximum Memory allocated per usage (worst node) - excludes caches, buffers, etc', + format='bytes', + datasource='$PROMETHEUS_DS', + transparent=true, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + fill=1, + linewidth=2, + ) + .addTarget( + prometheus.target( + expr='min by (cluster) (sum by (cluster, dc, rack, instance) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', + legendFormat='min memory available', + ) + ) + .addTarget( + prometheus.target( + expr='max by (cluster, memory) (sum by (cluster, dc, rack, instance, memory) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))', + legendFormat='max memory {{memory}}', + ) + ) + ) + .addPanel( + graphPanel.new( + 'Disk Read Thoughput', + description='Disk read throughput', + format='bps', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + ) + .addTarget( + prometheus.target( + expr='max by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Disk Write Thoughput', + description='Disk write throughput', + format='bps', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + ) + .addTarget( + prometheus.target( + expr='max by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Network I/O', + description='Network In and Out per cluster', + format='bytes', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=1, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + bars=false, + ) + .addTarget( + prometheus.target( + 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='outgoing', + ) + ) + .addTarget( + prometheus.target( + 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='incoming', + ) + ) + .addSeriesOverride({ + alias: 'incoming', + transform: 'negative-Y', + }) + + ) + ) + .addRow( + row.new(title='JVM / Garbage Collection') + .addPanel( + graphPanel.new( + 'Application Throughput (% time NOT doing GC)', + description='Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC', + format='percentunit', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + decimals=2, + min=0, + max=1, + ) + .addTarget( + prometheus.target( + 'max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Garbage Collection Time', + description='Garbage collection duration', + format='ms', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + ) + .addTarget( + prometheus.target( + 'max by (cluster) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'JVM Heap Memory Utilisation', + description='Maximum JVM Heap Memory size (worst node) and minimum available heap size', + format='bytes', + datasource='$PROMETHEUS_DS', + transparent=true, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + fill=1, + linewidth=2, + ) + .addTarget( + prometheus.target( + 'max by (cluster)\n (' + $._config.metricPrefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster)\n (' + $._config.metricPrefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster)\n (' + $._config.metricPrefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='avg', + ) + ) + .addTarget( + prometheus.target( + 'min by ( cluster)\n (' + $._config.metricPrefix + '_jvm_memory_max{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='Heap memory available', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + ), + }, +} diff --git a/mixin/dashboards/system-metrics.libsonnet b/mixin/dashboards/system-metrics.libsonnet new file mode 100644 index 0000000..ed70c34 --- /dev/null +++ b/mixin/dashboards/system-metrics.libsonnet @@ -0,0 +1,908 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; + +local dashboard = grafana.dashboard; +local row = grafana.row; +local singlestat = grafana.singlestat; +local graphpanel = grafana.graphPanel; +local text = grafana.text; +local prometheus = grafana.prometheus; +local template = grafana.template; + +local textstatHeight = 100; +local graphHeight = 250; +local singlestatHeight = 125; +local singlestatSpan = 2; +local graphSpan = 6; + +{ + _config+:: { + metricPrefix: error 'must provide metric prefix', + }, + grafanaDashboards+:: { + 'system-metrics.json': + dashboard.new( + 'System & Node Metrics', + description='Operating System Metrics and Apache Cassandra Node Information', + schemaVersion=14, + time_from='now-30m', + refresh='1m', + tags=['os'], + style='dark' + ) + .addTemplate( + template.datasource( + 'PROMETHEUS_DS', + 'prometheus', + 'Prometheus', + hide='all', + ) + ) + .addTemplate( + template.interval( + 'rate', + '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d', + '5m', + label='Rate', + ) + ) + .addTemplate( + template.new( + 'host', + '$PROMETHEUS_DS', + 'label_values(collectd_collectd_queue_length{}, instance)', + label='Host', + refresh='time', + ) + ) + .addPanel( + text.new( + transparent=true, + mode='html', + content='' + ), + { + h: 3, + w: 5, + x: 9, + y: -1, + }, + ) + .addRow( + row.new( + title='Basic CPU / Mem / Disk Gauge', + height=singlestatHeight, + ) + .addPanel( + singlestat.new( + 'CPU Busy', + description='Busy state of all CPU cores together', + format='percent', + datasource='$PROMETHEUS_DS', + thresholds='85,95', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + "(1 - ((sum(irate(collectd_cpu_total{instance='$host', type='idle'}[$rate])) by (instance) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) by (instance)))) * 100", + ) + ) + ) + .addPanel( + singlestat.new( + 'Memory Used', + description='Percentage of memory used (ignoring page cache)', + format='percent', + datasource='$PROMETHEUS_DS', + thresholds='85,95', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + '100 * ((sum(collectd_memory{instance="$host", memory="free"}) + sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})) / sum(collectd_memory{instance="$host"}))', + ) + ) + ) + .addPanel( + singlestat.new( + 'Swap Used', + description='Percentage of swap in use', + format='percent', + datasource='$PROMETHEUS_DS', + thresholds='0,1', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + "(sum(collectd_swap{instance='$host',swap='used'}) / sum(collectd_swap{instance='$host'})) * 100" + ) + ) + ) + .addPanel( + singlestat.new( + 'Disk Used', + description='Percentage of root disk in use', + format='percent', + datasource='$PROMETHEUS_DS', + thresholds='50,85', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + '(sum(collectd_df_df_complex{instance="$host", df="root", type="used"}) / sum(collectd_df_df_complex{instance="$host", df="root"})) * 100' + ) + ) + ) + .addPanel( + singlestat.new( + 'CPU System Load (1m avg)', + description='Busy state of all CPU cores together (1 min average)', + format='percent', + datasource='$PROMETHEUS_DS', + thresholds='85,95', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'avg(collectd_load_shortterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100', + ) + ) + ) + .addPanel( + singlestat.new( + 'CPU System Load (5m avg)', + description='Busy state of all CPU cores together (5 min average)', + format='percent', + datasource='$PROMETHEUS_DS', + thresholds='85,95', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'avg(collectd_load_midterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100', + ) + ) + ) + ) + .addRow( + row.new( + title='Basic CPU / Mem / Disk Info', + height=textstatHeight + ) + .addPanel( + singlestat.new( + 'CPU Cores', + description='Total number of CPU cores', + format='short', + datasource='$PROMETHEUS_DS', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'count(count(collectd_cpu_total{instance="$host"}) by (cpu))' + ) + ) + ) + .addPanel( + singlestat.new( + 'Total RAM', + description='Total amount of system memory', + format='bytes', + datasource='$PROMETHEUS_DS', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(collectd_memory{instance="$host"})' + ) + ) + ) + .addPanel( + singlestat.new( + 'Total Swap', + description='Total amount of swap space', + format='bytes', + datasource='$PROMETHEUS_DS', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(collectd_swap{instance="$host"})' + ) + ) + ) + .addPanel( + singlestat.new( + 'Total RootFS', + description='Total amount of disk space', + format='bytes', + datasource='$PROMETHEUS_DS', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'sum(collectd_df_df_complex{df="root",instance="$host"})' + ) + ) + ) + .addPanel( + singlestat.new( + 'System Load (1m avg)', + description='System Load (1m avg)', + format='short', + datasource='$PROMETHEUS_DS', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'collectd_load_shortterm{instance="$host"}' + ) + ) + ) + .addPanel( + singlestat.new( + 'System Uptime', + description='Uptime of the host', + format='s', + decimals=1, + datasource='$PROMETHEUS_DS', + span=singlestatSpan + ) + .addTarget( + prometheus.target( + 'collectd_uptime{instance="$host"}' + ) + ) + ) + ) + .addRow( + row.new( + title='Basic CPU / Mem Graph', + height=graphHeight + ) + .addPanel( + graphpanel.new( + title='CPU Basic', + datasource='$PROMETHEUS_DS', + span=graphSpan, + percentage=true, + stack=true, + min=0, + max=100 + ) + .addTarget( + prometheus.target( + expr="sum(irate(collectd_cpu_total{instance='$host',type='system'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", + legendFormat='Busy System' + ) + ) + .addTarget( + prometheus.target( + expr="sum(irate(collectd_cpu_total{instance='$host',type='user'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", + legendFormat='Busy User' + ) + ) + .addTarget( + prometheus.target( + expr="sum(irate(collectd_cpu_total{instance='$host',type='wait'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", + legendFormat='Busy IOWait' + ) + ) + .addTarget( + prometheus.target( + expr="sum(irate(collectd_cpu_total{instance='$host',type='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", + legendFormat='Busy IRQ' + ) + ) + .addTarget( + prometheus.target( + expr="sum(irate(collectd_cpu_total{instance='$host',type!='idle',type!='system',type!='user',type!='wait',type!='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", + legendFormat='Busy Other' + ) + ) + .addTarget( + prometheus.target( + expr="sum(irate(collectd_cpu_total{instance='$host',type='idle'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100", + legendFormat='Idle' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Basic memory usage', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='bytes', + min=0 + ) + .addTarget( + prometheus.target( + expr='sum(collectd_memory{instance="$host"})', + legendFormat='RAM Total' + ) + ) + .addTarget( + prometheus.target( + expr='sum(collectd_memory{instance="$host"}) - sum(collectd_memory{instance="$host", memory="free"}) - sum(collectd_memory{instance="$host", memory="cached"}) - sum(collectd_memory{instance="$host", memory="buffered"})', + legendFormat='RAM Used' + ) + ) + .addTarget( + prometheus.target( + expr='sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})', + legendFormat='RAM Cache + Buffer' + ) + ) + .addTarget( + prometheus.target( + expr='sum(collectd_memory{instance="$host", memory="free"})', + legendFormat='RAM Free' + ) + ) + .addTarget( + prometheus.target( + expr='sum(collectd_swap{instance="$host"}) - sum(collectd_swap{instance="$host", swap="free"})', + legendFormat='SWAP Used' + ) + ) + ) + ) + .addRow( + row.new( + title='Basic Network / Disk Graph', + height=graphHeight + ) + .addPanel( + graphpanel.new( + title='Network Traffic / Second', + description='Basic network info per interface', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='bps', + labelY1='Receive (-) / Send (+)', + + ) + .addSeriesOverride({ + alias: '/.*receive.*/', + transform: 'negative-Y', + }) + .addTarget( + prometheus.target( + expr="irate(collectd_interface_if_octets_rx_total{instance='$host'}[$rate]) * 8", + legendFormat='{{interface}} receive' + ) + ) + .addTarget( + prometheus.target( + expr="irate(collectd_interface_if_octets_tx_total{instance='$host'}[$rate]) * 8", + legendFormat='{{interface}} send' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Network Packets / Second', + description='Basic network info per interface', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='pps', + labelY1='Receive (-) / Send (+)', + + ) + .addSeriesOverride({ + alias: '/.*receive.*/', + transform: 'negative-Y', + }) + .addTarget( + prometheus.target( + expr="irate(collectd_interface_if_packets_rx_total{instance='$host'}[$rate]) * 8", + legendFormat='{{interface}} receive' + ) + ) + .addTarget( + prometheus.target( + expr="irate(collectd_interface_if_packets_tx_total{instance='$host'}[$rate]) * 8", + legendFormat='{{interface}} send' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Disk Activity / Second', + description='Disk Activity / Second', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='Bps', + labelY1='Read (-) / Write (+)', + legend_hideZero=true, + legend_hideEmpty=true + ) + .addSeriesOverride({ + alias: '/.*Read.*/', + transform: 'negative-Y', + }) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_octets_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])', + legendFormat='{{disk}} - Read' + ) + ) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_octets_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])', + legendFormat='{{disk}} - Write' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Disk IOPS', + description='Disk iops per disk', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='iops', + labelY1='Read (-) / Write (+)', + legend_hideZero=true, + legend_hideEmpty=true + ) + .addSeriesOverride({ + alias: '/.*Read.*/', + transform: 'negative-Y', + }) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_ops_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])', + legendFormat='{{disk}} - Read' + ) + ) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_ops_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])', + legendFormat='{{disk}} - Write' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Disk Used', + description='Disk space used', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='decbytes', + legend_hideZero=true, + legend_hideEmpty=true + ) + .addTarget( + prometheus.target( + expr='sum(collectd_df_df_complex{instance="$host", type="used"}) by (df)', + legendFormat='{{df}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Disk Queue Length', + description='The amount of requests pending in the disk queue', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideZero=true, + legend_hideEmpty=true + ) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_io_time_weighted_io_time_total{instance="$host",disk=~".*[0-9]+"}[$rate]) / 1000', + legendFormat='{{disk}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Disk Latency', + description='Disk access times', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='ms', + labelY1='Read (-) / Write (+)', + legend_hideZero=true, + legend_hideEmpty=true + ) + .addSeriesOverride({ + alias: '/.*Read.*/', + transform: 'negative-Y', + }) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_time_read_total{instance="$host",disk=~".*[0-9]+"}[$rate])', + legendFormat='{{disk}} - Read' + ) + ) + .addTarget( + prometheus.target( + expr='irate(collectd_disk_disk_time_write_total{instance="$host",disk=~".*[0-9]+"}[$rate])', + legendFormat='{{disk}} - Write' + ) + ) + ) + ) + .addRow( + row.new( + title='CPU Details', + height=graphHeight, + collapse=true + ) + .addPanel( + graphpanel.new( + title='CPU User', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='percent', + min=0, + max=100 + ) + .addTarget( + prometheus.target( + expr='sum(rate(collectd_cpu_total{instance="$host", type="user"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', + legendFormat='{{cpu}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='CPU System', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='percent', + min=0, + max=100 + ) + .addTarget( + prometheus.target( + expr='sum(rate(collectd_cpu_total{instance="$host", type="system"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', + legendFormat='{{cpu}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='CPU IOWait', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='percent', + min=0, + max=100 + ) + .addTarget( + prometheus.target( + expr='sum(rate(collectd_cpu_total{instance="$host", type="wait"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', + legendFormat='{{cpu}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='CPU SoftIRQ', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='percent', + min=0, + max=100 + ) + .addTarget( + prometheus.target( + expr='sum(rate(collectd_cpu_total{instance="$host", type="softirq"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100', + legendFormat='{{cpu}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='CPU Other', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='percent', + min=0, + max=100 + ) + .addTarget( + prometheus.target( + expr='sum(rate(collectd_cpu_total{instance="$host", type=~"(interrupt|nice|steal)"}[$rate])) by (cpu, type) / ignoring(type) group_left sum(rate(collectd_cpu_total{instance="$host" }[$rate])) by (cpu) * 100', + legendFormat='{{cpu}} - {{type}}' + ) + ) + ) + ) + .addRow( + row.new( + title='Advanced Details', + height=graphHeight, + collapse=true + ) + .addPanel( + graphpanel.new( + title='Context Switches / Second', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short' + ) + .addTarget( + prometheus.target( + expr='irate(collectd_contextswitch_total{instance="$host"}[$rate])', + legendFormat='Context Switches' + ) + ) + ) + .addPanel( + graphpanel.new( + title='IRQ Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='rate(collectd_irq_total{instance="$host", irq != "LOC"}[$rate])', + legendFormat='{{irq}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='NUMA Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='irate(collectd_numa_vmpage_action_total{instance="$host"}[$rate])', + legendFormat='{{numa}} - {{type}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='TCP Connection Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='collectd_tcpconns_tcp_connections{instance="$host"}', + legendFormat='{{tcpconns}} - {{type}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='TCP Connection Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='rate(collectd_protocols_protocol_counter_total{instance="$host"}[$rate])', + legendFormat='{{protocols}} - {{type}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Processor Speeds', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='hertz', + points=true, + lines=false, + pointradius=5, + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='collectd_cpufreq{instance="$host"}', + legendFormat='{{cpufreq}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Page Cache Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='rate(collectd_vmem_vmpage_faults_majflt_total{instance="$host"}[$rate])', + legendFormat='Major fault' + ) + ) + .addTarget( + prometheus.target( + expr='rate(collectd_vmem_vmpage_faults_minflt_total{instance="$host"}[$rate])', + legendFormat='Minor fault' + ) + ) + .addTarget( + prometheus.target( + expr='rate(collectd_vmem_vmpage_action_total{instance="$host"}[$rate])', + legendFormat='Action - {{vmem}}' + ) + ) + .addTarget( + prometheus.target( + expr='rate(collectd_vmem_vmpage_io_in_total{instance="$host"}[$rate])', + legendFormat='IO read page' + ) + ) + .addTarget( + prometheus.target( + expr='rate(collectd_vmem_vmpage_io_out_total{instance="$host"}[$rate])', + legendFormat='IO write page' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Page Cache Layout', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + percentage=true, + stack=true, + min=0, + max=100, + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='collectd_vmem_vmpage_number{instance="$host"}', + legendFormat='{{vmem}}' + ) + ) + ) + .addPanel( + graphpanel.new( + title='Process Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='short', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='collectd_processes_ps_count_threads{instance="$host"}', + legendFormat='Thread Count' + ) + ) + .addTarget( + prometheus.target( + expr='collectd_processes_ps_count_processes{instance="$host"}', + legendFormat='Process Count' + ) + ) + .addTarget( + prometheus.target( + expr='collectd_processes_ps_state{instance="$host"}', + legendFormat='Process State - {{processes}}' + ) + ) + ) + ) + .addRow( + row.new( + title='Basic Cassandra Overview', + height=singlestatHeight + ) + .addPanel( + singlestat.new( + 'SSTable Count', + description='Number of sstables on the node', + format='short', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan, + thresholds='100000,500000' + ) + .addTarget( + prometheus.target( + 'sum(' + $._config.metricPrefix + "_table_live_ss_table_count{instance='$host'})" + ) + ) + ) + .addPanel( + singlestat.new( + 'Pending Compactions', + description='Number of pending compactions on the node', + format='short', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan, + thresholds='10,50' + ) + .addTarget( + prometheus.target( + 'sum(' + $._config.metricPrefix + "_compaction_pending_tasks{instance='$host'})" + ) + ) + ) + .addPanel( + singlestat.new( + 'Connected Clients', + description='Number of client connections to the node', + format='percent', + datasource='$PROMETHEUS_DS', + sparklineShow=true, + gaugeShow=true, + span=singlestatSpan, + thresholds='100,1000' + ) + .addTarget( + prometheus.target( + 'sum(' + $._config.metricPrefix + "_client_connected_native_clients{instance='$host'})" + ) + ) + ) + .addPanel( + graphpanel.new( + title='GC Activity', + datasource='$PROMETHEUS_DS', + span=graphSpan, + format='bytes', + legend_hideEmpty=true, + legend_hideZero=true + ) + .addTarget( + prometheus.target( + expr='sum(' + $._config.metricPrefix + '_jvm_memory_max{instance="$host", memory_type="total"})', + legendFormat='JVM Heap Total' + ) + ) + .addSeriesOverride({ + alias: '/.*Total.*/', + fill: 0, + }) + .addTarget( + prometheus.target( + expr='sum(' + $._config.metricPrefix + '_jvm_memory_used{instance="$host", memory_type="non_heap"})', + legendFormat='JVM Non-Heap Used' + ) + ) + .addTarget( + prometheus.target( + expr='sum(' + $._config.metricPrefix + '_jvm_memory_used{instance="$host", memory_type="heap"})', + legendFormat='JVM Heap Used' + ) + ) + ) + ), + }, +} diff --git a/mixin/jsonnetfile.json b/mixin/jsonnetfile.json new file mode 100644 index 0000000..8e6cf3c --- /dev/null +++ b/mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/thelastpickle/grafonnet-polystat-panel.git", + "subdir": "" + } + }, + "version": "master" + } + ], + "legacyImports": false +} diff --git a/mixin/jsonnetfile.lock.json b/mixin/jsonnetfile.lock.json new file mode 100644 index 0000000..79d7d41 --- /dev/null +++ b/mixin/jsonnetfile.lock.json @@ -0,0 +1,26 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "6db00c292d3a1c71661fc875f90e0ec7caa538c2", + "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w=" + }, + { + "source": { + "git": { + "remote": "https://github.com/thelastpickle/grafonnet-polystat-panel.git", + "subdir": "" + } + }, + "version": "275a48de57afdac0d72219d82863d8ab8bd0e682", + "sum": "pXSXxNxi4WvBKYZ83GVYotQyL+toHaizqvjJ+8YYMoU=" + } + ], + "legacyImports": false +} diff --git a/mixin/make-dashboards.sh b/mixin/make-dashboards.sh new file mode 100755 index 0000000..1fef13e --- /dev/null +++ b/mixin/make-dashboards.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUTPUT_DIR="${ROOT_DIR}/dashboards/grafana/generated-dashboards" + +rm -rf "${OUTPUT_DIR}" +mkdir "${OUTPUT_DIR}" + +docker run -v "${ROOT_DIR}:${ROOT_DIR}" datastax/grafonnet-lib:v0.1.3 \ + jsonnet --multi "${OUTPUT_DIR}" "${ROOT_DIR}/mixin/dashboards.jsonnet" diff --git a/mixin/mixin.libsonnet b/mixin/mixin.libsonnet new file mode 100644 index 0000000..b291077 --- /dev/null +++ b/mixin/mixin.libsonnet @@ -0,0 +1,2 @@ +(import './dashboards/dashboards.libsonnet') + +(import './config.libsonnet')