diff --git a/.gitignore b/.gitignore
index 1af9d35..d5304a7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ venv
.classpath
target
.vscode
+vendor
diff --git a/dashboards/demo/README.md b/dashboards/demo/README.md
index e881cdb..94b2607 100644
--- a/dashboards/demo/README.md
+++ b/dashboards/demo/README.md
@@ -20,10 +20,10 @@ To use:
3. Open your web browser to [http://localhost:3000](http://localhost:3000)
- If you want to change the jsonnet dashboards, make your changes then run:
+ If you want to change the jsonnet dashboards, make your changes under `mixin/dashboards/` then run:
````
- ../grafana/make-dashboards.sh
+ mixin/make-dashboards.sh
````
Refresh the browser to see changes.
diff --git a/dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet b/dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet
deleted file mode 100644
index 9397551..0000000
--- a/dashboards/grafana/dashboards-jsonnet/cassandra-condensed.jsonnet
+++ /dev/null
@@ -1,522 +0,0 @@
-local grafana = import 'grafonnet/grafana.libsonnet';
-
-local dashboard = grafana.dashboard;
-local row = grafana.row;
-local singlestat = grafana.singlestat;
-local graphpanel = grafana.graphPanel;
-local text = grafana.text;
-local prometheus = grafana.prometheus;
-local template = grafana.template;
-
-local prefix = std.extVar('prefix');
-
-local graphHeight = 300;
-local singlestatHeight = 100;
-local singlestatSpan = 1;
-local graphSpan = 4;
-
-dashboard.new(
- 'Cassandra Cluster Condensed',
- description='Single pane of glass for most important Cassandra metrics',
- schemaVersion=14,
- refresh='30s',
- time_from='now-30m',
- editable=true,
- tags=['os'],
- style='dark'
-)
-.addTemplate(
- template.datasource(
- 'PROMETHEUS_DS',
- 'prometheus',
- 'Prometheus',
- hide='all',
- )
-)
-
-.addTemplate(
- template.custom(
- 'by',
- 'cluster,dc,rack,instance',
- 'cluster',
- valuelabels={
- "cluster": "Cluster",
- "dc" : "Datacenter",
- "rack" : "Rack",
- "instance" : "Host"},
- label='Group By',
- )
-)
-.addTemplate(
- template.interval(
- 'rate',
- '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d',
- '5m',
- label='Rate',
- )
-)
-.addTemplate(
- template.new(
- 'cluster',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{}, cluster)',
- label='Cluster',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'dc',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)',
- label='DataCenter',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'rack',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)',
- label='Rack',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'keyspace',
- '$PROMETHEUS_DS',
- 'label_values(' + prefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc"}, keyspace)',
- label='Keyspace',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'table',
- '$PROMETHEUS_DS',
- 'label_values(' + prefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc", keyspace=~"$keyspace"}, table)',
- label='Table',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'host',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)',
- label='Host',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.custom(
- 'latency',
- '0.999,0.99,0.98,0.95,0.90,0.75,0.50',
- '0.95',
- valuelabels={
- "0.999" : "P999",
- "0.99" : "P99",
- "0.98" : "P98",
- "0.95" : "P95",
- "0.90" : "P90",
- "0.75" : "P75",
- "0.50" : "P50"
- },
- label='Percentile'
- )
-)
-.addRow(
- row.new(
- title='Cluster Overview',
- height=singlestatHeight,
- )
- .addPanel(
- singlestat.new(
- 'Nodes Up',
- description="Nodes that are currently running in this time window",
- format='none',
- decimals=0,
- datasource='$PROMETHEUS_DS',
- colorValue=true,
- colors=["#d44a3a", "#299c46", "#299c46"],
- thresholds='0.1,1000',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'count(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"} >= 0) or vector(0)'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Nodes Down',
- description="Nodes that are currently not running in this time window",
- format='none',
- decimals=0,
- colorValue=true,
- colors=[ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a"],
- datasource='$PROMETHEUS_DS',
- thresholds='1,2',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'count(absent(sum(rate(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[5m])))) OR vector(0)'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Compactions / $rate',
- description="Rate of compactions during this window",
- format='none',
- decimals=0,
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(rate(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'CQL Requests / $rate',
- description="Rate of CQL requests during this window",
- format='none',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- decimals=0,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(irate(dse_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Dropped Messages / $rate',
- description="Rate of Dropped requests during this window",
- format='none',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- thresholds="30,300",
- colorValue=true,
- decimals=0,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(irate(' + prefix + '_table_dropped_mutations_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
- )
- )
- )
- .addPanel(
- text.new(
- transparent=true,
- mode="html",
- content='',
- span=2
- )
- )
- .addPanel(
- singlestat.new(
- 'CQL Clients',
- description="Number of connected clients during this time window",
- format='none',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- thresholds="100,1000",
- colorValue=true,
- decimals=0,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(' + prefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"})'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Timeouts / $rate',
- description="Client timeouts over the last $rate",
- format='none',
- datasource='$PROMETHEUS_DS',
- thresholds='100,300',
- colorValue=true,
- sparklineShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(irate(' + prefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))',
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Hints / $rate',
- description="Hints stored over the last $rate",
- format='none',
- datasource='$PROMETHEUS_DS',
- thresholds='1000,30000',
- colorValue=true,
- sparklineShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(irate(' + prefix + '_storage_hints_on_disk_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Data Size',
- description="Data",
- format='bytes',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(' + prefix + '_table_live_disk_space_used_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"})'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'GC Time / $rate',
- description="Data",
- format='ms',
- decimals=1,
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(rate(' + prefix + '_jvm_gc_time{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
- )
- )
- )
-)
-.addRow(
- row.new(
- title='Condensed Metrics',
- height=graphHeight
- )
- .addPanel(
- graphpanel.new(
- title="Requests Served / $by / $rate",
- description="(no keyspace/table filters apply)",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- labelY2="Clients Connected",
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addSeriesOverride({
- "alias": "/.*Connected/",
- "yaxis": 2
- })
- .addTarget(
- prometheus.target(
- expr='sum(irate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by, request_type)',
- legendFormat="{{$by}}:{{request_type}}"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(' + prefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}) by ($by)',
- legendFormat= "{{$by}}:Clients Connected"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Coordinator $latency Latency / $by",
- description="(no keyspace/table filters apply)",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="µs",
- min=0,
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile($latency, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by (le, request_type, $by))',
- legendFormat="$by:{{$by}} {{$latency}} {{request_type}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Memtable Space $keyspace.$table / $by",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- formatY1="bytes",
- formatY2="short",
- labelY2="Flush",
- min=0,
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addSeriesOverride({
- "alias": "/.*Flushes/",
- "bars": true,
- "lines": false,
- "zindex": -3,
- "yaxis": 2,
- })
- .addTarget(
- prometheus.target(
- expr='sum(' + prefix + '_table_memtable_off_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)',
- legendFormat="{{$by}} : Off Heap"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(' + prefix + '_table_memtable_on_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)',
- legendFormat="{{$by}} : On Heap"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(idelta(' + prefix + '_table_memtable_switch_count_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{$by}} : Flushes"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(idelta(' + prefix + '_table_pending_flushes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{$by}} : Pending Flushes"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Compactions $keyspace.$table / $by",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="bps",
- formatY2="short",
- labelY2="Count",
- legend_hideZero=true,
- legend_hideEmpty=true,
- min=0
- )
- .addSeriesOverride({
- "alias": "/.*Compactions/",
- "bars": true,
- "lines": false,
- "zindex": -3,
- "yaxis": 2,
- })
- .addTarget(
- prometheus.target(
- expr='sum(irate(' + prefix + '_table_compaction_bytes_written_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{by}} : Bytes Compacted"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(irate(' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{by}} : Pending Compactions"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(irate(' + prefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{by}} : Completed Compactions"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Table $latency Latency / $by",
- description="",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="µs",
- min=0,
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_range_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
- legendFormat="$by:{{$by}} Local Range Scan"
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
- legendFormat="$by:{{$by}} Local Read"
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_write_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
- legendFormat="$by:{{$by}} Local Write"
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_coordinator_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table"}[$rate])) by (le, $by))',
- legendFormat="$by:{{$by}} Coordinator Read"
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile($latency, sum(irate(' + prefix + '_table_coordinator_scan_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
- legendFormat="$by:{{$by}} Coordinator Range Scan"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Streaming / $by / $rate",
- description="",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="Bps",
- min=0,
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addTarget(
- prometheus.target(
- expr='sum(irate(' + prefix + '_streaming_total_incoming_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{$by}}: Incoming Stream"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(irate(' + prefix + '_streaming_total_outgoing_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)',
- legendFormat="{{$by}}: Outgoing Stream"
- )
- )
- )
-)
diff --git a/dashboards/grafana/dashboards-jsonnet/overview.jsonnet b/dashboards/grafana/dashboards-jsonnet/overview.jsonnet
deleted file mode 100644
index 6b00a26..0000000
--- a/dashboards/grafana/dashboards-jsonnet/overview.jsonnet
+++ /dev/null
@@ -1,1110 +0,0 @@
-local grafana = (import 'grafonnet/grafana.libsonnet')
- + (import 'grafonnet-polystat-panel/plugin.libsonnet');
-local dashboard = grafana.dashboard;
-local prometheus = grafana.prometheus;
-local template = grafana.template;
-local row = grafana.row;
-
-local graphPanel = grafana.graphPanel;
-local tablePanel = grafana.tablePanel;
-local singleStatPanel = grafana.singlestat;
-local textPanel = grafana.text;
-local polystatPanel = grafana.polystatPanel;
-
-local prefix = std.extVar('prefix');
-
-local fillLatencySeriesOverrides = {
- 'alias': 'p999',
- 'fillBelowTo': 'p98',
- 'lines': false
-};
-local removeMinLatencySeriesOverrides = {
- 'alias': 'p98',
- 'lines': false
-};
-
-local fillMinMaxSeriesOverrides = {
- 'alias': 'max',
- 'fillBelowTo': 'min',
- 'lines': false
-};
-local removeMinlineSeriesOverrides = {
- 'alias': 'min',
- 'lines': false
-};
-
-
-// used in the single stat panels where higher is better - cache hit rates for example
-local reversedColors =[
- '#d44a3a',
- 'rgba(237, 129, 40, 0.89)',
- '#299c46',
-];
-
-dashboard.new(
- 'Cassandra Overview',
- schemaVersion=14,
- refresh='30s',
- time_from='now-30m',
- editable=true,
- tags=['Cassandra', 'Overview'],
- style='dark'
-)
-.addTemplate(
- grafana.template.datasource(
- 'PROMETHEUS_DS',
- 'prometheus',
- 'Prometheus',
- hide='all',
- )
-)
-.addTemplate(
- template.new(
- 'cluster',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{}, cluster)',
- label='Cluster',
- refresh='time',
- )
-)
-.addTemplate(
- template.new(
- 'dc',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)',
- label='DataCenter',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'rack',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)',
- label='Rack',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addTemplate(
- template.new(
- 'node',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)',
- label='Node',
- refresh='time',
- includeAll=true,
- allValues=".*",
- )
-)
-.addRow(
- row.new(title='', height='50px')
- .addPanel(textPanel.new(transparent=true))
- .addPanel(
- textPanel.new(
- transparent=true,
- mode="html",
- content='',
- )
- )
- .addPanel(textPanel.new(transparent=true))
-)
-.addRow(
- row.new(title='Request Throughputs (Coordinator Perspective)')
- .addPanel(
- graphPanel.new(
- 'Request Throughputs',
- description='Total Requests Per Cluster, by Request Type',
- format='rps',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Error throughputs',
- description='Total Timeouts, Failures, Unavailable Rates for each cluster',
- format='rps',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_failures_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}} failures',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}} timeouts',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unavailables_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}} unavailable errors',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unfinished_commit_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}} unfinished commit errors',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_condition_not_met_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}} condition not met errors',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_contention_histogram_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{request_type}} contention histogram errors',
- )
- )
- )
- .addPanel(
- singleStatPanel.new(
- 'Read / Write Distribution',
- description='Part of reads in the total of standard requests (Reads+Writes). CAS, Views, ... operations are ignored.',
- format='percentunit',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- postfix=' Reads',
- postfixFontSize='30%',
- valueFontSize='30%',
- valueName="current",
- decimals=2,
- thresholds='0.25,0.5,0.75',
- timeFrom='',
- colors=[
- "#DEB6F2",
- "#CA95E5",
- "#8F3BB8"
- ],
- gaugeShow=true,
- gaugeMinValue=0,
- gaugeMaxValue=1,
- gaugeThresholdLabels=true,
- gaugeThresholdMarkers=false,
- sparklineFillColor='rgba(31, 118, 189, 0.18)',
- sparklineFull=false,
- sparklineLineColor='#FFB357',
- sparklineShow=false
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) / ignoring (request_type) (sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) + ignoring (request_type) sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Read Latency (98 - 999th percentile)',
- description='Read latency for coordinated reads',
- format='µs',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile(0.98, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
- legendFormat='p98',
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
- legendFormat='p99',
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile(0.999, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
- legendFormat='p999',
- )
- )
- .addSeriesOverride(fillLatencySeriesOverrides)
- .addSeriesOverride(removeMinLatencySeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Write Latency (98th - p999 Percentile)',
- description='Write latency for coordinated writes',
- format='µs',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile(0.98, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
- legendFormat='p98',
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
- legendFormat='p99',
- )
- )
- .addTarget(
- prometheus.target(
- expr='histogram_quantile(0.999, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
- legendFormat='p999',
- )
- )
- .addSeriesOverride(fillLatencySeriesOverrides)
- .addSeriesOverride(removeMinLatencySeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Other Latencies',
- description='Other p99 latencies for coordinated requests',
- format='µs',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- # In scope!~"Write|Read|.*-.*", we want to exclude charts above and all the per-consistency_level info like "Read-LOCAL_ONE"
- expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type!~"write|read|.*-.*"}[1m:30s])) by (le, request_type, cluster))',
- legendFormat='p99 {{request_type}}'
- )
- )
- )
-)
-.addRow(
- row.new(title='Nodes Status',)
- .addPanel(
- polystatPanel.new(
- 'Nodes Status',
- description='Nodes Status uses Internal/Gossip activity. Be mindful that if Native or Thrift protocol are disabled, the nodes won\'t be reachable, and still marked up',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- span=12,
- global_unit_format='none',
- global_operator_name='current',
- global_thresholds=[
- {
- "value": 0,
- "state": 2,
- "color": "#d44a3a"
- },
- {
- "value": 1,
- "state": 0,
- "color": "#299c46"
- }
- ],
- range_maps=[
- {
- "from": "0",
- "to": "0.9999",
- "text": "DOWN"
- },
- {
- "from": "1",
- "to": "1",
- "text": "UP"
- }
- ],
- mapping_type=2,
- value_enabled=true,
- )
- .addTarget(
- prometheus.target(
- 'max by (cluster, dc, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0',
- legendFormat='{{instance}}',
- instant=true,
- )
- )
- )
- .addPanel(
- singleStatPanel.new(
- 'Nodes Count',
- description='Nodes up and down in the cluster',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- decimals=0,
- prefix='Total:',
- postfix=' Nodes',
- postfixFontSize='80%',
- valueFontSize='80%',
- span=4
- )
- .addTarget(
- prometheus.target(
- expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
- legendFormat='Total Number Of Nodes',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Nodes Status History',
- description='Nodes up and down in the cluster per protocol/activity',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- decimals=0,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=false,
- shared_tooltip=false,
- min=0,
- span=8
- )
- .addTarget(
- prometheus.target(
- expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
- legendFormat='Total Number Of Nodes',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="native"}[2m:30s])) > bool 0)',
- legendFormat='Nodes Coordinating Requests (Native protocol)',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0)',
- legendFormat='Nodes With Internal Activity (Gossip protocol)',
- )
- )
- )
-)
-.addRow(
- row.new(title='Data Status')
- .addPanel(
- tablePanel.new(
- 'Disk Space Usage',
- description='Disk space used ordered (fullest disks first)',
- datasource='$PROMETHEUS_DS',
- transform='timeseries_aggregations',
- transparent=true,
- styles=[
- {
- "alias": "Node --> Mounting Point",
- "colorMode": null,
- "colors": [
- "rgba(245, 54, 54, 0.9)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(50, 172, 45, 0.97)"
- ],
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "decimals": 2,
- "mappingType": 1,
- "pattern": "Metric",
- "preserveFormat": true,
- "sanitize": true,
- "thresholds": [],
- "type": "string",
- "unit": "short"
- },
- {
- "alias": "% Disk Space Used",
- "colorMode": "row",
- "colors": [
- "rgba(50, 172, 45, 0.97)",
- "rgba(237, 129, 40, 0.89)",
- "rgba(245, 54, 54, 0.9)"
- ],
- "dateFormat": "YYYY-MM-DD HH:mm:ss",
- "decimals": 2,
- "link": false,
- "mappingType": 1,
- "pattern": "Current",
- "thresholds": [
- "0.5",
- "0.75",
- ],
- "type": "number",
- "unit": "percentunit"
- }
- ],
- columns=[
- {
- "text": "Current",
- "value": "current"
- }
- ],
- sort={
- "col": 1,
- "desc": true
- }
- )
- .addTarget(
- prometheus.target(
- expr='min by (instance, df) (1-(collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"}
- / ignoring (type) (collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="used"}
- + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="reserved"}
- + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"}))
- )',
- legendFormat='{{cluster}}-{{instance}} --> {{df}}',
- instant=true
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Cassandra cluster Data Size',
- description='Total sizes of the data on distinct nodes',
- format='bytes',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster) (' + prefix + '_table_live_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='Live space - {{cluster}}',
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster) (' + prefix + '_table_total_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='Total space - {{cluster}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'SSTable Count',
- description='SSTable Count Max and Average per table',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster, keyspace, table) (' + prefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='Table - {{keyspace}}.{{table}}',
- )
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster) (' + prefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='Max in cluster - {{cluster}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Pending Compactions',
- description='Maximum pending compactions on any node in the cluster',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- bars=false,
- lines=true,
- stack=false,
- decimals=0,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- expr='min by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- expr='avg by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Pending Compactions per Table',
- description='Maximum pending compactions per table',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- bars=false,
- lines=true,
- stack=true,
- decimals=0,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster, keyspace, table) (' + prefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='max for {{keyspace}}.{{table}}',
- )
- )
- )
-)
-.addRow(
- row.new(title='Cassandra Internals')
- .addPanel(
- graphPanel.new(
- 'Pending Tasks',
- description='Cluster wide pending threads, by thread pool name',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, pool_name) (' + prefix + '_thread_pools_pending_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='{{cluster}} - pending {{pool_name}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Blocked Tasks',
- description='Cluster wide blocked threads, by thread pool name',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, pool_name) (rate(' + prefix + '_thread_pools_total_blocked_tasks_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{cluster}} - blocked {{pool_name}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Dropped Messages',
- description='Dropped messages rate summed by message type and cluster',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, message_type) (rate(' + prefix + '_dropped_message_dropped_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='{{cluster}} - dropped {{message_type}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Active Tasks',
- description='active threads summed per cluster',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster, pool_name) (' + prefix + '_thread_pools_active_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='{{cluster}} - active {{pool_name}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Hinted Handoff',
- description='Sum of hints being handed off per cluster.',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- min=0,
- )
- .addTarget(
- prometheus.target(
- expr='sum by (cluster) (' + prefix + '_storage_total_hints_in_progress_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='count',
- )
- )
- )
-)
-.addRow(
- row.new(title='Hardware / Operating System')
- .addPanel(
- graphPanel.new(
- 'CPU Utilization',
- description='Maximum CPU utilisation (max 100%)',
- format='percentunit',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- percentage=true,
- decimals=1,
- min=0,
- max=1,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- expr='min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- expr='avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Unix Load (1m rate)',
- description='Max Unix load on a node for a cluster',
- format='short',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- 'avg by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Memory Utilisation',
- description='Maximum Memory allocated per usage (worst node) - excludes caches, buffers, etc',
- format='bytes',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- fill=1,
- linewidth=2,
- )
- .addTarget(
- prometheus.target(
- expr='min by (cluster) (sum by (cluster, dc, rack, instance) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
- legendFormat='min memory available',
- )
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster, memory) (sum by (cluster, dc, rack, instance, memory) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
- legendFormat='max memory {{memory}}',
- )
- )
- )
- .addPanel(
- graphPanel.new(
- 'Disk Read Thoughput',
- description='Disk read throughput',
- format='bps',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- 'avg by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Disk Write Thoughput',
- description='Disk write throughput',
- format='bps',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- )
- .addTarget(
- prometheus.target(
- expr='max by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- 'avg by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Network I/O',
- description='Network In and Out per cluster',
- format='bytes',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=1,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- bars=false,
- )
- .addTarget(
- prometheus.target(
- 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='outgoing',
- )
- )
- .addTarget(
- prometheus.target(
- 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='incoming',
- )
- )
- .addSeriesOverride({
- "alias": "incoming",
- "transform": "negative-Y"
- })
-
- )
-)
-.addRow(
- row.new(title='JVM / Garbage Collection')
- .addPanel(
- graphPanel.new(
- 'Application Throughput (% time NOT doing GC)',
- description='Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC',
- format='percentunit',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- decimals=2,
- min=0,
- max=1,
- )
- .addTarget(
- prometheus.target(
- 'max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- 'avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'Garbage Collection Time',
- description='Garbage collection duration',
- format='ms',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- fill=0,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- )
- .addTarget(
- prometheus.target(
- 'max by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- 'avg by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
- legendFormat='avg',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
- .addPanel(
- graphPanel.new(
- 'JVM Heap Memory Utilisation',
- description='Maximum JVM Heap Memory size (worst node) and minimum available heap size',
- format='bytes',
- datasource='$PROMETHEUS_DS',
- transparent=true,
- legend_show=true,
- legend_values=true,
- legend_current=true,
- legend_alignAsTable=true,
- legend_sort='current',
- legend_sortDesc=true,
- shared_tooltip=false,
- fill=1,
- linewidth=2,
- )
- .addTarget(
- prometheus.target(
- 'max by (cluster)
- (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='max',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by (cluster)
- (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='min',
- )
- )
- .addTarget(
- prometheus.target(
- 'avg by (cluster)
- (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='avg',
- )
- )
- .addTarget(
- prometheus.target(
- 'min by ( cluster)
- (' + prefix + '_jvm_memory_max{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
- legendFormat='Heap memory available',
- )
- )
- .addSeriesOverride(fillMinMaxSeriesOverrides)
- .addSeriesOverride(removeMinlineSeriesOverrides)
- )
-)
diff --git a/dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet b/dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet
deleted file mode 100644
index e9482ae..0000000
--- a/dashboards/grafana/dashboards-jsonnet/system-metrics.jsonnet
+++ /dev/null
@@ -1,902 +0,0 @@
-local grafana = import 'grafonnet/grafana.libsonnet';
-
-local dashboard = grafana.dashboard;
-local row = grafana.row;
-local singlestat = grafana.singlestat;
-local graphpanel = grafana.graphPanel;
-local text = grafana.text;
-local prometheus = grafana.prometheus;
-local template = grafana.template;
-
-local prefix = std.extVar('prefix');
-
-local textstatHeight = 100;
-local graphHeight = 250;
-local singlestatHeight = 125;
-local singlestatSpan = 2;
-local graphSpan = 6;
-
-dashboard.new(
- 'System & Node Metrics',
- description='Operating System Metrics and Apache Cassandra Node Information',
- schemaVersion=14,
- time_from='now-30m',
- refresh='1m',
- tags=['os'],
- style='dark'
-)
-.addTemplate(
- template.datasource(
- 'PROMETHEUS_DS',
- 'prometheus',
- 'Prometheus',
- hide='all',
- )
-)
-.addTemplate(
- template.interval(
- 'rate',
- '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d',
- '5m',
- label='Rate',
- )
-)
-.addTemplate(
- template.new(
- 'host',
- '$PROMETHEUS_DS',
- 'label_values(collectd_collectd_queue_length{}, instance)',
- label='Host',
- refresh='time',
- )
-)
-.addPanel(
- text.new(
- transparent=true,
- mode="html",
- content=''
- ),
- {
- "h": 3,
- "w": 5,
- "x": 9,
- "y": -1
- },
-)
-.addRow(
- row.new(
- title='Basic CPU / Mem / Disk Gauge',
- height=singlestatHeight,
- )
- .addPanel(
- singlestat.new(
- 'CPU Busy',
- description="Busy state of all CPU cores together",
- format='percent',
- datasource='$PROMETHEUS_DS',
- thresholds='85,95',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- "(1 - ((sum(irate(collectd_cpu_total{instance='$host', type='idle'}[$rate])) by (instance) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) by (instance)))) * 100",
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Memory Used',
- description="Percentage of memory used (ignoring page cache)",
- format='percent',
- datasource='$PROMETHEUS_DS',
- thresholds='85,95',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- '100 * ((sum(collectd_memory{instance="$host", memory="free"}) + sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})) / sum(collectd_memory{instance="$host"}))',
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Swap Used',
- description="Percentage of swap in use",
- format='percent',
- datasource='$PROMETHEUS_DS',
- thresholds='0,1',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- "(sum(collectd_swap{instance='$host',swap='used'}) / sum(collectd_swap{instance='$host'})) * 100"
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Disk Used',
- description="Percentage of root disk in use",
- format='percent',
- datasource='$PROMETHEUS_DS',
- thresholds='50,85',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- '(sum(collectd_df_df_complex{instance="$host", df="root", type="used"}) / sum(collectd_df_df_complex{instance="$host", df="root"})) * 100'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'CPU System Load (1m avg)',
- description="Busy state of all CPU cores together (1 min average)",
- format='percent',
- datasource='$PROMETHEUS_DS',
- thresholds='85,95',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'avg(collectd_load_shortterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100',
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'CPU System Load (5m avg)',
- description="Busy state of all CPU cores together (5 min average)",
- format='percent',
- datasource='$PROMETHEUS_DS',
- thresholds='85,95',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'avg(collectd_load_midterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100',
- )
- )
- )
-)
-.addRow(
- row.new(
- title='Basic CPU / Mem / Disk Info',
- height=textstatHeight
- )
- .addPanel(
- singlestat.new(
- 'CPU Cores',
- description="Total number of CPU cores",
- format="short",
- datasource='$PROMETHEUS_DS',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'count(count(collectd_cpu_total{instance="$host"}) by (cpu))'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Total RAM',
- description="Total amount of system memory",
- format="bytes",
- datasource='$PROMETHEUS_DS',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(collectd_memory{instance="$host"})'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Total Swap',
- description="Total amount of swap space",
- format="bytes",
- datasource='$PROMETHEUS_DS',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(collectd_swap{instance="$host"})'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Total RootFS',
- description="Total amount of disk space",
- format='bytes',
- datasource='$PROMETHEUS_DS',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'sum(collectd_df_df_complex{df="root",instance="$host"})'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'System Load (1m avg)',
- description="System Load (1m avg)",
- format="short",
- datasource='$PROMETHEUS_DS',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'collectd_load_shortterm{instance="$host"}'
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'System Uptime',
- description="Uptime of the host",
- format="s",
- decimals=1,
- datasource='$PROMETHEUS_DS',
- span=singlestatSpan
- )
- .addTarget(
- prometheus.target(
- 'collectd_uptime{instance="$host"}'
- )
- )
- )
-)
-.addRow(
- row.new(
- title='Basic CPU / Mem Graph',
- height=graphHeight
- )
- .addPanel(
- graphpanel.new(
- title="CPU Basic",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- percentage=true,
- stack=true,
- min=0,
- max=100
- )
- .addTarget(
- prometheus.target(
- expr="sum(irate(collectd_cpu_total{instance='$host',type='system'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
- legendFormat="Busy System"
- )
- )
- .addTarget(
- prometheus.target(
- expr="sum(irate(collectd_cpu_total{instance='$host',type='user'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
- legendFormat="Busy User"
- )
- )
- .addTarget(
- prometheus.target(
- expr="sum(irate(collectd_cpu_total{instance='$host',type='wait'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
- legendFormat="Busy IOWait"
- )
- )
- .addTarget(
- prometheus.target(
- expr="sum(irate(collectd_cpu_total{instance='$host',type='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
- legendFormat="Busy IRQ"
- )
- )
- .addTarget(
- prometheus.target(
- expr="sum(irate(collectd_cpu_total{instance='$host',type!='idle',type!='system',type!='user',type!='wait',type!='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
- legendFormat="Busy Other"
- )
- )
- .addTarget(
- prometheus.target(
- expr="sum(irate(collectd_cpu_total{instance='$host',type='idle'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
- legendFormat="Idle"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Basic memory usage",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="bytes",
- min=0
- )
- .addTarget(
- prometheus.target(
- expr='sum(collectd_memory{instance="$host"})',
- legendFormat="RAM Total"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(collectd_memory{instance="$host"}) - sum(collectd_memory{instance="$host", memory="free"}) - sum(collectd_memory{instance="$host", memory="cached"}) - sum(collectd_memory{instance="$host", memory="buffered"})',
- legendFormat="RAM Used"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})',
- legendFormat="RAM Cache + Buffer"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(collectd_memory{instance="$host", memory="free"})',
- legendFormat="RAM Free"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(collectd_swap{instance="$host"}) - sum(collectd_swap{instance="$host", swap="free"})',
- legendFormat="SWAP Used"
- )
- )
- )
-)
-.addRow(
- row.new(
- title='Basic Network / Disk Graph',
- height=graphHeight
- )
- .addPanel(
- graphpanel.new(
- title="Network Traffic / Second",
- description="Basic network info per interface",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="bps",
- labelY1="Receive (-) / Send (+)",
-
- )
- .addSeriesOverride({
- "alias": "/.*receive.*/",
- "transform": "negative-Y"
- })
- .addTarget(
- prometheus.target(
- expr="irate(collectd_interface_if_octets_rx_total{instance='$host'}[$rate]) * 8",
- legendFormat="{{interface}} receive"
- )
- )
- .addTarget(
- prometheus.target(
- expr="irate(collectd_interface_if_octets_tx_total{instance='$host'}[$rate]) * 8",
- legendFormat="{{interface}} send"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Network Packets / Second",
- description="Basic network info per interface",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="pps",
- labelY1="Receive (-) / Send (+)",
-
- )
- .addSeriesOverride({
- "alias": "/.*receive.*/",
- "transform": "negative-Y"
- })
- .addTarget(
- prometheus.target(
- expr="irate(collectd_interface_if_packets_rx_total{instance='$host'}[$rate]) * 8",
- legendFormat="{{interface}} receive"
- )
- )
- .addTarget(
- prometheus.target(
- expr="irate(collectd_interface_if_packets_tx_total{instance='$host'}[$rate]) * 8",
- legendFormat="{{interface}} send"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Disk Activity / Second",
- description="Disk Activity / Second",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="Bps",
- labelY1="Read (-) / Write (+)",
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addSeriesOverride({
- "alias": "/.*Read.*/",
- "transform": "negative-Y"
- })
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_octets_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
- legendFormat="{{disk}} - Read"
- )
- )
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_octets_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
- legendFormat="{{disk}} - Write"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Disk IOPS",
- description="Disk iops per disk",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="iops",
- labelY1="Read (-) / Write (+)",
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addSeriesOverride({
- "alias": "/.*Read.*/",
- "transform": "negative-Y"
- })
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_ops_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
- legendFormat="{{disk}} - Read"
- )
- )
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_ops_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
- legendFormat="{{disk}} - Write"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Disk Used",
- description="Disk space used",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="decbytes",
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addTarget(
- prometheus.target(
- expr='sum(collectd_df_df_complex{instance="$host", type="used"}) by (df)',
- legendFormat="{{df}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Disk Queue Length",
- description="The amount of requests pending in the disk queue",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_io_time_weighted_io_time_total{instance="$host",disk=~".*[0-9]+"}[$rate]) / 1000',
- legendFormat="{{disk}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Disk Latency",
- description="Disk access times",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="ms",
- labelY1="Read (-) / Write (+)",
- legend_hideZero=true,
- legend_hideEmpty=true
- )
- .addSeriesOverride({
- "alias": "/.*Read.*/",
- "transform": "negative-Y"
- })
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_time_read_total{instance="$host",disk=~".*[0-9]+"}[$rate])',
- legendFormat="{{disk}} - Read"
- )
- )
- .addTarget(
- prometheus.target(
- expr='irate(collectd_disk_disk_time_write_total{instance="$host",disk=~".*[0-9]+"}[$rate])',
- legendFormat="{{disk}} - Write"
- )
- )
- )
-)
-.addRow(
- row.new(
- title='CPU Details',
- height=graphHeight,
- collapse=true
- )
- .addPanel(
- graphpanel.new(
- title="CPU User",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="percent",
- min=0,
- max=100
- )
- .addTarget(
- prometheus.target(
- expr='sum(rate(collectd_cpu_total{instance="$host", type="user"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
- legendFormat="{{cpu}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="CPU System",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="percent",
- min=0,
- max=100
- )
- .addTarget(
- prometheus.target(
- expr='sum(rate(collectd_cpu_total{instance="$host", type="system"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
- legendFormat="{{cpu}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="CPU IOWait",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="percent",
- min=0,
- max=100
- )
- .addTarget(
- prometheus.target(
- expr='sum(rate(collectd_cpu_total{instance="$host", type="wait"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
- legendFormat="{{cpu}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="CPU SoftIRQ",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="percent",
- min=0,
- max=100
- )
- .addTarget(
- prometheus.target(
- expr='sum(rate(collectd_cpu_total{instance="$host", type="softirq"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
- legendFormat="{{cpu}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="CPU Other",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="percent",
- min=0,
- max=100
- )
- .addTarget(
- prometheus.target(
- expr='sum(rate(collectd_cpu_total{instance="$host", type=~"(interrupt|nice|steal)"}[$rate])) by (cpu, type) / ignoring(type) group_left sum(rate(collectd_cpu_total{instance="$host" }[$rate])) by (cpu) * 100',
- legendFormat="{{cpu}} - {{type}}"
- )
- )
- )
-)
-.addRow(
- row.new(
- title='Advanced Details',
- height=graphHeight,
- collapse=true
- )
- .addPanel(
- graphpanel.new(
- title="Context Switches / Second",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short"
- )
- .addTarget(
- prometheus.target(
- expr='irate(collectd_contextswitch_total{instance="$host"}[$rate])',
- legendFormat="Context Switches"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="IRQ Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_irq_total{instance="$host", irq != "LOC"}[$rate])',
- legendFormat="{{irq}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="NUMA Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='irate(collectd_numa_vmpage_action_total{instance="$host"}[$rate])',
- legendFormat="{{numa}} - {{type}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="TCP Connection Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='collectd_tcpconns_tcp_connections{instance="$host"}',
- legendFormat="{{tcpconns}} - {{type}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="TCP Connection Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_protocols_protocol_counter_total{instance="$host"}[$rate])',
- legendFormat="{{protocols}} - {{type}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Processor Speeds",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="hertz",
- points=true,
- lines=false,
- pointradius=5,
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='collectd_cpufreq{instance="$host"}',
- legendFormat="{{cpufreq}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Page Cache Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_vmem_vmpage_faults_majflt_total{instance="$host"}[$rate])',
- legendFormat="Major fault"
- )
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_vmem_vmpage_faults_minflt_total{instance="$host"}[$rate])',
- legendFormat="Minor fault"
- )
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_vmem_vmpage_action_total{instance="$host"}[$rate])',
- legendFormat="Action - {{vmem}}"
- )
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_vmem_vmpage_io_in_total{instance="$host"}[$rate])',
- legendFormat="IO read page"
- )
- )
- .addTarget(
- prometheus.target(
- expr='rate(collectd_vmem_vmpage_io_out_total{instance="$host"}[$rate])',
- legendFormat="IO write page"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Page Cache Layout",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- percentage=true,
- stack=true,
- min=0,
- max=100,
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='collectd_vmem_vmpage_number{instance="$host"}',
- legendFormat="{{vmem}}"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="Process Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="short",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='collectd_processes_ps_count_threads{instance="$host"}',
- legendFormat="Thread Count"
- )
- )
- .addTarget(
- prometheus.target(
- expr='collectd_processes_ps_count_processes{instance="$host"}',
- legendFormat="Process Count"
- )
- )
- .addTarget(
- prometheus.target(
- expr='collectd_processes_ps_state{instance="$host"}',
- legendFormat="Process State - {{processes}}"
- )
- )
- )
-)
-.addRow(
- row.new(
- title='Basic Cassandra Overview',
- height=singlestatHeight
- )
- .addPanel(
- singlestat.new(
- 'SSTable Count',
- description="Number of sstables on the node",
- format='short',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan,
- thresholds="100000,500000"
- )
- .addTarget(
- prometheus.target(
- "sum(" + prefix + "_table_live_ss_table_count{instance='$host'})"
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Pending Compactions',
- description="Number of pending compactions on the node",
- format='short',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan,
- thresholds="10,50"
- )
- .addTarget(
- prometheus.target(
- "sum(" + prefix + "_compaction_pending_tasks{instance='$host'})"
- )
- )
- )
- .addPanel(
- singlestat.new(
- 'Connected Clients',
- description="Number of client connections to the node",
- format='percent',
- datasource='$PROMETHEUS_DS',
- sparklineShow=true,
- gaugeShow=true,
- span=singlestatSpan,
- thresholds="100,1000"
- )
- .addTarget(
- prometheus.target(
- "sum(" + prefix + "_client_connected_native_clients{instance='$host'})"
- )
- )
- )
- .addPanel(
- graphpanel.new(
- title="GC Activity",
- datasource='$PROMETHEUS_DS',
- span=graphSpan,
- format="bytes",
- legend_hideEmpty=true,
- legend_hideZero=true
- )
- .addTarget(
- prometheus.target(
- expr='sum(' + prefix + '_jvm_memory_max{instance="$host", memory_type="total"})',
- legendFormat="JVM Heap Total"
- )
- )
- .addSeriesOverride({
- "alias": "/.*Total.*/",
- "fill": 0
- })
- .addTarget(
- prometheus.target(
- expr='sum(' + prefix + '_jvm_memory_used{instance="$host", memory_type="non_heap"})',
- legendFormat="JVM Non-Heap Used"
- )
- )
- .addTarget(
- prometheus.target(
- expr='sum(' + prefix + '_jvm_memory_used{instance="$host", memory_type="heap"})',
- legendFormat="JVM Heap Used"
- )
- )
- )
-)
\ No newline at end of file
diff --git a/dashboards/grafana/make-dashboards.sh b/dashboards/grafana/make-dashboards.sh
deleted file mode 100755
index 7e80adf..0000000
--- a/dashboards/grafana/make-dashboards.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-cd $DIR
-for file in dashboards-jsonnet/*; do
- name=$(basename $file);
- echo "Generating ${name%.jsonnet}.json"
- docker run -v `pwd`:/here datastax/grafonnet-lib:v0.1.3 jsonnet --ext-str prefix=mcac /here/$file > `pwd`/generated-dashboards/${name%.jsonnet}.json;
-done
diff --git a/make_package.sh b/make_package.sh
index 2394077..3073c31 100755
--- a/make_package.sh
+++ b/make_package.sh
@@ -41,10 +41,7 @@ cd $PACKAGE_DIR
tar zcvf $PROJECT_DIR_NAME.tar.gz $PROJECT_DIR_NAME
zip $PROJECT_DIR_NAME.zip $(tar ztf $PROJECT_DIR_NAME.tar.gz)
popd
-pushd .
-cd dashboards/grafana
-./make-dashboards.sh
-popd
+mixin/make-dashboards.sh
DASHBOARD_DIR_NAME=datastax-mcac-dashboards-$VERSION
mkdir -p $PACKAGE_DIR/$DASHBOARD_DIR_NAME/grafana
diff --git a/mixin/README.md b/mixin/README.md
new file mode 100644
index 0000000..b2db7fb
--- /dev/null
+++ b/mixin/README.md
@@ -0,0 +1,14 @@
+# MCAC Mixin
+
+The MCAC Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Apache Cassandra.
+
+## Requirements
+
+- [Jsonnet](https://github.com/google/go-jsonnet)
+- [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler) (`jb`)
+
+## Install as library
+
+```shell
+jb install github.com/datastax/metric-collector-for-apache-cassandra/mixin@master
+```
diff --git a/mixin/config.libsonnet b/mixin/config.libsonnet
new file mode 100644
index 0000000..cd9503e
--- /dev/null
+++ b/mixin/config.libsonnet
@@ -0,0 +1,5 @@
+{
+ _config+:: {
+ metricPrefix: 'mcac',
+ },
+}
diff --git a/mixin/dashboards.jsonnet b/mixin/dashboards.jsonnet
new file mode 100644
index 0000000..9d913ed
--- /dev/null
+++ b/mixin/dashboards.jsonnet
@@ -0,0 +1,6 @@
+local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
+
+{
+ [name]: dashboards[name]
+ for name in std.objectFields(dashboards)
+}
diff --git a/mixin/dashboards/cassandra-condensed.libsonnet b/mixin/dashboards/cassandra-condensed.libsonnet
new file mode 100644
index 0000000..452d699
--- /dev/null
+++ b/mixin/dashboards/cassandra-condensed.libsonnet
@@ -0,0 +1,529 @@
+local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
+
+local dashboard = grafana.dashboard;
+local row = grafana.row;
+local singlestat = grafana.singlestat;
+local graphpanel = grafana.graphPanel;
+local text = grafana.text;
+local prometheus = grafana.prometheus;
+local template = grafana.template;
+
+local graphHeight = 300;
+local singlestatHeight = 100;
+local singlestatSpan = 1;
+local graphSpan = 4;
+
+{
+ _config+:: {
+ metricPrefix: error 'must provide metric prefix',
+ },
+ grafanaDashboards+:: {
+ 'cassandra-condensed.json':
+ dashboard.new(
+ 'Cassandra Cluster Condensed',
+ description='Single pane of glass for most important Cassandra metrics',
+ schemaVersion=14,
+ refresh='30s',
+ time_from='now-30m',
+ editable=true,
+ tags=['os'],
+ style='dark'
+ )
+ .addTemplate(
+ template.datasource(
+ 'PROMETHEUS_DS',
+ 'prometheus',
+ 'Prometheus',
+ hide='all',
+ )
+ )
+
+ .addTemplate(
+ template.custom(
+ 'by',
+ 'cluster,dc,rack,instance',
+ 'cluster',
+ valuelabels={
+ cluster: 'Cluster',
+ dc: 'Datacenter',
+ rack: 'Rack',
+ instance: 'Host',
+ },
+ label='Group By',
+ )
+ )
+ .addTemplate(
+ template.interval(
+ 'rate',
+ '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d',
+ '5m',
+ label='Rate',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'cluster',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{}, cluster)',
+ label='Cluster',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'dc',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)',
+ label='DataCenter',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'rack',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)',
+ label='Rack',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'keyspace',
+ '$PROMETHEUS_DS',
+ 'label_values(' + $._config.metricPrefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc"}, keyspace)',
+ label='Keyspace',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'table',
+ '$PROMETHEUS_DS',
+ 'label_values(' + $._config.metricPrefix + '_table_read_latency_total{cluster=~"$cluster", dc=~"$dc", keyspace=~"$keyspace"}, table)',
+ label='Table',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'host',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)',
+ label='Host',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.custom(
+ 'latency',
+ '0.999,0.99,0.98,0.95,0.90,0.75,0.50',
+ '0.95',
+ valuelabels={
+ '0.999': 'P999',
+ '0.99': 'P99',
+ '0.98': 'P98',
+ '0.95': 'P95',
+ '0.90': 'P90',
+ '0.75': 'P75',
+ '0.50': 'P50',
+ },
+ label='Percentile'
+ )
+ )
+ .addRow(
+ row.new(
+ title='Cluster Overview',
+ height=singlestatHeight,
+ )
+ .addPanel(
+ singlestat.new(
+ 'Nodes Up',
+ description='Nodes that are currently running in this time window',
+ format='none',
+ decimals=0,
+ datasource='$PROMETHEUS_DS',
+ colorValue=true,
+ colors=['#d44a3a', '#299c46', '#299c46'],
+ thresholds='0.1,1000',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'count(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"} >= 0) or vector(0)'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Nodes Down',
+ description='Nodes that are currently not running in this time window',
+ format='none',
+ decimals=0,
+ colorValue=true,
+ colors=['#299c46', 'rgba(237, 129, 40, 0.89)', '#d44a3a'],
+ datasource='$PROMETHEUS_DS',
+ thresholds='1,2',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'count(absent(sum(rate(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[5m])))) OR vector(0)'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Compactions / $rate',
+ description='Rate of compactions during this window',
+ format='none',
+ decimals=0,
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(rate(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'CQL Requests / $rate',
+ description='Rate of CQL requests during this window',
+ format='none',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ decimals=0,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(irate(dse_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Dropped Messages / $rate',
+ description='Rate of Dropped requests during this window',
+ format='none',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ thresholds='30,300',
+ colorValue=true,
+ decimals=0,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(irate(' + $._config.metricPrefix + '_table_dropped_mutations_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
+ )
+ )
+ )
+ .addPanel(
+ text.new(
+ transparent=true,
+ mode='html',
+ content='',
+ span=2
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'CQL Clients',
+ description='Number of connected clients during this time window',
+ format='none',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ thresholds='100,1000',
+ colorValue=true,
+ decimals=0,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(' + $._config.metricPrefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"})'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Timeouts / $rate',
+ description='Client timeouts over the last $rate',
+ format='none',
+ datasource='$PROMETHEUS_DS',
+ thresholds='100,300',
+ colorValue=true,
+ sparklineShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(irate(' + $._config.metricPrefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))',
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Hints / $rate',
+ description='Hints stored over the last $rate',
+ format='none',
+ datasource='$PROMETHEUS_DS',
+ thresholds='1000,30000',
+ colorValue=true,
+ sparklineShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(irate(' + $._config.metricPrefix + '_storage_hints_on_disk_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Data Size',
+ description='Data',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(' + $._config.metricPrefix + '_table_live_disk_space_used_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"})'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'GC Time / $rate',
+ description='Data',
+ format='ms',
+ decimals=1,
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate]))'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='Condensed Metrics',
+ height=graphHeight
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Requests Served / $by / $rate',
+ description='(no keyspace/table filters apply)',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ labelY2='Clients Connected',
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addSeriesOverride({
+ alias: '/.*Connected/',
+ yaxis: 2,
+ })
+ .addTarget(
+ prometheus.target(
+ expr='sum(irate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by, request_type)',
+ legendFormat='{{$by}}:{{request_type}}'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(' + $._config.metricPrefix + '_client_connected_native_clients{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}) by ($by)',
+ legendFormat='{{$by}}:Clients Connected'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Coordinator $latency Latency / $by',
+ description='(no keyspace/table filters apply)',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='µs',
+ min=0,
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile($latency, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by (le, request_type, $by))',
+ legendFormat='$by:{{$by}} {{$latency}} {{request_type}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Memtable Space $keyspace.$table / $by',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ formatY1='bytes',
+ formatY2='short',
+ labelY2='Flush',
+ min=0,
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addSeriesOverride({
+ alias: '/.*Flushes/',
+ bars: true,
+ lines: false,
+ zindex: -3,
+ yaxis: 2,
+ })
+ .addTarget(
+ prometheus.target(
+ expr='sum(' + $._config.metricPrefix + '_table_memtable_off_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)',
+ legendFormat='{{$by}} : Off Heap'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(' + $._config.metricPrefix + '_table_memtable_on_heap_size{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}) by ($by)',
+ legendFormat='{{$by}} : On Heap'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(idelta(' + $._config.metricPrefix + '_table_memtable_switch_count_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{$by}} : Flushes'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(idelta(' + $._config.metricPrefix + '_table_pending_flushes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{$by}} : Pending Flushes'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Compactions $keyspace.$table / $by',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='bps',
+ formatY2='short',
+ labelY2='Count',
+ legend_hideZero=true,
+ legend_hideEmpty=true,
+ min=0
+ )
+ .addSeriesOverride({
+ alias: '/.*Compactions/',
+ bars: true,
+ lines: false,
+ zindex: -3,
+ yaxis: 2,
+ })
+ .addTarget(
+ prometheus.target(
+ expr='sum(irate(' + $._config.metricPrefix + '_table_compaction_bytes_written_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{by}} : Bytes Compacted'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(irate(' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{by}} : Pending Compactions'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(irate(' + $._config.metricPrefix + '_compaction_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{by}} : Completed Compactions'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Table $latency Latency / $by',
+ description='',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='µs',
+ min=0,
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_range_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
+ legendFormat='$by:{{$by}} Local Range Scan'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
+ legendFormat='$by:{{$by}} Local Read'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_write_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
+ legendFormat='$by:{{$by}} Local Write'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_coordinator_read_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table"}[$rate])) by (le, $by))',
+ legendFormat='$by:{{$by}} Coordinator Read'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile($latency, sum(irate(' + $._config.metricPrefix + '_table_coordinator_scan_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", keyspace=~"$keyspace", table=~"$table", instance=~"$host"}[$rate])) by (le, $by))',
+ legendFormat='$by:{{$by}} Coordinator Range Scan'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Streaming / $by / $rate',
+ description='',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='Bps',
+ min=0,
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(irate(' + $._config.metricPrefix + '_streaming_total_incoming_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{$by}}: Incoming Stream'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(irate(' + $._config.metricPrefix + '_streaming_total_outgoing_bytes_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$host"}[$rate])) by ($by)',
+ legendFormat='{{$by}}: Outgoing Stream'
+ )
+ )
+ )
+ ),
+ },
+}
diff --git a/mixin/dashboards/dashboards.libsonnet b/mixin/dashboards/dashboards.libsonnet
new file mode 100644
index 0000000..aa49aca
--- /dev/null
+++ b/mixin/dashboards/dashboards.libsonnet
@@ -0,0 +1,3 @@
+(import './cassandra-condensed.libsonnet') +
+(import './overview.libsonnet') +
+(import './system-metrics.libsonnet')
diff --git a/mixin/dashboards/overview.libsonnet b/mixin/dashboards/overview.libsonnet
new file mode 100644
index 0000000..1c1effc
--- /dev/null
+++ b/mixin/dashboards/overview.libsonnet
@@ -0,0 +1,1108 @@
+local grafana = (import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet') +
+ (import 'github.com/thelastpickle/grafonnet-polystat-panel/plugin.libsonnet');
+local dashboard = grafana.dashboard;
+local prometheus = grafana.prometheus;
+local template = grafana.template;
+local row = grafana.row;
+
+local graphPanel = grafana.graphPanel;
+local tablePanel = grafana.tablePanel;
+local singleStatPanel = grafana.singlestat;
+local textPanel = grafana.text;
+local polystatPanel = grafana.polystatPanel;
+
+local fillLatencySeriesOverrides = {
+ alias: 'p999',
+ fillBelowTo: 'p98',
+ lines: false,
+};
+local removeMinLatencySeriesOverrides = {
+ alias: 'p98',
+ lines: false,
+};
+
+local fillMinMaxSeriesOverrides = {
+ alias: 'max',
+ fillBelowTo: 'min',
+ lines: false,
+};
+local removeMinlineSeriesOverrides = {
+ alias: 'min',
+ lines: false,
+};
+
+
+// used in the single stat panels where higher is better - cache hit rates for example
+local reversedColors = [
+ '#d44a3a',
+ 'rgba(237, 129, 40, 0.89)',
+ '#299c46',
+];
+
+{
+ _config+:: {
+ metricPrefix: error 'must provide metric prefix',
+ },
+ grafanaDashboards+:: {
+ 'overview.json':
+ dashboard.new(
+ 'Cassandra Overview',
+ schemaVersion=14,
+ refresh='30s',
+ time_from='now-30m',
+ editable=true,
+ tags=['Cassandra', 'Overview'],
+ style='dark'
+ )
+ .addTemplate(
+ grafana.template.datasource(
+ 'PROMETHEUS_DS',
+ 'prometheus',
+ 'Prometheus',
+ hide='all',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'cluster',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{}, cluster)',
+ label='Cluster',
+ refresh='time',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'dc',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)',
+ label='DataCenter',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'rack',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)',
+ label='Rack',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'node',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)',
+ label='Node',
+ refresh='time',
+ includeAll=true,
+ allValues='.*',
+ )
+ )
+ .addRow(
+ row.new(title='', height='50px')
+ .addPanel(textPanel.new(transparent=true))
+ .addPanel(
+ textPanel.new(
+ transparent=true,
+ mode='html',
+ content='',
+ )
+ )
+ .addPanel(textPanel.new(transparent=true))
+ )
+ .addRow(
+ row.new(title='Request Throughputs (Coordinator Perspective)')
+ .addPanel(
+ graphPanel.new(
+ 'Request Throughputs',
+ description='Total Requests Per Cluster, by Request Type',
+ format='rps',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Error throughputs',
+ description='Total Timeouts, Failures, Unavailable Rates for each cluster',
+ format='rps',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_failures_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}} failures',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}} timeouts',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_unavailables_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}} unavailable errors',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_unfinished_commit_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}} unfinished commit errors',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_condition_not_met_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}} condition not met errors',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_contention_histogram_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{request_type}} contention histogram errors',
+ )
+ )
+ )
+ .addPanel(
+ singleStatPanel.new(
+ 'Read / Write Distribution',
+ description='Part of reads in the total of standard requests (Reads+Writes). CAS, Views, ... operations are ignored.',
+ format='percentunit',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ postfix=' Reads',
+ postfixFontSize='30%',
+ valueFontSize='30%',
+ valueName='current',
+ decimals=2,
+ thresholds='0.25,0.5,0.75',
+ timeFrom='',
+ colors=[
+ '#DEB6F2',
+ '#CA95E5',
+ '#8F3BB8',
+ ],
+ gaugeShow=true,
+ gaugeMinValue=0,
+ gaugeMaxValue=1,
+ gaugeThresholdLabels=true,
+ gaugeThresholdMarkers=false,
+ sparklineFillColor='rgba(31, 118, 189, 0.18)',
+ sparklineFull=false,
+ sparklineLineColor='#FFB357',
+ sparklineShow=false
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) / ignoring (request_type) (sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) + ignoring (request_type) sum by (cluster, request_type) (rate(' + $._config.metricPrefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Read Latency (98 - 999th percentile)',
+ description='Read latency for coordinated reads',
+ format='µs',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile(0.98, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
+ legendFormat='p98',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile(0.99, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
+ legendFormat='p99',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile(0.999, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
+ legendFormat='p999',
+ )
+ )
+ .addSeriesOverride(fillLatencySeriesOverrides)
+ .addSeriesOverride(removeMinLatencySeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Write Latency (98th - p999 Percentile)',
+ description='Write latency for coordinated writes',
+ format='µs',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile(0.98, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
+ legendFormat='p98',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile(0.99, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
+ legendFormat='p99',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='histogram_quantile(0.999, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
+ legendFormat='p999',
+ )
+ )
+ .addSeriesOverride(fillLatencySeriesOverrides)
+ .addSeriesOverride(removeMinLatencySeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Other Latencies',
+ description='Other p99 latencies for coordinated requests',
+ format='µs',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ // In scope!~"Write|Read|.*-.*", we want to exclude charts above and all the per-consistency_level info like "Read-LOCAL_ONE"
+ expr='histogram_quantile(0.99, sum(rate(' + $._config.metricPrefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type!~"write|read|.*-.*"}[1m:30s])) by (le, request_type, cluster))',
+ legendFormat='p99 {{request_type}}'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(title='Nodes Status',)
+ .addPanel(
+ polystatPanel.new(
+ 'Nodes Status',
+ description="Nodes Status uses Internal/Gossip activity. Be mindful that if Native or Thrift protocol are disabled, the nodes won't be reachable, and still marked up",
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ span=12,
+ global_unit_format='none',
+ global_operator_name='current',
+ global_thresholds=[
+ {
+ value: 0,
+ state: 2,
+ color: '#d44a3a',
+ },
+ {
+ value: 1,
+ state: 0,
+ color: '#299c46',
+ },
+ ],
+ range_maps=[
+ {
+ from: '0',
+ to: '0.9999',
+ text: 'DOWN',
+ },
+ {
+ from: '1',
+ to: '1',
+ text: 'UP',
+ },
+ ],
+ mapping_type=2,
+ value_enabled=true,
+ )
+ .addTarget(
+ prometheus.target(
+ 'max by (cluster, dc, rack, instance) (changes(' + $._config.metricPrefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0',
+ legendFormat='{{instance}}',
+ instant=true,
+ )
+ )
+ )
+ .addPanel(
+ singleStatPanel.new(
+ 'Nodes Count',
+ description='Nodes up and down in the cluster',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ decimals=0,
+ prefix='Total:',
+ postfix=' Nodes',
+ postfixFontSize='80%',
+ valueFontSize='80%',
+ span=4
+ )
+ .addTarget(
+ prometheus.target(
+ expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
+ legendFormat='Total Number Of Nodes',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Nodes Status History',
+ description='Nodes up and down in the cluster per protocol/activity',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ decimals=0,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=false,
+ shared_tooltip=false,
+ min=0,
+ span=8
+ )
+ .addTarget(
+ prometheus.target(
+ expr='count by (cluster) (max by (cluster, dc, rack, instance) (collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
+ legendFormat='Total Number Of Nodes',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + $._config.metricPrefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="native"}[2m:30s])) > bool 0)',
+ legendFormat='Nodes Coordinating Requests (Native protocol)',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + $._config.metricPrefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0)',
+ legendFormat='Nodes With Internal Activity (Gossip protocol)',
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(title='Data Status')
+ .addPanel(
+ tablePanel.new(
+ 'Disk Space Usage',
+ description='Disk space used ordered (fullest disks first)',
+ datasource='$PROMETHEUS_DS',
+ transform='timeseries_aggregations',
+ transparent=true,
+ styles=[
+ {
+ alias: 'Node --> Mounting Point',
+ colorMode: null,
+ colors: [
+ 'rgba(245, 54, 54, 0.9)',
+ 'rgba(237, 129, 40, 0.89)',
+ 'rgba(50, 172, 45, 0.97)',
+ ],
+ dateFormat: 'YYYY-MM-DD HH:mm:ss',
+ decimals: 2,
+ mappingType: 1,
+ pattern: 'Metric',
+ preserveFormat: true,
+ sanitize: true,
+ thresholds: [],
+ type: 'string',
+ unit: 'short',
+ },
+ {
+ alias: '% Disk Space Used',
+ colorMode: 'row',
+ colors: [
+ 'rgba(50, 172, 45, 0.97)',
+ 'rgba(237, 129, 40, 0.89)',
+ 'rgba(245, 54, 54, 0.9)',
+ ],
+ dateFormat: 'YYYY-MM-DD HH:mm:ss',
+ decimals: 2,
+ link: false,
+ mappingType: 1,
+ pattern: 'Current',
+ thresholds: [
+ '0.5',
+ '0.75',
+ ],
+ type: 'number',
+ unit: 'percentunit',
+ },
+ ],
+ columns=[
+ {
+ text: 'Current',
+ value: 'current',
+ },
+ ],
+ sort={
+ col: 1,
+ desc: true,
+ }
+ )
+ .addTarget(
+ prometheus.target(
+ expr='min by (instance, df) (1-(collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"}\n / ignoring (type) (collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="used"}\n + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="reserved"}\n + ignoring (type) collectd_df_df_complex{df!~".*lxcfs.*", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", type="free"}))\n )',
+ legendFormat='{{cluster}}-{{instance}} --> {{df}}',
+ instant=true
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Cassandra cluster Data Size',
+ description='Total sizes of the data on distinct nodes',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster) (' + $._config.metricPrefix + '_table_live_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='Live space - {{cluster}}',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster) (' + $._config.metricPrefix + '_table_total_disk_space_used_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='Total space - {{cluster}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'SSTable Count',
+ description='SSTable Count Max and Average per table',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster, keyspace, table) (' + $._config.metricPrefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='Table - {{keyspace}}.{{table}}',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster) (' + $._config.metricPrefix + '_table_live_ss_table_count{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='Max in cluster - {{cluster}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Pending Compactions',
+ description='Maximum pending compactions on any node in the cluster',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ bars=false,
+ lines=true,
+ stack=false,
+ decimals=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster) (' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='min by (cluster) (' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='avg by (cluster) (' + $._config.metricPrefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Pending Compactions per Table',
+ description='Maximum pending compactions per table',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ bars=false,
+ lines=true,
+ stack=true,
+ decimals=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster, keyspace, table) (' + $._config.metricPrefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='max for {{keyspace}}.{{table}}',
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(title='Cassandra Internals')
+ .addPanel(
+ graphPanel.new(
+ 'Pending Tasks',
+ description='Cluster wide pending threads, by thread pool name',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, pool_name) (' + $._config.metricPrefix + '_thread_pools_pending_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='{{cluster}} - pending {{pool_name}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Blocked Tasks',
+ description='Cluster wide blocked threads, by thread pool name',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, pool_name) (rate(' + $._config.metricPrefix + '_thread_pools_total_blocked_tasks_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{cluster}} - blocked {{pool_name}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Dropped Messages',
+ description='Dropped messages rate summed by message type and cluster',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, message_type) (rate(' + $._config.metricPrefix + '_dropped_message_dropped_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='{{cluster}} - dropped {{message_type}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Active Tasks',
+ description='active threads summed per cluster',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster, pool_name) (' + $._config.metricPrefix + '_thread_pools_active_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='{{cluster}} - active {{pool_name}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Hinted Handoff',
+ description='Sum of hints being handed off per cluster.',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ min=0,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum by (cluster) (' + $._config.metricPrefix + '_storage_total_hints_in_progress_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='count',
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(title='Hardware / Operating System')
+ .addPanel(
+ graphPanel.new(
+ 'CPU Utilization',
+ description='Maximum CPU utilisation (max 100%)',
+ format='percentunit',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ percentage=true,
+ decimals=1,
+ min=0,
+ max=1,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{type="idle", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / sum by (cluster, dc, rack, instance) (rate(collectd_cpu_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))))',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Unix Load (1m rate)',
+ description='Max Unix load on a node for a cluster',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg by (cluster) (collectd_load_shortterm{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Memory Utilisation',
+ description='Maximum Memory allocated per usage (worst node) - excludes caches, buffers, etc',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ fill=1,
+ linewidth=2,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='min by (cluster) (sum by (cluster, dc, rack, instance) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
+ legendFormat='min memory available',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster, memory) (sum by (cluster, dc, rack, instance, memory) (collectd_memory{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}))',
+ legendFormat='max memory {{memory}}',
+ )
+ )
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Disk Read Thoughput',
+ description='Disk read throughput',
+ format='bps',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg by (cluster) (rate(collectd_processes_disk_octets_read_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Disk Write Thoughput',
+ description='Disk write throughput',
+ format='bps',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ )
+ .addTarget(
+ prometheus.target(
+ expr='max by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg by (cluster) (rate(collectd_processes_disk_octets_write_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Network I/O',
+ description='Network In and Out per cluster',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=1,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ bars=false,
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='outgoing',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum by (cluster) (rate(collectd_interface_if_octets_rx_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='incoming',
+ )
+ )
+ .addSeriesOverride({
+ alias: 'incoming',
+ transform: 'negative-Y',
+ })
+
+ )
+ )
+ .addRow(
+ row.new(title='JVM / Garbage Collection')
+ .addPanel(
+ graphPanel.new(
+ 'Application Throughput (% time NOT doing GC)',
+ description='Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC',
+ format='percentunit',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ decimals=2,
+ min=0,
+ max=1,
+ )
+ .addTarget(
+ prometheus.target(
+ 'max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'Garbage Collection Time',
+ description='Garbage collection duration',
+ format='ms',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ fill=0,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ )
+ .addTarget(
+ prometheus.target(
+ 'max by (cluster) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by (cluster) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg by (cluster) (rate(' + $._config.metricPrefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
+ legendFormat='avg',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ .addPanel(
+ graphPanel.new(
+ 'JVM Heap Memory Utilisation',
+ description='Maximum JVM Heap Memory size (worst node) and minimum available heap size',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ transparent=true,
+ legend_show=true,
+ legend_values=true,
+ legend_current=true,
+ legend_alignAsTable=true,
+ legend_sort='current',
+ legend_sortDesc=true,
+ shared_tooltip=false,
+ fill=1,
+ linewidth=2,
+ )
+ .addTarget(
+ prometheus.target(
+ 'max by (cluster)\n (' + $._config.metricPrefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='max',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by (cluster)\n (' + $._config.metricPrefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='min',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg by (cluster)\n (' + $._config.metricPrefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='avg',
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ 'min by ( cluster)\n (' + $._config.metricPrefix + '_jvm_memory_max{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
+ legendFormat='Heap memory available',
+ )
+ )
+ .addSeriesOverride(fillMinMaxSeriesOverrides)
+ .addSeriesOverride(removeMinlineSeriesOverrides)
+ )
+ ),
+ },
+}
diff --git a/mixin/dashboards/system-metrics.libsonnet b/mixin/dashboards/system-metrics.libsonnet
new file mode 100644
index 0000000..ed70c34
--- /dev/null
+++ b/mixin/dashboards/system-metrics.libsonnet
@@ -0,0 +1,908 @@
+local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
+
+local dashboard = grafana.dashboard;
+local row = grafana.row;
+local singlestat = grafana.singlestat;
+local graphpanel = grafana.graphPanel;
+local text = grafana.text;
+local prometheus = grafana.prometheus;
+local template = grafana.template;
+
+local textstatHeight = 100;
+local graphHeight = 250;
+local singlestatHeight = 125;
+local singlestatSpan = 2;
+local graphSpan = 6;
+
+{
+ _config+:: {
+ metricPrefix: error 'must provide metric prefix',
+ },
+ grafanaDashboards+:: {
+ 'system-metrics.json':
+ dashboard.new(
+ 'System & Node Metrics',
+ description='Operating System Metrics and Apache Cassandra Node Information',
+ schemaVersion=14,
+ time_from='now-30m',
+ refresh='1m',
+ tags=['os'],
+ style='dark'
+ )
+ .addTemplate(
+ template.datasource(
+ 'PROMETHEUS_DS',
+ 'prometheus',
+ 'Prometheus',
+ hide='all',
+ )
+ )
+ .addTemplate(
+ template.interval(
+ 'rate',
+ '1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d',
+ '5m',
+ label='Rate',
+ )
+ )
+ .addTemplate(
+ template.new(
+ 'host',
+ '$PROMETHEUS_DS',
+ 'label_values(collectd_collectd_queue_length{}, instance)',
+ label='Host',
+ refresh='time',
+ )
+ )
+ .addPanel(
+ text.new(
+ transparent=true,
+ mode='html',
+ content=''
+ ),
+ {
+ h: 3,
+ w: 5,
+ x: 9,
+ y: -1,
+ },
+ )
+ .addRow(
+ row.new(
+ title='Basic CPU / Mem / Disk Gauge',
+ height=singlestatHeight,
+ )
+ .addPanel(
+ singlestat.new(
+ 'CPU Busy',
+ description='Busy state of all CPU cores together',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ thresholds='85,95',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ "(1 - ((sum(irate(collectd_cpu_total{instance='$host', type='idle'}[$rate])) by (instance) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) by (instance)))) * 100",
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Memory Used',
+ description='Percentage of memory used (ignoring page cache)',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ thresholds='85,95',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ '100 * ((sum(collectd_memory{instance="$host", memory="free"}) + sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})) / sum(collectd_memory{instance="$host"}))',
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Swap Used',
+ description='Percentage of swap in use',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ thresholds='0,1',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ "(sum(collectd_swap{instance='$host',swap='used'}) / sum(collectd_swap{instance='$host'})) * 100"
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Disk Used',
+ description='Percentage of root disk in use',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ thresholds='50,85',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ '(sum(collectd_df_df_complex{instance="$host", df="root", type="used"}) / sum(collectd_df_df_complex{instance="$host", df="root"})) * 100'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'CPU System Load (1m avg)',
+ description='Busy state of all CPU cores together (1 min average)',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ thresholds='85,95',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg(collectd_load_shortterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100',
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'CPU System Load (5m avg)',
+ description='Busy state of all CPU cores together (5 min average)',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ thresholds='85,95',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'avg(collectd_load_midterm{instance="$host"}) / count(count(collectd_cpu_total{instance="$host"}) by (cpu)) * 100',
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='Basic CPU / Mem / Disk Info',
+ height=textstatHeight
+ )
+ .addPanel(
+ singlestat.new(
+ 'CPU Cores',
+ description='Total number of CPU cores',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'count(count(collectd_cpu_total{instance="$host"}) by (cpu))'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Total RAM',
+ description='Total amount of system memory',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(collectd_memory{instance="$host"})'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Total Swap',
+ description='Total amount of swap space',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(collectd_swap{instance="$host"})'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Total RootFS',
+ description='Total amount of disk space',
+ format='bytes',
+ datasource='$PROMETHEUS_DS',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(collectd_df_df_complex{df="root",instance="$host"})'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'System Load (1m avg)',
+ description='System Load (1m avg)',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'collectd_load_shortterm{instance="$host"}'
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'System Uptime',
+ description='Uptime of the host',
+ format='s',
+ decimals=1,
+ datasource='$PROMETHEUS_DS',
+ span=singlestatSpan
+ )
+ .addTarget(
+ prometheus.target(
+ 'collectd_uptime{instance="$host"}'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='Basic CPU / Mem Graph',
+ height=graphHeight
+ )
+ .addPanel(
+ graphpanel.new(
+ title='CPU Basic',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ percentage=true,
+ stack=true,
+ min=0,
+ max=100
+ )
+ .addTarget(
+ prometheus.target(
+ expr="sum(irate(collectd_cpu_total{instance='$host',type='system'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
+ legendFormat='Busy System'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="sum(irate(collectd_cpu_total{instance='$host',type='user'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
+ legendFormat='Busy User'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="sum(irate(collectd_cpu_total{instance='$host',type='wait'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
+ legendFormat='Busy IOWait'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="sum(irate(collectd_cpu_total{instance='$host',type='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
+ legendFormat='Busy IRQ'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="sum(irate(collectd_cpu_total{instance='$host',type!='idle',type!='system',type!='user',type!='wait',type!='softirq'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
+ legendFormat='Busy Other'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="sum(irate(collectd_cpu_total{instance='$host',type='idle'}[$rate])) / sum(irate(collectd_cpu_total{instance='$host'}[$rate])) * 100",
+ legendFormat='Idle'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Basic memory usage',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='bytes',
+ min=0
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(collectd_memory{instance="$host"})',
+ legendFormat='RAM Total'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(collectd_memory{instance="$host"}) - sum(collectd_memory{instance="$host", memory="free"}) - sum(collectd_memory{instance="$host", memory="cached"}) - sum(collectd_memory{instance="$host", memory="buffered"})',
+ legendFormat='RAM Used'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(collectd_memory{instance="$host", memory="cached"}) + sum(collectd_memory{instance="$host", memory="buffered"})',
+ legendFormat='RAM Cache + Buffer'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(collectd_memory{instance="$host", memory="free"})',
+ legendFormat='RAM Free'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(collectd_swap{instance="$host"}) - sum(collectd_swap{instance="$host", swap="free"})',
+ legendFormat='SWAP Used'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='Basic Network / Disk Graph',
+ height=graphHeight
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Network Traffic / Second',
+ description='Basic network info per interface',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='bps',
+ labelY1='Receive (-) / Send (+)',
+
+ )
+ .addSeriesOverride({
+ alias: '/.*receive.*/',
+ transform: 'negative-Y',
+ })
+ .addTarget(
+ prometheus.target(
+ expr="irate(collectd_interface_if_octets_rx_total{instance='$host'}[$rate]) * 8",
+ legendFormat='{{interface}} receive'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="irate(collectd_interface_if_octets_tx_total{instance='$host'}[$rate]) * 8",
+ legendFormat='{{interface}} send'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Network Packets / Second',
+ description='Basic network info per interface',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='pps',
+ labelY1='Receive (-) / Send (+)',
+
+ )
+ .addSeriesOverride({
+ alias: '/.*receive.*/',
+ transform: 'negative-Y',
+ })
+ .addTarget(
+ prometheus.target(
+ expr="irate(collectd_interface_if_packets_rx_total{instance='$host'}[$rate]) * 8",
+ legendFormat='{{interface}} receive'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr="irate(collectd_interface_if_packets_tx_total{instance='$host'}[$rate]) * 8",
+ legendFormat='{{interface}} send'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Disk Activity / Second',
+ description='Disk Activity / Second',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='Bps',
+ labelY1='Read (-) / Write (+)',
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addSeriesOverride({
+ alias: '/.*Read.*/',
+ transform: 'negative-Y',
+ })
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_octets_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
+ legendFormat='{{disk}} - Read'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_octets_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
+ legendFormat='{{disk}} - Write'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Disk IOPS',
+ description='Disk iops per disk',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='iops',
+ labelY1='Read (-) / Write (+)',
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addSeriesOverride({
+ alias: '/.*Read.*/',
+ transform: 'negative-Y',
+ })
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_ops_read_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
+ legendFormat='{{disk}} - Read'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_ops_write_total{instance="$host", disk=~".*\\\\d+"}[$rate])',
+ legendFormat='{{disk}} - Write'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Disk Used',
+ description='Disk space used',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='decbytes',
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(collectd_df_df_complex{instance="$host", type="used"}) by (df)',
+ legendFormat='{{df}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Disk Queue Length',
+ description='The amount of requests pending in the disk queue',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_io_time_weighted_io_time_total{instance="$host",disk=~".*[0-9]+"}[$rate]) / 1000',
+ legendFormat='{{disk}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Disk Latency',
+ description='Disk access times',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='ms',
+ labelY1='Read (-) / Write (+)',
+ legend_hideZero=true,
+ legend_hideEmpty=true
+ )
+ .addSeriesOverride({
+ alias: '/.*Read.*/',
+ transform: 'negative-Y',
+ })
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_time_read_total{instance="$host",disk=~".*[0-9]+"}[$rate])',
+ legendFormat='{{disk}} - Read'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_disk_disk_time_write_total{instance="$host",disk=~".*[0-9]+"}[$rate])',
+ legendFormat='{{disk}} - Write'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='CPU Details',
+ height=graphHeight,
+ collapse=true
+ )
+ .addPanel(
+ graphpanel.new(
+ title='CPU User',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='percent',
+ min=0,
+ max=100
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(rate(collectd_cpu_total{instance="$host", type="user"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
+ legendFormat='{{cpu}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='CPU System',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='percent',
+ min=0,
+ max=100
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(rate(collectd_cpu_total{instance="$host", type="system"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
+ legendFormat='{{cpu}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='CPU IOWait',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='percent',
+ min=0,
+ max=100
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(rate(collectd_cpu_total{instance="$host", type="wait"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
+ legendFormat='{{cpu}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='CPU SoftIRQ',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='percent',
+ min=0,
+ max=100
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(rate(collectd_cpu_total{instance="$host", type="softirq"}[$rate])) by (cpu) / sum(rate(collectd_cpu_total{instance="$host"}[$rate])) by (cpu) * 100',
+ legendFormat='{{cpu}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='CPU Other',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='percent',
+ min=0,
+ max=100
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(rate(collectd_cpu_total{instance="$host", type=~"(interrupt|nice|steal)"}[$rate])) by (cpu, type) / ignoring(type) group_left sum(rate(collectd_cpu_total{instance="$host" }[$rate])) by (cpu) * 100',
+ legendFormat='{{cpu}} - {{type}}'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='Advanced Details',
+ height=graphHeight,
+ collapse=true
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Context Switches / Second',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short'
+ )
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_contextswitch_total{instance="$host"}[$rate])',
+ legendFormat='Context Switches'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='IRQ Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_irq_total{instance="$host", irq != "LOC"}[$rate])',
+ legendFormat='{{irq}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='NUMA Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='irate(collectd_numa_vmpage_action_total{instance="$host"}[$rate])',
+ legendFormat='{{numa}} - {{type}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='TCP Connection Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='collectd_tcpconns_tcp_connections{instance="$host"}',
+ legendFormat='{{tcpconns}} - {{type}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='TCP Connection Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_protocols_protocol_counter_total{instance="$host"}[$rate])',
+ legendFormat='{{protocols}} - {{type}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Processor Speeds',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='hertz',
+ points=true,
+ lines=false,
+ pointradius=5,
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='collectd_cpufreq{instance="$host"}',
+ legendFormat='{{cpufreq}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Page Cache Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_vmem_vmpage_faults_majflt_total{instance="$host"}[$rate])',
+ legendFormat='Major fault'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_vmem_vmpage_faults_minflt_total{instance="$host"}[$rate])',
+ legendFormat='Minor fault'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_vmem_vmpage_action_total{instance="$host"}[$rate])',
+ legendFormat='Action - {{vmem}}'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_vmem_vmpage_io_in_total{instance="$host"}[$rate])',
+ legendFormat='IO read page'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='rate(collectd_vmem_vmpage_io_out_total{instance="$host"}[$rate])',
+ legendFormat='IO write page'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Page Cache Layout',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ percentage=true,
+ stack=true,
+ min=0,
+ max=100,
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='collectd_vmem_vmpage_number{instance="$host"}',
+ legendFormat='{{vmem}}'
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='Process Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='short',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='collectd_processes_ps_count_threads{instance="$host"}',
+ legendFormat='Thread Count'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='collectd_processes_ps_count_processes{instance="$host"}',
+ legendFormat='Process Count'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='collectd_processes_ps_state{instance="$host"}',
+ legendFormat='Process State - {{processes}}'
+ )
+ )
+ )
+ )
+ .addRow(
+ row.new(
+ title='Basic Cassandra Overview',
+ height=singlestatHeight
+ )
+ .addPanel(
+ singlestat.new(
+ 'SSTable Count',
+ description='Number of sstables on the node',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan,
+ thresholds='100000,500000'
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(' + $._config.metricPrefix + "_table_live_ss_table_count{instance='$host'})"
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Pending Compactions',
+ description='Number of pending compactions on the node',
+ format='short',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan,
+ thresholds='10,50'
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(' + $._config.metricPrefix + "_compaction_pending_tasks{instance='$host'})"
+ )
+ )
+ )
+ .addPanel(
+ singlestat.new(
+ 'Connected Clients',
+ description='Number of client connections to the node',
+ format='percent',
+ datasource='$PROMETHEUS_DS',
+ sparklineShow=true,
+ gaugeShow=true,
+ span=singlestatSpan,
+ thresholds='100,1000'
+ )
+ .addTarget(
+ prometheus.target(
+ 'sum(' + $._config.metricPrefix + "_client_connected_native_clients{instance='$host'})"
+ )
+ )
+ )
+ .addPanel(
+ graphpanel.new(
+ title='GC Activity',
+ datasource='$PROMETHEUS_DS',
+ span=graphSpan,
+ format='bytes',
+ legend_hideEmpty=true,
+ legend_hideZero=true
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(' + $._config.metricPrefix + '_jvm_memory_max{instance="$host", memory_type="total"})',
+ legendFormat='JVM Heap Total'
+ )
+ )
+ .addSeriesOverride({
+ alias: '/.*Total.*/',
+ fill: 0,
+ })
+ .addTarget(
+ prometheus.target(
+ expr='sum(' + $._config.metricPrefix + '_jvm_memory_used{instance="$host", memory_type="non_heap"})',
+ legendFormat='JVM Non-Heap Used'
+ )
+ )
+ .addTarget(
+ prometheus.target(
+ expr='sum(' + $._config.metricPrefix + '_jvm_memory_used{instance="$host", memory_type="heap"})',
+ legendFormat='JVM Heap Used'
+ )
+ )
+ )
+ ),
+ },
+}
diff --git a/mixin/jsonnetfile.json b/mixin/jsonnetfile.json
new file mode 100644
index 0000000..8e6cf3c
--- /dev/null
+++ b/mixin/jsonnetfile.json
@@ -0,0 +1,24 @@
+{
+ "version": 1,
+ "dependencies": [
+ {
+ "source": {
+ "git": {
+ "remote": "https://github.com/grafana/grafonnet-lib.git",
+ "subdir": "grafonnet"
+ }
+ },
+ "version": "master"
+ },
+ {
+ "source": {
+ "git": {
+ "remote": "https://github.com/thelastpickle/grafonnet-polystat-panel.git",
+ "subdir": ""
+ }
+ },
+ "version": "master"
+ }
+ ],
+ "legacyImports": false
+}
diff --git a/mixin/jsonnetfile.lock.json b/mixin/jsonnetfile.lock.json
new file mode 100644
index 0000000..79d7d41
--- /dev/null
+++ b/mixin/jsonnetfile.lock.json
@@ -0,0 +1,26 @@
+{
+ "version": 1,
+ "dependencies": [
+ {
+ "source": {
+ "git": {
+ "remote": "https://github.com/grafana/grafonnet-lib.git",
+ "subdir": "grafonnet"
+ }
+ },
+ "version": "6db00c292d3a1c71661fc875f90e0ec7caa538c2",
+ "sum": "gF8foHByYcB25jcUOBqP6jxk0OPifQMjPvKY0HaCk6w="
+ },
+ {
+ "source": {
+ "git": {
+ "remote": "https://github.com/thelastpickle/grafonnet-polystat-panel.git",
+ "subdir": ""
+ }
+ },
+ "version": "275a48de57afdac0d72219d82863d8ab8bd0e682",
+ "sum": "pXSXxNxi4WvBKYZ83GVYotQyL+toHaizqvjJ+8YYMoU="
+ }
+ ],
+ "legacyImports": false
+}
diff --git a/mixin/make-dashboards.sh b/mixin/make-dashboards.sh
new file mode 100755
index 0000000..1fef13e
--- /dev/null
+++ b/mixin/make-dashboards.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUTPUT_DIR="${ROOT_DIR}/dashboards/grafana/generated-dashboards"
+
+rm -rf "${OUTPUT_DIR}"
+mkdir "${OUTPUT_DIR}"
+
+docker run -v "${ROOT_DIR}:${ROOT_DIR}" datastax/grafonnet-lib:v0.1.3 \
+ jsonnet --multi "${OUTPUT_DIR}" "${ROOT_DIR}/mixin/dashboards.jsonnet"
diff --git a/mixin/mixin.libsonnet b/mixin/mixin.libsonnet
new file mode 100644
index 0000000..b291077
--- /dev/null
+++ b/mixin/mixin.libsonnet
@@ -0,0 +1,2 @@
+(import './dashboards/dashboards.libsonnet') +
+(import './config.libsonnet')