From 177fb45fb15db3af5ce053412c47ef38a745eb60 Mon Sep 17 00:00:00 2001 From: Alain Rodriguez Date: Wed, 17 Mar 2021 12:01:13 +0100 Subject: [PATCH] Fixes to read path dashboard --- .../dashboards-jsonnet/read-path.jsonnet | 190 ++++++-- .../generated-dashboards/read-path.json | 409 ++++++++++++++++-- 2 files changed, 543 insertions(+), 56 deletions(-) diff --git a/dashboards/grafana/dashboards-jsonnet/read-path.jsonnet b/dashboards/grafana/dashboards-jsonnet/read-path.jsonnet index 3e79a53..30ce92e 100644 --- a/dashboards/grafana/dashboards-jsonnet/read-path.jsonnet +++ b/dashboards/grafana/dashboards-jsonnet/read-path.jsonnet @@ -5,24 +5,27 @@ local template = grafana.template; local row = grafana.row; local graphPanel = grafana.graphPanel; -local tablePanel = grafana.tablePanel; -local singleStatPanel = grafana.singlestat; local textPanel = grafana.text; -local polystatPanel = grafana.polystatPanel; local prefix = std.extVar('prefix'); +local fillLatencySeriesOverrides = { + 'alias': 'p999', + 'fillBelowTo': 'p98', + 'lines': false +}; + local fillMinMaxSeriesOverrides = { 'alias': 'max', 'fillBelowTo': 'min', 'lines': false }; + local removeMinlineSeriesOverrides = { 'alias': 'min', 'lines': false }; - // used in the single stat panels where higher is better - cache hit rates for example local reversedColors =[ '#d44a3a', @@ -36,7 +39,7 @@ dashboard.new( refresh='30s', time_from='now-30m', editable=true, - tags=['Cassandra', 'Read-Path'], + tags=['Cassandra', 'Read', 'Read-Path', 'Select'], style='dark' ) .addTemplate( @@ -125,19 +128,19 @@ dashboard.new( ) .addTarget( prometheus.target( - expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Reads: {{keyspace}}.{{table}}', ) ) .addTarget( prometheus.target( - expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Ranges: {{keyspace}}.{{table}}', ) ) .addTarget( prometheus.target( - expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Cas Prepare: {{keyspace}}.{{table}}', ) ) @@ -161,19 +164,19 @@ dashboard.new( ) .addTarget( prometheus.target( - expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Reads: {{instance}}', ) ) .addTarget( prometheus.target( - expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Ranges: {{instance}}', ) ) .addTarget( prometheus.target( - expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Cas Prepare: {{instance}}', ) ) @@ -197,19 +200,19 @@ dashboard.new( ) .addTarget( prometheus.target( - expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Reads: {{keyspace}}.{{table}} on {{instance}}', ) ) .addTarget( prometheus.target( - expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Ranges: {{keyspace}}.{{table}} on {{instance}}', ) ) .addTarget( prometheus.target( - expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))', + expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', legendFormat='Cas Prepare: {{keyspace}}.{{table}} on {{instance}}', ) ) @@ -239,19 +242,19 @@ dashboard.new( ) .addTarget( prometheus.target( - expr='histogram_quantile(0.98, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.98, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p98 - {{keyspace}}.{{table}}', ) ) .addTarget( prometheus.target( - expr='histogram_quantile(0.99, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.99, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p99 - {{keyspace}}.{{table}}', ) ) .addTarget( prometheus.target( - expr='histogram_quantile(0.999, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.999, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p999 - {{keyspace}}.{{table}}', ) ) @@ -276,19 +279,19 @@ dashboard.new( ) .addTarget( prometheus.target( - expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p98 - {{instance}}', ) ) .addTarget( prometheus.target( - expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p99 - {{instance}}', ) ) .addTarget( prometheus.target( - expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p999 - {{instance}}', ) ) @@ -313,19 +316,19 @@ dashboard.new( ) .addTarget( prometheus.target( - expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p98 - {{keyspace}}.{{table}} - {{instance}}', ) ) .addTarget( prometheus.target( - expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p99 - {{keyspace}}.{{table}} - {{instance}}', ) ) .addTarget( prometheus.target( - expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))', + expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))', legendFormat='p999 - {{keyspace}}.{{table}} - {{instance}}', ) ) @@ -602,7 +605,7 @@ dashboard.new( .addPanel( graphPanel.new( 'Max & Average Partition Size per Table', - description='Max of SStable hit per read, for each table', + description='Max & Average of the partition sizes, for each table', format='bytes', datasource='$PROMETHEUS_DS', transparent=true, @@ -637,7 +640,7 @@ dashboard.new( .addPanel( graphPanel.new( 'Max & Average Partition Size per Node', - description='Max of SSTable hit per read, for each node', + description='Max & Average of the partition sizes, for each node', format='bytes', datasource='$PROMETHEUS_DS', transparent=true, @@ -672,7 +675,7 @@ dashboard.new( .addPanel( graphPanel.new( 'Max & Average Partition Size per Table and Node', - description='Max and mean partition size, for each combination of table and node', + description='Max & Average of the partition sizes, for each combination of table and node', format='bytes', datasource='$PROMETHEUS_DS', transparent=true, @@ -704,6 +707,132 @@ dashboard.new( ) ) +).addRow( + row.new(title='JVM / Garbage Collection') + .addPanel( + graphPanel.new( + 'Application Throughput (% time NOT doing GC)', + description='Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC', + format='percentunit', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + decimals=2, + max=1, + ) + .addTarget( + prometheus.target( + 'max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'Garbage Collection Time', + description='Garbage collection duration', + format='ms', + datasource='$PROMETHEUS_DS', + transparent=true, + fill=0, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + ) + .addTarget( + prometheus.target( + 'max by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))', + legendFormat='avg', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) + .addPanel( + graphPanel.new( + 'JVM Heap Memory Utilisation', + description='Maximum JVM Heap Memory size (worst node) and minimum available heap size', + format='bytes', + datasource='$PROMETHEUS_DS', + transparent=true, + legend_show=true, + legend_values=true, + legend_current=true, + legend_alignAsTable=true, + legend_sort='current', + legend_sortDesc=true, + shared_tooltip=false, + fill=1, + linewidth=2, + ) + .addTarget( + prometheus.target( + 'max by (cluster) + (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='max', + ) + ) + .addTarget( + prometheus.target( + 'min by (cluster) + (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='min', + ) + ) + .addTarget( + prometheus.target( + 'avg by (cluster) + (' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='avg', + ) + ) + .addTarget( + prometheus.target( + 'min by ( cluster) + (' + prefix + '_jvm_memory_max{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})', + legendFormat='Heap memory available', + ) + ) + .addSeriesOverride(fillMinMaxSeriesOverrides) + .addSeriesOverride(removeMinlineSeriesOverrides) + ) ) @@ -975,3 +1104,12 @@ dashboard.new( // Disk Utilization (io.util%) ) + +// TODO/Add + +// section OS/hardware section +// CPU usage +// Unix Load +// Context switching (collectd_contextswitch_total) +// System Memory +// Network diff --git a/dashboards/grafana/generated-dashboards/read-path.json b/dashboards/grafana/generated-dashboards/read-path.json index c325138..8635684 100644 --- a/dashboards/grafana/generated-dashboards/read-path.json +++ b/dashboards/grafana/generated-dashboards/read-path.json @@ -100,21 +100,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (cluster, keyspace, table) (rate(mcac_table_read_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, keyspace, table) (rate(mcac_table_read_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Reads: {{keyspace}}.{{table}}", "refId": "A" }, { - "expr": "sum by (cluster, keyspace, table) (rate(mcac_table_range_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, keyspace, table) (rate(mcac_table_range_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Ranges: {{keyspace}}.{{table}}", "refId": "B" }, { - "expr": "sum by (cluster, keyspace, table) (rate(mcac_table_cas_prepare_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, keyspace, table) (rate(mcac_table_cas_prepare_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Cas Prepare: {{keyspace}}.{{table}}", @@ -198,21 +198,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (cluster, dc, rack, instance) (rate(mcac_table_read_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, dc, rack, instance) (rate(mcac_table_read_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Reads: {{instance}}", "refId": "A" }, { - "expr": "sum by (cluster, dc, rack, instance) (rate(mcac_table_range_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, dc, rack, instance) (rate(mcac_table_range_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Ranges: {{instance}}", "refId": "B" }, { - "expr": "sum by (cluster, dc, rack, instance) (rate(mcac_table_cas_prepare_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, dc, rack, instance) (rate(mcac_table_cas_prepare_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Cas Prepare: {{instance}}", @@ -296,21 +296,21 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (cluster, dc, rack, instance, keyspace, table) (rate(mcac_table_read_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, dc, rack, instance, keyspace, table) (rate(mcac_table_read_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Reads: {{keyspace}}.{{table}} on {{instance}}", "refId": "A" }, { - "expr": "sum by (cluster, dc, rack, instance, keyspace, table) (rate(mcac_table_range_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, dc, rack, instance, keyspace, table) (rate(mcac_table_range_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Ranges: {{keyspace}}.{{table}} on {{instance}}", "refId": "B" }, { - "expr": "sum by (cluster, dc, rack, instance, keyspace, table) (rate(mcac_table_cas_prepare_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m]))", + "expr": "sum by (cluster, dc, rack, instance, keyspace, table) (rate(mcac_table_cas_prepare_latency_total{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Cas Prepare: {{keyspace}}.{{table}} on {{instance}}", @@ -407,21 +407,21 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.98, sum by (cluster, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.98, sum by (cluster, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p98 - {{keyspace}}.{{table}}", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum by (cluster, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.99, sum by (cluster, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p99 - {{keyspace}}.{{table}}", "refId": "B" }, { - "expr": "histogram_quantile(0.999, sum by (cluster, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.999, sum by (cluster, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p999 - {{keyspace}}.{{table}}", @@ -505,21 +505,21 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.98, sum by (cluster, dc, rack, instance, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.98, sum by (cluster, dc, rack, instance, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p98 - {{instance}}", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum by (cluster, dc, rack, instance, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.99, sum by (cluster, dc, rack, instance, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p99 - {{instance}}", "refId": "B" }, { - "expr": "histogram_quantile(0.999, sum by (cluster, dc, rack, instance, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.999, sum by (cluster, dc, rack, instance, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p999 - {{instance}}", @@ -603,21 +603,21 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.98, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.98, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p98 - {{keyspace}}.{{table}} - {{instance}}", "refId": "A" }, { - "expr": "histogram_quantile(0.99, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.99, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p99 - {{keyspace}}.{{table}} - {{instance}}", "refId": "B" }, { - "expr": "histogram_quantile(0.999, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[5m:1m])))", + "expr": "histogram_quantile(0.999, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(mcac_table_read_latency_bucket{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "p999 - {{keyspace}}.{{table}} - {{instance}}", @@ -1494,7 +1494,7 @@ "dashes": false, "datasource": "$PROMETHEUS_DS", "decimals": 0, - "description": "Max of SStable hit per read, for each table", + "description": "Max & Average of the partition sizes, for each table", "fill": 0, "fillGradient": 0, "gridPos": { }, @@ -1588,7 +1588,7 @@ "dashes": false, "datasource": "$PROMETHEUS_DS", "decimals": 0, - "description": "Max of SSTable hit per read, for each node", + "description": "Max & Average of the partition sizes, for each node", "fill": 0, "fillGradient": 0, "gridPos": { }, @@ -1682,7 +1682,7 @@ "dashes": false, "datasource": "$PROMETHEUS_DS", "decimals": 0, - "description": "Max and mean partition size, for each combination of table and node", + "description": "Max & Average of the partition sizes, for each combination of table and node", "fill": 0, "fillGradient": 0, "gridPos": { }, @@ -1789,7 +1789,7 @@ "dashes": false, "datasource": "$PROMETHEUS_DS", "decimals": 2, - "description": "Min of keyCache hit rates, for each table", + "description": "Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC", "fill": 0, "fillGradient": 0, "gridPos": { }, @@ -1817,6 +1817,353 @@ "points": false, "renderer": "flot", "repeat": null, + "seriesOverrides": [ + { + "alias": "max", + "fillBelowTo": "min", + "lines": false + }, + { + "alias": "min", + "lines": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(mcac_jvm_gc_time{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])) / 1000))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "max", + "refId": "A" + }, + { + "expr": "min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(mcac_jvm_gc_time{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])) / 1000))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "min", + "refId": "B" + }, + { + "expr": "avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(mcac_jvm_gc_time{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s])) / 1000))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "avg", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Application Throughput (% time NOT doing GC)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "decimals": 2, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": null, + "show": true + }, + { + "decimals": 2, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$PROMETHEUS_DS", + "description": "Garbage collection duration", + "fill": 0, + "fillGradient": 0, + "gridPos": { }, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "max", + "fillBelowTo": "min", + "lines": false + }, + { + "alias": "min", + "lines": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (cluster) (rate(mcac_jvm_gc_time{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "max", + "refId": "A" + }, + { + "expr": "min by (cluster) (rate(mcac_jvm_gc_time{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "min", + "refId": "B" + }, + { + "expr": "avg by (cluster) (rate(mcac_jvm_gc_time{cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"}[1m:30s]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "avg", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Garbage Collection Time", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$PROMETHEUS_DS", + "description": "Maximum JVM Heap Memory size (worst node) and minimum available heap size", + "fill": 1, + "fillGradient": 0, + "gridPos": { }, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "max", + "fillBelowTo": "min", + "lines": false + }, + { + "alias": "min", + "lines": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by (cluster)\n (mcac_jvm_memory_used{memory_type=\"heap\", cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "max", + "refId": "A" + }, + { + "expr": "min by (cluster)\n (mcac_jvm_memory_used{memory_type=\"heap\", cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "min", + "refId": "B" + }, + { + "expr": "avg by (cluster)\n (mcac_jvm_memory_used{memory_type=\"heap\", cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "avg", + "refId": "C" + }, + { + "expr": "min by ( cluster)\n (mcac_jvm_memory_max{memory_type=\"heap\", cluster=\"$cluster\", dc=~\"$dc\", rack=~\"$rack\", instance=~\"$node\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Heap memory available", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "JVM Heap Memory Utilisation", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "JVM / Garbage Collection", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$PROMETHEUS_DS", + "decimals": 2, + "description": "Min of keyCache hit rates, for each table", + "fill": 0, + "fillGradient": 0, + "gridPos": { }, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, "seriesOverrides": [ ], "spaceLength": 10, "stack": false, @@ -1880,7 +2227,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 24, + "id": 27, "legend": { "alignAsTable": true, "avg": false, @@ -1967,7 +2314,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 25, + "id": 28, "legend": { "alignAsTable": true, "avg": false, @@ -2091,7 +2438,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 26, + "id": 29, "legend": { "alignAsTable": true, "avg": false, @@ -2178,7 +2525,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 27, + "id": 30, "legend": { "alignAsTable": true, "avg": false, @@ -2265,7 +2612,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 28, + "id": 31, "legend": { "alignAsTable": true, "avg": false, @@ -2365,7 +2712,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 29, + "id": 32, "legend": { "alignAsTable": true, "avg": false, @@ -2452,7 +2799,7 @@ "fill": 0, "fillGradient": 0, "gridPos": { }, - "id": 30, + "id": 33, "legend": { "alignAsTable": true, "avg": false, @@ -2542,7 +2889,9 @@ "style": "dark", "tags": [ "Cassandra", - "Read-Path" + "Read", + "Read-Path", + "Select" ], "templating": { "list": [