Skip to content

Commit

Permalink
Fixes to read path dashboard
Browse files Browse the repository at this point in the history
  • Loading branch information
arodrime committed Mar 17, 2021
1 parent e05973d commit 177fb45
Show file tree
Hide file tree
Showing 2 changed files with 543 additions and 56 deletions.
190 changes: 164 additions & 26 deletions dashboards/grafana/dashboards-jsonnet/read-path.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,27 @@ local template = grafana.template;
local row = grafana.row;

local graphPanel = grafana.graphPanel;
local tablePanel = grafana.tablePanel;
local singleStatPanel = grafana.singlestat;
local textPanel = grafana.text;
local polystatPanel = grafana.polystatPanel;

local prefix = std.extVar('prefix');

local fillLatencySeriesOverrides = {
'alias': 'p999',
'fillBelowTo': 'p98',
'lines': false
};

local fillMinMaxSeriesOverrides = {
'alias': 'max',
'fillBelowTo': 'min',
'lines': false
};

local removeMinlineSeriesOverrides = {
'alias': 'min',
'lines': false
};


// used in the single stat panels where higher is better - cache hit rates for example
local reversedColors =[
'#d44a3a',
Expand All @@ -36,7 +39,7 @@ dashboard.new(
refresh='30s',
time_from='now-30m',
editable=true,
tags=['Cassandra', 'Read-Path'],
tags=['Cassandra', 'Read', 'Read-Path', 'Select'],
style='dark'
)
.addTemplate(
Expand Down Expand Up @@ -125,19 +128,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Reads: {{keyspace}}.{{table}}',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Ranges: {{keyspace}}.{{table}}',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Cas Prepare: {{keyspace}}.{{table}}',
)
)
Expand All @@ -161,19 +164,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Reads: {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Ranges: {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, dc, rack, instance) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Cas Prepare: {{instance}}',
)
)
Expand All @@ -197,19 +200,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_read_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Reads: {{keyspace}}.{{table}} on {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_range_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Ranges: {{keyspace}}.{{table}} on {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m]))',
expr='sum by (cluster, dc, rack, instance, keyspace, table) (rate(' + prefix + '_table_cas_prepare_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='Cas Prepare: {{keyspace}}.{{table}} on {{instance}}',
)
)
Expand Down Expand Up @@ -239,19 +242,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.98, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.98, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p98 - {{keyspace}}.{{table}}',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.99, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.99, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p99 - {{keyspace}}.{{table}}',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.999, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.999, sum by (cluster, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p999 - {{keyspace}}.{{table}}',
)
)
Expand All @@ -276,19 +279,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p98 - {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p99 - {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p999 - {{instance}}',
)
)
Expand All @@ -313,19 +316,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.98, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p98 - {{keyspace}}.{{table}} - {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.99, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p99 - {{keyspace}}.{{table}} - {{instance}}',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[5m:1m])))',
expr='histogram_quantile(0.999, sum by (cluster, dc, rack, instance, keyspace, table, le) (rate(' + prefix + '_table_read_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])))',
legendFormat='p999 - {{keyspace}}.{{table}} - {{instance}}',
)
)
Expand Down Expand Up @@ -602,7 +605,7 @@ dashboard.new(
.addPanel(
graphPanel.new(
'Max & Average Partition Size per Table',
description='Max of SStable hit per read, for each table',
description='Max & Average of the partition sizes, for each table',
format='bytes',
datasource='$PROMETHEUS_DS',
transparent=true,
Expand Down Expand Up @@ -637,7 +640,7 @@ dashboard.new(
.addPanel(
graphPanel.new(
'Max & Average Partition Size per Node',
description='Max of SSTable hit per read, for each node',
description='Max & Average of the partition sizes, for each node',
format='bytes',
datasource='$PROMETHEUS_DS',
transparent=true,
Expand Down Expand Up @@ -672,7 +675,7 @@ dashboard.new(
.addPanel(
graphPanel.new(
'Max & Average Partition Size per Table and Node',
description='Max and mean partition size, for each combination of table and node',
description='Max & Average of the partition sizes, for each combination of table and node',
format='bytes',
datasource='$PROMETHEUS_DS',
transparent=true,
Expand Down Expand Up @@ -704,6 +707,132 @@ dashboard.new(
)
)

).addRow(
row.new(title='JVM / Garbage Collection')
.addPanel(
graphPanel.new(
'Application Throughput (% time NOT doing GC)',
description='Percentage of the time the node is *not* doing a GC, thus Cassandra is not stopped for GC',
format='percentunit',
datasource='$PROMETHEUS_DS',
transparent=true,
fill=0,
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_sort='current',
legend_sortDesc=true,
shared_tooltip=false,
decimals=2,
max=1,
)
.addTarget(
prometheus.target(
'max by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
legendFormat='max',
)
)
.addTarget(
prometheus.target(
'min by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
legendFormat='min',
)
)
.addTarget(
prometheus.target(
'avg by (cluster) (1 - (sum by (cluster, dc, rack, instance) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s])) / 1000))',
legendFormat='avg',
)
)
.addSeriesOverride(fillMinMaxSeriesOverrides)
.addSeriesOverride(removeMinlineSeriesOverrides)
)
.addPanel(
graphPanel.new(
'Garbage Collection Time',
description='Garbage collection duration',
format='ms',
datasource='$PROMETHEUS_DS',
transparent=true,
fill=0,
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_sort='current',
legend_sortDesc=true,
shared_tooltip=false,
)
.addTarget(
prometheus.target(
'max by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='max',
)
)
.addTarget(
prometheus.target(
'min by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='min',
)
)
.addTarget(
prometheus.target(
'avg by (cluster) (rate(' + prefix + '_jvm_gc_time{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='avg',
)
)
.addSeriesOverride(fillMinMaxSeriesOverrides)
.addSeriesOverride(removeMinlineSeriesOverrides)
)
.addPanel(
graphPanel.new(
'JVM Heap Memory Utilisation',
description='Maximum JVM Heap Memory size (worst node) and minimum available heap size',
format='bytes',
datasource='$PROMETHEUS_DS',
transparent=true,
legend_show=true,
legend_values=true,
legend_current=true,
legend_alignAsTable=true,
legend_sort='current',
legend_sortDesc=true,
shared_tooltip=false,
fill=1,
linewidth=2,
)
.addTarget(
prometheus.target(
'max by (cluster)
(' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='max',
)
)
.addTarget(
prometheus.target(
'min by (cluster)
(' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='min',
)
)
.addTarget(
prometheus.target(
'avg by (cluster)
(' + prefix + '_jvm_memory_used{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='avg',
)
)
.addTarget(
prometheus.target(
'min by ( cluster)
(' + prefix + '_jvm_memory_max{memory_type="heap", cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='Heap memory available',
)
)
.addSeriesOverride(fillMinMaxSeriesOverrides)
.addSeriesOverride(removeMinlineSeriesOverrides)
)
)


Expand Down Expand Up @@ -975,3 +1104,12 @@ dashboard.new(
// Disk Utilization (io.util%)

)

// TODO/Add

// section OS/hardware section
// CPU usage
// Unix Load
// Context switching (collectd_contextswitch_total)
// System Memory
// Network
Loading

0 comments on commit 177fb45

Please sign in to comment.