Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alain/38 add write path dashboard #45

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,279 changes: 1,279 additions & 0 deletions dashboards/demo/demo-cassandra.yaml

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions dashboards/demo/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ services:
volumes:
- "mcac_data:/mcac"
- "../../config:/mcac/config:ro"
- "./demo-cassandra.yaml:/etc/cassandra/cassandra.yaml"
environment:
MAX_HEAP_SIZE: "500M"
HEAP_NEWSIZE: "100M"
Expand All @@ -64,6 +65,7 @@ services:
volumes:
- "mcac_data:/mcac"
- "../../config:/mcac/config:ro"
- "./demo-cassandra.yaml:/etc/cassandra/cassandra.yaml"
networks:
- demo_net
healthcheck:
Expand All @@ -89,7 +91,7 @@ services:
- demo_net
environment:
- "TLP_STRESS_CASSANDRA_HOST=cassandra"
command: "run KeyValue --rate 30 -d 1d -r .8"
command: "run KeyValue --rate 250 -p 100 -d 1d -r .8"
depends_on:
cassandra2:
condition: service_healthy
Expand All @@ -102,7 +104,7 @@ services:
- demo_net
environment:
- "TLP_STRESS_CASSANDRA_HOST=cassandra"
command: "run BasicTimeSeries --rate 20 -d 1d -r .9"
command: "run BasicTimeSeries --rate 200 -p 10 -d 1d -r .1 --deletes .5"
depends_on:
cassandra2:
condition: service_healthy
Expand Down
54 changes: 29 additions & 25 deletions dashboards/grafana/dashboards-jsonnet/overview.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,10 @@ dashboard.new(
template.new(
'dc',
'$PROMETHEUS_DS',
'label_values(collectd_collectd_queue_length{cluster=~"$cluster"}, dc)',
'label_values(collectd_collectd_queue_length{cluster="$cluster"}, dc)',
label='DataCenter',
refresh='time',
multi=true,
includeAll=true,
allValues=".*",
)
Expand All @@ -82,9 +83,10 @@ dashboard.new(
template.new(
'rack',
'$PROMETHEUS_DS',
'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc"}, rack)',
'label_values(collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc"}, rack)',
label='Rack',
refresh='time',
multi=true,
includeAll=true,
allValues=".*",
)
Expand All @@ -93,9 +95,10 @@ dashboard.new(
template.new(
'node',
'$PROMETHEUS_DS',
'label_values(collectd_collectd_queue_length{cluster=~"$cluster", dc=~"$dc", rack=~"$rack"}, instance)',
'label_values(collectd_collectd_queue_length{cluster="$cluster", dc=~"$dc", rack=~"$rack"}, instance)',
label='Node',
refresh='time',
multi=true,
includeAll=true,
allValues=".*",
)
Expand Down Expand Up @@ -133,7 +136,7 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}}',
)
)
Expand All @@ -157,37 +160,37 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_failures_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_failures_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}} failures',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_timeouts_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_timeouts_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}} timeouts',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unavailables_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unavailables_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}} unavailable errors',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unfinished_commit_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_unfinished_commit_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}} unfinished commit errors',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_condition_not_met_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_condition_not_met_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}} condition not met errors',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_contention_histogram_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_contention_histogram_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"}[1m:30s]))',
legendFormat='{{request_type}} contention histogram errors',
)
)
Expand Down Expand Up @@ -223,7 +226,9 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) / ignoring (request_type) (sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])) + ignoring (request_type) sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
expr='sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s]))
/ ignoring (request_type) (sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s]))
+ ignoring (request_type) sum by (cluster, request_type) (rate(' + prefix + '_client_request_latency_total{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
)
)
)
Expand All @@ -246,19 +251,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.98, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
expr='histogram_quantile(0.98, sum by (le, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])))',
legendFormat='p98',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
expr='histogram_quantile(0.99, sum by (le, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])))',
legendFormat='p99',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.999, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[5m])) by (le, cluster))',
expr='histogram_quantile(0.999, sum by (le, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="read"}[1m:30s])))',
legendFormat='p999',
)
)
Expand All @@ -284,19 +289,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.98, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
expr='histogram_quantile(0.98, sum by (le, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
legendFormat='p98',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
expr='histogram_quantile(0.99, sum by (le, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
legendFormat='p99',
)
)
.addTarget(
prometheus.target(
expr='histogram_quantile(0.999, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[5m])) by (le, cluster))',
expr='histogram_quantile(0.999, sum by (le, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type="write"}[1m:30s])))',
legendFormat='p999',
)
)
Expand All @@ -323,7 +328,7 @@ dashboard.new(
.addTarget(
prometheus.target(
# In scope!~"Write|Read|.*-.*", we want to exclude charts above and all the per-consistency_level info like "Read-LOCAL_ONE"
expr='histogram_quantile(0.99, sum(rate(' + prefix + '_client_request_latency_bucket{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type!~"write|read|.*-.*"}[1m:30s])) by (le, request_type, cluster))',
expr='histogram_quantile(0.99, sum by (le, request_type, cluster) (rate(' + prefix + '_client_request_latency_bucket{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", request_type!~"write|read|.*-.*"}[1m:30s])))',
legendFormat='p99 {{request_type}}'
)
)
Expand Down Expand Up @@ -369,7 +374,7 @@ dashboard.new(
)
.addTarget(
prometheus.target(
'max by (cluster, dc, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0',
'max by (cluster, dc, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0',
legendFormat='{{instance}}',
instant=true,
)
Expand Down Expand Up @@ -423,13 +428,13 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="native"}[2m:30s])) > bool 0)',
expr='sum by (cluster) (max by (cluster, dc, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="native"}[2m:30s])) > bool 0)',
legendFormat='Nodes Coordinating Requests (Native protocol)',
)
)
.addTarget(
prometheus.target(
expr='sum by (cluster) (max by (cluster, datacenter, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0)',
expr='sum by (cluster) (max by (cluster, dc, rack, instance) (changes(' + prefix + '_thread_pools_completed_tasks{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node", pool_name="gossip_stage"}[2m:30s])) > bool 0)',
legendFormat='Nodes With Internal Activity (Gossip protocol)',
)
)
Expand Down Expand Up @@ -589,19 +594,19 @@ dashboard.new(
)
.addTarget(
prometheus.target(
expr='max by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
expr='max by (cluster) (' + prefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='max',
)
)
.addTarget(
prometheus.target(
expr='min by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
expr='min by (cluster) (' + prefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='min',
)
)
.addTarget(
prometheus.target(
expr='avg by (cluster) (' + prefix + '_table_pending_compactions{cluster=~"$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
expr='avg by (cluster) (' + prefix + '_table_pending_compactions{cluster="$cluster", dc=~"$dc", rack=~"$rack", instance=~"$node"})',
legendFormat='avg',
)
)
Expand Down Expand Up @@ -998,7 +1003,6 @@ dashboard.new(
legend_sortDesc=true,
shared_tooltip=false,
decimals=2,
min=0,
max=1,
)
.addTarget(
Expand Down
Loading