From 6738bb970bdb374f3c135cb106f639695d7ca5b0 Mon Sep 17 00:00:00 2001 From: Arun Kumar Mohan Date: Thu, 19 Sep 2024 19:12:47 +0530 Subject: [PATCH] Fix 'ceph_disk_occupation' query expressions Need to address changes in 'ceph_disk_occupation' metric labels. What is the change in 'ceph_disk_occupation' metric? 'ceph_disk_occupation' result no longer has 'exported_instance' label, instead it has 'instance' label. What is the issue we are facing because of it? We are hitting 'PrometheusRuleFailures' due to this new label changes in our alerts / rules, where this metric is used. Second issue is that we are not seeing any results for some of the query expressions. What is the solution? Update the query expressions, change 'exported_instance' to 'instance'. Any 'label_replace' action which changes 'exported_instance' label to 'instance' label is no longer required (as the 'instance' label is directly available now) Signed-off-by: Arun Kumar Mohan --- .../prometheus/localcephrules.yaml | 8 +++---- .../deploy/prometheus-ocs-rules-external.yaml | 24 +++++-------------- metrics/deploy/prometheus-ocs-rules.yaml | 24 +++++-------------- 3 files changed, 16 insertions(+), 40 deletions(-) diff --git a/controllers/storagecluster/prometheus/localcephrules.yaml b/controllers/storagecluster/prometheus/localcephrules.yaml index ff6e4ae1cf..9c40364ae0 100644 --- a/controllers/storagecluster/prometheus/localcephrules.yaml +++ b/controllers/storagecluster/prometheus/localcephrules.yaml @@ -11,10 +11,10 @@ spec: - name: ceph.rules rules: - expr: | - kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max by (node, namespace) (label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) + kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max by (node, namespace) (label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","instance","(.*)")) record: cluster:ceph_node_down:join_kube - expr: | - avg by (namespace) (topk by (ceph_daemon, namespace) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon, namespace) topk by (instance, device, namespace) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) + avg by (namespace) (topk by (ceph_daemon, namespace) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon, namespace) topk by (instance, device, namespace) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m]))))) record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m - name: telemeter.rules rules: @@ -171,7 +171,7 @@ spec: storage_type: ceph runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephOSDDiskNotResponding.md expr: | - label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","exported_instance","(.*)") + label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","instance","(.*)") for: 15m labels: severity: critical @@ -183,7 +183,7 @@ spec: storage_type: ceph runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephOSDDiskUnavailable.md expr: | - label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","exported_instance","(.*)") + label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","instance","(.*)") for: 1m labels: severity: critical diff --git a/metrics/deploy/prometheus-ocs-rules-external.yaml b/metrics/deploy/prometheus-ocs-rules-external.yaml index 0e34c6ab51..dbce203342 100644 --- a/metrics/deploy/prometheus-ocs-rules-external.yaml +++ b/metrics/deploy/prometheus-ocs-rules-external.yaml @@ -10,17 +10,11 @@ spec: groups: - name: ocs_performance.rules rules: - - expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"}, - \"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\", - \"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by - (instance,device) \n (1,\n (\n rate(node_disk_read_time_seconds_total[1m]) - / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n )\n )\n)\n" + - expr: | + sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))))) record: cluster:ceph_disk_latency_read:join_ceph_node_disk_rate1m - - expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"}, - \"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\", - \"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by - (instance,device) \n (1,\n (\n rate(node_disk_write_time_seconds_total[1m]) - / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))\n )\n )\n)\n" + - expr: | + sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))))) record: cluster:ceph_disk_latency_write:join_ceph_node_disk_rate1m - name: ODF_standardized_metrics.rules rules: @@ -54,14 +48,8 @@ spec: system_type: OCS system_vendor: Red Hat record: odf_system_throughput_total_bytes - - expr: "sum by (namespace, managedBy, job, service)\n(\n topk by (ceph_daemon) - (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"}, - \"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\", - \"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left() topk by - (instance,device) \n (1,\n (\n ( \n rate(node_disk_read_time_seconds_total[1m]) - / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n ) +\n (\n - \ rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), - 1))\n )\n )\n )\n)\n" + - expr: | + sum by (namespace, managedBy, job, service) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left() topk by (instance,device) (1, ((rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))) + (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1)))))) labels: system_type: OCS system_vendor: Red Hat diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml index 4e42e420ce..85d1a27346 100644 --- a/metrics/deploy/prometheus-ocs-rules.yaml +++ b/metrics/deploy/prometheus-ocs-rules.yaml @@ -10,17 +10,11 @@ spec: groups: - name: ocs_performance.rules rules: - - expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"}, - \"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\", - \"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by - (instance,device) \n (1,\n (\n rate(node_disk_read_time_seconds_total[1m]) - / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n )\n )\n)\n" + - expr: | + sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))))) record: cluster:ceph_disk_latency_read:join_ceph_node_disk_rate1m - - expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"}, - \"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\", - \"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by - (instance,device) \n (1,\n (\n rate(node_disk_write_time_seconds_total[1m]) - / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))\n )\n )\n)\n" + - expr: | + sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))))) record: cluster:ceph_disk_latency_write:join_ceph_node_disk_rate1m - name: ODF_standardized_metrics.rules rules: @@ -54,14 +48,8 @@ spec: system_type: OCS system_vendor: Red Hat record: odf_system_throughput_total_bytes - - expr: "sum by (namespace, managedBy, job, service)\n(\n topk by (ceph_daemon) - (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"}, - \"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\", - \"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left() topk by - (instance,device) \n (1,\n (\n ( \n rate(node_disk_read_time_seconds_total[1m]) - / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n ) +\n (\n - \ rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), - 1))\n )\n )\n )\n)\n" + - expr: | + sum by (namespace, managedBy, job, service) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left() topk by (instance,device) (1, ((rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))) + (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1)))))) labels: system_type: OCS system_vendor: Red Hat