Skip to content

Commit

Permalink
Merge pull request #2933 from openshift-cherrypick-robot/cherry-pick-…
Browse files Browse the repository at this point in the history
…2812-to-release-4.18

[release-4.18] DFBUGS-1188: Fix 'ceph_disk_occupation' query expressions
  • Loading branch information
openshift-merge-bot[bot] authored Jan 6, 2025
2 parents 8b4416e + 6738bb9 commit da4df69
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 40 deletions.
8 changes: 4 additions & 4 deletions controllers/storagecluster/prometheus/localcephrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ spec:
- name: ceph.rules
rules:
- expr: |
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max by (node, namespace) (label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)"))
kube_node_status_condition{condition="Ready",job="kube-state-metrics",status="true"} * on (node) group_right() max by (node, namespace) (label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","instance","(.*)"))
record: cluster:ceph_node_down:join_kube
- expr: |
avg by (namespace) (topk by (ceph_daemon, namespace) (1, label_replace(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "instance", "$1", "exported_instance", "(.*)"), "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon, namespace) topk by (instance, device, namespace) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
avg by (namespace) (topk by (ceph_daemon, namespace) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_right(ceph_daemon, namespace) topk by (instance, device, namespace) (1,(irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (clamp_min(irate(node_disk_reads_completed_total[1m]), 1) + irate(node_disk_writes_completed_total[1m])))))
record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m
- name: telemeter.rules
rules:
Expand Down Expand Up @@ -171,7 +171,7 @@ spec:
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephOSDDiskNotResponding.md
expr: |
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","exported_instance","(.*)")
label_replace((ceph_osd_in == 1 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","instance","(.*)")
for: 15m
labels:
severity: critical
Expand All @@ -183,7 +183,7 @@ spec:
storage_type: ceph
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephOSDDiskUnavailable.md
expr: |
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","exported_instance","(.*)")
label_replace((ceph_osd_in == 0 and ceph_osd_up == 0),"disk","$1","ceph_daemon","osd.(.*)") + on(ceph_daemon, namespace, managedBy) group_left(host, device) label_replace(ceph_disk_occupation{job=~"rook-ceph-mgr|rook-ceph-mgr-external"},"host","$1","instance","(.*)")
for: 1m
labels:
severity: critical
Expand Down
24 changes: 6 additions & 18 deletions metrics/deploy/prometheus-ocs-rules-external.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,11 @@ spec:
groups:
- name: ocs_performance.rules
rules:
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_read:join_ceph_node_disk_rate1m
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_write_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_write:join_ceph_node_disk_rate1m
- name: ODF_standardized_metrics.rules
rules:
Expand Down Expand Up @@ -54,14 +48,8 @@ spec:
system_type: OCS
system_vendor: Red Hat
record: odf_system_throughput_total_bytes
- expr: "sum by (namespace, managedBy, job, service)\n(\n topk by (ceph_daemon)
(1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left() topk by
(instance,device) \n (1,\n (\n ( \n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n ) +\n (\n
\ rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]),
1))\n )\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy, job, service) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left() topk by (instance,device) (1, ((rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))) + (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))))))
labels:
system_type: OCS
system_vendor: Red Hat
Expand Down
24 changes: 6 additions & 18 deletions metrics/deploy/prometheus-ocs-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,11 @@ spec:
groups:
- name: ocs_performance.rules
rules:
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_read:join_ceph_node_disk_rate1m
- expr: "sum by (namespace, managedBy) (\n topk by (ceph_daemon) (1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left topk by
(instance,device) \n (1,\n (\n rate(node_disk_write_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left topk by (instance,device) (1, (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1)))))
record: cluster:ceph_disk_latency_write:join_ceph_node_disk_rate1m
- name: ODF_standardized_metrics.rules
rules:
Expand Down Expand Up @@ -54,14 +48,8 @@ spec:
system_type: OCS
system_vendor: Red Hat
record: odf_system_throughput_total_bytes
- expr: "sum by (namespace, managedBy, job, service)\n(\n topk by (ceph_daemon)
(1, label_replace(label_replace(ceph_disk_occupation{job=\"rook-ceph-mgr\"},
\"instance\", \"$1\", \"exported_instance\", \"(.*)\"), \"device\", \"$1\",
\"device\", \"/dev/(.*)\")) \n * on(instance, device) group_left() topk by
(instance,device) \n (1,\n (\n ( \n rate(node_disk_read_time_seconds_total[1m])
/ (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))\n ) +\n (\n
\ rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]),
1))\n )\n )\n )\n)\n"
- expr: |
sum by (namespace, managedBy, job, service) (topk by (ceph_daemon) (1, label_replace(ceph_disk_occupation{job="rook-ceph-mgr"}, "device", "$1", "device", "/dev/(.*)")) * on(instance, device) group_left() topk by (instance,device) (1, ((rate(node_disk_read_time_seconds_total[1m]) / (clamp_min(rate(node_disk_reads_completed_total[1m]), 1))) + (rate(node_disk_write_time_seconds_total[1m]) / (clamp_min(rate(node_disk_writes_completed_total[1m]), 1))))))
labels:
system_type: OCS
system_vendor: Red Hat
Expand Down

0 comments on commit da4df69

Please sign in to comment.