Skip to content

Commit

Permalink
monitoring: add new alerts
Browse files Browse the repository at this point in the history
This adds new hardware monitoring alerts.

Signed-off-by: Guillaume Abrioux <[email protected]>
  • Loading branch information
guits committed Jan 25, 2024
1 parent 05cc6af commit 76d8e0b
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 0 deletions.
65 changes: 65 additions & 0 deletions monitoring/ceph-mixin/prometheus_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,71 @@
},
],
},
{
name: 'hardware',
rules: [
{
alert: 'HardwareStorageError',
'for': '30s',
expr: 'ceph_health_detail{name="HARDWARE_STORAGE"} > 0',
labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.1' },
annotations: {
summary: 'Storage devices error(s) detected%(cluster)s' % $.MultiClusterSummary(),
description: 'Some storage devices are in error. Check `ceph health detail`.',
},
},
{
alert: 'HardwareMemoryError',
'for': '30s',
expr: 'ceph_health_detail{name="HARDWARE_MEMORY"} > 0',
labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.2' },
annotations: {
summary: 'DIMM error(s) detected%(cluster)s' % $.MultiClusterSummary(),
description: 'DIMM error(s) detected. Check `ceph health detail`.',
},
},
{
alert: 'HardwareProcessorError',
'for': '30s',
expr: 'ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0',
labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.3' },
annotations: {
summary: 'Processor error(s) detected%(cluster)s' % $.MultiClusterSummary(),
description: 'Processor error(s) detected. Check `ceph health detail`.',
},
},
{
alert: 'HardwareNetworkError',
'for': '30s',
expr: 'ceph_health_detail{name="HARDWARE_NETWORK"} > 0',
labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.4' },
annotations: {
summary: 'Network error(s) detected%(cluster)s' % $.MultiClusterSummary(),
description: 'Network error(s) detected. Check `ceph health detail`.',
},
},
{
alert: 'HardwarePowerError',
'for': '30s',
expr: 'ceph_health_detail{name="HARDWARE_POWER"} > 0',
labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.5' },
annotations: {
summary: 'Power supply error(s) detected%(cluster)s' % $.MultiClusterSummary(),
description: 'Power supply error(s) detected. Check `ceph health detail`.',
},
},
{
alert: 'HardwareFanError',
'for': '30s',
expr: 'ceph_health_detail{name="HARDWARE_FANS"} > 0',
labels: { severity: 'critical', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.13.6' },
annotations: {
summary: 'Fan error(s) detected%(cluster)s' % $.MultiClusterSummary(),
description: 'Fan error(s) detected. Check `ceph health detail`.',
},
},
],
},
{
name: 'PrometheusServer',
rules: [
Expand Down
62 changes: 62 additions & 0 deletions monitoring/ceph-mixin/prometheus_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,68 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
- name: "hardware"
rules:
- alert: "HardwareStorageError"
annotations:
description: "Some storage devices are in error. Check `ceph health detail`."
summary: "Storage devices error(s) detected"
expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.13.1"
severity: "critical"
type: "ceph_default"
- alert: "HardwareMemoryError"
annotations:
description: "DIMM error(s) detected. Check `ceph health detail`."
summary: "DIMM error(s) detected"
expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.13.2"
severity: "critical"
type: "ceph_default"
- alert: "HardwareProcessorError"
annotations:
description: "Processor error(s) detected. Check `ceph health detail`."
summary: "Processor error(s) detected"
expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.13.3"
severity: "critical"
type: "ceph_default"
- alert: "HardwareNetworkError"
annotations:
description: "Network error(s) detected. Check `ceph health detail`."
summary: "Network error(s) detected"
expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.13.4"
severity: "critical"
type: "ceph_default"
- alert: "HardwarePowerError"
annotations:
description: "Power supply error(s) detected. Check `ceph health detail`."
summary: "Power supply error(s) detected"
expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.13.5"
severity: "critical"
type: "ceph_default"
- alert: "HardwareFanError"
annotations:
description: "Fan error(s) detected. Check `ceph health detail`."
summary: "Fan error(s) detected"
expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
for: "30s"
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.13.6"
severity: "critical"
type: "ceph_default"
- name: "PrometheusServer"
rules:
- alert: "PrometheusJobMissing"
Expand Down
144 changes: 144 additions & 0 deletions monitoring/ceph-mixin/tests_alerts/test_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2075,3 +2075,147 @@ tests:
description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"

- interval: 30s
input_series:
- series: 'ceph_health_detail{name="HARDWARE_STORAGE"}'
values: '1+0x40'
promql_expr_test:
- expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0
eval_time: 2m
exp_samples:
- labels: '{__name__="ceph_health_detail", name="HARDWARE_STORAGE"}'
value: 1
alert_rule_test:
- eval_time: 1m
alertname: HardwareStorageError
- eval_time: 5m
alertname: HardwareStorageError
exp_alerts:
- exp_labels:
name: HARDWARE_STORAGE
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.13.1
exp_annotations:
summary: Storage devices error(s) detected
description: "Some storage devices are in error. Check `ceph health detail`."
- interval: 30s
input_series:
- series: 'ceph_health_detail{name="HARDWARE_MEMORY"}'
values: '1+0x40'
promql_expr_test:
- expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0
eval_time: 2m
exp_samples:
- labels: '{__name__="ceph_health_detail", name="HARDWARE_MEMORY"}'
value: 1
alert_rule_test:
- eval_time: 1m
alertname: HardwareMemoryError
- eval_time: 5m
alertname: HardwareMemoryError
exp_alerts:
- exp_labels:
name: HARDWARE_MEMORY
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.13.2
exp_annotations:
summary: DIMM error(s) detected
description: "DIMM error(s) detected. Check `ceph health detail`."
- interval: 30s
input_series:
- series: 'ceph_health_detail{name="HARDWARE_PROCESSOR"}'
values: '1+0x40'
promql_expr_test:
- expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0
eval_time: 2m
exp_samples:
- labels: '{__name__="ceph_health_detail", name="HARDWARE_PROCESSOR"}'
value: 1
alert_rule_test:
- eval_time: 1m
alertname: HardwareProcessorError
- eval_time: 5m
alertname: HardwareProcessorError
exp_alerts:
- exp_labels:
name: HARDWARE_PROCESSOR
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.13.3
exp_annotations:
summary: Processor error(s) detected
description: "Processor error(s) detected. Check `ceph health detail`."
- interval: 30s
input_series:
- series: 'ceph_health_detail{name="HARDWARE_NETWORK"}'
values: '1+0x40'
promql_expr_test:
- expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0
eval_time: 2m
exp_samples:
- labels: '{__name__="ceph_health_detail", name="HARDWARE_NETWORK"}'
value: 1
alert_rule_test:
- eval_time: 1m
alertname: HardwareNetworkError
- eval_time: 5m
alertname: HardwareNetworkError
exp_alerts:
- exp_labels:
name: HARDWARE_NETWORK
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.13.4
exp_annotations:
summary: Network error(s) detected
description: "Network error(s) detected. Check `ceph health detail`."
- interval: 30s
input_series:
- series: 'ceph_health_detail{name="HARDWARE_POWER"}'
values: '1+0x40'
promql_expr_test:
- expr: ceph_health_detail{name="HARDWARE_POWER"} > 0
eval_time: 2m
exp_samples:
- labels: '{__name__="ceph_health_detail", name="HARDWARE_POWER"}'
value: 1
alert_rule_test:
- eval_time: 1m
alertname: HardwarePowerError
- eval_time: 5m
alertname: HardwarePowerError
exp_alerts:
- exp_labels:
name: HARDWARE_POWER
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.13.5
exp_annotations:
summary: Power supply error(s) detected
description: "Power supply error(s) detected. Check `ceph health detail`."
- interval: 30s
input_series:
- series: 'ceph_health_detail{name="HARDWARE_FANS"}'
values: '1+0x40'
promql_expr_test:
- expr: ceph_health_detail{name="HARDWARE_FANS"} > 0
eval_time: 2m
exp_samples:
- labels: '{__name__="ceph_health_detail", name="HARDWARE_FANS"}'
value: 1
alert_rule_test:
- eval_time: 1m
alertname: HardwareFanError
- eval_time: 5m
alertname: HardwareFanError
exp_alerts:
- exp_labels:
name: HARDWARE_FANS
severity: critical
type: ceph_default
oid: 1.3.6.1.4.1.50495.1.2.1.13.6
exp_annotations:
summary: Fan error(s) detected
description: "Fan error(s) detected. Check `ceph health detail`."
1 change: 1 addition & 0 deletions monitoring/snmp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ internet private enterprise ceph ceph Notifications Prometheus Notific
.10 (Rados)
.11 (cephadm)
.12 (prometheus)
.13 (hardware)
```
Individual alerts are placed within the appropriate alert category. For example, to add
Expand Down

0 comments on commit 76d8e0b

Please sign in to comment.