forked from infinityworks/hpilo-exporter
-
Notifications
You must be signed in to change notification settings - Fork 3
/
alerts.hpilo.yml
128 lines (121 loc) · 7.67 KB
/
alerts.hpilo.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
groups:
- name: 'HPiLO Exporter alerts'
rules:
- alert: 'hPiLO exporter'
expr: up{job="hpilo"} == 0
for: 10m
labels:
severity: 'warning'
annotations:
title: 'HPiLO exporter'
description: 'HPiLO exporter seems down.'
- alert: 'HPiLO_Temperature'
expr: 'min(hpilo_temperature_value{job="hpilo"} > 0) without (sensor) > 30'
for: 3m
labels:
severity: 'warning'
annotations:
title: 'High Temperature'
description: 'High temperature detected ({{ $value | humanize }}C)'
# all other alerts
- alert: 'HWstatus'
expr: '{job="hpilo", __name__=~"hpilo_.*status", __name__!~"hpilo_nic_status|hpilo_network_status|hpilo_battery_status|hpilo_power_supply_status|hpilo_fan_status"} > 0'
for: 0s
labels:
severity: "{{ if eq ($value | humanize) \"1\" }}warning{{ else if eq ($value | humanize) \"3\" }}warning{{else}}critical{{end}}"
metric: '{{ .Labels.__name__ }}'
vsp: "{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | label \"encl\" }}{{end}}"
annotations:
title: '{{ .Labels.__name__ }} status'
description: "{{ .Labels.__name__ }} is
{{ if eq ($value | humanize) \"0\" }}'ok'
{{ else if eq ($value | humanize) \"1\" }}'warning'
{{ else if eq ($value | humanize) \"2\" }}'critical'
{{else}}'unknown'
{{ end }}({{ $value | printf \"%.0f\" }}).
{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}
Blade Enclosure {{ . | first | label \"encl\" }}, OA_IP {{ . | first | label \"oa_ip\" }}, Bay {{ . | first | label \"location_bay\" }}
{{end}}"
# set separate alert about battery with possibility of exclusion hosts with 'battery' tag in noAlarmOn label
- alert: 'Battery Status'
expr: 'hpilo_battery_status{job="hpilo",noAlarmOn!~"(.*,|^)battery(,.*|$)"} > 0'
for: 0s
labels:
severity: "{{ if eq ($value | humanize) \"1\" }}warning{{ else if eq ($value | humanize) \"3\" }}warning{{else}}critical{{end}}"
metric: '{{ .Labels.__name__ }}'
vsp: "{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | label \"encl\" }}{{end}}"
annotations:
title: '{{ .Labels.__name__ }} status'
description: "{{ .Labels.__name__ }} is
{{ if eq ($value | humanize) \"0\" }}'ok'
{{ else if eq ($value | humanize) \"1\" }}'warning'
{{ else if eq ($value | humanize) \"2\" }}'critical'
{{else}}'unknown'
{{ end }}({{ $value | printf \"%.0f\" }}).
{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}
Blade Enclosure {{ . | first | label \"encl\" }}, OA_IP {{ . | first | label \"oa_ip\" }}, Bay {{ . | first | label \"location_bay\" }}
{{end}}"
# prevent fake PS alert enforced while having Battety alert
- alert: 'Battery Supply Status'
expr: 'hpilo_power_supply_status{job="hpilo",noAlarmOn!~"(.*,|^)battery(,.*|$)",ps=~"Battery.*"} > 0'
for: 0s
labels:
severity: "{{ if eq ($value | humanize) \"1\" }}warning{{ else if eq ($value | humanize) \"3\" }}warning{{else}}critical{{end}}"
metric: '{{ .Labels.__name__ }}'
vsp: "{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | label \"encl\" }}{{end}}"
annotations:
title: 'hpilo Battery Supply status'
description: "{{ .Labels.__name__ }} is
{{ if eq ($value | humanize) \"0\" }}'ok'
{{ else if eq ($value | humanize) \"1\" }}'warning'
{{ else if eq ($value | humanize) \"2\" }}'critical'
{{else}}'unknown'
{{ end }}({{ $value | printf \"%.0f\" }}).
{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}
Blade Enclosure {{ . | first | label \"encl\" }}, OA_IP {{ . | first | label \"oa_ip\" }}, Bay {{ . | first | label \"location_bay\" }}
{{end}}"
# prevent fake PS alert enforced while having Battety alert
- alert: 'Power Supply Status'
expr: 'hpilo_power_supply_status{job="hpilo",noAlarmOn!~"(.*,|^)PowerSupply(,.*|$)",ps!~"Battery.*"} > 0'
for: 0s
labels:
severity: "{{ if eq ($value | humanize) \"1\" }}warning{{ else if eq ($value | humanize) \"3\" }}warning{{else}}critical{{end}}"
metric: '{{ .Labels.__name__ }}'
vsp: "{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | label \"encl\" }}{{end}}"
annotations:
title: '{{ .Labels.__name__ }} status'
description: "{{ .Labels.__name__ }} is
{{ if eq ($value | humanize) \"0\" }}'ok'
{{ else if eq ($value | humanize) \"1\" }}'warning'
{{ else if eq ($value | humanize) \"2\" }}'critical'
{{else}}'unknown'
{{ end }}({{ $value | printf \"%.0f\" }}).
{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}
Blade Enclosure {{ . | first | label \"encl\" }}, OA_IP {{ . | first | label \"oa_ip\" }}, Bay {{ . | first | label \"location_bay\" }}
{{end}}"
# prevent fake alert for two-socket servers with only one socket installed: two fans will be absent
- alert: 'Fan Status'
expr: 'hpilo_fan_status{job="hpilo",noAlarmOn!~"(.*,|^)Fan(,.*|$)"} * ignoring(fan) group_left hpilo_fans_status > 0'
for: 0s
labels:
severity: "{{ if eq ($value | humanize) \"1\" }}warning{{ else if eq ($value | humanize) \"3\" }}warning{{else}}critical{{end}}"
metric: '{{ .Labels.__name__ }}'
vsp: "{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}{{ . | first | label \"encl\" }}{{end}}"
annotations:
title: '{{ .Labels.__name__ }} status'
description: "{{ .Labels.__name__ }} is
{{ if eq ($value | humanize) \"0\" }}'ok'
{{ else if eq ($value | humanize) \"1\" }}'warning'
{{else}}'critical'
{{ end }}({{ $value | printf \"%.0f\" }}).
{{ with printf \"hpilo_onboard_administrator_info{instance='%s',job='%s'}\" .Labels.instance .Labels.job | query }}
Blade Enclosure {{ . | first | label \"encl\" }}, OA_IP {{ . | first | label \"oa_ip\" }}, Bay {{ . | first | label \"location_bay\" }}
{{end}}"
# - alert: 'RunningStatus'
# expr: 'hpilo_running_status{job="hpilo"} != 0'
# for: 1m
# labels:
# severity: "critical"
# annotations:
# title: 'Running status'
# description: "{{ .Labels.server_name }} is not in RUNNING(ON) status"