-
Notifications
You must be signed in to change notification settings - Fork 8
/
alerts.solaris.yml
182 lines (159 loc) · 7.85 KB
/
alerts.solaris.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
---
groups:
- name: 'Exporter alerts'
rules:
- alert: 'Solaris_exporter_service'
# up{job="solaris_exporter" == 0
expr: avg_over_time(up{job="solaris_exporter",noAlarmOn!~"(.*,|^)node(,.*|$)"}[2m]) < 0.9
for: 1m
labels:
severity: 'critical'
annotations:
title: "Exporter seems down"
description: 'Exporter does not return any metrics!'
- name: 'solaris OS alerts'
rules:
- alert: 'CPU Load solaris'
expr: solaris_exporter_cpu_load{job="solaris_exporter", statistic="load15m", noAlarmOn!~"(.*,|^)cpu(,.*|$)" } / ignoring (statistic) group_left(vcpu) (solaris_exporter_cpu_load{job="solaris_exporter", statistic="vcpu"} ) > 1.5
for: 30m
labels:
severity: 'critical'
annotations:
title: "Avg load15m > 1.5*cpu"
description: 'Avg load15m is more than cpu count in {{ $value | printf "%2.3f" }} times for {{ $labels.ip }} ({{ $labels.host }})'
- alert: 'CPU Load solaris'
expr: solaris_exporter_cpu_load{job="solaris_exporter", statistic="load15m", noAlarmOn!~"(.*,|^)cpu(,.*|$)" } / ignoring (statistic) group_left(vcpu) (solaris_exporter_cpu_load{job="solaris_exporter", statistic="vcpu"} ) > 1.2
for: 30m
labels:
severity: 'warning'
annotations:
title: "Avg load15m > 1.2*cpu"
description: 'Avg load15m is more than cpu count in {{ $value | printf "%2.3f" }} times for {{ $labels.ip }} ({{ $labels.host }})'
- alert: 'DiskSpaceUsage'
expr: solaris_exporter_diskspace_usage_bytes{job="solaris_exporter", statistic="free",fstype="zfs", noAlarmOn!~"(.*,|^)space(,.*|$)" } /
ignoring (statistic) solaris_exporter_diskspace_usage_bytes{job="solaris_exporter", statistic="total", fstype="zfs"} *100
< 10
for: 1m
labels:
severity: 'critical'
annotations:
title: 'Disk Space Usage'
description: 'On {{ $labels.ip }} fs on {{ $labels.mountpoint }} has free space of {{ $value | humanize }}%'
- alert: 'DiskSpaceUsage'
expr: solaris_exporter_diskspace_usage_bytes{job="solaris_exporter", statistic="free", fstype="zfs", noAlarmOn!~"(.*,|^)space(,.*|$)" } /
ignoring (statistic) solaris_exporter_diskspace_usage_bytes{job="solaris_exporter", statistic="total", fstype="zfs"} *100
< 15
for: 1m
labels:
severity: 'warning'
annotations:
title: 'Disk Space Usage'
description: 'On {{ $labels.ip }} fs on {{ $labels.mountpoint }} has free space of {{ $value | humanize }}%'
- alert: 'DiskSpaceUsage'
expr: predict_linear(solaris_exporter_diskspace_usage_bytes{statistic="free",fstype="zfs", noAlarmOn!~"(.*,|^)space(,.*|$)" }[2h], 4*3600) < 0
for: 5m
labels:
severity: 'predict'
annotations:
title: 'Disk Space Usage predict fill in 4 hours'
description: "On {{ $labels.ip }} fs on {{ $labels.mountpoint }} will predict out of space in 4 hours. Check 'Low free space' alarm."
- alert: 'svcs-x'
expr: solaris_exporter_svcs_x_failed_services > 0
for: 3m
labels:
severity: 'warning'
annotations:
title: 'Found failed system services'
description: "Found {{ $value | humanize }} services with problems. Run 'svcs -x' command to check on {{ $labels.ip }}"
- alert: 'fmadm-faulty'
expr: solaris_exporter_fmadm_faults > 0
for: 3m
labels:
severity: 'warning'
annotations:
title: 'Found system failures'
description: "Found {{ $value | humanize }} faults in fmadm. Run 'fmadm faulty' command to check on {{ $labels.ip }}"
- alert: 'zpool-status'
expr: solaris_exporter_zpool_faults > 0
for: 3m
labels:
severity: 'warning'
annotations:
title: 'Found zpool failures'
description: "Found {{ $value | humanize }} faults in zpools. Run 'zpool status' command to check on {{ $labels.ip }}"
- alert: 'metastat-status'
expr: solaris_exporter_metastat_faults > 0
for: 3m
labels:
severity: 'warning'
annotations:
title: 'Found metastat failures'
description: "Found {{ $value | humanize }} faults in metastat. Run 'metastat' command to check on {{ $labels.ip }}"
- alert: 'metadb-status'
expr: solaris_exporter_metadb_faults > 0
for: 3m
labels:
severity: 'warning'
annotations:
title: 'Found metadb failures'
description: "Found {{ $value | humanize }} faults in metastat. Run 'metadb' command to check on {{ $labels.ip }}"
- alert: 'prtdiag-status'
expr: solaris_exporter_prtdiag_rc > 0
for: 3m
labels:
severity: 'warning'
annotations:
title: 'Found non-zero prtdiag status'
description: "Found non-zero ({{ $value | humanize }}) return code of prtdiag. Run 'prtdiag -v' command to check on {{ $labels.ip }}"
- alert: 'iostat-errors'
expr: rate(solaris_exporter_disk_errors_total[3m]) > 0
for: 30s
labels:
severity: 'warning'
annotations:
title: 'Found new IO failures in iostat'
description: "Found {{ $value | humanize }} new faults/second. Run 'iostat -En' command to check on {{ $labels.ip }}. Problem device is {{ $labels.admin_name }} ({{ $labels.admin_desc }})"
- alert: 'iostat-%b'
expr: rate(solaris_exporter_diskio_usage_total{statistic="rtime"}[10m]) * 100 > 90
for: 60m
labels:
severity: 'warning'
annotations:
title: 'Found heavy IO on disk for continuous time'
description: "Found disk with {{ $value | humanize }}% busy for more that 1 hour. Problem device is {{ $labels.admin_name }} ({{ $labels.admin_desc }})"
- alert: 'iostat-servicetime'
expr: 1000 * sum by (name, admin_name, ip, instance, admin_desc, driver) (rate(solaris_exporter_diskio_usage_total{statistic="rlentime"}[10m]))
/ ignoring (statistic,stat_desc)
sum by (name, admin_name, ip, instance, admin_desc, driver) (rate(solaris_exporter_diskio_usage_total{statistic=~"reads|writes"}[10m])>1) > 400
for: 30m
labels:
severity: 'warning'
annotations:
title: 'Found IO with heavy ServiceTime for continuous time (>400ms)'
description: "Found servicetime {{ $value | humanize }}ms for more that 1 hour. Problem device is {{ $labels.name }}: {{ $labels.admin_name }} ({{ $labels.admin_desc }})"
- alert: 'fc_paths'
expr: solaris_exporter_fc_paths{stat='active'} != ignoring(stat) solaris_exporter_fc_paths{stat='total'}
for: 1m
labels:
severity: 'warning'
annotations:
title: 'Found multipath disk with degraded paths'
description: "Disk {{ $labels.device }} has {{ $value | humanize }} path(s)"
- alert: 'Localtime'
expr: abs(solaris_exporter_current_time_seconds_total{job="solaris_exporter"}
- timestamp(solaris_exporter_current_time_seconds_total{job="solaris_exporter"}) ) > 10
#- scrape_duration_seconds{job="solaris_exporter"}) > 3
for: 10m
labels:
severity: warning
annotations:
title: "Time is wrong"
description: "Localtime of {{ $labels.instance }} differs from Prometheus time for more than 10 second (value {{ $value | humanize }}sec). Check Time Sync."
- alert: Uptime
expr: solaris_exporter_uptime_seconds_total{job="solaris_exporter"} < 3600
for: 1m
labels:
severity: warning
annotations:
title: "System rebooted"
description: "Host {{ $labels.instance }} uptime is less than 1 hour. \n VALUE = {{ $value | humanizeDuration }} seconds"