Skip to content

Commit 4db2eaf

Browse files
authored
Merge branch 'master' into add_loadbalancer_standard_healthprobe_status
2 parents dd23328 + 2376314 commit 4db2eaf

8 files changed

+164
-3
lines changed

docs/severity.md

+1
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@
176176
|---|---|---|---|---|---|
177177
|Azure VPN heartbeat|X|-|-|-|-|
178178
|Azure VPN total flow count|X|-|-|-|-|
179+
|Azure VPN ipsec tunnel status|X|X|-|-|-|
179180

180181

181182
## integration_aws-alb

modules/fame_azure-vpn/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ This module creates the following SignalFx detectors which could contain one or
7777
|---|---|---|---|---|---|
7878
|Azure VPN heartbeat|X|-|-|-|-|
7979
|Azure VPN total flow count|X|-|-|-|-|
80+
|Azure VPN ipsec tunnel status|X|X|-|-|-|
8081

8182
## How to collect required metrics?
8283

@@ -97,6 +98,7 @@ Check the [Related documentation](#related-documentation) section for more detai
9798
Here is the list of required metrics for detectors in this module.
9899

99100
* `fame.azure.virtual_network_gateway.total_flow_count`
101+
* `fame.azure.virtual_network_gateway.tunnel_status`
100102

101103

102104

modules/fame_azure-vpn/conf/00-heartbeat.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ aggregation: true
66

77
signals:
88
signal:
9-
metric: fame.azure.virtual_network_gateway.total_flow_count
9+
metric: fame.azure.virtual_network_gateway.tunnel_status
1010

1111
rules:
1212
critical:

modules/fame_azure-vpn/conf/01-total-flow-count.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ signals:
99
signal:
1010
metric: fame.azure.virtual_network_gateway.total_flow_count
1111

12+
disabled: true
13+
1214
rules:
1315
critical:
1416
threshold: 0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
module: Azure VPN
2+
name: IPSEC Tunnel Status
3+
id: tunnel_status
4+
5+
transformation: true
6+
aggregation: ".mean(by=['azure_resource_group', 'azure_resource_name', 'remote_ip'])"
7+
8+
signals:
9+
signal:
10+
metric: fame.azure.virtual_network_gateway.tunnel_status
11+
12+
rules:
13+
critical:
14+
threshold: 0
15+
comparator: "=="
16+
lasting_duration: '20m'
17+
major:
18+
threshold: 0
19+
comparator: "=="
20+
lasting_duration: '10m'
21+
dependency: critical

modules/fame_azure-vpn/detectors-gen.tf

+41-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ resource "signalfx_detector" "heartbeat" {
77

88
program_text = <<-EOF
99
from signalfx.detectors.not_reporting import not_reporting
10-
signal = data('fame.azure.virtual_network_gateway.total_flow_count', filter=${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal')
10+
signal = data('fame.azure.virtual_network_gateway.tunnel_status', filter=${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal')
1111
not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT')
1212
EOF
1313

@@ -53,3 +53,43 @@ EOF
5353
max_delay = var.totalflowcount_max_delay
5454
}
5555

56+
resource "signalfx_detector" "tunnel_status" {
57+
name = format("%s %s", local.detector_name_prefix, "Azure VPN ipsec tunnel status")
58+
59+
authorized_writer_teams = var.authorized_writer_teams
60+
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
61+
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))
62+
63+
program_text = <<-EOF
64+
signal = data('fame.azure.virtual_network_gateway.tunnel_status', filter=${module.filtering.signalflow})${var.tunnel_status_aggregation_function}${var.tunnel_status_transformation_function}.publish('signal')
65+
detect(when(signal == ${var.tunnel_status_threshold_critical}%{if var.tunnel_status_lasting_duration_critical != null}, lasting='${var.tunnel_status_lasting_duration_critical}', at_least=${var.tunnel_status_at_least_percentage_critical}%{endif})).publish('CRIT')
66+
detect(when(signal == ${var.tunnel_status_threshold_major}%{if var.tunnel_status_lasting_duration_major != null}, lasting='${var.tunnel_status_lasting_duration_major}', at_least=${var.tunnel_status_at_least_percentage_major}%{endif}) and (not when(signal == ${var.tunnel_status_threshold_critical}%{if var.tunnel_status_lasting_duration_critical != null}, lasting='${var.tunnel_status_lasting_duration_critical}', at_least=${var.tunnel_status_at_least_percentage_critical}%{endif}))).publish('MAJOR')
67+
EOF
68+
69+
rule {
70+
description = "is == ${var.tunnel_status_threshold_critical}"
71+
severity = "Critical"
72+
detect_label = "CRIT"
73+
disabled = coalesce(var.tunnel_status_disabled_critical, var.tunnel_status_disabled, var.detectors_disabled)
74+
notifications = try(coalescelist(lookup(var.tunnel_status_notifications, "critical", []), var.notifications.critical), null)
75+
runbook_url = try(coalesce(var.tunnel_status_runbook_url, var.runbook_url), "")
76+
tip = var.tunnel_status_tip
77+
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
78+
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
79+
}
80+
81+
rule {
82+
description = "is == ${var.tunnel_status_threshold_major}"
83+
severity = "Major"
84+
detect_label = "MAJOR"
85+
disabled = coalesce(var.tunnel_status_disabled_major, var.tunnel_status_disabled, var.detectors_disabled)
86+
notifications = try(coalescelist(lookup(var.tunnel_status_notifications, "major", []), var.notifications.major), null)
87+
runbook_url = try(coalesce(var.tunnel_status_runbook_url, var.runbook_url), "")
88+
tip = var.tunnel_status_tip
89+
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
90+
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
91+
}
92+
93+
max_delay = var.tunnel_status_max_delay
94+
}
95+

modules/fame_azure-vpn/outputs.tf

+5
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,8 @@ output "totalflowcount" {
88
value = signalfx_detector.totalflowcount
99
}
1010

11+
output "tunnel_status" {
12+
description = "Detector resource for tunnel_status"
13+
value = signalfx_detector.tunnel_status
14+
}
15+

modules/fame_azure-vpn/variables-gen.tf

+91-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ variable "totalflowcount_runbook_url" {
8989
variable "totalflowcount_disabled" {
9090
description = "Disable all alerting rules for totalflowcount detector"
9191
type = bool
92-
default = null
92+
default = true
9393
}
9494

9595
variable "totalflowcount_threshold_critical" {
@@ -109,3 +109,93 @@ variable "totalflowcount_at_least_percentage_critical" {
109109
type = number
110110
default = 1
111111
}
112+
# tunnel_status detector
113+
114+
variable "tunnel_status_notifications" {
115+
description = "Notification recipients list per severity overridden for tunnel_status detector"
116+
type = map(list(string))
117+
default = {}
118+
}
119+
120+
variable "tunnel_status_aggregation_function" {
121+
description = "Aggregation function and group by for tunnel_status detector (i.e. \".mean(by=['host'])\")"
122+
type = string
123+
default = ".mean(by=['azure_resource_group', 'azure_resource_name', 'remote_ip'])"
124+
}
125+
126+
variable "tunnel_status_transformation_function" {
127+
description = "Transformation function for tunnel_status detector (i.e. \".mean(over='5m')\")"
128+
type = string
129+
default = ""
130+
}
131+
132+
variable "tunnel_status_max_delay" {
133+
description = "Enforce max delay for tunnel_status detector (use \"0\" or \"null\" for \"Auto\")"
134+
type = number
135+
default = null
136+
}
137+
138+
variable "tunnel_status_tip" {
139+
description = "Suggested first course of action or any note useful for incident handling"
140+
type = string
141+
default = ""
142+
}
143+
144+
variable "tunnel_status_runbook_url" {
145+
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
146+
type = string
147+
default = ""
148+
}
149+
150+
variable "tunnel_status_disabled" {
151+
description = "Disable all alerting rules for tunnel_status detector"
152+
type = bool
153+
default = null
154+
}
155+
156+
variable "tunnel_status_disabled_critical" {
157+
description = "Disable critical alerting rule for tunnel_status detector"
158+
type = bool
159+
default = null
160+
}
161+
162+
variable "tunnel_status_disabled_major" {
163+
description = "Disable major alerting rule for tunnel_status detector"
164+
type = bool
165+
default = null
166+
}
167+
168+
variable "tunnel_status_threshold_critical" {
169+
description = "Critical threshold for tunnel_status detector"
170+
type = number
171+
default = 0
172+
}
173+
174+
variable "tunnel_status_lasting_duration_critical" {
175+
description = "Minimum duration that conditions must be true before raising alert"
176+
type = string
177+
default = "20m"
178+
}
179+
180+
variable "tunnel_status_at_least_percentage_critical" {
181+
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
182+
type = number
183+
default = 1
184+
}
185+
variable "tunnel_status_threshold_major" {
186+
description = "Major threshold for tunnel_status detector"
187+
type = number
188+
default = 0
189+
}
190+
191+
variable "tunnel_status_lasting_duration_major" {
192+
description = "Minimum duration that conditions must be true before raising alert"
193+
type = string
194+
default = "10m"
195+
}
196+
197+
variable "tunnel_status_at_least_percentage_major" {
198+
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
199+
type = number
200+
default = 1
201+
}

0 commit comments

Comments
 (0)