diff --git a/docs/severity.md b/docs/severity.md index 6948564d1..47f548389 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -177,6 +177,7 @@ |---|---|---|---|---|---| |Azure VPN heartbeat|X|-|-|-|-| |Azure VPN total flow count|X|-|-|-|-| +|Azure VPN ipsec tunnel status|X|X|-|-|-| ## integration_aws-alb @@ -592,6 +593,7 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |Azure Load Balancer heartbeat|X|-|-|-|-| +|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-| ## integration_azure-mariadb diff --git a/modules/fame_azure-vpn/README.md b/modules/fame_azure-vpn/README.md index dfca0d58f..21f797371 100644 --- a/modules/fame_azure-vpn/README.md +++ b/modules/fame_azure-vpn/README.md @@ -77,6 +77,7 @@ This module creates the following SignalFx detectors which could contain one or |---|---|---|---|---|---| |Azure VPN heartbeat|X|-|-|-|-| |Azure VPN total flow count|X|-|-|-|-| +|Azure VPN ipsec tunnel status|X|X|-|-|-| ## How to collect required metrics? @@ -97,6 +98,7 @@ Check the [Related documentation](#related-documentation) section for more detai Here is the list of required metrics for detectors in this module. * `fame.azure.virtual_network_gateway.total_flow_count` +* `fame.azure.virtual_network_gateway.tunnel_status` diff --git a/modules/fame_azure-vpn/conf/00-heartbeat.yaml b/modules/fame_azure-vpn/conf/00-heartbeat.yaml index ce6bca47b..d91f61987 100644 --- a/modules/fame_azure-vpn/conf/00-heartbeat.yaml +++ b/modules/fame_azure-vpn/conf/00-heartbeat.yaml @@ -6,7 +6,7 @@ aggregation: true signals: signal: - metric: fame.azure.virtual_network_gateway.total_flow_count + metric: fame.azure.virtual_network_gateway.tunnel_status rules: critical: diff --git a/modules/fame_azure-vpn/conf/01-total-flow-count.yaml b/modules/fame_azure-vpn/conf/01-total-flow-count.yaml index bc24613e0..d898883e8 100644 --- a/modules/fame_azure-vpn/conf/01-total-flow-count.yaml +++ b/modules/fame_azure-vpn/conf/01-total-flow-count.yaml @@ -9,6 +9,8 @@ signals: signal: metric: fame.azure.virtual_network_gateway.total_flow_count +disabled: true + rules: critical: threshold: 0 diff --git a/modules/fame_azure-vpn/conf/02-tunnel-status.yaml b/modules/fame_azure-vpn/conf/02-tunnel-status.yaml new file mode 100644 index 000000000..a5b2a84ba --- /dev/null +++ b/modules/fame_azure-vpn/conf/02-tunnel-status.yaml @@ -0,0 +1,21 @@ +module: Azure VPN +name: IPSEC Tunnel Status +id: tunnel_status + +transformation: true +aggregation: ".mean(by=['azure_resource_group', 'azure_resource_name', 'remote_ip'])" + +signals: + signal: + metric: fame.azure.virtual_network_gateway.tunnel_status + +rules: + critical: + threshold: 0 + comparator: "==" + lasting_duration: '20m' + major: + threshold: 0 + comparator: "==" + lasting_duration: '10m' + dependency: critical diff --git a/modules/fame_azure-vpn/detectors-gen.tf b/modules/fame_azure-vpn/detectors-gen.tf index f91cb55c7..584c3f7a2 100644 --- a/modules/fame_azure-vpn/detectors-gen.tf +++ b/modules/fame_azure-vpn/detectors-gen.tf @@ -7,7 +7,7 @@ resource "signalfx_detector" "heartbeat" { program_text = <<-EOF from signalfx.detectors.not_reporting import not_reporting - signal = data('fame.azure.virtual_network_gateway.total_flow_count', filter=${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal') + signal = data('fame.azure.virtual_network_gateway.tunnel_status', filter=${module.filtering.signalflow})${var.heartbeat_aggregation_function}${var.heartbeat_transformation_function}.publish('signal') not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') EOF @@ -53,3 +53,43 @@ EOF max_delay = var.totalflowcount_max_delay } +resource "signalfx_detector" "tunnel_status" { + name = format("%s %s", local.detector_name_prefix, "Azure VPN ipsec tunnel status") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('fame.azure.virtual_network_gateway.tunnel_status', filter=${module.filtering.signalflow})${var.tunnel_status_aggregation_function}${var.tunnel_status_transformation_function}.publish('signal') + detect(when(signal == ${var.tunnel_status_threshold_critical}%{if var.tunnel_status_lasting_duration_critical != null}, lasting='${var.tunnel_status_lasting_duration_critical}', at_least=${var.tunnel_status_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal == ${var.tunnel_status_threshold_major}%{if var.tunnel_status_lasting_duration_major != null}, lasting='${var.tunnel_status_lasting_duration_major}', at_least=${var.tunnel_status_at_least_percentage_major}%{endif}) and (not when(signal == ${var.tunnel_status_threshold_critical}%{if var.tunnel_status_lasting_duration_critical != null}, lasting='${var.tunnel_status_lasting_duration_critical}', at_least=${var.tunnel_status_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is == ${var.tunnel_status_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.tunnel_status_disabled_critical, var.tunnel_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.tunnel_status_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.tunnel_status_runbook_url, var.runbook_url), "") + tip = var.tunnel_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is == ${var.tunnel_status_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.tunnel_status_disabled_major, var.tunnel_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.tunnel_status_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.tunnel_status_runbook_url, var.runbook_url), "") + tip = var.tunnel_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.tunnel_status_max_delay +} + diff --git a/modules/fame_azure-vpn/outputs.tf b/modules/fame_azure-vpn/outputs.tf index d32aff46c..1e4cbbfaf 100644 --- a/modules/fame_azure-vpn/outputs.tf +++ b/modules/fame_azure-vpn/outputs.tf @@ -8,3 +8,8 @@ output "totalflowcount" { value = signalfx_detector.totalflowcount } +output "tunnel_status" { + description = "Detector resource for tunnel_status" + value = signalfx_detector.tunnel_status +} + diff --git a/modules/fame_azure-vpn/variables-gen.tf b/modules/fame_azure-vpn/variables-gen.tf index 3207d78e5..3f4a0b9b0 100644 --- a/modules/fame_azure-vpn/variables-gen.tf +++ b/modules/fame_azure-vpn/variables-gen.tf @@ -89,7 +89,7 @@ variable "totalflowcount_runbook_url" { variable "totalflowcount_disabled" { description = "Disable all alerting rules for totalflowcount detector" type = bool - default = null + default = true } variable "totalflowcount_threshold_critical" { @@ -109,3 +109,93 @@ variable "totalflowcount_at_least_percentage_critical" { type = number default = 1 } +# tunnel_status detector + +variable "tunnel_status_notifications" { + description = "Notification recipients list per severity overridden for tunnel_status detector" + type = map(list(string)) + default = {} +} + +variable "tunnel_status_aggregation_function" { + description = "Aggregation function and group by for tunnel_status detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".mean(by=['azure_resource_group', 'azure_resource_name', 'remote_ip'])" +} + +variable "tunnel_status_transformation_function" { + description = "Transformation function for tunnel_status detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "tunnel_status_max_delay" { + description = "Enforce max delay for tunnel_status detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "tunnel_status_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "tunnel_status_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "tunnel_status_disabled" { + description = "Disable all alerting rules for tunnel_status detector" + type = bool + default = null +} + +variable "tunnel_status_disabled_critical" { + description = "Disable critical alerting rule for tunnel_status detector" + type = bool + default = null +} + +variable "tunnel_status_disabled_major" { + description = "Disable major alerting rule for tunnel_status detector" + type = bool + default = null +} + +variable "tunnel_status_threshold_critical" { + description = "Critical threshold for tunnel_status detector" + type = number + default = 0 +} + +variable "tunnel_status_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "20m" +} + +variable "tunnel_status_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "tunnel_status_threshold_major" { + description = "Major threshold for tunnel_status detector" + type = number + default = 0 +} + +variable "tunnel_status_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "tunnel_status_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/integration_azure-key-vault/conf/02-api-latency.yaml b/modules/integration_azure-key-vault/conf/02-api-latency.yaml index 1e77c3be7..770c16f2f 100644 --- a/modules/integration_azure-key-vault/conf/02-api-latency.yaml +++ b/modules/integration_azure-key-vault/conf/02-api-latency.yaml @@ -15,9 +15,11 @@ rules: threshold: 500 comparator: ">" lasting_duration: '1h' + disabled: true minor: threshold: 500 comparator: ">" lasting_duration: '30m' dependency: major + disabled: true ... diff --git a/modules/integration_azure-key-vault/variables-gen.tf b/modules/integration_azure-key-vault/variables-gen.tf index 085c7c21b..1ff032cfd 100644 --- a/modules/integration_azure-key-vault/variables-gen.tf +++ b/modules/integration_azure-key-vault/variables-gen.tf @@ -135,13 +135,13 @@ variable "api_latency_disabled" { variable "api_latency_disabled_major" { description = "Disable major alerting rule for api_latency detector" type = bool - default = null + default = true } variable "api_latency_disabled_minor" { description = "Disable minor alerting rule for api_latency detector" type = bool - default = null + default = true } variable "api_latency_threshold_major" { diff --git a/modules/integration_azure-load-balancer/README.md b/modules/integration_azure-load-balancer/README.md index c4a658d72..39a98b658 100644 --- a/modules/integration_azure-load-balancer/README.md +++ b/modules/integration_azure-load-balancer/README.md @@ -8,6 +8,8 @@ - [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) - [How to collect required metrics?](#how-to-collect-required-metrics) - [Metrics](#metrics) +- [Notes](#notes) + - [About Healthprobe detector](#about-healthprobe-detector) - [Related documentation](#related-documentation) @@ -76,6 +78,7 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |Azure Load Balancer heartbeat|X|-|-|-|-| +|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-| ## How to collect required metrics? @@ -94,9 +97,14 @@ Check the [Related documentation](#related-documentation) section for more detai Here is the list of required metrics for detectors in this module. * `ByteCount` +* `DipAvailability` +## Notes +### About Healthprobe detector + +Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus). ## Related documentation diff --git a/modules/integration_azure-load-balancer/conf/01-healthprobe.yaml b/modules/integration_azure-load-balancer/conf/01-healthprobe.yaml new file mode 100644 index 000000000..713d3e01a --- /dev/null +++ b/modules/integration_azure-load-balancer/conf/01-healthprobe.yaml @@ -0,0 +1,21 @@ +--- +module: "Azure Load Balancer" +name: backend unhealthy host ratio +filtering: "filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')" +aggregation: ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])" +value_unit: "%" +transformation: true +signals: + signal: + metric: "DipAvailability" +rules: + critical: + threshold: 50 + comparator: "<" + lasting_duration: '10m' + major: + threshold: 100 + comparator: "<" + lasting_duration: '10m' + dependency: critical +... \ No newline at end of file diff --git a/modules/integration_azure-load-balancer/conf/readme.yaml b/modules/integration_azure-load-balancer/conf/readme.yaml index a2bdda75f..6efaaba8c 100644 --- a/modules/integration_azure-load-balancer/conf/readme.yaml +++ b/modules/integration_azure-load-balancer/conf/readme.yaml @@ -1,3 +1,8 @@ documentations: - name: Azure Monitor metrics url: 'https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-network-loadbalancers-metrics' + +notes: | + ### About Healthprobe detector + + Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus). \ No newline at end of file diff --git a/modules/integration_azure-load-balancer/detectors-gen.tf b/modules/integration_azure-load-balancer/detectors-gen.tf index 417daab88..2630d892d 100644 --- a/modules/integration_azure-load-balancer/detectors-gen.tf +++ b/modules/integration_azure-load-balancer/detectors-gen.tf @@ -27,3 +27,49 @@ EOF max_delay = var.heartbeat_max_delay } +resource "signalfx_detector" "backend_unhealthy_host_ratio" { + name = format("%s %s", local.detector_name_prefix, "Azure Load Balancer backend unhealthy host ratio") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + base_filtering = filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true') + signal = data('DipAvailability', filter=base_filtering and ${module.filtering.signalflow})${var.backend_unhealthy_host_ratio_aggregation_function}${var.backend_unhealthy_host_ratio_transformation_function}.publish('signal') + detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_major}%{if var.backend_unhealthy_host_ratio_lasting_duration_major != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_major}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_major}%{endif}) and (not when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too low < ${var.backend_unhealthy_host_ratio_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.backend_unhealthy_host_ratio_disabled_critical, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "") + tip = var.backend_unhealthy_host_ratio_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too low < ${var.backend_unhealthy_host_ratio_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.backend_unhealthy_host_ratio_disabled_major, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "") + tip = var.backend_unhealthy_host_ratio_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.backend_unhealthy_host_ratio_max_delay +} + diff --git a/modules/integration_azure-load-balancer/outputs.tf b/modules/integration_azure-load-balancer/outputs.tf index 606361b57..4a4e4d4ed 100644 --- a/modules/integration_azure-load-balancer/outputs.tf +++ b/modules/integration_azure-load-balancer/outputs.tf @@ -1,3 +1,8 @@ +output "backend_unhealthy_host_ratio" { + description = "Detector resource for backend_unhealthy_host_ratio" + value = signalfx_detector.backend_unhealthy_host_ratio +} + output "heartbeat" { description = "Detector resource for heartbeat" value = signalfx_detector.heartbeat diff --git a/modules/integration_azure-load-balancer/variables-gen.tf b/modules/integration_azure-load-balancer/variables-gen.tf index 864f6ea77..caa6df828 100644 --- a/modules/integration_azure-load-balancer/variables-gen.tf +++ b/modules/integration_azure-load-balancer/variables-gen.tf @@ -48,3 +48,93 @@ variable "heartbeat_timeframe" { default = "25m" } +# backend_unhealthy_host_ratio detector + +variable "backend_unhealthy_host_ratio_notifications" { + description = "Notification recipients list per severity overridden for backend_unhealthy_host_ratio detector" + type = map(list(string)) + default = {} +} + +variable "backend_unhealthy_host_ratio_aggregation_function" { + description = "Aggregation function and group by for backend_unhealthy_host_ratio detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])" +} + +variable "backend_unhealthy_host_ratio_transformation_function" { + description = "Transformation function for backend_unhealthy_host_ratio detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "backend_unhealthy_host_ratio_max_delay" { + description = "Enforce max delay for backend_unhealthy_host_ratio detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "backend_unhealthy_host_ratio_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "backend_unhealthy_host_ratio_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "backend_unhealthy_host_ratio_disabled" { + description = "Disable all alerting rules for backend_unhealthy_host_ratio detector" + type = bool + default = null +} + +variable "backend_unhealthy_host_ratio_disabled_critical" { + description = "Disable critical alerting rule for backend_unhealthy_host_ratio detector" + type = bool + default = null +} + +variable "backend_unhealthy_host_ratio_disabled_major" { + description = "Disable major alerting rule for backend_unhealthy_host_ratio detector" + type = bool + default = null +} + +variable "backend_unhealthy_host_ratio_threshold_critical" { + description = "Critical threshold for backend_unhealthy_host_ratio detector in %" + type = number + default = 50 +} + +variable "backend_unhealthy_host_ratio_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "backend_unhealthy_host_ratio_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "backend_unhealthy_host_ratio_threshold_major" { + description = "Major threshold for backend_unhealthy_host_ratio detector in %" + type = number + default = 100 +} + +variable "backend_unhealthy_host_ratio_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "backend_unhealthy_host_ratio_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}