Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Azure Add loadbalancer healthprobe status detector #527

Merged
merged 14 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Load Balancer heartbeat|X|-|-|-|-|
|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-|


## integration_azure-mariadb
Expand Down
8 changes: 8 additions & 0 deletions modules/integration_azure-load-balancer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module)
- [How to collect required metrics?](#how-to-collect-required-metrics)
- [Metrics](#metrics)
- [Notes](#notes)
- [About Healthprobe detector](#about-healthprobe-detector)
- [Related documentation](#related-documentation)

<!-- END doctoc generated TOC please keep comment here to allow auto update -->
Expand Down Expand Up @@ -76,6 +78,7 @@ This module creates the following SignalFx detectors which could contain one or
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Azure Load Balancer heartbeat|X|-|-|-|-|
|Azure Load Balancer backend unhealthy host ratio|X|X|-|-|-|

## How to collect required metrics?

Expand All @@ -94,9 +97,14 @@ Check the [Related documentation](#related-documentation) section for more detai
Here is the list of required metrics for detectors in this module.

* `ByteCount`
* `DipAvailability`


## Notes

### About Healthprobe detector

Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus).

## Related documentation

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
module: "Azure Load Balancer"
name: backend unhealthy host ratio
filtering: "filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')"
aggregation: ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
value_unit: "%"
transformation: true
signals:
signal:
metric: "DipAvailability"
rules:
critical:
threshold: 50
comparator: "<"
lasting_duration: '10m'
major:
threshold: 100
comparator: "<"
lasting_duration: '10m'
dependency: critical
...
5 changes: 5 additions & 0 deletions modules/integration_azure-load-balancer/conf/readme.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
documentations:
- name: Azure Monitor metrics
url: 'https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-network-loadbalancers-metrics'

notes: |
### About Healthprobe detector

Healthprobe detector is only available for loadbalancer with a standard SKU. See [documentation](https://learn.microsoft.com/en-us/azure/load-balancer/skus).
46 changes: 46 additions & 0 deletions modules/integration_azure-load-balancer/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,49 @@ EOF
max_delay = var.heartbeat_max_delay
}

resource "signalfx_detector" "backend_unhealthy_host_ratio" {
name = format("%s %s", local.detector_name_prefix, "Azure Load Balancer backend unhealthy host ratio")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "%"
}

program_text = <<-EOF
base_filtering = filter('resource_type', 'Microsoft.Network/loadBalancers') and filter('primary_aggregation_type', 'true')
signal = data('DipAvailability', filter=base_filtering and ${module.filtering.signalflow})${var.backend_unhealthy_host_ratio_aggregation_function}${var.backend_unhealthy_host_ratio_transformation_function}.publish('signal')
detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif})).publish('CRIT')
detect(when(signal < ${var.backend_unhealthy_host_ratio_threshold_major}%{if var.backend_unhealthy_host_ratio_lasting_duration_major != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_major}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_major}%{endif}) and (not when(signal < ${var.backend_unhealthy_host_ratio_threshold_critical}%{if var.backend_unhealthy_host_ratio_lasting_duration_critical != null}, lasting='${var.backend_unhealthy_host_ratio_lasting_duration_critical}', at_least=${var.backend_unhealthy_host_ratio_at_least_percentage_critical}%{endif}))).publish('MAJOR')
EOF

rule {
description = "is too low < ${var.backend_unhealthy_host_ratio_threshold_critical}%"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.backend_unhealthy_host_ratio_disabled_critical, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "")
tip = var.backend_unhealthy_host_ratio_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too low < ${var.backend_unhealthy_host_ratio_threshold_major}%"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.backend_unhealthy_host_ratio_disabled_major, var.backend_unhealthy_host_ratio_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.backend_unhealthy_host_ratio_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.backend_unhealthy_host_ratio_runbook_url, var.runbook_url), "")
tip = var.backend_unhealthy_host_ratio_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.backend_unhealthy_host_ratio_max_delay
}

5 changes: 5 additions & 0 deletions modules/integration_azure-load-balancer/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
output "backend_unhealthy_host_ratio" {
description = "Detector resource for backend_unhealthy_host_ratio"
value = signalfx_detector.backend_unhealthy_host_ratio
}

output "heartbeat" {
description = "Detector resource for heartbeat"
value = signalfx_detector.heartbeat
Expand Down
90 changes: 90 additions & 0 deletions modules/integration_azure-load-balancer/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,93 @@ variable "heartbeat_timeframe" {
default = "25m"
}

# backend_unhealthy_host_ratio detector

variable "backend_unhealthy_host_ratio_notifications" {
description = "Notification recipients list per severity overridden for backend_unhealthy_host_ratio detector"
type = map(list(string))
default = {}
}

variable "backend_unhealthy_host_ratio_aggregation_function" {
description = "Aggregation function and group by for backend_unhealthy_host_ratio detector (i.e. \".mean(by=['host'])\")"
type = string
default = ".max(by=['BackendIPAddress', 'azure_resource_name', 'azure_resource_group_name', 'azure_region'])"
}

variable "backend_unhealthy_host_ratio_transformation_function" {
description = "Transformation function for backend_unhealthy_host_ratio detector (i.e. \".mean(over='5m')\")"
type = string
default = ""
}

variable "backend_unhealthy_host_ratio_max_delay" {
description = "Enforce max delay for backend_unhealthy_host_ratio detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "backend_unhealthy_host_ratio_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "backend_unhealthy_host_ratio_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "backend_unhealthy_host_ratio_disabled" {
description = "Disable all alerting rules for backend_unhealthy_host_ratio detector"
type = bool
default = null
}

variable "backend_unhealthy_host_ratio_disabled_critical" {
description = "Disable critical alerting rule for backend_unhealthy_host_ratio detector"
type = bool
default = null
}

variable "backend_unhealthy_host_ratio_disabled_major" {
description = "Disable major alerting rule for backend_unhealthy_host_ratio detector"
type = bool
default = null
}

variable "backend_unhealthy_host_ratio_threshold_critical" {
description = "Critical threshold for backend_unhealthy_host_ratio detector in %"
type = number
default = 50
}

variable "backend_unhealthy_host_ratio_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "backend_unhealthy_host_ratio_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "backend_unhealthy_host_ratio_threshold_major" {
description = "Major threshold for backend_unhealthy_host_ratio detector in %"
type = number
default = 100
}

variable "backend_unhealthy_host_ratio_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = "10m"
}

variable "backend_unhealthy_host_ratio_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
Loading