diff --git a/docs/severity.md b/docs/severity.md index fbdfec111..780718ac9 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -212,6 +212,7 @@ |AWS ELB backend 4xx error rate|X|X|-|-|-| |AWS ELB backend 5xx error rate|X|X|-|-|-| |AWS ELB backend latency|X|X|-|-|-| +|AWS ELB unhealthy instances|X|-|-|-|-| ## aws-kinesis-firehose diff --git a/modules/integration_aws-elb/README.md b/modules/integration_aws-elb/README.md index 615dff52b..84c5bd014 100644 --- a/modules/integration_aws-elb/README.md +++ b/modules/integration_aws-elb/README.md @@ -57,7 +57,7 @@ Note the following parameters: These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables.tf](variables.tf). +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/docs/configuration/variables.html) make it possible to customize the detectors behavior to better fit your needs. @@ -82,6 +82,7 @@ This module creates the following SignalFx detectors which could contain one or |AWS ELB backend 4xx error rate|X|X|-|-|-| |AWS ELB backend 5xx error rate|X|X|-|-|-| |AWS ELB backend latency|X|X|-|-|-| +|AWS ELB unhealthy instances|X|-|-|-|-| ## How to collect required metrics? diff --git a/modules/integration_aws-elb/conf/01-unhealthy-instances.yaml b/modules/integration_aws-elb/conf/01-unhealthy-instances.yaml new file mode 100644 index 000000000..d45058cd3 --- /dev/null +++ b/modules/integration_aws-elb/conf/01-unhealthy-instances.yaml @@ -0,0 +1,12 @@ +module: "AWS ELB" +name: "unhealthy instances" +id: unhealthy_instances_absolute +transformation: ".min(over='10m')" +signals: + signal: + metric: UnHealthyHostCount + filter: "filter('namespace', 'AWS/ELB') and filter('stat', 'upper') and (not filter('AvailabilityZone', '*'))" +rules: + critical: + threshold: 1 + comparator: ">=" diff --git a/modules/integration_aws-elb/detectors-gen.tf b/modules/integration_aws-elb/detectors-gen.tf new file mode 100644 index 000000000..9159c8959 --- /dev/null +++ b/modules/integration_aws-elb/detectors-gen.tf @@ -0,0 +1,25 @@ +resource "signalfx_detector" "unhealthy_instances_absolute" { + name = format("%s %s", local.detector_name_prefix, "AWS ELB unhealthy instances") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('UnHealthyHostCount', filter=filter('namespace', 'AWS/ELB') and filter('stat', 'upper') and (not filter('AvailabilityZone', '*')) and ${module.filtering.signalflow})${var.unhealthy_instances_absolute_aggregation_function}${var.unhealthy_instances_absolute_transformation_function}.publish('signal') + detect(when(signal >= ${var.unhealthy_instances_absolute_threshold_critical})).publish('CRIT') +EOF + + rule { + description = "is too high >= ${var.unhealthy_instances_absolute_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.unhealthy_instances_absolute_disabled, var.detectors_disabled) + notifications = coalescelist(lookup(var.unhealthy_instances_absolute_notifications, "critical", []), var.notifications.critical) + runbook_url = try(coalesce(var.unhealthy_instances_absolute_runbook_url, var.runbook_url), "") + tip = var.unhealthy_instances_absolute_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } +} + diff --git a/modules/integration_aws-elb/outputs.tf b/modules/integration_aws-elb/outputs.tf index c6dcb8983..1d153d2e1 100644 --- a/modules/integration_aws-elb/outputs.tf +++ b/modules/integration_aws-elb/outputs.tf @@ -33,3 +33,8 @@ output "no_healthy_instances" { value = signalfx_detector.no_healthy_instances } +output "unhealthy_instances_absolute" { + description = "Detector resource for unhealthy_instances_absolute" + value = signalfx_detector.unhealthy_instances_absolute +} + diff --git a/modules/integration_aws-elb/variables-gen.tf b/modules/integration_aws-elb/variables-gen.tf new file mode 100644 index 000000000..18dfad25b --- /dev/null +++ b/modules/integration_aws-elb/variables-gen.tf @@ -0,0 +1,44 @@ +# unhealthy_instances_absolute detector + +variable "unhealthy_instances_absolute_notifications" { + description = "Notification recipients list per severity overridden for unhealthy_instances_absolute detector" + type = map(list(string)) + default = {} +} + +variable "unhealthy_instances_absolute_aggregation_function" { + description = "Aggregation function and group by for unhealthy_instances_absolute detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "unhealthy_instances_absolute_transformation_function" { + description = "Transformation function for unhealthy_instances_absolute detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='10m')" +} + +variable "unhealthy_instances_absolute_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "unhealthy_instances_absolute_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "unhealthy_instances_absolute_disabled" { + description = "Disable all alerting rules for unhealthy_instances_absolute detector" + type = bool + default = null +} + +variable "unhealthy_instances_absolute_threshold_critical" { + description = "Critical threshold for unhealthy_instances_absolute detector" + type = number + default = 1 +} +