From 93dd4171224375844614f28c73cc55e7be0feba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?WAGET=20K=C3=A9vin?= Date: Mon, 27 May 2024 11:05:24 +0200 Subject: [PATCH] Integration aws elasticache redis database capacity usage percentage (#558) * add database capacity unit detector * add database capacity unit detector - fix * add database capacity unit detector - fix * add database capacity unit detector - fix * add database capacity usage detector - fix * add database capacity usage detector - fix * add database capacity usage detector - fix --------- Co-authored-by: Kevin Waget --- docs/severity.md | 1 + .../README.md | 2 + .../conf/06-database-capacity-usage.yaml | 21 +++++ .../detectors-gen.tf | 41 +++++++++ .../outputs.tf | 5 ++ .../variables-gen.tf | 90 +++++++++++++++++++ 6 files changed, 160 insertions(+) create mode 100644 modules/integration_aws-elasticache-redis/conf/06-database-capacity-usage.yaml diff --git a/docs/severity.md b/docs/severity.md index 2fd723d0b..22752ae50 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -290,6 +290,7 @@ |AWS ElastiCache redis replication lag|X|X|-|-|-| |AWS ElastiCache redis commands|-|X|-|-|-| |AWS ElastiCache redis network conntrack allowance exceeded|X|-|-|-|-| +|AWS ElastiCache redis database capacity usage|X|X|-|-|-| ## integration_aws-elasticsearch diff --git a/modules/integration_aws-elasticache-redis/README.md b/modules/integration_aws-elasticache-redis/README.md index 8cf275d5d..351168242 100644 --- a/modules/integration_aws-elasticache-redis/README.md +++ b/modules/integration_aws-elasticache-redis/README.md @@ -80,6 +80,7 @@ This module creates the following SignalFx detectors which could contain one or |AWS ElastiCache redis replication lag|X|X|-|-|-| |AWS ElastiCache redis commands|-|X|-|-|-| |AWS ElastiCache redis network conntrack allowance exceeded|X|-|-|-|-| +|AWS ElastiCache redis database capacity usage|X|X|-|-|-| ## How to collect required metrics? @@ -99,6 +100,7 @@ Here is the list of required metrics for detectors in this module. * `CacheHits` * `CacheMisses` +* `DatabaseCapacityUsagePercentage` * `EngineCPUUtilization` * `GetTypeCmds` * `NetworkConntrackAllowanceExceeded` diff --git a/modules/integration_aws-elasticache-redis/conf/06-database-capacity-usage.yaml b/modules/integration_aws-elasticache-redis/conf/06-database-capacity-usage.yaml new file mode 100644 index 000000000..5ab2308a5 --- /dev/null +++ b/modules/integration_aws-elasticache-redis/conf/06-database-capacity-usage.yaml @@ -0,0 +1,21 @@ +module: AWS ElastiCache redis +name: database capacity usage + +transformation: true +aggregation: true +filtering: "filter('namespace', 'AWS/ElastiCache') and filter('stat', 'upper') and filter('CacheNodeId', '*')" + +signals: + signal: + metric: DatabaseCapacityUsagePercentage + +rules: + critical: + threshold: 90 + comparator: ">" + lasting_duration: 10m + major: + threshold: 80 + comparator: ">" + dependency: critical + lasting_duration: 10m diff --git a/modules/integration_aws-elasticache-redis/detectors-gen.tf b/modules/integration_aws-elasticache-redis/detectors-gen.tf index 8a18e0606..798cb0455 100644 --- a/modules/integration_aws-elasticache-redis/detectors-gen.tf +++ b/modules/integration_aws-elasticache-redis/detectors-gen.tf @@ -196,3 +196,44 @@ EOF max_delay = var.network_conntrack_allowance_exceeded_max_delay } +resource "signalfx_detector" "database_capacity_usage" { + name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache redis database capacity usage") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/ElastiCache') and filter('stat', 'upper') and filter('CacheNodeId', '*') + signal = data('DatabaseCapacityUsagePercentage', filter=base_filtering and ${module.filtering.signalflow})${var.database_capacity_usage_aggregation_function}${var.database_capacity_usage_transformation_function}.publish('signal') + detect(when(signal > ${var.database_capacity_usage_threshold_critical}%{if var.database_capacity_usage_lasting_duration_critical != null}, lasting='${var.database_capacity_usage_lasting_duration_critical}', at_least=${var.database_capacity_usage_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.database_capacity_usage_threshold_major}%{if var.database_capacity_usage_lasting_duration_major != null}, lasting='${var.database_capacity_usage_lasting_duration_major}', at_least=${var.database_capacity_usage_at_least_percentage_major}%{endif}) and (not when(signal > ${var.database_capacity_usage_threshold_critical}%{if var.database_capacity_usage_lasting_duration_critical != null}, lasting='${var.database_capacity_usage_lasting_duration_critical}', at_least=${var.database_capacity_usage_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.database_capacity_usage_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.database_capacity_usage_disabled_critical, var.database_capacity_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.database_capacity_usage_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.database_capacity_usage_runbook_url, var.runbook_url), "") + tip = var.database_capacity_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.database_capacity_usage_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.database_capacity_usage_disabled_major, var.database_capacity_usage_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.database_capacity_usage_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.database_capacity_usage_runbook_url, var.runbook_url), "") + tip = var.database_capacity_usage_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.database_capacity_usage_max_delay +} + diff --git a/modules/integration_aws-elasticache-redis/outputs.tf b/modules/integration_aws-elasticache-redis/outputs.tf index ed887cc33..81441506c 100644 --- a/modules/integration_aws-elasticache-redis/outputs.tf +++ b/modules/integration_aws-elasticache-redis/outputs.tf @@ -13,6 +13,11 @@ output "cpu_high" { value = signalfx_detector.cpu_high } +output "database_capacity_usage" { + description = "Detector resource for database_capacity_usage" + value = signalfx_detector.database_capacity_usage +} + output "network_conntrack_allowance_exceeded" { description = "Detector resource for network_conntrack_allowance_exceeded" value = signalfx_detector.network_conntrack_allowance_exceeded diff --git a/modules/integration_aws-elasticache-redis/variables-gen.tf b/modules/integration_aws-elasticache-redis/variables-gen.tf index c830cabe3..29eee3542 100644 --- a/modules/integration_aws-elasticache-redis/variables-gen.tf +++ b/modules/integration_aws-elasticache-redis/variables-gen.tf @@ -390,3 +390,93 @@ variable "network_conntrack_allowance_exceeded_at_least_percentage_critical" { type = number default = 1 } +# database_capacity_usage detector + +variable "database_capacity_usage_notifications" { + description = "Notification recipients list per severity overridden for database_capacity_usage detector" + type = map(list(string)) + default = {} +} + +variable "database_capacity_usage_aggregation_function" { + description = "Aggregation function and group by for database_capacity_usage detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "database_capacity_usage_transformation_function" { + description = "Transformation function for database_capacity_usage detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "database_capacity_usage_max_delay" { + description = "Enforce max delay for database_capacity_usage detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "database_capacity_usage_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "database_capacity_usage_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "database_capacity_usage_disabled" { + description = "Disable all alerting rules for database_capacity_usage detector" + type = bool + default = null +} + +variable "database_capacity_usage_disabled_critical" { + description = "Disable critical alerting rule for database_capacity_usage detector" + type = bool + default = null +} + +variable "database_capacity_usage_disabled_major" { + description = "Disable major alerting rule for database_capacity_usage detector" + type = bool + default = null +} + +variable "database_capacity_usage_threshold_critical" { + description = "Critical threshold for database_capacity_usage detector" + type = number + default = 90 +} + +variable "database_capacity_usage_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "database_capacity_usage_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "database_capacity_usage_threshold_major" { + description = "Major threshold for database_capacity_usage detector" + type = number + default = 80 +} + +variable "database_capacity_usage_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "10m" +} + +variable "database_capacity_usage_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}