From f5380368707fd68a9034c491914b5701a082737c Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 3 Jan 2024 15:46:05 +0100 Subject: [PATCH 01/16] Add zookeeper-health --- docs/severity.md | 1 + modules/smart-agent_zookeeper/README.md | 3 ++- modules/smart-agent_zookeeper/outputs.tf | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/severity.md b/docs/severity.md index 30be3a0cd..d12a3801c 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1311,6 +1311,7 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| +|Zookeeper zookeeper-health|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index b4e0cbe78..c334a86a8 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -59,7 +59,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables.tf](variables.tf). +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. @@ -77,6 +77,7 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| +|Zookeeper zookeeper-health|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/outputs.tf b/modules/smart-agent_zookeeper/outputs.tf index e6913132e..07839831d 100644 --- a/modules/smart-agent_zookeeper/outputs.tf +++ b/modules/smart-agent_zookeeper/outputs.tf @@ -8,6 +8,11 @@ output "heartbeat" { value = signalfx_detector.heartbeat } +output "zookeeper-health" { + description = "Detector resource for zookeeper-health" + value = signalfx_detector.zookeeper-health +} + output "zookeeper_health" { description = "Detector resource for zookeeper_health" value = signalfx_detector.zookeeper_health From b9fa05e080412cbce39266d7c8e74883f2963c65 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 3 Jan 2024 15:50:41 +0100 Subject: [PATCH 02/16] Remove old zookeeper health --- modules/smart-agent_zookeeper/detectors-zookeeper.tf | 4 ++-- modules/smart-agent_zookeeper/outputs.tf | 5 ----- modules/smart-agent_zookeeper/variables.tf | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/modules/smart-agent_zookeeper/detectors-zookeeper.tf b/modules/smart-agent_zookeeper/detectors-zookeeper.tf index 125012060..27d52b5ed 100644 --- a/modules/smart-agent_zookeeper/detectors-zookeeper.tf +++ b/modules/smart-agent_zookeeper/detectors-zookeeper.tf @@ -26,7 +26,7 @@ EOF max_delay = var.heartbeat_max_delay } -resource "signalfx_detector" "zookeeper_health" { +/*resource "signalfx_detector" "zookeeper_health" { name = format("%s %s", local.detector_name_prefix, "Zookeeper service health") authorized_writer_teams = var.authorized_writer_teams @@ -51,7 +51,7 @@ EOF } max_delay = var.zookeeper_health_max_delay -} +}*/ resource "signalfx_detector" "zookeeper_latency" { name = format("%s %s", local.detector_name_prefix, "Zookeeper latency") diff --git a/modules/smart-agent_zookeeper/outputs.tf b/modules/smart-agent_zookeeper/outputs.tf index 07839831d..b445a34f7 100644 --- a/modules/smart-agent_zookeeper/outputs.tf +++ b/modules/smart-agent_zookeeper/outputs.tf @@ -13,11 +13,6 @@ output "zookeeper-health" { value = signalfx_detector.zookeeper-health } -output "zookeeper_health" { - description = "Detector resource for zookeeper_health" - value = signalfx_detector.zookeeper_health -} - output "zookeeper_latency" { description = "Detector resource for zookeeper_latency" value = signalfx_detector.zookeeper_latency diff --git a/modules/smart-agent_zookeeper/variables.tf b/modules/smart-agent_zookeeper/variables.tf index 9b8032632..b1aeaed8c 100644 --- a/modules/smart-agent_zookeeper/variables.tf +++ b/modules/smart-agent_zookeeper/variables.tf @@ -44,7 +44,7 @@ variable "heartbeat_aggregation_function" { default = "" } -# zookeeper_health detector +/*# zookeeper_health detector variable "zookeeper_health_max_delay" { description = "Enforce max delay for zookeeper_health detector (use \"0\" or \"null\" for \"Auto\")" @@ -92,7 +92,7 @@ variable "zookeeper_health_transformation_function" { description = "Transformation function for zookeeper_health detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='5m')" -} +}*/ # zookeeper_latency detector From 2f0b068bbfe8fe84d726d2cf6c5bf527da809a28 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 3 Jan 2024 15:52:18 +0100 Subject: [PATCH 03/16] Remove old zookeeper health --- .../conf/00-zookeeper-health.yaml | 15 +++++ .../smart-agent_zookeeper/detectors-gen.tf | 27 +++++++++ .../smart-agent_zookeeper/variables-gen.tf | 55 +++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml create mode 100644 modules/smart-agent_zookeeper/detectors-gen.tf create mode 100644 modules/smart-agent_zookeeper/variables-gen.tf diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml new file mode 100644 index 000000000..fd6a64e40 --- /dev/null +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml @@ -0,0 +1,15 @@ +module: zookeeper +name: zookeeper-health +transformation: false +aggregation: true +exclude_not_running_vm: true +disabled: true +signals: + signal: + metric: "gauge.zk_service_health" +rules: + critical: + threshold: 1 + comparator: "!=" + description: "is not running" + lasting_duration: "15m" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf new file mode 100644 index 000000000..6cabb1a23 --- /dev/null +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -0,0 +1,27 @@ +resource "signalfx_detector" "zookeeper-health" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper zookeeper-health") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('gauge.zk_service_health', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.zookeeper-health_aggregation_function}.publish('signal') + detect(when(signal != ${var.zookeeper-health_threshold_critical}, lasting=%{if var.zookeeper-health_lasting_duration_critical == null}None%{else}'${var.zookeeper-health_lasting_duration_critical}'%{endif}, at_least=${var.zookeeper-health_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is not running != ${var.zookeeper-health_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.zookeeper-health_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.zookeeper-health_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.zookeeper-health_runbook_url, var.runbook_url), "") + tip = var.zookeeper-health_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.zookeeper-health_max_delay +} + diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf new file mode 100644 index 000000000..a1bda9717 --- /dev/null +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -0,0 +1,55 @@ +# zookeeper-health detector + +variable "zookeeper-health_notifications" { + description = "Notification recipients list per severity overridden for zookeeper-health detector" + type = map(list(string)) + default = {} +} + +variable "zookeeper-health_aggregation_function" { + description = "Aggregation function and group by for zookeeper-health detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "zookeeper-health_max_delay" { + description = "Enforce max delay for zookeeper-health detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "zookeeper-health_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "zookeeper-health_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "zookeeper-health_disabled" { + description = "Disable all alerting rules for zookeeper-health detector" + type = bool + default = true +} + +variable "zookeeper-health_threshold_critical" { + description = "Critical threshold for zookeeper-health detector" + type = number + default = 1 +} + +variable "zookeeper-health_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "15m" +} + +variable "zookeeper-health_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 7bb2c6b300fcec0d01a13902bb8582e94115cb71 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 3 Jan 2024 15:59:56 +0100 Subject: [PATCH 04/16] Remove old health ZK --- docs/severity.md | 1 - modules/smart-agent_zookeeper/README.md | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index d12a3801c..e6c61e95d 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1313,7 +1313,6 @@ |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| -|Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index c334a86a8..f456c61ba 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -79,7 +79,6 @@ This module creates the following SignalFx detectors which could contain one or |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| -|Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| From f90dfaec44d8b7dd63a4f44d5e3d4640bb80a279 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 3 Jan 2024 16:40:06 +0100 Subject: [PATCH 05/16] Enabled detector --- docs/severity.md | 1 + modules/smart-agent_zookeeper/README.md | 1 + modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml | 5 +++-- modules/smart-agent_zookeeper/variables-gen.tf | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index e6c61e95d..d12a3801c 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1313,6 +1313,7 @@ |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| +|Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index f456c61ba..c334a86a8 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -79,6 +79,7 @@ This module creates the following SignalFx detectors which could contain one or |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| +|Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml index fd6a64e40..f19376eb6 100644 --- a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml @@ -3,7 +3,7 @@ name: zookeeper-health transformation: false aggregation: true exclude_not_running_vm: true -disabled: true +disabled: false signals: signal: metric: "gauge.zk_service_health" @@ -12,4 +12,5 @@ rules: threshold: 1 comparator: "!=" description: "is not running" - lasting_duration: "15m" \ No newline at end of file + lasting_duration: "15m" + health_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index a1bda9717..6ba71176e 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -33,7 +33,7 @@ variable "zookeeper-health_runbook_url" { variable "zookeeper-health_disabled" { description = "Disable all alerting rules for zookeeper-health detector" type = bool - default = true + default = null } variable "zookeeper-health_threshold_critical" { From a98dd62a784e693f6c21095642efbefc4eec2162 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 3 Jan 2024 16:45:48 +0100 Subject: [PATCH 06/16] update duration --- modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml | 2 +- modules/smart-agent_zookeeper/variables-gen.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml index f19376eb6..320b98dfa 100644 --- a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml @@ -12,5 +12,5 @@ rules: threshold: 1 comparator: "!=" description: "is not running" - lasting_duration: "15m" + lasting_duration: "5m" health_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index 6ba71176e..87e36774b 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -45,7 +45,7 @@ variable "zookeeper-health_threshold_critical" { variable "zookeeper-health_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = "15m" + default = "5m" } variable "zookeeper-health_at_least_percentage_critical" { From 2209c167458be90ac83312640c188b6db49115e6 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Thu, 4 Jan 2024 11:21:42 +0100 Subject: [PATCH 07/16] Add zookeeper-latency module --- docs/severity.md | 1 + modules/smart-agent_zookeeper/README.md | 1 + .../conf/01-zookeeper-latency.yaml | 22 +++++ .../smart-agent_zookeeper/detectors-gen.tf | 40 +++++++++ .../detectors-zookeeper.tf | 4 +- modules/smart-agent_zookeeper/outputs.tf | 6 +- .../smart-agent_zookeeper/variables-gen.tf | 84 +++++++++++++++++++ modules/smart-agent_zookeeper/variables.tf | 4 +- 8 files changed, 155 insertions(+), 7 deletions(-) create mode 100644 modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml diff --git a/docs/severity.md b/docs/severity.md index d12a3801c..9aa7e7daf 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1312,6 +1312,7 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| +|Zookeeper zookeeper-latency|X|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index c334a86a8..e5f4a2dec 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -78,6 +78,7 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| +|Zookeeper zookeeper-latency|X|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml new file mode 100644 index 000000000..58ca88276 --- /dev/null +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml @@ -0,0 +1,22 @@ +module: zookeeper +name: zookeeper-latency +transformation: false +aggregation: true +exclude_not_running_vm: true +disabled: false +signals: + signal: + metric: "gauge.zk_avg_latency" +rules: + critical: + threshold: 300000 + comparator: ">" + description: "is too high" + lasting_duration: "5m" + latency_disabled: "false" + major: + threshold: 250000 + comparator: ">" + description: "is too high" + lasting_duration: "5m" + latency_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf index 6cabb1a23..810ded82f 100644 --- a/modules/smart-agent_zookeeper/detectors-gen.tf +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -25,3 +25,43 @@ EOF max_delay = var.zookeeper-health_max_delay } +resource "signalfx_detector" "zookeeper-latency" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper zookeeper-latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('gauge.zk_avg_latency', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.zookeeper-latency_aggregation_function}.publish('signal') + detect(when(signal > ${var.zookeeper-latency_threshold_critical}, lasting=%{if var.zookeeper-latency_lasting_duration_critical == null}None%{else}'${var.zookeeper-latency_lasting_duration_critical}'%{endif}, at_least=${var.zookeeper-latency_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.zookeeper-latency_threshold_major}, lasting=%{if var.zookeeper-latency_lasting_duration_major == null}None%{else}'${var.zookeeper-latency_lasting_duration_major}'%{endif}, at_least=${var.zookeeper-latency_at_least_percentage_major})).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.zookeeper-latency_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.zookeeper-latency_disabled_critical, var.zookeeper-latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.zookeeper-latency_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.zookeeper-latency_runbook_url, var.runbook_url), "") + tip = var.zookeeper-latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.zookeeper-latency_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.zookeeper-latency_disabled_major, var.zookeeper-latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.zookeeper-latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.zookeeper-latency_runbook_url, var.runbook_url), "") + tip = var.zookeeper-latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.zookeeper-latency_max_delay +} + diff --git a/modules/smart-agent_zookeeper/detectors-zookeeper.tf b/modules/smart-agent_zookeeper/detectors-zookeeper.tf index 27d52b5ed..6606a4d03 100644 --- a/modules/smart-agent_zookeeper/detectors-zookeeper.tf +++ b/modules/smart-agent_zookeeper/detectors-zookeeper.tf @@ -53,7 +53,7 @@ EOF max_delay = var.zookeeper_health_max_delay }*/ -resource "signalfx_detector" "zookeeper_latency" { +/*resource "signalfx_detector" "zookeeper_latency" { name = format("%s %s", local.detector_name_prefix, "Zookeeper latency") authorized_writer_teams = var.authorized_writer_teams @@ -91,7 +91,7 @@ EOF } max_delay = var.zookeeper_latency_max_delay -} +}*/ resource "signalfx_detector" "file_descriptors" { name = format("%s %s", local.detector_name_prefix, "Zookeeper file descriptors usage") diff --git a/modules/smart-agent_zookeeper/outputs.tf b/modules/smart-agent_zookeeper/outputs.tf index b445a34f7..c60896def 100644 --- a/modules/smart-agent_zookeeper/outputs.tf +++ b/modules/smart-agent_zookeeper/outputs.tf @@ -13,8 +13,8 @@ output "zookeeper-health" { value = signalfx_detector.zookeeper-health } -output "zookeeper_latency" { - description = "Detector resource for zookeeper_latency" - value = signalfx_detector.zookeeper_latency +output "zookeeper-latency" { + description = "Detector resource for zookeeper-latency" + value = signalfx_detector.zookeeper-latency } diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index 87e36774b..c479ab84c 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -53,3 +53,87 @@ variable "zookeeper-health_at_least_percentage_critical" { type = number default = 1 } +# zookeeper-latency detector + +variable "zookeeper-latency_notifications" { + description = "Notification recipients list per severity overridden for zookeeper-latency detector" + type = map(list(string)) + default = {} +} + +variable "zookeeper-latency_aggregation_function" { + description = "Aggregation function and group by for zookeeper-latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "zookeeper-latency_max_delay" { + description = "Enforce max delay for zookeeper-latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "zookeeper-latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "zookeeper-latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "zookeeper-latency_disabled" { + description = "Disable all alerting rules for zookeeper-latency detector" + type = bool + default = null +} + +variable "zookeeper-latency_disabled_critical" { + description = "Disable critical alerting rule for zookeeper-latency detector" + type = bool + default = null +} + +variable "zookeeper-latency_disabled_major" { + description = "Disable major alerting rule for zookeeper-latency detector" + type = bool + default = null +} + +variable "zookeeper-latency_threshold_critical" { + description = "Critical threshold for zookeeper-latency detector" + type = number + default = 300000 +} + +variable "zookeeper-latency_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "zookeeper-latency_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "zookeeper-latency_threshold_major" { + description = "Major threshold for zookeeper-latency detector" + type = number + default = 250000 +} + +variable "zookeeper-latency_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "zookeeper-latency_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/smart-agent_zookeeper/variables.tf b/modules/smart-agent_zookeeper/variables.tf index b1aeaed8c..57bfdf3ba 100644 --- a/modules/smart-agent_zookeeper/variables.tf +++ b/modules/smart-agent_zookeeper/variables.tf @@ -92,7 +92,7 @@ variable "zookeeper_health_transformation_function" { description = "Transformation function for zookeeper_health detector (i.e. \".mean(over='5m')\")" type = string default = ".mean(over='5m')" -}*/ +} # zookeeper_latency detector @@ -160,7 +160,7 @@ variable "zookeeper_latency_threshold_major" { description = "Major threshold for zookeeper_latency detector" type = number default = 250000 -} +}*/ # file_descriptors detector From df371536e11892a9a24756ee87017b36ab360ea9 Mon Sep 17 00:00:00 2001 From: Quentin GIBERT Date: Thu, 11 Jan 2024 16:56:34 +0100 Subject: [PATCH 08/16] zookeeper: divide single and global latency --- docs/severity.md | 3 +- modules/smart-agent_zookeeper/README.md | 3 +- .../conf/01-zookeeper-latency.yaml | 22 ----- .../smart-agent_zookeeper/detectors-gen.tf | 46 ++++++---- modules/smart-agent_zookeeper/outputs.tf | 15 ++-- .../smart-agent_zookeeper/variables-gen.tf | 84 ++++++++++++------- 6 files changed, 96 insertions(+), 77 deletions(-) delete mode 100644 modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml diff --git a/docs/severity.md b/docs/severity.md index 9aa7e7daf..161546479 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1312,7 +1312,8 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| -|Zookeeper zookeeper-latency|X|X|-|-|-| +|Zookeeper cluster-latency|X|-|-|-|-| +|Zookeeper server-latency|-|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index e5f4a2dec..f8933536a 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -78,7 +78,8 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| |Zookeeper zookeeper-health|X|-|-|-|-| -|Zookeeper zookeeper-latency|X|X|-|-|-| +|Zookeeper cluster-latency|X|-|-|-|-| +|Zookeeper server-latency|-|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper service health|X|-|-|-|-| |Zookeeper latency|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml deleted file mode 100644 index 58ca88276..000000000 --- a/modules/smart-agent_zookeeper/conf/01-zookeeper-latency.yaml +++ /dev/null @@ -1,22 +0,0 @@ -module: zookeeper -name: zookeeper-latency -transformation: false -aggregation: true -exclude_not_running_vm: true -disabled: false -signals: - signal: - metric: "gauge.zk_avg_latency" -rules: - critical: - threshold: 300000 - comparator: ">" - description: "is too high" - lasting_duration: "5m" - latency_disabled: "false" - major: - threshold: 250000 - comparator: ">" - description: "is too high" - lasting_duration: "5m" - latency_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf index 810ded82f..3d07f5217 100644 --- a/modules/smart-agent_zookeeper/detectors-gen.tf +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -25,43 +25,57 @@ EOF max_delay = var.zookeeper-health_max_delay } -resource "signalfx_detector" "zookeeper-latency" { - name = format("%s %s", local.detector_name_prefix, "Zookeeper zookeeper-latency") +resource "signalfx_detector" "cluster-latency" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-latency") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_avg_latency', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.zookeeper-latency_aggregation_function}.publish('signal') - detect(when(signal > ${var.zookeeper-latency_threshold_critical}, lasting=%{if var.zookeeper-latency_lasting_duration_critical == null}None%{else}'${var.zookeeper-latency_lasting_duration_critical}'%{endif}, at_least=${var.zookeeper-latency_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.zookeeper-latency_threshold_major}, lasting=%{if var.zookeeper-latency_lasting_duration_major == null}None%{else}'${var.zookeeper-latency_lasting_duration_major}'%{endif}, at_least=${var.zookeeper-latency_at_least_percentage_major})).publish('MAJOR') + signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}.publish('signal') + detect(when(signal > ${var.cluster-latency_threshold_critical}, lasting=%{if var.cluster-latency_lasting_duration_critical == null}None%{else}'${var.cluster-latency_lasting_duration_critical}'%{endif}, at_least=${var.cluster-latency_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "is too high > ${var.zookeeper-latency_threshold_critical}" + description = "Zookeeper global latency is too high > ${var.cluster-latency_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.zookeeper-latency_disabled_critical, var.zookeeper-latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.zookeeper-latency_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.zookeeper-latency_runbook_url, var.runbook_url), "") - tip = var.zookeeper-latency_tip + disabled = coalesce(var.cluster-latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster-latency_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster-latency_runbook_url, var.runbook_url), "") + tip = var.cluster-latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } + max_delay = var.cluster-latency_max_delay +} + +resource "signalfx_detector" "server-latency" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper server-latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow}).publish('signal') + detect(when(signal > ${var.server-latency_threshold_major}, lasting=%{if var.server-latency_lasting_duration_major == null}None%{else}'${var.server-latency_lasting_duration_major}'%{endif}, at_least=${var.server-latency_at_least_percentage_major})).publish('MAJOR') +EOF + rule { - description = "is too high > ${var.zookeeper-latency_threshold_major}" + description = "Zookeeper latency is too high > ${var.server-latency_threshold_major}" severity = "Major" detect_label = "MAJOR" - disabled = coalesce(var.zookeeper-latency_disabled_major, var.zookeeper-latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.zookeeper-latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.zookeeper-latency_runbook_url, var.runbook_url), "") - tip = var.zookeeper-latency_tip + disabled = coalesce(var.server-latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.server-latency_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.server-latency_runbook_url, var.runbook_url), "") + tip = var.server-latency_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.zookeeper-latency_max_delay + max_delay = var.server-latency_max_delay } diff --git a/modules/smart-agent_zookeeper/outputs.tf b/modules/smart-agent_zookeeper/outputs.tf index c60896def..d3cbd16e7 100644 --- a/modules/smart-agent_zookeeper/outputs.tf +++ b/modules/smart-agent_zookeeper/outputs.tf @@ -1,3 +1,8 @@ +output "cluster-latency" { + description = "Detector resource for cluster-latency" + value = signalfx_detector.cluster-latency +} + output "file_descriptors" { description = "Detector resource for file_descriptors" value = signalfx_detector.file_descriptors @@ -8,13 +13,13 @@ output "heartbeat" { value = signalfx_detector.heartbeat } +output "server-latency" { + description = "Detector resource for server-latency" + value = signalfx_detector.server-latency +} + output "zookeeper-health" { description = "Detector resource for zookeeper-health" value = signalfx_detector.zookeeper-health } -output "zookeeper-latency" { - description = "Detector resource for zookeeper-latency" - value = signalfx_detector.zookeeper-latency -} - diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index c479ab84c..37ad6dca6 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -53,86 +53,106 @@ variable "zookeeper-health_at_least_percentage_critical" { type = number default = 1 } -# zookeeper-latency detector +# cluster-latency detector -variable "zookeeper-latency_notifications" { - description = "Notification recipients list per severity overridden for zookeeper-latency detector" +variable "cluster-latency_notifications" { + description = "Notification recipients list per severity overridden for cluster-latency detector" type = map(list(string)) default = {} } -variable "zookeeper-latency_aggregation_function" { - description = "Aggregation function and group by for zookeeper-latency detector (i.e. \".mean(by=['host'])\")" +variable "cluster-latency_aggregation_function" { + description = "Aggregation function and group by for cluster-latency detector (i.e. \".mean(by=['host'])\")" type = string - default = "" + default = ".mean(by='env')" } -variable "zookeeper-latency_max_delay" { - description = "Enforce max delay for zookeeper-latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "cluster-latency_max_delay" { + description = "Enforce max delay for cluster-latency detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "zookeeper-latency_tip" { +variable "cluster-latency_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "zookeeper-latency_runbook_url" { +variable "cluster-latency_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "zookeeper-latency_disabled" { - description = "Disable all alerting rules for zookeeper-latency detector" - type = bool - default = null -} - -variable "zookeeper-latency_disabled_critical" { - description = "Disable critical alerting rule for zookeeper-latency detector" - type = bool - default = null -} - -variable "zookeeper-latency_disabled_major" { - description = "Disable major alerting rule for zookeeper-latency detector" +variable "cluster-latency_disabled" { + description = "Disable all alerting rules for cluster-latency detector" type = bool default = null } -variable "zookeeper-latency_threshold_critical" { - description = "Critical threshold for zookeeper-latency detector" +variable "cluster-latency_threshold_critical" { + description = "Critical threshold for cluster-latency detector" type = number default = 300000 } -variable "zookeeper-latency_lasting_duration_critical" { +variable "cluster-latency_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "zookeeper-latency_at_least_percentage_critical" { +variable "cluster-latency_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -variable "zookeeper-latency_threshold_major" { - description = "Major threshold for zookeeper-latency detector" +# server-latency detector + +variable "server-latency_notifications" { + description = "Notification recipients list per severity overridden for server-latency detector" + type = map(list(string)) + default = {} +} + +variable "server-latency_max_delay" { + description = "Enforce max delay for server-latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "server-latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "server-latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "server-latency_disabled" { + description = "Disable all alerting rules for server-latency detector" + type = bool + default = null +} + +variable "server-latency_threshold_major" { + description = "Major threshold for server-latency detector" type = number default = 250000 } -variable "zookeeper-latency_lasting_duration_major" { +variable "server-latency_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "zookeeper-latency_at_least_percentage_major" { +variable "server-latency_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 From ae5b8ee6a428a6be493f6711315548b827ccf21d Mon Sep 17 00:00:00 2001 From: Quentin GIBERT Date: Thu, 11 Jan 2024 16:57:50 +0100 Subject: [PATCH 09/16] zookeeper: divide single and global latency --- .../conf/01-zookeeper-cluster-latency.yaml | 17 +++++++++++++++++ .../conf/02-zookeeper-server-latency.yaml | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml create mode 100644 modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml new file mode 100644 index 000000000..14350a0e4 --- /dev/null +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml @@ -0,0 +1,17 @@ +module: zookeeper +name: cluster-latency +transformation: false +aggregation: ".mean(by='env')" +disabled: false + +signals: + signal: + metric: "gauge.zk_avg_latency" + +rules: + critical: + threshold: 300000 + comparator: ">" + description: "Zookeeper global latency is too high" + lasting_duration: "5m" + latency_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml b/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml new file mode 100644 index 000000000..9f3619a23 --- /dev/null +++ b/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml @@ -0,0 +1,18 @@ + +module: zookeeper +name: server-latency +transformation: false +aggregation: false +disabled: false + +signals: + signal: + metric: "gauge.zk_avg_latency" + +rules: + major: + threshold: 250000 + comparator: ">" + description: "Zookeeper latency is too high" + lasting_duration: "5m" + latency_disabled: "false" \ No newline at end of file From 75e835ac5cec6ae1f479a237fd8923ac58817f27 Mon Sep 17 00:00:00 2001 From: Quentin GIBERT Date: Thu, 11 Jan 2024 16:58:32 +0100 Subject: [PATCH 10/16] cleanup --- .../detectors-zookeeper.tf | 67 ------------------- 1 file changed, 67 deletions(-) diff --git a/modules/smart-agent_zookeeper/detectors-zookeeper.tf b/modules/smart-agent_zookeeper/detectors-zookeeper.tf index 6606a4d03..ceb386b3c 100644 --- a/modules/smart-agent_zookeeper/detectors-zookeeper.tf +++ b/modules/smart-agent_zookeeper/detectors-zookeeper.tf @@ -26,73 +26,6 @@ EOF max_delay = var.heartbeat_max_delay } -/*resource "signalfx_detector" "zookeeper_health" { - name = format("%s %s", local.detector_name_prefix, "Zookeeper service health") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('gauge.zk_service_health', filter=filter('plugin', 'zookeeper') and ${module.filtering.signalflow})${var.zookeeper_health_aggregation_function}${var.zookeeper_health_transformation_function}.publish('signal') - detect(when(signal != 1)).publish('CRIT') -EOF - - rule { - description = "is not running" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.zookeeper_health_disabled_critical, var.zookeeper_health_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.zookeeper_health_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.zookeeper_health_runbook_url, var.runbook_url), "") - tip = var.zookeeper_health_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.zookeeper_health_max_delay -}*/ - -/*resource "signalfx_detector" "zookeeper_latency" { - name = format("%s %s", local.detector_name_prefix, "Zookeeper latency") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('gauge.zk_avg_latency', filter=filter('plugin', 'zookeeper') and ${module.filtering.signalflow})${var.zookeeper_latency_aggregation_function}${var.zookeeper_latency_transformation_function}.publish('signal') - detect(when(signal > ${var.zookeeper_latency_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.zookeeper_latency_threshold_major}) and (not when(signal > ${var.zookeeper_latency_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.zookeeper_latency_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.zookeeper_latency_disabled_critical, var.zookeeper_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.zookeeper_latency_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.zookeeper_latency_runbook_url, var.runbook_url), "") - tip = var.zookeeper_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.zookeeper_latency_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.zookeeper_latency_disabled_major, var.zookeeper_latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.zookeeper_latency_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.zookeeper_latency_runbook_url, var.runbook_url), "") - tip = var.zookeeper_latency_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.zookeeper_latency_max_delay -}*/ - resource "signalfx_detector" "file_descriptors" { name = format("%s %s", local.detector_name_prefix, "Zookeeper file descriptors usage") From f1b41ec0379454b9dcc2c5dc20daecc6b7f92359 Mon Sep 17 00:00:00 2001 From: Quentin GIBERT Date: Thu, 11 Jan 2024 17:27:58 +0100 Subject: [PATCH 11/16] zookeeper latency: fix aggregation --- docs/severity.md | 2 -- modules/smart-agent_zookeeper/README.md | 2 -- .../conf/01-zookeeper-cluster-latency.yaml | 2 +- modules/smart-agent_zookeeper/variables-gen.tf | 2 +- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index 161546479..43c4f4efb 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1315,8 +1315,6 @@ |Zookeeper cluster-latency|X|-|-|-|-| |Zookeeper server-latency|-|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| -|Zookeeper service health|X|-|-|-|-| -|Zookeeper latency|X|X|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index f8933536a..487fcbf15 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -81,8 +81,6 @@ This module creates the following SignalFx detectors which could contain one or |Zookeeper cluster-latency|X|-|-|-|-| |Zookeeper server-latency|-|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| -|Zookeeper service health|X|-|-|-|-| -|Zookeeper latency|X|X|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| ## How to collect required metrics? diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml index 14350a0e4..148c0e638 100644 --- a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml @@ -1,7 +1,7 @@ module: zookeeper name: cluster-latency transformation: false -aggregation: ".mean(by='env')" +aggregation: ".mean(by=['env', 'kubernetes_cluster'])" disabled: false signals: diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index 37ad6dca6..b3f574d5d 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -64,7 +64,7 @@ variable "cluster-latency_notifications" { variable "cluster-latency_aggregation_function" { description = "Aggregation function and group by for cluster-latency detector (i.e. \".mean(by=['host'])\")" type = string - default = ".mean(by='env')" + default = ".mean(by=['env', 'kubernetes_cluster'])" } variable "cluster-latency_max_delay" { From 66b04fd907ef0aeb7e0c10a1ffe92a932dfe0631 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Fri, 12 Jan 2024 10:18:14 +0100 Subject: [PATCH 12/16] cleanup zookeeper config --- docs/severity.md | 2 +- modules/smart-agent_zookeeper/README.md | 2 +- .../conf/00-zookeeper-health.yaml | 5 +- .../conf/01-zookeeper-cluster-latency.yaml | 3 - .../conf/02-zookeeper-server-latency.yaml | 16 +-- .../smart-agent_zookeeper/detectors-gen.tf | 26 ++-- modules/smart-agent_zookeeper/outputs.tf | 10 +- .../smart-agent_zookeeper/variables-gen.tf | 48 ++++--- modules/smart-agent_zookeeper/variables.tf | 118 ------------------ 9 files changed, 60 insertions(+), 170 deletions(-) diff --git a/docs/severity.md b/docs/severity.md index 43c4f4efb..d6245f94f 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1311,7 +1311,7 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|Zookeeper zookeeper-health|X|-|-|-|-| +|Zookeeper health|X|-|-|-|-| |Zookeeper cluster-latency|X|-|-|-|-| |Zookeeper server-latency|-|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index 487fcbf15..bc66741e2 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -77,7 +77,7 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|Zookeeper zookeeper-health|X|-|-|-|-| +|Zookeeper health|X|-|-|-|-| |Zookeeper cluster-latency|X|-|-|-|-| |Zookeeper server-latency|-|X|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml index 320b98dfa..8eb5ee4a2 100644 --- a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml @@ -1,8 +1,5 @@ module: zookeeper -name: zookeeper-health -transformation: false -aggregation: true -exclude_not_running_vm: true +name: health disabled: false signals: signal: diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml index 148c0e638..94d1ef1fb 100644 --- a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml @@ -1,13 +1,10 @@ module: zookeeper name: cluster-latency -transformation: false aggregation: ".mean(by=['env', 'kubernetes_cluster'])" disabled: false - signals: signal: metric: "gauge.zk_avg_latency" - rules: critical: threshold: 300000 diff --git a/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml b/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml index 9f3619a23..532babeb9 100644 --- a/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml +++ b/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml @@ -1,18 +1,14 @@ - module: zookeeper name: server-latency -transformation: false aggregation: false disabled: false - signals: signal: metric: "gauge.zk_avg_latency" - rules: - major: - threshold: 250000 - comparator: ">" - description: "Zookeeper latency is too high" - lasting_duration: "5m" - latency_disabled: "false" \ No newline at end of file + major: + threshold: 250000 + comparator: ">" + description: "Zookeeper server latency is too high" + lasting_duration: "5m" + latency_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf index 3d07f5217..a13f7f366 100644 --- a/modules/smart-agent_zookeeper/detectors-gen.tf +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -1,28 +1,28 @@ -resource "signalfx_detector" "zookeeper-health" { - name = format("%s %s", local.detector_name_prefix, "Zookeeper zookeeper-health") +resource "signalfx_detector" "health" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper health") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_service_health', filter=${local.not_running_vm_filters} and ${module.filtering.signalflow})${var.zookeeper-health_aggregation_function}.publish('signal') - detect(when(signal != ${var.zookeeper-health_threshold_critical}, lasting=%{if var.zookeeper-health_lasting_duration_critical == null}None%{else}'${var.zookeeper-health_lasting_duration_critical}'%{endif}, at_least=${var.zookeeper-health_at_least_percentage_critical})).publish('CRIT') + signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.health_aggregation_function}${var.health_transformation_function}.publish('signal') + detect(when(signal != ${var.health_threshold_critical}, lasting=%{if var.health_lasting_duration_critical == null}None%{else}'${var.health_lasting_duration_critical}'%{endif}, at_least=${var.health_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "is not running != ${var.zookeeper-health_threshold_critical}" + description = "is not running != ${var.health_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.zookeeper-health_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.zookeeper-health_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.zookeeper-health_runbook_url, var.runbook_url), "") - tip = var.zookeeper-health_tip + disabled = coalesce(var.health_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.health_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.health_runbook_url, var.runbook_url), "") + tip = var.health_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.zookeeper-health_max_delay + max_delay = var.health_max_delay } resource "signalfx_detector" "cluster-latency" { @@ -33,7 +33,7 @@ resource "signalfx_detector" "cluster-latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}.publish('signal') + signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}${var.cluster-latency_transformation_function}.publish('signal') detect(when(signal > ${var.cluster-latency_threshold_critical}, lasting=%{if var.cluster-latency_lasting_duration_critical == null}None%{else}'${var.cluster-latency_lasting_duration_critical}'%{endif}, at_least=${var.cluster-latency_at_least_percentage_critical})).publish('CRIT') EOF @@ -60,12 +60,12 @@ resource "signalfx_detector" "server-latency" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow}).publish('signal') + signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.server-latency_transformation_function}.publish('signal') detect(when(signal > ${var.server-latency_threshold_major}, lasting=%{if var.server-latency_lasting_duration_major == null}None%{else}'${var.server-latency_lasting_duration_major}'%{endif}, at_least=${var.server-latency_at_least_percentage_major})).publish('MAJOR') EOF rule { - description = "Zookeeper latency is too high > ${var.server-latency_threshold_major}" + description = "Zookeeper server latency is too high > ${var.server-latency_threshold_major}" severity = "Major" detect_label = "MAJOR" disabled = coalesce(var.server-latency_disabled, var.detectors_disabled) diff --git a/modules/smart-agent_zookeeper/outputs.tf b/modules/smart-agent_zookeeper/outputs.tf index d3cbd16e7..f65f069d4 100644 --- a/modules/smart-agent_zookeeper/outputs.tf +++ b/modules/smart-agent_zookeeper/outputs.tf @@ -8,6 +8,11 @@ output "file_descriptors" { value = signalfx_detector.file_descriptors } +output "health" { + description = "Detector resource for health" + value = signalfx_detector.health +} + output "heartbeat" { description = "Detector resource for heartbeat" value = signalfx_detector.heartbeat @@ -18,8 +23,3 @@ output "server-latency" { value = signalfx_detector.server-latency } -output "zookeeper-health" { - description = "Detector resource for zookeeper-health" - value = signalfx_detector.zookeeper-health -} - diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index b3f574d5d..a8b9fcc19 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -1,54 +1,60 @@ -# zookeeper-health detector +# health detector -variable "zookeeper-health_notifications" { - description = "Notification recipients list per severity overridden for zookeeper-health detector" +variable "health_notifications" { + description = "Notification recipients list per severity overridden for health detector" type = map(list(string)) default = {} } -variable "zookeeper-health_aggregation_function" { - description = "Aggregation function and group by for zookeeper-health detector (i.e. \".mean(by=['host'])\")" +variable "health_aggregation_function" { + description = "Aggregation function and group by for health detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "zookeeper-health_max_delay" { - description = "Enforce max delay for zookeeper-health detector (use \"0\" or \"null\" for \"Auto\")" +variable "health_transformation_function" { + description = "Transformation function for health detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "health_max_delay" { + description = "Enforce max delay for health detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "zookeeper-health_tip" { +variable "health_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "zookeeper-health_runbook_url" { +variable "health_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "zookeeper-health_disabled" { - description = "Disable all alerting rules for zookeeper-health detector" +variable "health_disabled" { + description = "Disable all alerting rules for health detector" type = bool default = null } -variable "zookeeper-health_threshold_critical" { - description = "Critical threshold for zookeeper-health detector" +variable "health_threshold_critical" { + description = "Critical threshold for health detector" type = number default = 1 } -variable "zookeeper-health_lasting_duration_critical" { +variable "health_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "zookeeper-health_at_least_percentage_critical" { +variable "health_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -67,6 +73,12 @@ variable "cluster-latency_aggregation_function" { default = ".mean(by=['env', 'kubernetes_cluster'])" } +variable "cluster-latency_transformation_function" { + description = "Transformation function for cluster-latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + variable "cluster-latency_max_delay" { description = "Enforce max delay for cluster-latency detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -116,6 +128,12 @@ variable "server-latency_notifications" { default = {} } +variable "server-latency_transformation_function" { + description = "Transformation function for server-latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + variable "server-latency_max_delay" { description = "Enforce max delay for server-latency detector (use \"0\" or \"null\" for \"Auto\")" type = number diff --git a/modules/smart-agent_zookeeper/variables.tf b/modules/smart-agent_zookeeper/variables.tf index 57bfdf3ba..b2ee2670e 100644 --- a/modules/smart-agent_zookeeper/variables.tf +++ b/modules/smart-agent_zookeeper/variables.tf @@ -44,124 +44,6 @@ variable "heartbeat_aggregation_function" { default = "" } -/*# zookeeper_health detector - -variable "zookeeper_health_max_delay" { - description = "Enforce max delay for zookeeper_health detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "zookeeper_health_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "zookeeper_health_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "zookeeper_health_disabled" { - description = "Disable all alerting rules for zookeeper_health detector" - type = bool - default = null -} - -variable "zookeeper_health_disabled_critical" { - description = "Disable critical alerting rule for zookeeper_health detector" - type = bool - default = null -} - -variable "zookeeper_health_notifications" { - description = "Notification recipients list per severity overridden for zookeeper_health detector" - type = map(list(string)) - default = {} -} - -variable "zookeeper_health_aggregation_function" { - description = "Aggregation function and group by for zookeeper_health detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "zookeeper_health_transformation_function" { - description = "Transformation function for zookeeper_health detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='5m')" -} - -# zookeeper_latency detector - -variable "zookeeper_latency_max_delay" { - description = "Enforce max delay for zookeeper_latency detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "zookeeper_latency_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "zookeeper_latency_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "zookeeper_latency_disabled" { - description = "Disable all alerting rules for zookeeper_latency detector" - type = bool - default = null -} - -variable "zookeeper_latency_disabled_critical" { - description = "Disable critical alerting rule for zookeeper_latency detector" - type = bool - default = null -} - -variable "zookeeper_latency_disabled_major" { - description = "Disable major alerting rule for zookeeper_latency detector" - type = bool - default = null -} - -variable "zookeeper_latency_notifications" { - description = "Notification recipients list per severity overridden for zookeeper_latency detector" - type = map(list(string)) - default = {} -} - -variable "zookeeper_latency_aggregation_function" { - description = "Aggregation function and group by for zookeeper_latency detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "zookeeper_latency_transformation_function" { - description = "Transformation function for zookeeper_latency detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='5m')" -} - -variable "zookeeper_latency_threshold_critical" { - description = "Critical threshold for zookeeper_latency detector" - type = number - default = 300000 -} - -variable "zookeeper_latency_threshold_major" { - description = "Major threshold for zookeeper_latency detector" - type = number - default = 250000 -}*/ - # file_descriptors detector variable "file_descriptors_max_delay" { From 797bfb966a940fbe3cd64b9399b373e7b1f960c7 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Fri, 12 Jan 2024 15:59:58 +0100 Subject: [PATCH 13/16] split health detector --- docs/severity.md | 5 +- modules/smart-agent_zookeeper/README.md | 5 +- ...h.yaml => 00-zookeeper-server-health.yaml} | 6 +- .../conf/01-zookeeper-cluster-health.yaml | 14 ++ ...yaml => 03-zookeeper-cluster-latency.yaml} | 4 +- .../smart-agent_zookeeper/detectors-gen.tf | 71 +++++++--- modules/smart-agent_zookeeper/outputs.tf | 15 +- .../smart-agent_zookeeper/variables-gen.tf | 133 +++++++++++++----- 8 files changed, 181 insertions(+), 72 deletions(-) rename modules/smart-agent_zookeeper/conf/{00-zookeeper-health.yaml => 00-zookeeper-server-health.yaml} (70%) create mode 100644 modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml rename modules/smart-agent_zookeeper/conf/{01-zookeeper-cluster-latency.yaml => 03-zookeeper-cluster-latency.yaml} (67%) diff --git a/docs/severity.md b/docs/severity.md index d6245f94f..4f3433460 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1311,9 +1311,10 @@ |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|Zookeeper health|X|-|-|-|-| -|Zookeeper cluster-latency|X|-|-|-|-| +|Zookeeper server-health|-|X|-|-|-| +|Zookeeper cluster-health|X|-|-|-|-| |Zookeeper server-latency|-|X|-|-|-| +|Zookeeper cluster-latency|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/README.md b/modules/smart-agent_zookeeper/README.md index bc66741e2..3157a15cf 100644 --- a/modules/smart-agent_zookeeper/README.md +++ b/modules/smart-agent_zookeeper/README.md @@ -77,9 +77,10 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|Zookeeper health|X|-|-|-|-| -|Zookeeper cluster-latency|X|-|-|-|-| +|Zookeeper server-health|-|X|-|-|-| +|Zookeeper cluster-health|X|-|-|-|-| |Zookeeper server-latency|-|X|-|-|-| +|Zookeeper cluster-latency|X|-|-|-|-| |Zookeeper heartbeat|X|-|-|-|-| |Zookeeper file descriptors usage|X|X|-|-|-| diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml similarity index 70% rename from modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml rename to modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml index 8eb5ee4a2..4b15a50d9 100644 --- a/modules/smart-agent_zookeeper/conf/00-zookeeper-health.yaml +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml @@ -1,13 +1,13 @@ module: zookeeper -name: health +name: server-health disabled: false signals: signal: metric: "gauge.zk_service_health" rules: - critical: + major: threshold: 1 comparator: "!=" - description: "is not running" + description: "Zookeeper server is not running" lasting_duration: "5m" health_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml new file mode 100644 index 000000000..49c4d6bb7 --- /dev/null +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml @@ -0,0 +1,14 @@ +module: zookeeper +name: cluster-health +disabled: false +aggregation: ".mean(by=['kubernetes_cluster'])" +signals: + signal: + metric: "gauge.zk_service_health" +rules: + critical: + threshold: 1 + comparator: "!=" + description: "Zookeeper cluster is not running" + lasting_duration: "5m" + health_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml b/modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml similarity index 67% rename from modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml rename to modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml index 94d1ef1fb..98730cff4 100644 --- a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-latency.yaml +++ b/modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml @@ -1,6 +1,6 @@ module: zookeeper name: cluster-latency -aggregation: ".mean(by=['env', 'kubernetes_cluster'])" +aggregation: ".mean(by=['kubernetes_cluster'])" disabled: false signals: signal: @@ -9,6 +9,6 @@ rules: critical: threshold: 300000 comparator: ">" - description: "Zookeeper global latency is too high" + description: "Zookeeper cluster latency is too high" lasting_duration: "5m" latency_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf index a13f7f366..e15a54cb8 100644 --- a/modules/smart-agent_zookeeper/detectors-gen.tf +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -1,55 +1,55 @@ -resource "signalfx_detector" "health" { - name = format("%s %s", local.detector_name_prefix, "Zookeeper health") +resource "signalfx_detector" "server-health" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper server-health") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.health_aggregation_function}${var.health_transformation_function}.publish('signal') - detect(when(signal != ${var.health_threshold_critical}, lasting=%{if var.health_lasting_duration_critical == null}None%{else}'${var.health_lasting_duration_critical}'%{endif}, at_least=${var.health_at_least_percentage_critical})).publish('CRIT') + signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.server-health_aggregation_function}${var.server-health_transformation_function}.publish('signal') + detect(when(signal != ${var.server-health_threshold_major}, lasting=%{if var.server-health_lasting_duration_major == null}None%{else}'${var.server-health_lasting_duration_major}'%{endif}, at_least=${var.server-health_at_least_percentage_major})).publish('MAJOR') EOF rule { - description = "is not running != ${var.health_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.health_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.health_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.health_runbook_url, var.runbook_url), "") - tip = var.health_tip + description = "Zookeeper server is not running != ${var.server-health_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.server-health_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.server-health_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.server-health_runbook_url, var.runbook_url), "") + tip = var.server-health_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.health_max_delay + max_delay = var.server-health_max_delay } -resource "signalfx_detector" "cluster-latency" { - name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-latency") +resource "signalfx_detector" "cluster-health" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-health") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}${var.cluster-latency_transformation_function}.publish('signal') - detect(when(signal > ${var.cluster-latency_threshold_critical}, lasting=%{if var.cluster-latency_lasting_duration_critical == null}None%{else}'${var.cluster-latency_lasting_duration_critical}'%{endif}, at_least=${var.cluster-latency_at_least_percentage_critical})).publish('CRIT') + signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.cluster-health_aggregation_function}${var.cluster-health_transformation_function}.publish('signal') + detect(when(signal != ${var.cluster-health_threshold_critical}, lasting=%{if var.cluster-health_lasting_duration_critical == null}None%{else}'${var.cluster-health_lasting_duration_critical}'%{endif}, at_least=${var.cluster-health_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "Zookeeper global latency is too high > ${var.cluster-latency_threshold_critical}" + description = "Zookeeper cluster is not running != ${var.cluster-health_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.cluster-latency_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster-latency_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster-latency_runbook_url, var.runbook_url), "") - tip = var.cluster-latency_tip + disabled = coalesce(var.cluster-health_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster-health_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster-health_runbook_url, var.runbook_url), "") + tip = var.cluster-health_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject parameterized_body = var.message_body == "" ? local.rule_body : var.message_body } - max_delay = var.cluster-latency_max_delay + max_delay = var.cluster-health_max_delay } resource "signalfx_detector" "server-latency" { @@ -79,3 +79,30 @@ EOF max_delay = var.server-latency_max_delay } +resource "signalfx_detector" "cluster-latency" { + name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-latency") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}${var.cluster-latency_transformation_function}.publish('signal') + detect(when(signal > ${var.cluster-latency_threshold_critical}, lasting=%{if var.cluster-latency_lasting_duration_critical == null}None%{else}'${var.cluster-latency_lasting_duration_critical}'%{endif}, at_least=${var.cluster-latency_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "Zookeeper cluster latency is too high > ${var.cluster-latency_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster-latency_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster-latency_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster-latency_runbook_url, var.runbook_url), "") + tip = var.cluster-latency_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster-latency_max_delay +} + diff --git a/modules/smart-agent_zookeeper/outputs.tf b/modules/smart-agent_zookeeper/outputs.tf index f65f069d4..a4b5fadd7 100644 --- a/modules/smart-agent_zookeeper/outputs.tf +++ b/modules/smart-agent_zookeeper/outputs.tf @@ -1,3 +1,8 @@ +output "cluster-health" { + description = "Detector resource for cluster-health" + value = signalfx_detector.cluster-health +} + output "cluster-latency" { description = "Detector resource for cluster-latency" value = signalfx_detector.cluster-latency @@ -8,16 +13,16 @@ output "file_descriptors" { value = signalfx_detector.file_descriptors } -output "health" { - description = "Detector resource for health" - value = signalfx_detector.health -} - output "heartbeat" { description = "Detector resource for heartbeat" value = signalfx_detector.heartbeat } +output "server-health" { + description = "Detector resource for server-health" + value = signalfx_detector.server-health +} + output "server-latency" { description = "Detector resource for server-latency" value = signalfx_detector.server-latency diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index a8b9fcc19..178e2706d 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -1,121 +1,121 @@ -# health detector +# server-health detector -variable "health_notifications" { - description = "Notification recipients list per severity overridden for health detector" +variable "server-health_notifications" { + description = "Notification recipients list per severity overridden for server-health detector" type = map(list(string)) default = {} } -variable "health_aggregation_function" { - description = "Aggregation function and group by for health detector (i.e. \".mean(by=['host'])\")" +variable "server-health_aggregation_function" { + description = "Aggregation function and group by for server-health detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "health_transformation_function" { - description = "Transformation function for health detector (i.e. \".mean(over='5m')\")" +variable "server-health_transformation_function" { + description = "Transformation function for server-health detector (i.e. \".mean(over='5m')\")" type = string default = "" } -variable "health_max_delay" { - description = "Enforce max delay for health detector (use \"0\" or \"null\" for \"Auto\")" +variable "server-health_max_delay" { + description = "Enforce max delay for server-health detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "health_tip" { +variable "server-health_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "health_runbook_url" { +variable "server-health_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "health_disabled" { - description = "Disable all alerting rules for health detector" +variable "server-health_disabled" { + description = "Disable all alerting rules for server-health detector" type = bool default = null } -variable "health_threshold_critical" { - description = "Critical threshold for health detector" +variable "server-health_threshold_major" { + description = "Major threshold for server-health detector" type = number default = 1 } -variable "health_lasting_duration_critical" { +variable "server-health_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "health_at_least_percentage_critical" { +variable "server-health_at_least_percentage_major" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 } -# cluster-latency detector +# cluster-health detector -variable "cluster-latency_notifications" { - description = "Notification recipients list per severity overridden for cluster-latency detector" +variable "cluster-health_notifications" { + description = "Notification recipients list per severity overridden for cluster-health detector" type = map(list(string)) default = {} } -variable "cluster-latency_aggregation_function" { - description = "Aggregation function and group by for cluster-latency detector (i.e. \".mean(by=['host'])\")" +variable "cluster-health_aggregation_function" { + description = "Aggregation function and group by for cluster-health detector (i.e. \".mean(by=['host'])\")" type = string - default = ".mean(by=['env', 'kubernetes_cluster'])" + default = ".mean(by=['kubernetes_cluster'])" } -variable "cluster-latency_transformation_function" { - description = "Transformation function for cluster-latency detector (i.e. \".mean(over='5m')\")" +variable "cluster-health_transformation_function" { + description = "Transformation function for cluster-health detector (i.e. \".mean(over='5m')\")" type = string default = "" } -variable "cluster-latency_max_delay" { - description = "Enforce max delay for cluster-latency detector (use \"0\" or \"null\" for \"Auto\")" +variable "cluster-health_max_delay" { + description = "Enforce max delay for cluster-health detector (use \"0\" or \"null\" for \"Auto\")" type = number default = null } -variable "cluster-latency_tip" { +variable "cluster-health_tip" { description = "Suggested first course of action or any note useful for incident handling" type = string default = "" } -variable "cluster-latency_runbook_url" { +variable "cluster-health_runbook_url" { description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" type = string default = "" } -variable "cluster-latency_disabled" { - description = "Disable all alerting rules for cluster-latency detector" +variable "cluster-health_disabled" { + description = "Disable all alerting rules for cluster-health detector" type = bool default = null } -variable "cluster-latency_threshold_critical" { - description = "Critical threshold for cluster-latency detector" +variable "cluster-health_threshold_critical" { + description = "Critical threshold for cluster-health detector" type = number - default = 300000 + default = 1 } -variable "cluster-latency_lasting_duration_critical" { +variable "cluster-health_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "cluster-latency_at_least_percentage_critical" { +variable "cluster-health_at_least_percentage_critical" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 1 @@ -175,3 +175,64 @@ variable "server-latency_at_least_percentage_major" { type = number default = 1 } +# cluster-latency detector + +variable "cluster-latency_notifications" { + description = "Notification recipients list per severity overridden for cluster-latency detector" + type = map(list(string)) + default = {} +} + +variable "cluster-latency_aggregation_function" { + description = "Aggregation function and group by for cluster-latency detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".mean(by=['kubernetes_cluster'])" +} + +variable "cluster-latency_transformation_function" { + description = "Transformation function for cluster-latency detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "cluster-latency_max_delay" { + description = "Enforce max delay for cluster-latency detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster-latency_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster-latency_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster-latency_disabled" { + description = "Disable all alerting rules for cluster-latency detector" + type = bool + default = null +} + +variable "cluster-latency_threshold_critical" { + description = "Critical threshold for cluster-latency detector" + type = number + default = 300000 +} + +variable "cluster-latency_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "cluster-latency_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 995a6f801d1e173b1bf07751f4da7c961bf87fd6 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Mon, 15 Jan 2024 11:20:29 +0100 Subject: [PATCH 14/16] no aggregation for server-health --- .../conf/00-zookeeper-server-health.yaml | 1 + modules/smart-agent_zookeeper/detectors-gen.tf | 2 +- modules/smart-agent_zookeeper/variables-gen.tf | 6 ------ 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml index 4b15a50d9..9ac1fc5e2 100644 --- a/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml @@ -1,6 +1,7 @@ module: zookeeper name: server-health disabled: false +aggregation: false signals: signal: metric: "gauge.zk_service_health" diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf index e15a54cb8..a0f9eab38 100644 --- a/modules/smart-agent_zookeeper/detectors-gen.tf +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -6,7 +6,7 @@ resource "signalfx_detector" "server-health" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.server-health_aggregation_function}${var.server-health_transformation_function}.publish('signal') + signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.server-health_transformation_function}.publish('signal') detect(when(signal != ${var.server-health_threshold_major}, lasting=%{if var.server-health_lasting_duration_major == null}None%{else}'${var.server-health_lasting_duration_major}'%{endif}, at_least=${var.server-health_at_least_percentage_major})).publish('MAJOR') EOF diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index 178e2706d..26a84eb68 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -6,12 +6,6 @@ variable "server-health_notifications" { default = {} } -variable "server-health_aggregation_function" { - description = "Aggregation function and group by for server-health detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - variable "server-health_transformation_function" { description = "Transformation function for server-health detector (i.e. \".mean(over='5m')\")" type = string From 8808b17a52f8920c32dbfad58d63fabdffe150c6 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Mon, 15 Jan 2024 12:01:14 +0100 Subject: [PATCH 15/16] update cluster-health comparator --- .../conf/01-zookeeper-cluster-health.yaml | 4 ++-- modules/smart-agent_zookeeper/detectors-gen.tf | 4 ++-- modules/smart-agent_zookeeper/variables-gen.tf | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml index 49c4d6bb7..1781ab4a1 100644 --- a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml @@ -7,8 +7,8 @@ signals: metric: "gauge.zk_service_health" rules: critical: - threshold: 1 - comparator: "!=" + threshold: 0 + comparator: "==" description: "Zookeeper cluster is not running" lasting_duration: "5m" health_disabled: "false" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/detectors-gen.tf b/modules/smart-agent_zookeeper/detectors-gen.tf index a0f9eab38..634ac7ad1 100644 --- a/modules/smart-agent_zookeeper/detectors-gen.tf +++ b/modules/smart-agent_zookeeper/detectors-gen.tf @@ -34,11 +34,11 @@ resource "signalfx_detector" "cluster-health" { program_text = <<-EOF signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.cluster-health_aggregation_function}${var.cluster-health_transformation_function}.publish('signal') - detect(when(signal != ${var.cluster-health_threshold_critical}, lasting=%{if var.cluster-health_lasting_duration_critical == null}None%{else}'${var.cluster-health_lasting_duration_critical}'%{endif}, at_least=${var.cluster-health_at_least_percentage_critical})).publish('CRIT') + detect(when(signal == ${var.cluster-health_threshold_critical}, lasting=%{if var.cluster-health_lasting_duration_critical == null}None%{else}'${var.cluster-health_lasting_duration_critical}'%{endif}, at_least=${var.cluster-health_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "Zookeeper cluster is not running != ${var.cluster-health_threshold_critical}" + description = "Zookeeper cluster is not running == ${var.cluster-health_threshold_critical}" severity = "Critical" detect_label = "CRIT" disabled = coalesce(var.cluster-health_disabled, var.detectors_disabled) diff --git a/modules/smart-agent_zookeeper/variables-gen.tf b/modules/smart-agent_zookeeper/variables-gen.tf index 26a84eb68..a629d304d 100644 --- a/modules/smart-agent_zookeeper/variables-gen.tf +++ b/modules/smart-agent_zookeeper/variables-gen.tf @@ -100,7 +100,7 @@ variable "cluster-health_disabled" { variable "cluster-health_threshold_critical" { description = "Critical threshold for cluster-health detector" type = number - default = 1 + default = 0 } variable "cluster-health_lasting_duration_critical" { From 7d2b1c147bf8eb12d8fcb637f1d9b506860b1a76 Mon Sep 17 00:00:00 2001 From: Soufiane Date: Wed, 13 Mar 2024 12:16:12 +0100 Subject: [PATCH 16/16] Delete default variables --- .../conf/00-zookeeper-server-health.yaml | 4 +--- .../conf/01-zookeeper-cluster-health.yaml | 4 +--- .../conf/02-zookeeper-server-latency.yaml | 4 +--- .../conf/03-zookeeper-cluster-latency.yaml | 4 +--- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml b/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml index 9ac1fc5e2..d2daca7f5 100644 --- a/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml +++ b/modules/smart-agent_zookeeper/conf/00-zookeeper-server-health.yaml @@ -1,6 +1,5 @@ module: zookeeper name: server-health -disabled: false aggregation: false signals: signal: @@ -10,5 +9,4 @@ rules: threshold: 1 comparator: "!=" description: "Zookeeper server is not running" - lasting_duration: "5m" - health_disabled: "false" \ No newline at end of file + lasting_duration: "5m" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml index 1781ab4a1..9eb576b9e 100644 --- a/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml +++ b/modules/smart-agent_zookeeper/conf/01-zookeeper-cluster-health.yaml @@ -1,6 +1,5 @@ module: zookeeper name: cluster-health -disabled: false aggregation: ".mean(by=['kubernetes_cluster'])" signals: signal: @@ -10,5 +9,4 @@ rules: threshold: 0 comparator: "==" description: "Zookeeper cluster is not running" - lasting_duration: "5m" - health_disabled: "false" \ No newline at end of file + lasting_duration: "5m" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml b/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml index 532babeb9..b729bdc6c 100644 --- a/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml +++ b/modules/smart-agent_zookeeper/conf/02-zookeeper-server-latency.yaml @@ -1,7 +1,6 @@ module: zookeeper name: server-latency aggregation: false -disabled: false signals: signal: metric: "gauge.zk_avg_latency" @@ -10,5 +9,4 @@ rules: threshold: 250000 comparator: ">" description: "Zookeeper server latency is too high" - lasting_duration: "5m" - latency_disabled: "false" \ No newline at end of file + lasting_duration: "5m" \ No newline at end of file diff --git a/modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml b/modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml index 98730cff4..e379e62aa 100644 --- a/modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml +++ b/modules/smart-agent_zookeeper/conf/03-zookeeper-cluster-latency.yaml @@ -1,7 +1,6 @@ module: zookeeper name: cluster-latency aggregation: ".mean(by=['kubernetes_cluster'])" -disabled: false signals: signal: metric: "gauge.zk_avg_latency" @@ -10,5 +9,4 @@ rules: threshold: 300000 comparator: ">" description: "Zookeeper cluster latency is too high" - lasting_duration: "5m" - latency_disabled: "false" \ No newline at end of file + lasting_duration: "5m" \ No newline at end of file