Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zookeeper duration #541

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
6 changes: 4 additions & 2 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -1311,9 +1311,11 @@

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Zookeeper server-health|-|X|-|-|-|
|Zookeeper cluster-health|X|-|-|-|-|
|Zookeeper server-latency|-|X|-|-|-|
|Zookeeper cluster-latency|X|-|-|-|-|
|Zookeeper heartbeat|X|-|-|-|-|
|Zookeeper service health|X|-|-|-|-|
|Zookeeper latency|X|X|-|-|-|
|Zookeeper file descriptors usage|X|X|-|-|-|


8 changes: 5 additions & 3 deletions modules/smart-agent_zookeeper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ Note the following parameters:

These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all
[modules](../) in this repository. Other variables, specific to this module, are available in
[variables.tf](variables.tf).
[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf).
In general, the default configuration "works" but all of these Terraform
[variables](https://www.terraform.io/language/values/variables) make it possible to
customize the detectors behavior to better fit your needs.
Expand All @@ -77,9 +77,11 @@ This module creates the following SignalFx detectors which could contain one or

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Zookeeper server-health|-|X|-|-|-|
|Zookeeper cluster-health|X|-|-|-|-|
|Zookeeper server-latency|-|X|-|-|-|
|Zookeeper cluster-latency|X|-|-|-|-|
|Zookeeper heartbeat|X|-|-|-|-|
|Zookeeper service health|X|-|-|-|-|
|Zookeeper latency|X|X|-|-|-|
|Zookeeper file descriptors usage|X|X|-|-|-|

## How to collect required metrics?
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: server-health
aggregation: false
signals:
signal:
metric: "gauge.zk_service_health"
rules:
major:
threshold: 1
comparator: "!="
description: "Zookeeper server is not running"
lasting_duration: "5m"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: cluster-health
aggregation: ".mean(by=['kubernetes_cluster'])"
signals:
signal:
metric: "gauge.zk_service_health"
rules:
critical:
threshold: 0
comparator: "=="
description: "Zookeeper cluster is not running"
lasting_duration: "5m"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: server-latency
aggregation: false
signals:
signal:
metric: "gauge.zk_avg_latency"
rules:
major:
threshold: 250000
comparator: ">"
description: "Zookeeper server latency is too high"
lasting_duration: "5m"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module: zookeeper
name: cluster-latency
aggregation: ".mean(by=['kubernetes_cluster'])"
signals:
signal:
metric: "gauge.zk_avg_latency"
rules:
critical:
threshold: 300000
comparator: ">"
description: "Zookeeper cluster latency is too high"
lasting_duration: "5m"
108 changes: 108 additions & 0 deletions modules/smart-agent_zookeeper/detectors-gen.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
resource "signalfx_detector" "server-health" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper server-health")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.server-health_transformation_function}.publish('signal')
detect(when(signal != ${var.server-health_threshold_major}, lasting=%{if var.server-health_lasting_duration_major == null}None%{else}'${var.server-health_lasting_duration_major}'%{endif}, at_least=${var.server-health_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "Zookeeper server is not running != ${var.server-health_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.server-health_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.server-health_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.server-health_runbook_url, var.runbook_url), "")
tip = var.server-health_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.server-health_max_delay
}

resource "signalfx_detector" "cluster-health" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-health")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_service_health', filter=${module.filtering.signalflow})${var.cluster-health_aggregation_function}${var.cluster-health_transformation_function}.publish('signal')
detect(when(signal == ${var.cluster-health_threshold_critical}, lasting=%{if var.cluster-health_lasting_duration_critical == null}None%{else}'${var.cluster-health_lasting_duration_critical}'%{endif}, at_least=${var.cluster-health_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "Zookeeper cluster is not running == ${var.cluster-health_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.cluster-health_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.cluster-health_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.cluster-health_runbook_url, var.runbook_url), "")
tip = var.cluster-health_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.cluster-health_max_delay
}

resource "signalfx_detector" "server-latency" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper server-latency")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.server-latency_transformation_function}.publish('signal')
detect(when(signal > ${var.server-latency_threshold_major}, lasting=%{if var.server-latency_lasting_duration_major == null}None%{else}'${var.server-latency_lasting_duration_major}'%{endif}, at_least=${var.server-latency_at_least_percentage_major})).publish('MAJOR')
EOF

rule {
description = "Zookeeper server latency is too high > ${var.server-latency_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.server-latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.server-latency_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.server-latency_runbook_url, var.runbook_url), "")
tip = var.server-latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.server-latency_max_delay
}

resource "signalfx_detector" "cluster-latency" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper cluster-latency")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_avg_latency', filter=${module.filtering.signalflow})${var.cluster-latency_aggregation_function}${var.cluster-latency_transformation_function}.publish('signal')
detect(when(signal > ${var.cluster-latency_threshold_critical}, lasting=%{if var.cluster-latency_lasting_duration_critical == null}None%{else}'${var.cluster-latency_lasting_duration_critical}'%{endif}, at_least=${var.cluster-latency_at_least_percentage_critical})).publish('CRIT')
EOF

rule {
description = "Zookeeper cluster latency is too high > ${var.cluster-latency_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.cluster-latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.cluster-latency_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.cluster-latency_runbook_url, var.runbook_url), "")
tip = var.cluster-latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.cluster-latency_max_delay
}

67 changes: 0 additions & 67 deletions modules/smart-agent_zookeeper/detectors-zookeeper.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,73 +26,6 @@ EOF
max_delay = var.heartbeat_max_delay
}

resource "signalfx_detector" "zookeeper_health" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper service health")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_service_health', filter=filter('plugin', 'zookeeper') and ${module.filtering.signalflow})${var.zookeeper_health_aggregation_function}${var.zookeeper_health_transformation_function}.publish('signal')
detect(when(signal != 1)).publish('CRIT')
EOF

rule {
description = "is not running"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.zookeeper_health_disabled_critical, var.zookeeper_health_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.zookeeper_health_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.zookeeper_health_runbook_url, var.runbook_url), "")
tip = var.zookeeper_health_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.zookeeper_health_max_delay
}

resource "signalfx_detector" "zookeeper_latency" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper latency")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('gauge.zk_avg_latency', filter=filter('plugin', 'zookeeper') and ${module.filtering.signalflow})${var.zookeeper_latency_aggregation_function}${var.zookeeper_latency_transformation_function}.publish('signal')
detect(when(signal > ${var.zookeeper_latency_threshold_critical})).publish('CRIT')
detect(when(signal > ${var.zookeeper_latency_threshold_major}) and (not when(signal > ${var.zookeeper_latency_threshold_critical}))).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.zookeeper_latency_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.zookeeper_latency_disabled_critical, var.zookeeper_latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.zookeeper_latency_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.zookeeper_latency_runbook_url, var.runbook_url), "")
tip = var.zookeeper_latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too high > ${var.zookeeper_latency_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.zookeeper_latency_disabled_major, var.zookeeper_latency_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.zookeeper_latency_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.zookeeper_latency_runbook_url, var.runbook_url), "")
tip = var.zookeeper_latency_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.zookeeper_latency_max_delay
}

resource "signalfx_detector" "file_descriptors" {
name = format("%s %s", local.detector_name_prefix, "Zookeeper file descriptors usage")

Expand Down
22 changes: 16 additions & 6 deletions modules/smart-agent_zookeeper/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
output "cluster-health" {
description = "Detector resource for cluster-health"
value = signalfx_detector.cluster-health
}

output "cluster-latency" {
description = "Detector resource for cluster-latency"
value = signalfx_detector.cluster-latency
}

output "file_descriptors" {
description = "Detector resource for file_descriptors"
value = signalfx_detector.file_descriptors
Expand All @@ -8,13 +18,13 @@ output "heartbeat" {
value = signalfx_detector.heartbeat
}

output "zookeeper_health" {
description = "Detector resource for zookeeper_health"
value = signalfx_detector.zookeeper_health
output "server-health" {
description = "Detector resource for server-health"
value = signalfx_detector.server-health
}

output "zookeeper_latency" {
description = "Detector resource for zookeeper_latency"
value = signalfx_detector.zookeeper_latency
output "server-latency" {
description = "Detector resource for server-latency"
value = signalfx_detector.server-latency
}

Loading
Loading