From b54ca6fdc4f44b1e74ecf08f55fb35eed7025d47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Thu, 6 Apr 2023 17:41:07 +0200 Subject: [PATCH 1/3] Add Swap utilization detector --- docs/severity.md | 1 + modules/smart-agent_system-common/README.md | 28 ++++++ .../conf/06-swap.yaml | 17 ++++ .../conf/readme.yaml | 26 ++++++ .../detectors-gen.tf | 45 ++++++++++ modules/smart-agent_system-common/outputs.tf | 5 ++ .../variables-gen.tf | 90 +++++++++++++++++++ 7 files changed, 212 insertions(+) create mode 100644 modules/smart-agent_system-common/conf/06-swap.yaml diff --git a/docs/severity.md b/docs/severity.md index e617b86d6..042acd43d 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1184,6 +1184,7 @@ |System disk space utilization|X|X|-|-|-| |System disk inodes utilization|X|X|-|-|-| |System memory utilization|X|X|-|-|-| +|System swap utilization|X|X|-|-|-| |System disk space running out|-|X|-|-|-| diff --git a/modules/smart-agent_system-common/README.md b/modules/smart-agent_system-common/README.md index cdfc24eae..78b2d90ae 100644 --- a/modules/smart-agent_system-common/README.md +++ b/modules/smart-agent_system-common/README.md @@ -8,6 +8,7 @@ - [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) - [How to collect required metrics?](#how-to-collect-required-metrics) - [Monitors](#monitors) + - [Swap](#swap) - [Metrics](#metrics) - [Related documentation](#related-documentation) @@ -82,6 +83,7 @@ This module creates the following SignalFx detectors which could contain one or |System disk space utilization|X|X|-|-|-| |System disk inodes utilization|X|X|-|-|-| |System memory utilization|X|X|-|-|-| +|System swap utilization|X|X|-|-|-| |System disk space running out|-|X|-|-|-| ## How to collect required metrics? @@ -132,6 +134,30 @@ You have two choices to use load based detectors: In both cases, the goal is to get alerts based on the __ratio__ of load by dividing the original load per the number of CPU/cores which is the only way to get generic and relevant alerts for load. It mainly depends if you want to collect 2 metrics instead of 1 and if you want the load one to be raw or already averaged. +### Swap + +To activate the swap monitor, you need to add this parameters in otel-agent configuration + +* receivers configuration + +``` +receivers: + hostmetrics: + scrapers: + paging: + metrics: + system.paging.utilization: + enabled: true +``` + +* Exporters configuration + +``` +exporters: + signalfx: + include_metrics: + - metric_name: system.paging.utilization +``` ### Metrics @@ -150,6 +176,7 @@ parameter to the corresponding monitor configuration: - '!load.midterm' - '!memory.utilization' - '!percent_inodes.used' + - '!system.paging.utilization' ``` @@ -167,3 +194,4 @@ parameter to the corresponding monitor configuration: * [Smart Agent monitor memory](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/memory.md) * [Splunk Observability integration cpu](https://docs.splunk.com/Observability/gdi/cpu/cpu.html) * [Splunk Observability integration load](https://docs.splunk.com/Observability/gdi/load/load.html) +* [Splunk Observability hostmetrics](https://docs.splunk.com/Observability/gdi/opentelemetry/components/host-metrics-receiver.html) diff --git a/modules/smart-agent_system-common/conf/06-swap.yaml b/modules/smart-agent_system-common/conf/06-swap.yaml new file mode 100644 index 000000000..f856b6a37 --- /dev/null +++ b/modules/smart-agent_system-common/conf/06-swap.yaml @@ -0,0 +1,17 @@ +module: system +name: "swap utilization" +id: swap +transformation: ".min(over='5m')" +value_unit: "%" +signals: + signal: + metric: system.paging.utilization + filter: "filter('state': 'used')" +rules: + critical: + threshold: 95 + comparator: ">" + major: + threshold: 90 + comparator: ">" + dependency: critical diff --git a/modules/smart-agent_system-common/conf/readme.yaml b/modules/smart-agent_system-common/conf/readme.yaml index 5fc59fa67..48c27d935 100644 --- a/modules/smart-agent_system-common/conf/readme.yaml +++ b/modules/smart-agent_system-common/conf/readme.yaml @@ -13,6 +13,8 @@ documentations: url: 'https://docs.splunk.com/Observability/gdi/cpu/cpu.html' - name: Splunk Observability integration load url: 'https://docs.splunk.com/Observability/gdi/load/load.html' + - name: Splunk Observability hostmetrics + url: 'https://docs.splunk.com/Observability/gdi/opentelemetry/components/host-metrics-receiver.html' source_doc: | ### Monitors @@ -37,3 +39,27 @@ source_doc: | In both cases, the goal is to get alerts based on the __ratio__ of load by dividing the original load per the number of CPU/cores which is the only way to get generic and relevant alerts for load. It mainly depends if you want to collect 2 metrics instead of 1 and if you want the load one to be raw or already averaged. + ### Swap + + To activate the swap monitor, you need to add this parameters in otel-agent configuration + + * receivers configuration + + ``` + receivers: + hostmetrics: + scrapers: + paging: + metrics: + system.paging.utilization: + enabled: true + ``` + + * Exporters configuration + + ``` + exporters: + signalfx: + include_metrics: + - metric_name: system.paging.utilization + ``` \ No newline at end of file diff --git a/modules/smart-agent_system-common/detectors-gen.tf b/modules/smart-agent_system-common/detectors-gen.tf index 176ea0f48..97c0828ff 100644 --- a/modules/smart-agent_system-common/detectors-gen.tf +++ b/modules/smart-agent_system-common/detectors-gen.tf @@ -248,3 +248,48 @@ EOF max_delay = var.memory_max_delay } +resource "signalfx_detector" "swap" { + name = format("%s %s", local.detector_name_prefix, "System swap utilization") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + signal = data('system.paging.utilization', filter=filter('state': 'used') and ${module.filtering.signalflow})${var.swap_aggregation_function}${var.swap_transformation_function}.publish('signal') + detect(when(signal > ${var.swap_threshold_critical}, lasting=%{if var.swap_lasting_duration_critical == null}None%{else}'${var.swap_lasting_duration_critical}'%{endif}, at_least=${var.swap_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.swap_threshold_major}, lasting=%{if var.swap_lasting_duration_major == null}None%{else}'${var.swap_lasting_duration_major}'%{endif}, at_least=${var.swap_at_least_percentage_major}) and (not when(signal > ${var.swap_threshold_critical}, lasting=%{if var.swap_lasting_duration_critical == null}None%{else}'${var.swap_lasting_duration_critical}'%{endif}, at_least=${var.swap_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.swap_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.swap_disabled_critical, var.swap_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.swap_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.swap_runbook_url, var.runbook_url), "") + tip = var.swap_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.swap_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.swap_disabled_major, var.swap_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.swap_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.swap_runbook_url, var.runbook_url), "") + tip = var.swap_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.swap_max_delay +} + diff --git a/modules/smart-agent_system-common/outputs.tf b/modules/smart-agent_system-common/outputs.tf index b6d2f3221..fe5a620e5 100644 --- a/modules/smart-agent_system-common/outputs.tf +++ b/modules/smart-agent_system-common/outputs.tf @@ -33,3 +33,8 @@ output "memory" { value = signalfx_detector.memory } +output "swap" { + description = "Detector resource for swap" + value = signalfx_detector.swap +} + diff --git a/modules/smart-agent_system-common/variables-gen.tf b/modules/smart-agent_system-common/variables-gen.tf index e3d0a59f6..1dbf82637 100644 --- a/modules/smart-agent_system-common/variables-gen.tf +++ b/modules/smart-agent_system-common/variables-gen.tf @@ -492,3 +492,93 @@ variable "memory_at_least_percentage_major" { type = number default = 1 } +# swap detector + +variable "swap_notifications" { + description = "Notification recipients list per severity overridden for swap detector" + type = map(list(string)) + default = {} +} + +variable "swap_aggregation_function" { + description = "Aggregation function and group by for swap detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "swap_transformation_function" { + description = "Transformation function for swap detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5m')" +} + +variable "swap_max_delay" { + description = "Enforce max delay for swap detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "swap_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "swap_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "swap_disabled" { + description = "Disable all alerting rules for swap detector" + type = bool + default = null +} + +variable "swap_disabled_critical" { + description = "Disable critical alerting rule for swap detector" + type = bool + default = null +} + +variable "swap_disabled_major" { + description = "Disable major alerting rule for swap detector" + type = bool + default = null +} + +variable "swap_threshold_critical" { + description = "Critical threshold for swap detector in %" + type = number + default = 95 +} + +variable "swap_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "swap_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "swap_threshold_major" { + description = "Major threshold for swap detector in %" + type = number + default = 90 +} + +variable "swap_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "swap_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 1826c1eec91b30ba9b778117ebe8679ce316fe13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Thu, 6 Apr 2023 17:53:55 +0200 Subject: [PATCH 2/3] Fix filter --- modules/smart-agent_system-common/conf/06-swap.yaml | 2 +- modules/smart-agent_system-common/detectors-gen.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/smart-agent_system-common/conf/06-swap.yaml b/modules/smart-agent_system-common/conf/06-swap.yaml index f856b6a37..0ff53f840 100644 --- a/modules/smart-agent_system-common/conf/06-swap.yaml +++ b/modules/smart-agent_system-common/conf/06-swap.yaml @@ -6,7 +6,7 @@ value_unit: "%" signals: signal: metric: system.paging.utilization - filter: "filter('state': 'used')" + filter: "filter('state', 'used')" rules: critical: threshold: 95 diff --git a/modules/smart-agent_system-common/detectors-gen.tf b/modules/smart-agent_system-common/detectors-gen.tf index 97c0828ff..a902bd009 100644 --- a/modules/smart-agent_system-common/detectors-gen.tf +++ b/modules/smart-agent_system-common/detectors-gen.tf @@ -261,7 +261,7 @@ resource "signalfx_detector" "swap" { } program_text = <<-EOF - signal = data('system.paging.utilization', filter=filter('state': 'used') and ${module.filtering.signalflow})${var.swap_aggregation_function}${var.swap_transformation_function}.publish('signal') + signal = data('system.paging.utilization', filter=filter('state', 'used') and ${module.filtering.signalflow})${var.swap_aggregation_function}${var.swap_transformation_function}.publish('signal') detect(when(signal > ${var.swap_threshold_critical}, lasting=%{if var.swap_lasting_duration_critical == null}None%{else}'${var.swap_lasting_duration_critical}'%{endif}, at_least=${var.swap_at_least_percentage_critical})).publish('CRIT') detect(when(signal > ${var.swap_threshold_major}, lasting=%{if var.swap_lasting_duration_major == null}None%{else}'${var.swap_lasting_duration_major}'%{endif}, at_least=${var.swap_at_least_percentage_major}) and (not when(signal > ${var.swap_threshold_critical}, lasting=%{if var.swap_lasting_duration_critical == null}None%{else}'${var.swap_lasting_duration_critical}'%{endif}, at_least=${var.swap_at_least_percentage_critical}))).publish('MAJOR') EOF From 65c248e0015dbc0d979e6dc802441079051eb312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20MARMOL?= Date: Thu, 6 Apr 2023 18:03:37 +0200 Subject: [PATCH 3/3] Swap - Adjust threshold --- modules/smart-agent_system-common/conf/06-swap.yaml | 4 ++-- modules/smart-agent_system-common/variables-gen.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/smart-agent_system-common/conf/06-swap.yaml b/modules/smart-agent_system-common/conf/06-swap.yaml index 0ff53f840..74b49dd6b 100644 --- a/modules/smart-agent_system-common/conf/06-swap.yaml +++ b/modules/smart-agent_system-common/conf/06-swap.yaml @@ -9,9 +9,9 @@ signals: filter: "filter('state', 'used')" rules: critical: - threshold: 95 + threshold: 90 comparator: ">" major: - threshold: 90 + threshold: 80 comparator: ">" dependency: critical diff --git a/modules/smart-agent_system-common/variables-gen.tf b/modules/smart-agent_system-common/variables-gen.tf index 1dbf82637..7fbd61a9c 100644 --- a/modules/smart-agent_system-common/variables-gen.tf +++ b/modules/smart-agent_system-common/variables-gen.tf @@ -551,7 +551,7 @@ variable "swap_disabled_major" { variable "swap_threshold_critical" { description = "Critical threshold for swap detector in %" type = number - default = 95 + default = 90 } variable "swap_lasting_duration_critical" { @@ -568,7 +568,7 @@ variable "swap_at_least_percentage_critical" { variable "swap_threshold_major" { description = "Major threshold for swap detector in %" type = number - default = 90 + default = 80 } variable "swap_lasting_duration_major" {