diff --git a/docs/severity.md b/docs/severity.md index 4c7f18656..b35e2c92e 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -1253,6 +1253,7 @@ |System cpu utilization|X|X|-|-|-| |System load 5m ratio|X|X|-|-|-| |System disk space utilization|X|X|-|-|-| +|System filesystem inodes utilization|X|X|-|-|-| |System disk inodes utilization|X|X|-|-|-| |System memory utilization|X|X|-|-|-| |System swap in/out|X|X|-|-|-| diff --git a/modules/smart-agent_system-common/README.md b/modules/smart-agent_system-common/README.md index 2d12190ad..5d7e79f7d 100644 --- a/modules/smart-agent_system-common/README.md +++ b/modules/smart-agent_system-common/README.md @@ -80,6 +80,7 @@ This module creates the following SignalFx detectors which could contain one or |System cpu utilization|X|X|-|-|-| |System load 5m ratio|X|X|-|-|-| |System disk space utilization|X|X|-|-|-| +|System filesystem inodes utilization|X|X|-|-|-| |System disk inodes utilization|X|X|-|-|-| |System memory utilization|X|X|-|-|-| |System swap in/out|X|X|-|-|-| @@ -153,6 +154,7 @@ parameter to the corresponding monitor configuration: - '!percent_inodes.used' - '!vmpage_io.swap.in' - '!vmpage_io.swap.out' + - '!system.filesystem.inodes.usage' ``` diff --git a/modules/smart-agent_system-common/conf/04-filesystem-inodes.yaml b/modules/smart-agent_system-common/conf/04-filesystem-inodes.yaml new file mode 100644 index 000000000..13fbfd498 --- /dev/null +++ b/modules/smart-agent_system-common/conf/04-filesystem-inodes.yaml @@ -0,0 +1,22 @@ +module: system +name: "filesystem inodes utilization" +id: filesystem_inodes +transformation: ".max(over='5m')" +value_unit: "%" +signals: + used: + metric: system.filesystem.inodes.usage + filter: filter('state', 'used') + free: + metric: system.filesystem.inodes.usage + filter: filter('state', 'free') + signal: + formula: (used / (used + free) * 100) +rules: + critical: + threshold: 95 + comparator: ">" + major: + threshold: 90 + comparator: ">" + dependency: critical diff --git a/modules/smart-agent_system-common/detectors-gen.tf b/modules/smart-agent_system-common/detectors-gen.tf index 7feb897c1..a65d5137b 100644 --- a/modules/smart-agent_system-common/detectors-gen.tf +++ b/modules/smart-agent_system-common/detectors-gen.tf @@ -158,6 +158,53 @@ EOF max_delay = var.disk_space_max_delay } +resource "signalfx_detector" "filesystem_inodes" { + name = format("%s %s", local.detector_name_prefix, "System filesystem inodes utilization") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + used = data('system.filesystem.inodes.usage', filter=filter('state', 'used') and ${module.filtering.signalflow})${var.filesystem_inodes_aggregation_function}${var.filesystem_inodes_transformation_function} + free = data('system.filesystem.inodes.usage', filter=filter('state', 'free') and ${module.filtering.signalflow})${var.filesystem_inodes_aggregation_function}${var.filesystem_inodes_transformation_function} + signal = (used / (used + free) * 100).publish('signal') + detect(when(signal > ${var.filesystem_inodes_threshold_critical}, lasting=%{if var.filesystem_inodes_lasting_duration_critical == null}None%{else}'${var.filesystem_inodes_lasting_duration_critical}'%{endif}, at_least=${var.filesystem_inodes_at_least_percentage_critical})).publish('CRIT') + detect(when(signal > ${var.filesystem_inodes_threshold_major}, lasting=%{if var.filesystem_inodes_lasting_duration_major == null}None%{else}'${var.filesystem_inodes_lasting_duration_major}'%{endif}, at_least=${var.filesystem_inodes_at_least_percentage_major}) and (not when(signal > ${var.filesystem_inodes_threshold_critical}, lasting=%{if var.filesystem_inodes_lasting_duration_critical == null}None%{else}'${var.filesystem_inodes_lasting_duration_critical}'%{endif}, at_least=${var.filesystem_inodes_at_least_percentage_critical}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.filesystem_inodes_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.filesystem_inodes_disabled_critical, var.filesystem_inodes_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.filesystem_inodes_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.filesystem_inodes_runbook_url, var.runbook_url), "") + tip = var.filesystem_inodes_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.filesystem_inodes_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.filesystem_inodes_disabled_major, var.filesystem_inodes_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.filesystem_inodes_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.filesystem_inodes_runbook_url, var.runbook_url), "") + tip = var.filesystem_inodes_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.filesystem_inodes_max_delay +} + resource "signalfx_detector" "disk_inodes" { name = format("%s %s", local.detector_name_prefix, "System disk inodes utilization") diff --git a/modules/smart-agent_system-common/outputs.tf b/modules/smart-agent_system-common/outputs.tf index 07016f98c..471df98f3 100644 --- a/modules/smart-agent_system-common/outputs.tf +++ b/modules/smart-agent_system-common/outputs.tf @@ -18,6 +18,11 @@ output "disk_space" { value = signalfx_detector.disk_space } +output "filesystem_inodes" { + description = "Detector resource for filesystem_inodes" + value = signalfx_detector.filesystem_inodes +} + output "heartbeat" { description = "Detector resource for heartbeat" value = signalfx_detector.heartbeat diff --git a/modules/smart-agent_system-common/variables-gen.tf b/modules/smart-agent_system-common/variables-gen.tf index 8fa66139c..af6fa8034 100644 --- a/modules/smart-agent_system-common/variables-gen.tf +++ b/modules/smart-agent_system-common/variables-gen.tf @@ -312,6 +312,96 @@ variable "disk_space_at_least_percentage_major" { type = number default = 1 } +# filesystem_inodes detector + +variable "filesystem_inodes_notifications" { + description = "Notification recipients list per severity overridden for filesystem_inodes detector" + type = map(list(string)) + default = {} +} + +variable "filesystem_inodes_aggregation_function" { + description = "Aggregation function and group by for filesystem_inodes detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "filesystem_inodes_transformation_function" { + description = "Transformation function for filesystem_inodes detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='5m')" +} + +variable "filesystem_inodes_max_delay" { + description = "Enforce max delay for filesystem_inodes detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "filesystem_inodes_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "filesystem_inodes_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "filesystem_inodes_disabled" { + description = "Disable all alerting rules for filesystem_inodes detector" + type = bool + default = null +} + +variable "filesystem_inodes_disabled_critical" { + description = "Disable critical alerting rule for filesystem_inodes detector" + type = bool + default = null +} + +variable "filesystem_inodes_disabled_major" { + description = "Disable major alerting rule for filesystem_inodes detector" + type = bool + default = null +} + +variable "filesystem_inodes_threshold_critical" { + description = "Critical threshold for filesystem_inodes detector in %" + type = number + default = 95 +} + +variable "filesystem_inodes_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "filesystem_inodes_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "filesystem_inodes_threshold_major" { + description = "Major threshold for filesystem_inodes detector in %" + type = number + default = 90 +} + +variable "filesystem_inodes_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "filesystem_inodes_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} # disk_inodes detector variable "disk_inodes_notifications" {