Skip to content

Commit

Permalink
Add detector based on Otel filesystem inodes (`system.filesystem.inod…
Browse files Browse the repository at this point in the history
…es.usage`) (#484)

Co-authored-by: Benjamin DUPUIS <[email protected]>
  • Loading branch information
Poil and bd-clara authored Sep 11, 2023
1 parent 521743f commit 226ebd0
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -1253,6 +1253,7 @@
|System cpu utilization|X|X|-|-|-|
|System load 5m ratio|X|X|-|-|-|
|System disk space utilization|X|X|-|-|-|
|System filesystem inodes utilization|X|X|-|-|-|
|System disk inodes utilization|X|X|-|-|-|
|System memory utilization|X|X|-|-|-|
|System swap in/out|X|X|-|-|-|
Expand Down
2 changes: 2 additions & 0 deletions modules/smart-agent_system-common/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ This module creates the following SignalFx detectors which could contain one or
|System cpu utilization|X|X|-|-|-|
|System load 5m ratio|X|X|-|-|-|
|System disk space utilization|X|X|-|-|-|
|System filesystem inodes utilization|X|X|-|-|-|
|System disk inodes utilization|X|X|-|-|-|
|System memory utilization|X|X|-|-|-|
|System swap in/out|X|X|-|-|-|
Expand Down Expand Up @@ -153,6 +154,7 @@ parameter to the corresponding monitor configuration:
- '!percent_inodes.used'
- '!vmpage_io.swap.in'
- '!vmpage_io.swap.out'
- '!system.filesystem.inodes.usage'

```

Expand Down
22 changes: 22 additions & 0 deletions modules/smart-agent_system-common/conf/04-filesystem-inodes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module: system
name: "filesystem inodes utilization"
id: filesystem_inodes
transformation: ".max(over='5m')"
value_unit: "%"
signals:
used:
metric: system.filesystem.inodes.usage
filter: filter('state', 'used')
free:
metric: system.filesystem.inodes.usage
filter: filter('state', 'free')
signal:
formula: (used / (used + free) * 100)
rules:
critical:
threshold: 95
comparator: ">"
major:
threshold: 90
comparator: ">"
dependency: critical
47 changes: 47 additions & 0 deletions modules/smart-agent_system-common/detectors-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,53 @@ EOF
max_delay = var.disk_space_max_delay
}

resource "signalfx_detector" "filesystem_inodes" {
name = format("%s %s", local.detector_name_prefix, "System filesystem inodes utilization")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

viz_options {
label = "signal"
value_suffix = "%"
}

program_text = <<-EOF
used = data('system.filesystem.inodes.usage', filter=filter('state', 'used') and ${module.filtering.signalflow})${var.filesystem_inodes_aggregation_function}${var.filesystem_inodes_transformation_function}
free = data('system.filesystem.inodes.usage', filter=filter('state', 'free') and ${module.filtering.signalflow})${var.filesystem_inodes_aggregation_function}${var.filesystem_inodes_transformation_function}
signal = (used / (used + free) * 100).publish('signal')
detect(when(signal > ${var.filesystem_inodes_threshold_critical}, lasting=%{if var.filesystem_inodes_lasting_duration_critical == null}None%{else}'${var.filesystem_inodes_lasting_duration_critical}'%{endif}, at_least=${var.filesystem_inodes_at_least_percentage_critical})).publish('CRIT')
detect(when(signal > ${var.filesystem_inodes_threshold_major}, lasting=%{if var.filesystem_inodes_lasting_duration_major == null}None%{else}'${var.filesystem_inodes_lasting_duration_major}'%{endif}, at_least=${var.filesystem_inodes_at_least_percentage_major}) and (not when(signal > ${var.filesystem_inodes_threshold_critical}, lasting=%{if var.filesystem_inodes_lasting_duration_critical == null}None%{else}'${var.filesystem_inodes_lasting_duration_critical}'%{endif}, at_least=${var.filesystem_inodes_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
description = "is too high > ${var.filesystem_inodes_threshold_critical}%"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.filesystem_inodes_disabled_critical, var.filesystem_inodes_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.filesystem_inodes_notifications, "critical", []), var.notifications.critical), null)
runbook_url = try(coalesce(var.filesystem_inodes_runbook_url, var.runbook_url), "")
tip = var.filesystem_inodes_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

rule {
description = "is too high > ${var.filesystem_inodes_threshold_major}%"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.filesystem_inodes_disabled_major, var.filesystem_inodes_disabled, var.detectors_disabled)
notifications = try(coalescelist(lookup(var.filesystem_inodes_notifications, "major", []), var.notifications.major), null)
runbook_url = try(coalesce(var.filesystem_inodes_runbook_url, var.runbook_url), "")
tip = var.filesystem_inodes_tip
parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}

max_delay = var.filesystem_inodes_max_delay
}

resource "signalfx_detector" "disk_inodes" {
name = format("%s %s", local.detector_name_prefix, "System disk inodes utilization")

Expand Down
5 changes: 5 additions & 0 deletions modules/smart-agent_system-common/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ output "disk_space" {
value = signalfx_detector.disk_space
}

output "filesystem_inodes" {
description = "Detector resource for filesystem_inodes"
value = signalfx_detector.filesystem_inodes
}

output "heartbeat" {
description = "Detector resource for heartbeat"
value = signalfx_detector.heartbeat
Expand Down
90 changes: 90 additions & 0 deletions modules/smart-agent_system-common/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,96 @@ variable "disk_space_at_least_percentage_major" {
type = number
default = 1
}
# filesystem_inodes detector

variable "filesystem_inodes_notifications" {
description = "Notification recipients list per severity overridden for filesystem_inodes detector"
type = map(list(string))
default = {}
}

variable "filesystem_inodes_aggregation_function" {
description = "Aggregation function and group by for filesystem_inodes detector (i.e. \".mean(by=['host'])\")"
type = string
default = ""
}

variable "filesystem_inodes_transformation_function" {
description = "Transformation function for filesystem_inodes detector (i.e. \".mean(over='5m')\")"
type = string
default = ".max(over='5m')"
}

variable "filesystem_inodes_max_delay" {
description = "Enforce max delay for filesystem_inodes detector (use \"0\" or \"null\" for \"Auto\")"
type = number
default = null
}

variable "filesystem_inodes_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
default = ""
}

variable "filesystem_inodes_runbook_url" {
description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause"
type = string
default = ""
}

variable "filesystem_inodes_disabled" {
description = "Disable all alerting rules for filesystem_inodes detector"
type = bool
default = null
}

variable "filesystem_inodes_disabled_critical" {
description = "Disable critical alerting rule for filesystem_inodes detector"
type = bool
default = null
}

variable "filesystem_inodes_disabled_major" {
description = "Disable major alerting rule for filesystem_inodes detector"
type = bool
default = null
}

variable "filesystem_inodes_threshold_critical" {
description = "Critical threshold for filesystem_inodes detector in %"
type = number
default = 95
}

variable "filesystem_inodes_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "filesystem_inodes_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
variable "filesystem_inodes_threshold_major" {
description = "Major threshold for filesystem_inodes detector in %"
type = number
default = 90
}

variable "filesystem_inodes_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "filesystem_inodes_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
# disk_inodes detector

variable "disk_inodes_notifications" {
Expand Down

0 comments on commit 226ebd0

Please sign in to comment.