Skip to content

Commit

Permalink
convert processes detector using generator (#354)
Browse files Browse the repository at this point in the history
* add disabled at rule level for generator

* convert processes detector using generator

* rebase on generator_disabled_rule
  • Loading branch information
xp-1000 authored Nov 19, 2021
1 parent 8c01943 commit 6dbd2b8
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 23 deletions.
2 changes: 1 addition & 1 deletion docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Processes aliveness|X|X|-|-|-|
|Processes aliveness count|X|X|-|-|-|


## rabbitmq-node
Expand Down
4 changes: 2 additions & 2 deletions modules/smart-agent_processes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Note the following parameters:

These 3 parameters alongs with all variables defined in [common-variables.tf](common-variables.tf) are common to all
[modules](../) in this repository. Other variables, specific to this module, are available in
[variables.tf](variables.tf).
[variables-gen.tf](variables-gen.tf).
In general, the default configuration "works" but all of these Terraform
[variables](https://www.terraform.io/docs/configuration/variables.html) make it possible to
customize the detectors behavior to better fit your needs.
Expand All @@ -76,7 +76,7 @@ This module creates the following SignalFx detectors which could contain one or

|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|Processes aliveness|X|X|-|-|-|
|Processes aliveness count|X|X|-|-|-|

## How to collect required metrics?

Expand Down
16 changes: 16 additions & 0 deletions modules/smart-agent_processes/conf/01-count.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module: processes
name: "aliveness count"
id: processes
transformation: ".max(over='15m')"
signals:
signal:
metric: ps_count.processes
rules:
critical:
threshold: 1
comparator: "<"
major:
threshold: 2
comparator: "<"
dependency: critical
disabled: true
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
resource "signalfx_detector" "processes" {
name = format("%s %s", local.detector_name_prefix, "Processes aliveness")
name = format("%s %s", local.detector_name_prefix, "Processes aliveness count")

authorized_writer_teams = var.authorized_writer_teams
teams = try(coalescelist(var.teams, var.authorized_writer_teams), null)
tags = compact(concat(local.common_tags, local.tags, var.extra_tags))

program_text = <<-EOF
signal = data('ps_count.processes', filter=${module.filtering.signalflow})${var.processes_aggregation_function}${var.processes_transformation_function}.publish('signal')
detect(when(signal < 1)).publish('CRIT')
detect(when(signal < ${var.processes_threshold_major}) and when (signal >= 1)).publish('MAJOR')
signal = data('ps_count.processes', filter=${module.filtering.signalflow})${var.processes_aggregation_function}${var.processes_transformation_function}.publish('signal')
detect(when(signal < ${var.processes_threshold_critical}, lasting=%{if var.processes_lasting_duration_critical == null}None%{else}'${var.processes_lasting_duration_critical}'%{endif}, at_least=${var.processes_at_least_percentage_critical})).publish('CRIT')
detect(when(signal < ${var.processes_threshold_major}, lasting=%{if var.processes_lasting_duration_major == null}None%{else}'${var.processes_lasting_duration_major}'%{endif}, at_least=${var.processes_at_least_percentage_major}) and (not when(signal < ${var.processes_threshold_critical}, lasting=%{if var.processes_lasting_duration_critical == null}None%{else}'${var.processes_lasting_duration_critical}'%{endif}, at_least=${var.processes_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
description = "count is too low < 1"
description = "is too low < ${var.processes_threshold_critical}"
severity = "Critical"
detect_label = "CRIT"
disabled = coalesce(var.processes_disabled_critical, var.processes_disabled, var.detectors_disabled)
Expand All @@ -24,7 +24,7 @@ EOF
}

rule {
description = "count is too low < ${var.processes_threshold_major}"
description = "is too low < ${var.processes_threshold_major}"
severity = "Major"
detect_label = "MAJOR"
disabled = coalesce(var.processes_disabled_major, var.processes_disabled, var.detectors_disabled)
Expand All @@ -35,3 +35,4 @@ EOF
parameterized_body = var.message_body == "" ? local.rule_body : var.message_body
}
}

Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
# Module specific

# processes detector

variable "processes_notifications" {
description = "Notification recipients list per severity overridden for processes detector"
type = map(list(string))
default = {}
}

variable "processes_aggregation_function" {
description = "Aggregation function and group by for processes detector (i.e. \".mean(by=['host'])\")"
type = string
default = ""
}

variable "processes_transformation_function" {
description = "Transformation function for processes detector (i.e. \".mean(over='5m')\")"
type = string
default = ".max(over='15m')"
}

variable "processes_tip" {
description = "Suggested first course of action or any note useful for incident handling"
type = string
Expand Down Expand Up @@ -32,27 +48,37 @@ variable "processes_disabled_major" {
default = true
}

variable "processes_notifications" {
description = "Notification recipients list per severity overridden for processes detector"
type = map(list(string))
default = {}
variable "processes_threshold_critical" {
description = "Critical threshold for processes detector"
type = number
default = 1
}

variable "processes_aggregation_function" {
description = "Aggregation function and group by for processes detector (i.e. \".mean(by=['host'])\")"
variable "processes_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = ""
default = null
}

variable "processes_transformation_function" {
description = "Transformation function for processes detector (i.e. \".mean(over='5m')\")"
type = string
default = ".max(over='15m')"
variable "processes_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "processes_threshold_major" {
description = "Major threshold for processes detector"
type = number
default = 2
}

variable "processes_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "processes_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

0 comments on commit 6dbd2b8

Please sign in to comment.