Skip to content

Commit

Permalink
Add a duration for which conditions must be true before triggering an…
Browse files Browse the repository at this point in the history
… alert on Integration_aws-rds-common (#550)

* Add duration that conditions must be true before raising alert on integration_aws-rds-common

* keep duration at none like before on integration_aws-rds-common and overwrite it if necessary with variables

* keep duration at none like before on integration_aws-rds-common and overwrite it if necessary with variables

---------

Co-authored-by: Pierre-Islande MICHEL <[email protected]>
  • Loading branch information
pierreislande and Pierre-Islande MICHEL authored Apr 5, 2024
1 parent cdd61f7 commit da81945
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 6 deletions.
12 changes: 6 additions & 6 deletions modules/integration_aws-rds-common/detectors-rds-common.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ resource "signalfx_detector" "cpu_90_15min" {

program_text = <<-EOF
signal = data('CPUUtilization', filter=filter('namespace', 'AWS/RDS') and filter('stat', 'mean') and filter('DBInstanceIdentifier', '*') and ${module.filtering.signalflow})${var.cpu_90_15min_aggregation_function}${var.cpu_90_15min_transformation_function}.publish('signal')
detect(when(signal > ${var.cpu_90_15min_threshold_critical})).publish('CRIT')
detect(when(signal > ${var.cpu_90_15min_threshold_major}) and (not when(signal > ${var.cpu_90_15min_threshold_critical}))).publish('MAJOR')
detect(when(signal > ${var.cpu_90_15min_threshold_critical}, lasting=%{if var.cpu_90_15min_lasting_duration_critical == null}None%{else}'${var.cpu_90_15min_lasting_duration_critical}'%{endif}, at_least=${var.cpu_90_15min_at_least_percentage_critical})).publish('CRIT')
detect(when(signal > ${var.cpu_90_15min_threshold_major}, lasting=%{if var.cpu_90_15min_lasting_duration_major == null}None%{else}'${var.cpu_90_15min_lasting_duration_major}'%{endif}, at_least=${var.cpu_90_15min_at_least_percentage_major}) and (not when(signal > ${var.cpu_90_15min_threshold_critical}, lasting=%{if var.cpu_90_15min_lasting_duration_critical == null}None%{else}'${var.cpu_90_15min_lasting_duration_critical}'%{endif}, at_least=${var.cpu_90_15min_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
Expand Down Expand Up @@ -81,8 +81,8 @@ resource "signalfx_detector" "free_space_low" {
program_text = <<-EOF
free = data('FreeStorageSpace', filter=filter('namespace', 'AWS/RDS') and filter('stat', 'mean') and filter('DBInstanceIdentifier', '*') and ${module.filtering.signalflow})${var.free_space_low_aggregation_function}${var.free_space_low_transformation_function}
signal = free.scale(1/1024**3).publish('signal') # Bytes to Gibibytes
detect(when(signal < ${var.free_space_low_threshold_critical})).publish('CRIT')
detect(when(signal < ${var.free_space_low_threshold_major}) and (not when(signal < ${var.free_space_low_threshold_critical}))).publish('MAJOR')
detect(when(signal < ${var.free_space_low_threshold_critical}, lasting=%{if var.free_space_low_lasting_duration_critical == null}None%{else}'${var.free_space_low_lasting_duration_critical}'%{endif}, at_least=${var.free_space_low_at_least_percentage_critical})).publish('CRIT')
detect(when(signal < ${var.free_space_low_threshold_major}, lasting=%{if var.free_space_low_lasting_duration_major == null}None%{else}'${var.free_space_low_lasting_duration_major}'%{endif}, at_least=${var.free_space_low_at_least_percentage_major}) and (not when(signal < ${var.free_space_low_threshold_critical}, lasting=%{if var.free_space_low_lasting_duration_critical == null}None%{else}'${var.free_space_low_lasting_duration_critical}'%{endif}, at_least=${var.free_space_low_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
Expand Down Expand Up @@ -121,8 +121,8 @@ resource "signalfx_detector" "replica_lag" {

program_text = <<-EOF
signal = data('ReplicaLag', filter=filter('namespace', 'AWS/RDS') and filter('stat', 'mean') and filter('DBInstanceIdentifier', '*') and ${module.filtering.signalflow})${var.replica_lag_aggregation_function}${var.replica_lag_transformation_function}.publish('signal')
detect(when(signal > ${var.replica_lag_threshold_critical})).publish('CRIT')
detect(when(signal > ${var.replica_lag_threshold_major}) and (not when(signal > ${var.replica_lag_threshold_critical}))).publish('MAJOR')
detect(when(signal > ${var.replica_lag_threshold_critical}, lasting=%{if var.replica_lag_lasting_duration_critical == null}None%{else}'${var.replica_lag_lasting_duration_critical}'%{endif}, at_least=${var.replica_lag_at_least_percentage_critical})).publish('CRIT')
detect(when(signal > ${var.replica_lag_threshold_major}, lasting=%{if var.replica_lag_lasting_duration_major == null}None%{else}'${var.replica_lag_lasting_duration_major}'%{endif}, at_least=${var.replica_lag_at_least_percentage_major}) and (not when(signal > ${var.replica_lag_threshold_critical}, lasting=%{if var.replica_lag_lasting_duration_critical == null}None%{else}'${var.replica_lag_lasting_duration_critical}'%{endif}, at_least=${var.replica_lag_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
Expand Down
71 changes: 71 additions & 0 deletions modules/integration_aws-rds-common/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,30 @@ variable "cpu_90_15min_threshold_major" {
default = 80
}

variable "cpu_90_15min_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "cpu_90_15min_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "cpu_90_15min_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "cpu_90_15min_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

# Free_space_low detector

variable "free_space_low_max_delay" {
Expand Down Expand Up @@ -180,6 +204,30 @@ variable "free_space_low_threshold_major" {
default = 40
}

variable "free_space_low_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "free_space_low_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "free_space_low_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "free_space_low_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

# Replica_lag detector

variable "replica_lag_max_delay" {
Expand Down Expand Up @@ -248,3 +296,26 @@ variable "replica_lag_threshold_major" {
default = 200
}

variable "replica_lag_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "replica_lag_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "replica_lag_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "replica_lag_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

0 comments on commit da81945

Please sign in to comment.