Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve various AWS detectors #579

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
10 changes: 5 additions & 5 deletions docs/severity.md
Original file line number Diff line number Diff line change
Expand Up @@ -269,11 +269,11 @@
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|AWS EFS used space|X|X|-|-|-|
|AWS EFS percent of io limit|-|X|X|-|-|
|AWS EFS percent of read throughput|-|-|X|X|-|
|AWS EFS percent of write throughput|-|-|X|X|-|
|AWS EFS percent of permitted throughput|-|X|X|-|-|
|AWS EFS burst credit balance|-|X|-|-|-|
|AWS EFS percent of io limit|X|X|-|-|-|
|AWS EFS percent of read throughput|X|X|-|-|-|
|AWS EFS percent of write throughput|X|X|-|-|-|
|AWS EFS percent of permitted throughput|X|X|-|-|-|
|AWS EFS burst credit balance|X|-|-|-|-|


## integration_aws-elasticache-common
Expand Down
1 change: 1 addition & 0 deletions modules/integration_aws-alb/conf/00-heartbeat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ name: heartbeat
transformation: false
aggregation: ".mean(by=['LoadBalancer'])"
filtering: "filter('namespace', 'AWS/ApplicationELB')"
condition: "var.heartbeat_detector_enabled"

signals:
signal:
Expand Down
1 change: 1 addition & 0 deletions modules/integration_aws-alb/conf/01-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ApplicationELB')"
value_unit: "Second"
condition: "var.latency_detector_enabled"

signals:
signal:
Expand Down
1 change: 1 addition & 0 deletions modules/integration_aws-alb/conf/02-lb-5xx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and (not filter('AvailabilityZone', '*'))"
value_unit: "%"
condition: "var.lb_5xx_detector_enabled"

signals:
errors:
Expand Down
2 changes: 1 addition & 1 deletion modules/integration_aws-alb/conf/03-lb-4xx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and (not filter('AvailabilityZone', '*'))"
value_unit: "%"
condition: "var.lb_4xx_detector_enabled"

signals:
errors:
Expand All @@ -25,7 +26,6 @@ rules:
lasting_duration: 15m
lasting_at_least: 0.9
append_condition: and when(requests > ${var.minimum_traffic})
disabled: true
major:
threshold: 95
comparator: ">"
Expand Down
1 change: 1 addition & 0 deletions modules/integration_aws-alb/conf/04-target-5xx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and filter('TargetGroup', '*') and (not filter('AvailabilityZone', '*'))"
value_unit: "%"
condition: "var.target_5xx_detector_enabled"

signals:
errors:
Expand Down
2 changes: 1 addition & 1 deletion modules/integration_aws-alb/conf/05-target-4xx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ApplicationELB') and filter('stat', 'sum') and filter('TargetGroup', '*') and (not filter('AvailabilityZone', '*'))"
value_unit: "%"
condition: "var.target_4xx_detector_enabled"

signals:
errors:
Expand All @@ -25,7 +26,6 @@ rules:
lasting_duration: 15m
lasting_at_least: 0.9
append_condition: and when(requests > ${var.minimum_traffic})
disabled: true
major:
threshold: 95
comparator: ">"
Expand Down
1 change: 1 addition & 0 deletions modules/integration_aws-alb/conf/06-healthy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ transformation: true
aggregation: true
filtering: "filter('namespace', 'AWS/ApplicationELB') and (not filter('AvailabilityZone', '*'))"
value_unit: "%"
condition: "var.healthy_detector_enabled"

signals:
healthy:
Expand Down
14 changes: 14 additions & 0 deletions modules/integration_aws-alb/detectors-gen.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
resource "signalfx_detector" "heartbeat" {
count = (var.heartbeat_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB heartbeat")

authorized_writer_teams = var.authorized_writer_teams
Expand Down Expand Up @@ -28,6 +30,8 @@ EOF
}

resource "signalfx_detector" "latency" {
count = (var.latency_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB target response time")

authorized_writer_teams = var.authorized_writer_teams
Expand Down Expand Up @@ -74,6 +78,8 @@ EOF
}

resource "signalfx_detector" "alb_5xx" {
count = (var.lb_5xx_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB 5xx error rate")

authorized_writer_teams = var.authorized_writer_teams
Expand Down Expand Up @@ -122,6 +128,8 @@ EOF
}

resource "signalfx_detector" "alb_4xx" {
count = (var.lb_4xx_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB 4xx error rate")

authorized_writer_teams = var.authorized_writer_teams
Expand Down Expand Up @@ -183,6 +191,8 @@ EOF
}

resource "signalfx_detector" "target_5xx" {
count = (var.target_5xx_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB target 5xx error rate")

authorized_writer_teams = var.authorized_writer_teams
Expand Down Expand Up @@ -231,6 +241,8 @@ EOF
}

resource "signalfx_detector" "target_4xx" {
count = (var.target_4xx_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB target 4xx error rate")

authorized_writer_teams = var.authorized_writer_teams
Expand Down Expand Up @@ -292,6 +304,8 @@ EOF
}

resource "signalfx_detector" "healthy" {
count = (var.healthy_detector_enabled) ? 1 : 0

name = format("%s %s", local.detector_name_prefix, "AWS ALB healthy instances percentage")

authorized_writer_teams = var.authorized_writer_teams
Expand Down
4 changes: 2 additions & 2 deletions modules/integration_aws-alb/variables-gen.tf
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ variable "alb_4xx_disabled" {
variable "alb_4xx_disabled_critical" {
description = "Disable critical alerting rule for alb_4xx detector"
type = bool
default = true
default = null
}

variable "alb_4xx_disabled_major" {
Expand Down Expand Up @@ -472,7 +472,7 @@ variable "target_4xx_disabled" {
variable "target_4xx_disabled_critical" {
description = "Disable critical alerting rule for target_4xx detector"
type = bool
default = true
default = null
}

variable "target_4xx_disabled_major" {
Expand Down
42 changes: 42 additions & 0 deletions modules/integration_aws-alb/variables.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,45 @@
variable "heartbeat_detector_enabled" {
description = "Enable heartbeat detector"
type = bool
default = true
}

variable "latency_detector_enabled" {
description = "Enable latency detector"
type = bool
default = true
}

variable "lb_5xx_detector_enabled" {
description = "Enable lb 5xx detector"
type = bool
default = true
}

variable "lb_4xx_detector_enabled" {
description = "Enable lb 4xx detector"
type = bool
default = true
}

variable "target_5xx_detector_enabled" {
description = "Enable target 5xx detector"
type = bool
default = true
}

variable "target_4xx_detector_enabled" {
description = "Enable target 4xx detector"
type = bool
default = true
}

variable "healthy_detector_enabled" {
description = "Enable healthy detector"
type = bool
default = true
}

# Module specific

variable "minimum_traffic" {
Expand Down
22 changes: 9 additions & 13 deletions modules/integration_aws-efs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,10 @@ existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/G
module "signalfx-detectors-integration-aws-efs" {
source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_aws-efs?ref={revision}"

environment = var.environment
notifications = local.notifications
used_space_threshold_major = 42
write_throughput_threshold_minor = 42
read_throughput_threshold_minor = 42
read_throughput_threshold_warning = 42
write_throughput_threshold_warning = 42
used_space_threshold_critical = 42
environment = var.environment
notifications = local.notifications
used_space_threshold_major = 42
used_space_threshold_critical = 42
}
```

Expand Down Expand Up @@ -87,11 +83,11 @@ This module creates the following SignalFx detectors which could contain one or
|Detector|Critical|Major|Minor|Warning|Info|
|---|---|---|---|---|---|
|AWS EFS used space|X|X|-|-|-|
|AWS EFS percent of io limit|-|X|X|-|-|
|AWS EFS percent of read throughput|-|-|X|X|-|
|AWS EFS percent of write throughput|-|-|X|X|-|
|AWS EFS percent of permitted throughput|-|X|X|-|-|
|AWS EFS burst credit balance|-|X|-|-|-|
|AWS EFS percent of io limit|X|X|-|-|-|
|AWS EFS percent of read throughput|X|X|-|-|-|
|AWS EFS percent of write throughput|X|X|-|-|-|
|AWS EFS percent of permitted throughput|X|X|-|-|-|
|AWS EFS burst credit balance|X|-|-|-|-|

## How to collect required metrics?

Expand Down
4 changes: 3 additions & 1 deletion modules/integration_aws-efs/conf/01-used-space.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module: "AWS EFS"
name: "Used Space"
filtering: "filter('namespace', 'AWS/EFS')"
value_unit: "Gibibyte"
transformation: ".max(over='15m')"
condition: "var.used_space_detector_enabled"
signals:
used_space:
metric: "StorageBytes"
Expand All @@ -13,6 +13,8 @@ signals:
rules:
critical:
comparator: ">"
lasting_duration: "15m"
major:
comparator: ">"
lasting_duration: "15m"
dependency: critical
10 changes: 6 additions & 4 deletions modules/integration_aws-efs/conf/02-io-limit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@ name: "Percent of IO Limit"
id: "io_limit"
filtering: "filter('namespace', 'AWS/EFS')"
value_unit: "%"
transformation: ".mean(over='30m')"
condition: "var.io_limit_detector_enabled"
tip: "If you reach too often the limit with current General Purpose mode, consider moving your application to a file system using the Max I/O performance mode."
signals:
signal:
metric: "PercentIOLimit"
filter: "filter('stat', 'mean')"
rules:
major:
critical:
comparator: ">"
threshold: 90
minor:
lasting_duration: "30m"
major:
comparator: ">"
dependency: major
dependency: critical
threshold: 80
lasting_duration: "30m"
13 changes: 8 additions & 5 deletions modules/integration_aws-efs/conf/03-throughput-read.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ name: "Percent of read throughput"
id: "read_throughput"
filtering: "filter('namespace', 'AWS/EFS')"
value_unit: "%"
transformation: ".max(over='15m')"
disabled: true
condition: "var.read_throughput_detector_enabled"
signals:
read:
metric: "DataReadIOBytes"
Expand All @@ -16,8 +15,12 @@ signals:
formula:
(read/total).scale(100)
rules:
minor:
critical:
comparator: ">"
warning:
threshold: 90
lasting_duration: "15m"
major:
lasting_duration: "15m"
comparator: ">"
dependency: minor
threshold: 80
dependency: critical
13 changes: 8 additions & 5 deletions modules/integration_aws-efs/conf/04-throughput-write.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ name: "Percent of write throughput"
id: "write_throughput"
filtering: "filter('namespace', 'AWS/EFS')"
value_unit: "%"
transformation: ".max(over='15m')"
disabled: true
condition: "var.write_throughput_detector_enabled"
signals:
write:
metric: "DataWriteIOBytes"
Expand All @@ -16,8 +15,12 @@ signals:
formula:
(write/total).scale(100)
rules:
minor:
critical:
comparator: ">"
warning:
threshold: 90
lasting_duration: "15m"
major:
comparator: ">"
dependency: minor
threshold: 80
dependency: critical
lasting_duration: "15m"
10 changes: 6 additions & 4 deletions modules/integration_aws-efs/conf/05-permitted-throughput.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module: "AWS EFS"
name: "Percent of permitted throughput"
filtering: "filter('namespace', 'AWS/EFS')"
value_unit: "%"
transformation: ".mean(over='30m')"
condition: "var.permitted_throughput_detector_enabled"
tip: "You are consuming the entire amount of throughput allocated to your file system, In this situation, you might consider changing the file system's throughput mode to Provisioned Throughput to get higher throughput."
signals:
metered:
Expand All @@ -14,10 +14,12 @@ signals:
signal:
formula: (metered/permitted.scale(60)).scale(100)
rules:
major:
critical:
comparator: ">"
threshold: 90
minor:
lasting_duration: "30m"
major:
comparator: ">"
dependency: major
dependency: critical
threshold: 80
lasting_duration: "30m"
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ module: "AWS EFS"
name: "Burst Credit Balance"
filtering: "filter('namespace', 'AWS/EFS')"
value_unit: "credits"
transformation: ".mean(over='5m')"
condition: "var.burst_credit_balance_detector_enabled"
tip: "See https://docs.aws.amazon.com/efs/latest/ug/performance.html#bursting"
signals:
signal:
metric: "BurstCreditBalance"
filter: "filter('stat', 'lower')"
rules:
major:
critical:
comparator: "<"
threshold: 1
lasting_duration: "5m"
Loading
Loading