From 1d85e70362512492e129004857f299d726b37bc6 Mon Sep 17 00:00:00 2001 From: ndo77 Date: Thu, 12 Sep 2024 09:37:50 +0200 Subject: [PATCH 1/6] Integration gcp cloud run (#565) * add detector cloud run * fix conf * fix conf module cloud run * fix probe error 5xx * update probe 5xx * update read me * fix typo * fix name of probes * update conf detectors cloud run * remove error 5xx on cloud run * remove probe connection sql and update config probe cpu and memory * fix probe * fix probe memory and add heartbeat probe * add probe count container --- docs/severity.md | 10 + modules/integration_gcp-cloud-run/README.md | 229 +++++++++++++++++ .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/00-containers.yaml | 15 ++ .../conf/01-cpu_utilizations.yaml | 19 ++ .../conf/02-memory_utilizations.yaml | 19 ++ .../conf/readme.yaml | 118 +++++++++ .../detectors-gen.tf | 117 +++++++++ modules/integration_gcp-cloud-run/filters.tf | 3 + modules/integration_gcp-cloud-run/outputs.tf | 15 ++ modules/integration_gcp-cloud-run/tags.tf | 4 + .../variables-gen.tf | 241 ++++++++++++++++++ .../integration_gcp-cloud-run/variables.tf | 4 + 16 files changed, 798 insertions(+) create mode 100644 modules/integration_gcp-cloud-run/README.md create mode 120000 modules/integration_gcp-cloud-run/common-locals.tf create mode 120000 modules/integration_gcp-cloud-run/common-modules.tf create mode 120000 modules/integration_gcp-cloud-run/common-variables.tf create mode 120000 modules/integration_gcp-cloud-run/common-versions.tf create mode 100644 modules/integration_gcp-cloud-run/conf/00-containers.yaml create mode 100644 modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml create mode 100644 modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml create mode 100644 modules/integration_gcp-cloud-run/conf/readme.yaml create mode 100644 modules/integration_gcp-cloud-run/detectors-gen.tf create mode 100644 modules/integration_gcp-cloud-run/filters.tf create mode 100644 modules/integration_gcp-cloud-run/outputs.tf create mode 100644 modules/integration_gcp-cloud-run/tags.tf create mode 100644 modules/integration_gcp-cloud-run/variables-gen.tf create mode 100644 modules/integration_gcp-cloud-run/variables.tf diff --git a/docs/severity.md b/docs/severity.md index 22752ae50..657b18cec 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -66,6 +66,7 @@ - [integration_azure-virtual-machine-scaleset](#integration_azure-virtual-machine-scaleset) - [integration_azure-virtual-machine](#integration_azure-virtual-machine) - [integration_gcp-bigquery](#integration_gcp-bigquery) +- [integration_gcp-cloud-run](#integration_gcp-cloud-run) - [integration_gcp-cloud-sql-common](#integration_gcp-cloud-sql-common) - [integration_gcp-cloud-sql-failover](#integration_gcp-cloud-sql-failover) - [integration_gcp-cloud-sql-mysql](#integration_gcp-cloud-sql-mysql) @@ -737,6 +738,15 @@ |GCP BigQuery uploaded bytes billed|X|X|-|-|-| +## integration_gcp-cloud-run + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|GCP Cloud Run container count|X|-|-|-|-| +|GCP Cloud Run cpu utilizations|X|X|-|-|-| +|GCP Cloud Run memory utilizations|X|X|-|-|-| + + ## integration_gcp-cloud-sql-common |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/integration_gcp-cloud-run/README.md b/modules/integration_gcp-cloud-run/README.md new file mode 100644 index 000000000..0077314a6 --- /dev/null +++ b/modules/integration_gcp-cloud-run/README.md @@ -0,0 +1,229 @@ +# GCP-CLOUD-RUN SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Notes](#notes) + - [Metadata configuration for default filtering](#metadata-configuration-for-default-filtering) + - [CPU utilizations](#cpu-utilizations) + - [Memory utilizations](#memory-utilizations) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-integration-gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run?ref={revision}" + + environment = var.environment + notifications = local.notifications + gcp_project_id = "fillme" +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|GCP Cloud Run container count|X|-|-|-|-| +|GCP Cloud Run cpu utilizations|X|X|-|-|-| +|GCP Cloud Run memory utilizations|X|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +[GCP integration](https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp-metrics.html) configurable +with [this Terraform module](https://github.com/claranet/terraform-signalfx-integrations/tree/master/cloud/gcp). + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `container/containers` +* `container/cpu/utilizations` +* `container/memory/utilizations` + + +## Notes + + +### Metadata configuration for default filtering + +label to use : + +sfx_env = true +sfx_monitored = true + +For example: + +via gcloud, at the Cloud Run level: +``` +gcloud run deploy hello \ +--image=us-docker.pkg.dev/cloudrun/container/hello \ +--allow-unauthenticated \ +--port=8080 \ +--service-account=123456789-compute@developer.gserviceaccount.com \ +--region=europe-west9 \ +--project=claranet-425413 \ +--labels=sfx_env=true,sfx_monitored=true +``` +via terraform, [at the Cloud Run level](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_service#nested_metadata) +```hcl +resource "google_cloud_run_service" "hello" { + name = "hello" + location = "europe-west9" + + template { + spec { + containers { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources { + limits = { + cpu = "1000m" // adjust based on your needs + memory = "512Mi" // adjust based on your needs + } + } + ports { + name = "http1" // This name is a standard identifier (http1 or h2c) for the protocol + container_port = 8080 + } + } + service_account_name = "123456789-compute@developer.gserviceaccount.com" + } + + metadata { + annotations = { + "run.googleapis.com/launch-stage" = "BETA" // adjust this according to the launch stage of your application + } + labels = { + sfx_env = "true" + sfx_monitored = "true" + } + } + } + autogenerate_revision_name = true + + traffic { + percent = 100 + latest_revision = true + } + + project = "claranet-425413" +} +``` +You also **need** to check if those metadata are in the metadata `includeList` in your [SignalFx GCP +integration](https://dev.splunk.com/observability/docs/integrations/gcp_integration_overview/#Optional-fields). + +### CPU utilizations + +Monitoring the CPU utilization helps in understanding the system's capability and efficiency. + +```hcl +module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + cpu_usage_threshold_critical = 85 + cpu_usage_threshold_major = 80 +} +``` + +### Memory utilizations + +Accurate tracking of memory usage aids in optimizing and improving performance. + +```hcl +module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + memory_usage_threshold_critical = 85 + memory_usage_threshold_major = 80 +} +``` + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) +* [Stackdriver metrics for Memorystore for Redis](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-run) +* [Splunk Observability metrics](https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp.html) diff --git a/modules/integration_gcp-cloud-run/common-locals.tf b/modules/integration_gcp-cloud-run/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/common-modules.tf b/modules/integration_gcp-cloud-run/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/common-variables.tf b/modules/integration_gcp-cloud-run/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/common-versions.tf b/modules/integration_gcp-cloud-run/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/integration_gcp-cloud-run/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-run/conf/00-containers.yaml b/modules/integration_gcp-cloud-run/conf/00-containers.yaml new file mode 100644 index 000000000..358765d98 --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/00-containers.yaml @@ -0,0 +1,15 @@ +module: "GCP Cloud Run" +name: "Container count" + +transformation: true +aggregation: true + +signals: + signal: + metric: "container/containers" + +rules: + critical: + threshold: 0 + comparator: "==" + diff --git a/modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml b/modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml new file mode 100644 index 000000000..45d339d66 --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/01-cpu_utilizations.yaml @@ -0,0 +1,19 @@ +module: "GCP Cloud Run" +name: "CPU utilizations" + +value_unit: "%" +transformation: ".min(over='30m')" + +signals: + signal: + metric: "container/cpu/utilizations" + +rules: + critical: + threshold: 90 + comparator: ">" + + major: + threshold: 85 + comparator: ">" + dependency: "critical" diff --git a/modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml b/modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml new file mode 100644 index 000000000..702a16086 --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/02-memory_utilizations.yaml @@ -0,0 +1,19 @@ +module: "GCP Cloud Run" +name: "Memory utilizations" + +value_unit: "%" +transformation: ".min(over='30m')" + +signals: + signal: + metric: "container/memory/utilizations" + +rules: + critical: + threshold: 95 + comparator: ">" + + major: + threshold: 90 + comparator: ">" + dependency: "critical" diff --git a/modules/integration_gcp-cloud-run/conf/readme.yaml b/modules/integration_gcp-cloud-run/conf/readme.yaml new file mode 100644 index 000000000..eba5fabdb --- /dev/null +++ b/modules/integration_gcp-cloud-run/conf/readme.yaml @@ -0,0 +1,118 @@ +documentations: + - name: Stackdriver metrics for Memorystore for Redis + url: https://cloud.google.com/monitoring/api/metrics_gcp#gcp-run + - name: Splunk Observability metrics + url: https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp.html +notes: | + + ### Metadata configuration for default filtering + + label to use : + + sfx_env = true + sfx_monitored = true + + For example: + + via gcloud, at the Cloud Run level: + ``` + gcloud run deploy hello \ + --image=us-docker.pkg.dev/cloudrun/container/hello \ + --allow-unauthenticated \ + --port=8080 \ + --service-account=123456789-compute@developer.gserviceaccount.com \ + --region=europe-west9 \ + --project=claranet-425413 \ + --labels=sfx_env=true,sfx_monitored=true + ``` + via terraform, [at the Cloud Run level](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/cloud_run_service#nested_metadata) + ```hcl + resource "google_cloud_run_service" "hello" { + name = "hello" + location = "europe-west9" + + template { + spec { + containers { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources { + limits = { + cpu = "1000m" // adjust based on your needs + memory = "512Mi" // adjust based on your needs + } + } + ports { + name = "http1" // This name is a standard identifier (http1 or h2c) for the protocol + container_port = 8080 + } + } + service_account_name = "123456789-compute@developer.gserviceaccount.com" + } + + metadata { + annotations = { + "run.googleapis.com/launch-stage" = "BETA" // adjust this according to the launch stage of your application + } + labels = { + sfx_env = "true" + sfx_monitored = "true" + } + } + } + autogenerate_revision_name = true + + traffic { + percent = 100 + latest_revision = true + } + + project = "claranet-425413" + } + ``` + You also **need** to check if those metadata are in the metadata `includeList` in your [SignalFx GCP + integration](https://dev.splunk.com/observability/docs/integrations/gcp_integration_overview/#Optional-fields). + + ### CPU utilizations + + Monitoring the CPU utilization helps in understanding the system's capability and efficiency. + + ```hcl + module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + cpu_usage_threshold_critical = 85 + cpu_usage_threshold_major = 80 + } + ``` + + ### Memory utilizations + + Accurate tracking of memory usage aids in optimizing and improving performance. + + ```hcl + module "signalfx-detectors-integration_gcp-cloud-run" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-run" + + environment = var.environment + gcp_project_id = var.project_id + notifications = local.notifications + + # We keep default filtering policy here, we just want to append additional filter to it + filtering_append = true + # We define the additional filter + filtering_custom = "filter('service_name', '*service-name*')" + # We can configure the thresholds of the probes + memory_usage_threshold_critical = 85 + memory_usage_threshold_major = 80 + } + ``` + diff --git a/modules/integration_gcp-cloud-run/detectors-gen.tf b/modules/integration_gcp-cloud-run/detectors-gen.tf new file mode 100644 index 000000000..fcb996be9 --- /dev/null +++ b/modules/integration_gcp-cloud-run/detectors-gen.tf @@ -0,0 +1,117 @@ +resource "signalfx_detector" "container_count" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud Run container count") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('container/containers', filter=${module.filtering.signalflow})${var.container_count_aggregation_function}${var.container_count_transformation_function}.publish('signal') + detect(when(signal == ${var.container_count_threshold_critical}%{if var.container_count_lasting_duration_critical != null}, lasting='${var.container_count_lasting_duration_critical}', at_least=${var.container_count_at_least_percentage_critical}%{endif})).publish('CRIT') +EOF + + rule { + description = "is == ${var.container_count_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.container_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.container_count_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.container_count_runbook_url, var.runbook_url), "") + tip = var.container_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.container_count_max_delay +} + +resource "signalfx_detector" "cpu_utilizations" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud Run cpu utilizations") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + signal = data('container/cpu/utilizations', filter=${module.filtering.signalflow})${var.cpu_utilizations_aggregation_function}${var.cpu_utilizations_transformation_function}.publish('signal') + detect(when(signal > ${var.cpu_utilizations_threshold_critical}%{if var.cpu_utilizations_lasting_duration_critical != null}, lasting='${var.cpu_utilizations_lasting_duration_critical}', at_least=${var.cpu_utilizations_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.cpu_utilizations_threshold_major}%{if var.cpu_utilizations_lasting_duration_major != null}, lasting='${var.cpu_utilizations_lasting_duration_major}', at_least=${var.cpu_utilizations_at_least_percentage_major}%{endif}) and (not when(signal > ${var.cpu_utilizations_threshold_critical}%{if var.cpu_utilizations_lasting_duration_critical != null}, lasting='${var.cpu_utilizations_lasting_duration_critical}', at_least=${var.cpu_utilizations_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.cpu_utilizations_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cpu_utilizations_disabled_critical, var.cpu_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_utilizations_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cpu_utilizations_runbook_url, var.runbook_url), "") + tip = var.cpu_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cpu_utilizations_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cpu_utilizations_disabled_major, var.cpu_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cpu_utilizations_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cpu_utilizations_runbook_url, var.runbook_url), "") + tip = var.cpu_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cpu_utilizations_max_delay +} + +resource "signalfx_detector" "memory_utilizations" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud Run memory utilizations") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_suffix = "%" + } + + program_text = <<-EOF + signal = data('container/memory/utilizations', filter=${module.filtering.signalflow})${var.memory_utilizations_aggregation_function}${var.memory_utilizations_transformation_function}.publish('signal') + detect(when(signal > ${var.memory_utilizations_threshold_critical}%{if var.memory_utilizations_lasting_duration_critical != null}, lasting='${var.memory_utilizations_lasting_duration_critical}', at_least=${var.memory_utilizations_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.memory_utilizations_threshold_major}%{if var.memory_utilizations_lasting_duration_major != null}, lasting='${var.memory_utilizations_lasting_duration_major}', at_least=${var.memory_utilizations_at_least_percentage_major}%{endif}) and (not when(signal > ${var.memory_utilizations_threshold_critical}%{if var.memory_utilizations_lasting_duration_critical != null}, lasting='${var.memory_utilizations_lasting_duration_critical}', at_least=${var.memory_utilizations_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.memory_utilizations_threshold_critical}%" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.memory_utilizations_disabled_critical, var.memory_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.memory_utilizations_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.memory_utilizations_runbook_url, var.runbook_url), "") + tip = var.memory_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.memory_utilizations_threshold_major}%" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.memory_utilizations_disabled_major, var.memory_utilizations_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.memory_utilizations_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.memory_utilizations_runbook_url, var.runbook_url), "") + tip = var.memory_utilizations_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.memory_utilizations_max_delay +} + diff --git a/modules/integration_gcp-cloud-run/filters.tf b/modules/integration_gcp-cloud-run/filters.tf new file mode 100644 index 000000000..f396ead7e --- /dev/null +++ b/modules/integration_gcp-cloud-run/filters.tf @@ -0,0 +1,3 @@ +locals { + filters = "filter('project_id', '${var.gcp_project_id}')" +} diff --git a/modules/integration_gcp-cloud-run/outputs.tf b/modules/integration_gcp-cloud-run/outputs.tf new file mode 100644 index 000000000..d749c95b5 --- /dev/null +++ b/modules/integration_gcp-cloud-run/outputs.tf @@ -0,0 +1,15 @@ +output "container_count" { + description = "Detector resource for container_count" + value = signalfx_detector.container_count +} + +output "cpu_utilizations" { + description = "Detector resource for cpu_utilizations" + value = signalfx_detector.cpu_utilizations +} + +output "memory_utilizations" { + description = "Detector resource for memory_utilizations" + value = signalfx_detector.memory_utilizations +} + diff --git a/modules/integration_gcp-cloud-run/tags.tf b/modules/integration_gcp-cloud-run/tags.tf new file mode 100644 index 000000000..e5af37772 --- /dev/null +++ b/modules/integration_gcp-cloud-run/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["integration", "gcp-cloud-run"] +} + diff --git a/modules/integration_gcp-cloud-run/variables-gen.tf b/modules/integration_gcp-cloud-run/variables-gen.tf new file mode 100644 index 000000000..ced3c21a7 --- /dev/null +++ b/modules/integration_gcp-cloud-run/variables-gen.tf @@ -0,0 +1,241 @@ +# container_count detector + +variable "container_count_notifications" { + description = "Notification recipients list per severity overridden for container_count detector" + type = map(list(string)) + default = {} +} + +variable "container_count_aggregation_function" { + description = "Aggregation function and group by for container_count detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "container_count_transformation_function" { + description = "Transformation function for container_count detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "container_count_max_delay" { + description = "Enforce max delay for container_count detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "container_count_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "container_count_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "container_count_disabled" { + description = "Disable all alerting rules for container_count detector" + type = bool + default = null +} + +variable "container_count_threshold_critical" { + description = "Critical threshold for container_count detector" + type = number + default = 0 +} + +variable "container_count_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "container_count_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cpu_utilizations detector + +variable "cpu_utilizations_notifications" { + description = "Notification recipients list per severity overridden for cpu_utilizations detector" + type = map(list(string)) + default = {} +} + +variable "cpu_utilizations_aggregation_function" { + description = "Aggregation function and group by for cpu_utilizations detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "cpu_utilizations_transformation_function" { + description = "Transformation function for cpu_utilizations detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "cpu_utilizations_max_delay" { + description = "Enforce max delay for cpu_utilizations detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cpu_utilizations_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cpu_utilizations_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cpu_utilizations_disabled" { + description = "Disable all alerting rules for cpu_utilizations detector" + type = bool + default = null +} + +variable "cpu_utilizations_disabled_critical" { + description = "Disable critical alerting rule for cpu_utilizations detector" + type = bool + default = null +} + +variable "cpu_utilizations_disabled_major" { + description = "Disable major alerting rule for cpu_utilizations detector" + type = bool + default = null +} + +variable "cpu_utilizations_threshold_critical" { + description = "Critical threshold for cpu_utilizations detector in %" + type = number + default = 90 +} + +variable "cpu_utilizations_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cpu_utilizations_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cpu_utilizations_threshold_major" { + description = "Major threshold for cpu_utilizations detector in %" + type = number + default = 85 +} + +variable "cpu_utilizations_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cpu_utilizations_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# memory_utilizations detector + +variable "memory_utilizations_notifications" { + description = "Notification recipients list per severity overridden for memory_utilizations detector" + type = map(list(string)) + default = {} +} + +variable "memory_utilizations_aggregation_function" { + description = "Aggregation function and group by for memory_utilizations detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "memory_utilizations_transformation_function" { + description = "Transformation function for memory_utilizations detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "memory_utilizations_max_delay" { + description = "Enforce max delay for memory_utilizations detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "memory_utilizations_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "memory_utilizations_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "memory_utilizations_disabled" { + description = "Disable all alerting rules for memory_utilizations detector" + type = bool + default = null +} + +variable "memory_utilizations_disabled_critical" { + description = "Disable critical alerting rule for memory_utilizations detector" + type = bool + default = null +} + +variable "memory_utilizations_disabled_major" { + description = "Disable major alerting rule for memory_utilizations detector" + type = bool + default = null +} + +variable "memory_utilizations_threshold_critical" { + description = "Critical threshold for memory_utilizations detector in %" + type = number + default = 95 +} + +variable "memory_utilizations_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "memory_utilizations_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "memory_utilizations_threshold_major" { + description = "Major threshold for memory_utilizations detector in %" + type = number + default = 90 +} + +variable "memory_utilizations_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "memory_utilizations_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/integration_gcp-cloud-run/variables.tf b/modules/integration_gcp-cloud-run/variables.tf new file mode 100644 index 000000000..901d3ad46 --- /dev/null +++ b/modules/integration_gcp-cloud-run/variables.tf @@ -0,0 +1,4 @@ +variable "gcp_project_id" { + description = "GCP project id used for default filtering while lables are not synced" + type = string +} From 283b65576195cfbfcfb8caa399a015243e808fab Mon Sep 17 00:00:00 2001 From: Florent DELAHAYE Date: Thu, 12 Sep 2024 09:39:18 +0200 Subject: [PATCH 2/6] fix: remove useless transformations (#567) Co-authored-by: Florent DELAHAYE Co-authored-by: Jean-Baptiste Simillon --- .../conf/01-jvm-memory-pressure.yaml | 3 ++- .../conf/05-cluster-status.yaml | 3 ++- .../conf/08-cluster-cpu.yaml | 3 ++- .../conf/09-master-cpu.yaml | 3 ++- .../variables-gen.tf | 24 +++++++++---------- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml b/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml index a409edd83..b09233081 100644 --- a/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml +++ b/modules/integration_aws-elasticsearch/conf/01-jvm-memory-pressure.yaml @@ -1,7 +1,6 @@ module: AWS Elasticsearch name: JVM Memory Pressure -transformation: ".min(over='15m')" aggregation: true filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" value_unit: "%" @@ -15,7 +14,9 @@ rules: critical: threshold: 90 comparator: ">" + lasting_duration: "15m" major: threshold: 80 comparator: ">" dependency: critical + lasting_duration: "15m" diff --git a/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml b/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml index 4e6f7329c..8d3b3960f 100644 --- a/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml +++ b/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml @@ -1,7 +1,6 @@ module: AWS Elasticsearch name: Cluster status -aggregation: ".min(over='15m')" filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper')" signals: @@ -18,8 +17,10 @@ rules: comparator: ">=" description: "is red" signal: red + lasting_duration: "15m" major: threshold: 1 comparator: ">=" description: "is yellow" signal: yellow + lasting_duration: "15m" diff --git a/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml b/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml index 9249b2cb5..b1800b639 100644 --- a/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml +++ b/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml @@ -2,7 +2,6 @@ module: AWS Elasticsearch id: "cluster_cpu" name: "CPU utilization" -transformation: ".min(over='45m')" aggregation: "" filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" @@ -20,6 +19,8 @@ rules: threshold: 80 comparator: ">" dependency: critical + lasting_duration: "45m" critical: threshold: 90 comparator: ">" + lasting_duration: "45m" diff --git a/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml b/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml index 04385c8d1..ac054bc27 100644 --- a/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml +++ b/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml @@ -2,7 +2,6 @@ module: AWS Elasticsearch id: "master_cpu" name: "Master CPU utilization" -transformation: ".min(over='20m')" aggregation: "" filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" @@ -15,6 +14,8 @@ rules: threshold: 60 comparator: ">" dependency: critical + lasting_duration: "20m" critical: threshold: 70 comparator: ">" + lasting_duration: "20m" diff --git a/modules/integration_aws-elasticsearch/variables-gen.tf b/modules/integration_aws-elasticsearch/variables-gen.tf index b9b5939f0..e5f406bd3 100644 --- a/modules/integration_aws-elasticsearch/variables-gen.tf +++ b/modules/integration_aws-elasticsearch/variables-gen.tf @@ -59,7 +59,7 @@ variable "jvm_memory_pressure_aggregation_function" { variable "jvm_memory_pressure_transformation_function" { description = "Transformation function for jvm_memory_pressure detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='15m')" + default = "" } variable "jvm_memory_pressure_max_delay" { @@ -109,7 +109,7 @@ variable "jvm_memory_pressure_threshold_critical" { variable "jvm_memory_pressure_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_memory_pressure_at_least_percentage_critical" { @@ -126,7 +126,7 @@ variable "jvm_memory_pressure_threshold_major" { variable "jvm_memory_pressure_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "jvm_memory_pressure_at_least_percentage_major" { @@ -467,7 +467,7 @@ variable "cluster_status_notifications" { variable "cluster_status_aggregation_function" { description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" type = string - default = ".min(over='15m')" + default = "" } variable "cluster_status_transformation_function" { @@ -521,7 +521,7 @@ variable "cluster_status_threshold_critical" { variable "cluster_status_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_status_at_least_percentage_critical" { @@ -538,7 +538,7 @@ variable "cluster_status_threshold_major" { variable "cluster_status_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "15m" } variable "cluster_status_at_least_percentage_major" { @@ -725,7 +725,7 @@ variable "cluster_cpu_notifications" { variable "cluster_cpu_transformation_function" { description = "Transformation function for cluster_cpu detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='45m')" + default = "" } variable "cluster_cpu_max_delay" { @@ -773,7 +773,7 @@ variable "cluster_cpu_threshold_major" { variable "cluster_cpu_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "45m" } variable "cluster_cpu_at_least_percentage_major" { @@ -790,7 +790,7 @@ variable "cluster_cpu_threshold_critical" { variable "cluster_cpu_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "45m" } variable "cluster_cpu_at_least_percentage_critical" { @@ -809,7 +809,7 @@ variable "master_cpu_notifications" { variable "master_cpu_transformation_function" { description = "Transformation function for master_cpu detector (i.e. \".mean(over='5m')\")" type = string - default = ".min(over='20m')" + default = "" } variable "master_cpu_max_delay" { @@ -857,7 +857,7 @@ variable "master_cpu_threshold_major" { variable "master_cpu_lasting_duration_major" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "20m" } variable "master_cpu_at_least_percentage_major" { @@ -874,7 +874,7 @@ variable "master_cpu_threshold_critical" { variable "master_cpu_lasting_duration_critical" { description = "Minimum duration that conditions must be true before raising alert" type = string - default = null + default = "20m" } variable "master_cpu_at_least_percentage_critical" { From cefdd293ae8d6ea489c69256e5e75a3299aa6d7d Mon Sep 17 00:00:00 2001 From: tchernomax Date: Thu, 12 Sep 2024 09:41:07 +0200 Subject: [PATCH 3/6] =?UTF-8?q?redis.client.blocked=20=E2=86=92=20redis.cl?= =?UTF-8?q?ients.blocked=20(#564)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/redisreceiver/metadata.yaml#L153 https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/redisreceiver/metadata.yaml#L130 Co-authored-by: Jean-Baptiste Simillon --- modules/smart-agent_redis/README.md | 4 ++-- modules/smart-agent_redis/conf/03-blocked-clients.yaml | 4 ++-- modules/smart-agent_redis/detectors-gen.tf | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/smart-agent_redis/README.md b/modules/smart-agent_redis/README.md index b71f9ec63..01baa38a2 100644 --- a/modules/smart-agent_redis/README.md +++ b/modules/smart-agent_redis/README.md @@ -140,8 +140,8 @@ parameter to the corresponding monitor configuration: - '!bytes.maxmemory' - '!bytes.total_system_memory' - '!bytes.used_memory' - - '!${var.use_otel_receiver ? "redis.client.blocked" : "gauge.blocked_clients"}' - - '!${var.use_otel_receiver ? "redis.client.connected" : "gauge.connected_clients"}' + - '!${var.use_otel_receiver ? "redis.clients.blocked" : "gauge.blocked_clients"}' + - '!${var.use_otel_receiver ? "redis.clients.connected" : "gauge.connected_clients"}' - '!${var.use_otel_receiver ? "redis.connections.rejected" : "counter.rejected_connections"}' - '!${var.use_otel_receiver ? "redis.db.keys" : "gauge.db0_keys"}' - '!${var.use_otel_receiver ? "redis.keys.evicted" : "counter.evicted_keys"}' diff --git a/modules/smart-agent_redis/conf/03-blocked-clients.yaml b/modules/smart-agent_redis/conf/03-blocked-clients.yaml index ed4b711c8..e7da685d3 100644 --- a/modules/smart-agent_redis/conf/03-blocked-clients.yaml +++ b/modules/smart-agent_redis/conf/03-blocked-clients.yaml @@ -5,9 +5,9 @@ value_unit: "%" signals: A: - metric: '${var.use_otel_receiver ? "redis.client.blocked" : "gauge.blocked_clients"}' + metric: '${var.use_otel_receiver ? "redis.clients.blocked" : "gauge.blocked_clients"}' B: - metric: '${var.use_otel_receiver ? "redis.client.connected" : "gauge.connected_clients"}' + metric: '${var.use_otel_receiver ? "redis.clients.connected" : "gauge.connected_clients"}' signal: formula: (A/B).scale(100) diff --git a/modules/smart-agent_redis/detectors-gen.tf b/modules/smart-agent_redis/detectors-gen.tf index 80c2012bb..51c2d80c4 100644 --- a/modules/smart-agent_redis/detectors-gen.tf +++ b/modules/smart-agent_redis/detectors-gen.tf @@ -121,8 +121,8 @@ resource "signalfx_detector" "blocked_over_connected_clients_ratio" { } program_text = <<-EOF - A = data('${var.use_otel_receiver ? "redis.client.blocked" : "gauge.blocked_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} - B = data('${var.use_otel_receiver ? "redis.client.connected" : "gauge.connected_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} + A = data('${var.use_otel_receiver ? "redis.clients.blocked" : "gauge.blocked_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} + B = data('${var.use_otel_receiver ? "redis.clients.connected" : "gauge.connected_clients"}', filter=${module.filtering.signalflow})${var.blocked_over_connected_clients_ratio_aggregation_function}${var.blocked_over_connected_clients_ratio_transformation_function} signal = (A/B).scale(100).publish('signal') detect(when(signal > ${var.blocked_over_connected_clients_ratio_threshold_critical}%{if var.blocked_over_connected_clients_ratio_lasting_duration_critical != null}, lasting='${var.blocked_over_connected_clients_ratio_lasting_duration_critical}', at_least=${var.blocked_over_connected_clients_ratio_at_least_percentage_critical}%{endif})).publish('CRIT') detect(when(signal > ${var.blocked_over_connected_clients_ratio_threshold_major}%{if var.blocked_over_connected_clients_ratio_lasting_duration_major != null}, lasting='${var.blocked_over_connected_clients_ratio_lasting_duration_major}', at_least=${var.blocked_over_connected_clients_ratio_at_least_percentage_major}%{endif}) and (not when(signal > ${var.blocked_over_connected_clients_ratio_threshold_critical}%{if var.blocked_over_connected_clients_ratio_lasting_duration_critical != null}, lasting='${var.blocked_over_connected_clients_ratio_lasting_duration_critical}', at_least=${var.blocked_over_connected_clients_ratio_at_least_percentage_critical}%{endif}))).publish('MAJOR') From 199412e28a3d62cdc07d9cb2d5e98fdcbf976285 Mon Sep 17 00:00:00 2001 From: HugLee <70705221+hugueslepesant@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:46:46 +0200 Subject: [PATCH 4/6] feat: prometheus-exporter_postfix (#563) Co-authored-by: Jean-Baptiste Simillon --- docs/severity.md | 12 + modules/prometheus-exporter_postfix/README.md | 119 +++++ .../common-filters.tf | 1 + .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/00-heartbeat.yaml | 13 + .../conf/01-queue_deferred.yaml | 21 + .../conf/02-queue_hold.yaml | 21 + .../conf/03-queue_maildrop.yaml | 21 + .../conf/04-mail_delivery_delay.yaml | 21 + .../conf/readme.yaml | 3 + .../detectors-gen.tf | 192 ++++++++ .../prometheus-exporter_postfix/outputs.tf | 25 ++ modules/prometheus-exporter_postfix/tags.tf | 4 + .../variables-gen.tf | 410 ++++++++++++++++++ 17 files changed, 867 insertions(+) create mode 100644 modules/prometheus-exporter_postfix/README.md create mode 120000 modules/prometheus-exporter_postfix/common-filters.tf create mode 120000 modules/prometheus-exporter_postfix/common-locals.tf create mode 120000 modules/prometheus-exporter_postfix/common-modules.tf create mode 120000 modules/prometheus-exporter_postfix/common-variables.tf create mode 120000 modules/prometheus-exporter_postfix/common-versions.tf create mode 100644 modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml create mode 100644 modules/prometheus-exporter_postfix/conf/readme.yaml create mode 100644 modules/prometheus-exporter_postfix/detectors-gen.tf create mode 100644 modules/prometheus-exporter_postfix/outputs.tf create mode 100644 modules/prometheus-exporter_postfix/tags.tf create mode 100644 modules/prometheus-exporter_postfix/variables-gen.tf diff --git a/docs/severity.md b/docs/severity.md index 657b18cec..df8b7dab7 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -82,6 +82,7 @@ - [prometheus-exporter_docker-state](#prometheus-exporter_docker-state) - [prometheus-exporter_kong](#prometheus-exporter_kong) - [prometheus-exporter_oracledb](#prometheus-exporter_oracledb) +- [prometheus-exporter_postfix](#prometheus-exporter_postfix) - [prometheus-exporter_squid](#prometheus-exporter_squid) - [prometheus-exporter_varnish](#prometheus-exporter_varnish) - [prometheus-exporter_wallix-bastion](#prometheus-exporter_wallix-bastion) @@ -899,6 +900,17 @@ |Oracle database status|X|-|-|-|-| +## prometheus-exporter_postfix + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Postfix heartbeat|X|-|-|-|-| +|Postfix size postfix queue deferred|X|X|-|-|-| +|Postfix size postfix queue hold|X|X|-|-|-| +|Postfix size postfix queue maildrop|X|X|-|-|-| +|Postfix size postfix delivery delay|X|X|-|-|-| + + ## prometheus-exporter_squid |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/prometheus-exporter_postfix/README.md b/modules/prometheus-exporter_postfix/README.md new file mode 100644 index 000000000..bf4ed14d4 --- /dev/null +++ b/modules/prometheus-exporter_postfix/README.md @@ -0,0 +1,119 @@ +# POSTFIX SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-prometheus-exporter-postfix" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/prometheus-exporter_postfix?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Postfix heartbeat|X|-|-|-|-| +|Postfix size postfix queue deferred|X|X|-|-|-| +|Postfix size postfix queue hold|X|X|-|-|-| +|Postfix size postfix queue maildrop|X|X|-|-|-| +|Postfix size postfix delivery delay|X|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +scraping of a server following the [OpenMetrics convention](https://openmetrics.io/) based on and compatible with [the Prometheus +exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#openmetrics-text-format). + +They are generally called `Prometheus Exporters` which can be fetched by both the [SignalFx Smart Agent](https://github.com/signalfx/signalfx-agent) +thanks to its [prometheus exporter monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/prometheus-exporter.md) and the +[OpenTelemetry Collector](https://github.com/signalfx/splunk-otel-collector) using its [prometheus +receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) or its derivatives. + +These exporters could be embedded directly in the tool you want to monitor (e.g. nginx ingress) or must be installed next to it as +a separate program configured to connect, create metrics and expose them as server. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `postfix_showq_message_size_bytes_count` +* `postfix_smtp_delivery_delay_seconds_count` +* `postfix_up` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/prometheus-exporter_postfix/common-filters.tf b/modules/prometheus-exporter_postfix/common-filters.tf new file mode 120000 index 000000000..51ac61525 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-prometheus-exporter.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-locals.tf b/modules/prometheus-exporter_postfix/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-modules.tf b/modules/prometheus-exporter_postfix/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-variables.tf b/modules/prometheus-exporter_postfix/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/common-versions.tf b/modules/prometheus-exporter_postfix/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/prometheus-exporter_postfix/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml b/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml new file mode 100644 index 000000000..79a6c49e5 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +## Example +module: postfix +name: heartbeat + +transformation: false +aggregation: true +exclude_not_running_vm: true + +signals: + signal: + metric: "postfix_up" +rules: + critical: diff --git a/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml b/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml new file mode 100644 index 000000000..6627879f8 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/01-queue_deferred.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Deferred" +id: "postfix_showq_message_size_bytes_count_deferred" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'deferred')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml b/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml new file mode 100644 index 000000000..99052ccf1 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/02-queue_hold.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Hold" +id: "postfix_showq_message_size_bytes_count_hold" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'hold')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml b/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml new file mode 100644 index 000000000..1fe7c10a4 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/03-queue_maildrop.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Queue Maildrop" +id: "postfix_showq_message_size_bytes_count_maildrop" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'maildrop')" + +signals: + signal: + metric: "postfix_showq_message_size_bytes_count" + +rules: + critical: + threshold: 600 + comparator: ">" + + major: + threshold: 300 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml b/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml new file mode 100644 index 000000000..0a1297572 --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/04-mail_delivery_delay.yaml @@ -0,0 +1,21 @@ +module: postfix +name: "Size Postfix Delivery Delay" +id: "postfix_smtp_delivery_delay_seconds_count" + +transformation: ".min(over='30m')" +aggregation: true +filtering: "filter('queue', 'maildrop')" + +signals: + signal: + metric: "postfix_smtp_delivery_delay_seconds_count" + +rules: + critical: + threshold: 60 + comparator: ">" + + major: + threshold: 45 + comparator: ">" + dependency: "critical" diff --git a/modules/prometheus-exporter_postfix/conf/readme.yaml b/modules/prometheus-exporter_postfix/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/prometheus-exporter_postfix/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/prometheus-exporter_postfix/detectors-gen.tf b/modules/prometheus-exporter_postfix/detectors-gen.tf new file mode 100644 index 000000000..d6df80533 --- /dev/null +++ b/modules/prometheus-exporter_postfix/detectors-gen.tf @@ -0,0 +1,192 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Postfix heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('postfix_up', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_deferred" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue deferred") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'deferred') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_deferred_aggregation_function}${var.postfix_showq_message_size_bytes_count_deferred_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_major}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_deferred_disabled_critical, var.postfix_showq_message_size_bytes_count_deferred_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_deferred_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_deferred_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_deferred_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_deferred_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_deferred_disabled_major, var.postfix_showq_message_size_bytes_count_deferred_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_deferred_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_deferred_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_deferred_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_deferred_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_hold" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue hold") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'hold') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_hold_aggregation_function}${var.postfix_showq_message_size_bytes_count_hold_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_major}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_hold_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_hold_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_hold_disabled_critical, var.postfix_showq_message_size_bytes_count_hold_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_hold_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_hold_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_hold_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_hold_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_hold_disabled_major, var.postfix_showq_message_size_bytes_count_hold_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_hold_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_hold_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_hold_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_hold_max_delay +} + +resource "signalfx_detector" "postfix_showq_message_size_bytes_count_maildrop" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix queue maildrop") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'maildrop') + signal = data('postfix_showq_message_size_bytes_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_showq_message_size_bytes_count_maildrop_aggregation_function}${var.postfix_showq_message_size_bytes_count_maildrop_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_major}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}%{if var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical != null}, lasting='${var.postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical}', at_least=${var.postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_maildrop_disabled_critical, var.postfix_showq_message_size_bytes_count_maildrop_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_maildrop_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_maildrop_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_maildrop_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_showq_message_size_bytes_count_maildrop_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_showq_message_size_bytes_count_maildrop_disabled_major, var.postfix_showq_message_size_bytes_count_maildrop_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_showq_message_size_bytes_count_maildrop_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_showq_message_size_bytes_count_maildrop_runbook_url, var.runbook_url), "") + tip = var.postfix_showq_message_size_bytes_count_maildrop_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_showq_message_size_bytes_count_maildrop_max_delay +} + +resource "signalfx_detector" "postfix_smtp_delivery_delay_seconds_count" { + name = format("%s %s", local.detector_name_prefix, "Postfix size postfix delivery delay") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('queue', 'maildrop') + signal = data('postfix_smtp_delivery_delay_seconds_count', filter=base_filtering and ${module.filtering.signalflow})${var.postfix_smtp_delivery_delay_seconds_count_aggregation_function}${var.postfix_smtp_delivery_delay_seconds_count_transformation_function}.publish('signal') + detect(when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_major}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_major != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_major}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_major}%{endif}) and (not when(signal > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}%{if var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical != null}, lasting='${var.postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical}', at_least=${var.postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.postfix_smtp_delivery_delay_seconds_count_disabled_critical, var.postfix_smtp_delivery_delay_seconds_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_smtp_delivery_delay_seconds_count_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.postfix_smtp_delivery_delay_seconds_count_runbook_url, var.runbook_url), "") + tip = var.postfix_smtp_delivery_delay_seconds_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.postfix_smtp_delivery_delay_seconds_count_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.postfix_smtp_delivery_delay_seconds_count_disabled_major, var.postfix_smtp_delivery_delay_seconds_count_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.postfix_smtp_delivery_delay_seconds_count_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.postfix_smtp_delivery_delay_seconds_count_runbook_url, var.runbook_url), "") + tip = var.postfix_smtp_delivery_delay_seconds_count_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.postfix_smtp_delivery_delay_seconds_count_max_delay +} + diff --git a/modules/prometheus-exporter_postfix/outputs.tf b/modules/prometheus-exporter_postfix/outputs.tf new file mode 100644 index 000000000..c816c0800 --- /dev/null +++ b/modules/prometheus-exporter_postfix/outputs.tf @@ -0,0 +1,25 @@ +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + +output "postfix_showq_message_size_bytes_count_deferred" { + description = "Detector resource for postfix_showq_message_size_bytes_count_deferred" + value = signalfx_detector.postfix_showq_message_size_bytes_count_deferred +} + +output "postfix_showq_message_size_bytes_count_hold" { + description = "Detector resource for postfix_showq_message_size_bytes_count_hold" + value = signalfx_detector.postfix_showq_message_size_bytes_count_hold +} + +output "postfix_showq_message_size_bytes_count_maildrop" { + description = "Detector resource for postfix_showq_message_size_bytes_count_maildrop" + value = signalfx_detector.postfix_showq_message_size_bytes_count_maildrop +} + +output "postfix_smtp_delivery_delay_seconds_count" { + description = "Detector resource for postfix_smtp_delivery_delay_seconds_count" + value = signalfx_detector.postfix_smtp_delivery_delay_seconds_count +} + diff --git a/modules/prometheus-exporter_postfix/tags.tf b/modules/prometheus-exporter_postfix/tags.tf new file mode 100644 index 000000000..d8c3398d2 --- /dev/null +++ b/modules/prometheus-exporter_postfix/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["prometheus-exporter", "postfix"] +} + diff --git a/modules/prometheus-exporter_postfix/variables-gen.tf b/modules/prometheus-exporter_postfix/variables-gen.tf new file mode 100644 index 000000000..3089c2425 --- /dev/null +++ b/modules/prometheus-exporter_postfix/variables-gen.tf @@ -0,0 +1,410 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_exclude_not_running_vm" { + description = "Don’t send alerts if associated VM is stopped or stopping (metadata provided by cloud provider integration). Can be useful for ephemeral infrastructure (such as auto scaling groups) as VM will be stopped and started regularly. Note that timeframe must be at least 25 minutes for the metadata to be available to the detector." + type = bool + default = true +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"25m\"). Must be at least \"25m\" if \"heartbeat_exclude_not_running_vm\" is true" + type = string + default = "25m" +} + +# postfix_showq_message_size_bytes_count_deferred detector + +variable "postfix_showq_message_size_bytes_count_deferred_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_deferred detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_deferred_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_deferred detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_deferred detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_deferred_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_deferred detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_deferred detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_deferred detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_deferred_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_deferred_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_deferred detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_deferred_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_deferred_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_showq_message_size_bytes_count_hold detector + +variable "postfix_showq_message_size_bytes_count_hold_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_hold detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_hold_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_hold detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_hold detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_hold_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_hold detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_hold detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_hold detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_hold_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_hold_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_hold detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_hold_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_hold_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_showq_message_size_bytes_count_maildrop detector + +variable "postfix_showq_message_size_bytes_count_maildrop_notifications" { + description = "Notification recipients list per severity overridden for postfix_showq_message_size_bytes_count_maildrop detector" + type = map(list(string)) + default = {} +} + +variable "postfix_showq_message_size_bytes_count_maildrop_aggregation_function" { + description = "Aggregation function and group by for postfix_showq_message_size_bytes_count_maildrop detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_transformation_function" { + description = "Transformation function for postfix_showq_message_size_bytes_count_maildrop detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_max_delay" { + description = "Enforce max delay for postfix_showq_message_size_bytes_count_maildrop detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled" { + description = "Disable all alerting rules for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled_critical" { + description = "Disable critical alerting rule for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_disabled_major" { + description = "Disable major alerting rule for postfix_showq_message_size_bytes_count_maildrop detector" + type = bool + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_threshold_critical" { + description = "Critical threshold for postfix_showq_message_size_bytes_count_maildrop detector" + type = number + default = 600 +} + +variable "postfix_showq_message_size_bytes_count_maildrop_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_showq_message_size_bytes_count_maildrop_threshold_major" { + description = "Major threshold for postfix_showq_message_size_bytes_count_maildrop detector" + type = number + default = 300 +} + +variable "postfix_showq_message_size_bytes_count_maildrop_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_showq_message_size_bytes_count_maildrop_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# postfix_smtp_delivery_delay_seconds_count detector + +variable "postfix_smtp_delivery_delay_seconds_count_notifications" { + description = "Notification recipients list per severity overridden for postfix_smtp_delivery_delay_seconds_count detector" + type = map(list(string)) + default = {} +} + +variable "postfix_smtp_delivery_delay_seconds_count_aggregation_function" { + description = "Aggregation function and group by for postfix_smtp_delivery_delay_seconds_count detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_transformation_function" { + description = "Transformation function for postfix_smtp_delivery_delay_seconds_count detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='30m')" +} + +variable "postfix_smtp_delivery_delay_seconds_count_max_delay" { + description = "Enforce max delay for postfix_smtp_delivery_delay_seconds_count detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled" { + description = "Disable all alerting rules for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled_critical" { + description = "Disable critical alerting rule for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_disabled_major" { + description = "Disable major alerting rule for postfix_smtp_delivery_delay_seconds_count detector" + type = bool + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_threshold_critical" { + description = "Critical threshold for postfix_smtp_delivery_delay_seconds_count detector" + type = number + default = 60 +} + +variable "postfix_smtp_delivery_delay_seconds_count_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "postfix_smtp_delivery_delay_seconds_count_threshold_major" { + description = "Major threshold for postfix_smtp_delivery_delay_seconds_count detector" + type = number + default = 45 +} + +variable "postfix_smtp_delivery_delay_seconds_count_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "postfix_smtp_delivery_delay_seconds_count_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 7b884f4f4bc7d6652860f680a865102353a7dd5f Mon Sep 17 00:00:00 2001 From: HugLee <70705221+hugueslepesant@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:52:27 +0200 Subject: [PATCH 5/6] feat: prometheus-exporter_dnsmasq (#561) * feat: prometheus-exporter_dnsmasq * fix: prometheus-exporter_dnsmasq/README.md --------- Co-authored-by: Jean-Baptiste Simillon --- docs/severity.md | 10 + modules/prometheus-exporter_dnsmasq/README.md | 117 ++++++++++ .../common-filters.tf | 1 + .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/00-heartbeat.yaml | 13 ++ .../conf/01-cachesize_limit.yaml | 16 ++ .../conf/02-hit-rate.yaml | 24 +++ .../conf/readme.yaml | 3 + .../detectors-gen.tf | 97 +++++++++ .../prometheus-exporter_dnsmasq/outputs.tf | 15 ++ modules/prometheus-exporter_dnsmasq/tags.tf | 4 + .../variables-gen.tf | 201 ++++++++++++++++++ 15 files changed, 505 insertions(+) create mode 100644 modules/prometheus-exporter_dnsmasq/README.md create mode 120000 modules/prometheus-exporter_dnsmasq/common-filters.tf create mode 120000 modules/prometheus-exporter_dnsmasq/common-locals.tf create mode 120000 modules/prometheus-exporter_dnsmasq/common-modules.tf create mode 120000 modules/prometheus-exporter_dnsmasq/common-variables.tf create mode 120000 modules/prometheus-exporter_dnsmasq/common-versions.tf create mode 100644 modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml create mode 100644 modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml create mode 100644 modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml create mode 100644 modules/prometheus-exporter_dnsmasq/conf/readme.yaml create mode 100644 modules/prometheus-exporter_dnsmasq/detectors-gen.tf create mode 100644 modules/prometheus-exporter_dnsmasq/outputs.tf create mode 100644 modules/prometheus-exporter_dnsmasq/tags.tf create mode 100644 modules/prometheus-exporter_dnsmasq/variables-gen.tf diff --git a/docs/severity.md b/docs/severity.md index df8b7dab7..a9477c7e3 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -79,6 +79,7 @@ - [organization_usage](#organization_usage) - [otel-collector_kubernetes-common](#otel-collector_kubernetes-common) - [prometheus-exporter_active-directory](#prometheus-exporter_active-directory) +- [prometheus-exporter_dnsmasq](#prometheus-exporter_dnsmasq) - [prometheus-exporter_docker-state](#prometheus-exporter_docker-state) - [prometheus-exporter_kong](#prometheus-exporter_kong) - [prometheus-exporter_oracledb](#prometheus-exporter_oracledb) @@ -874,6 +875,15 @@ |Active-directory active directory services|X|-|-|-|-| +## prometheus-exporter_dnsmasq + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Dnsmasq heartbeat|X|-|-|-|-| +|Dnsmasq hits|X|-|-|-|-| +|Dnsmasq hit rate|-|X|X|-|-| + + ## prometheus-exporter_docker-state |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/prometheus-exporter_dnsmasq/README.md b/modules/prometheus-exporter_dnsmasq/README.md new file mode 100644 index 000000000..e5e5e43d8 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/README.md @@ -0,0 +1,117 @@ +# DNSMASQ SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-prometheus-exporter-dnsmasq" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/prometheus-exporter_dnsmasq?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|Dnsmasq heartbeat|X|-|-|-|-| +|Dnsmasq hits|X|-|-|-|-| +|Dnsmasq hit rate|-|X|X|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +scraping of a server following the [OpenMetrics convention](https://openmetrics.io/) based on and compatible with [the Prometheus +exposition format](https://github.com/prometheus/docs/blob/main/content/docs/instrumenting/exposition_formats.md#openmetrics-text-format). + +They are generally called `Prometheus Exporters` which can be fetched by both the [SignalFx Smart Agent](https://github.com/signalfx/signalfx-agent) +thanks to its [prometheus exporter monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/prometheus-exporter.md) and the +[OpenTelemetry Collector](https://github.com/signalfx/splunk-otel-collector) using its [prometheus +receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/prometheusreceiver) or its derivatives. + +These exporters could be embedded directly in the tool you want to monitor (e.g. nginx ingress) or must be installed next to it as +a separate program configured to connect, create metrics and expose them as server. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `dnsmasq_cachesize` +* `dnsmasq_hits` +* `dnsmasq_misses` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) diff --git a/modules/prometheus-exporter_dnsmasq/common-filters.tf b/modules/prometheus-exporter_dnsmasq/common-filters.tf new file mode 120000 index 000000000..51ac61525 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-filters.tf @@ -0,0 +1 @@ +../../common/module/filters-prometheus-exporter.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-locals.tf b/modules/prometheus-exporter_dnsmasq/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-modules.tf b/modules/prometheus-exporter_dnsmasq/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-variables.tf b/modules/prometheus-exporter_dnsmasq/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/common-versions.tf b/modules/prometheus-exporter_dnsmasq/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml b/modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml new file mode 100644 index 000000000..c303d719a --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +module: dnsmasq +name: heartbeat + +transformation: false +aggregation: true + +exclude_not_running_vm: true + +signals: + signal: + metric: "dnsmasq_cachesize" +rules: + critical: diff --git a/modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml b/modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml new file mode 100644 index 000000000..b4bc3b575 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/01-cachesize_limit.yaml @@ -0,0 +1,16 @@ +module: dnsmasq +name: hits +id: dnsmasq_hits + +transformation: true +aggregation: true + +signals: + signal: + metric: dnsmasq_hits + +rules: + critical: + threshold: 1 + comparator: "<=" + lasting_duration: '5m' diff --git a/modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml b/modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml new file mode 100644 index 000000000..fb05bcf33 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/02-hit-rate.yaml @@ -0,0 +1,24 @@ +module: dnsmasq +name: Hit Rate +id: dnsmasq_hit_rate + +transformation: ".min(over='5m')" +aggregation: true + +signals: + A: + metric: dnsmasq_hits + B: + metric: dnsmasq_misses + signal: + formula: (A/(A+B)).fill(0).scale(100) +rules: + minor: + threshold: 90 + comparator: "<" + lasting_duration: "5m" + dependency: major + major: + threshold: 80 + comparator: "<=" + lasting_duration: "5m" diff --git a/modules/prometheus-exporter_dnsmasq/conf/readme.yaml b/modules/prometheus-exporter_dnsmasq/conf/readme.yaml new file mode 100644 index 000000000..9015fc41a --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + +source_doc: diff --git a/modules/prometheus-exporter_dnsmasq/detectors-gen.tf b/modules/prometheus-exporter_dnsmasq/detectors-gen.tf new file mode 100644 index 000000000..4f4212ab9 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/detectors-gen.tf @@ -0,0 +1,97 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "Dnsmasq heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + signal = data('dnsmasq_cachesize', filter=%{if var.heartbeat_exclude_not_running_vm}${local.not_running_vm_filters} and %{endif}${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + +resource "signalfx_detector" "dnsmasq_hits" { + name = format("%s %s", local.detector_name_prefix, "Dnsmasq hits") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('dnsmasq_hits', filter=${module.filtering.signalflow})${var.dnsmasq_hits_aggregation_function}${var.dnsmasq_hits_transformation_function}.publish('signal') + detect(when(signal <= ${var.dnsmasq_hits_threshold_critical}%{if var.dnsmasq_hits_lasting_duration_critical != null}, lasting='${var.dnsmasq_hits_lasting_duration_critical}', at_least=${var.dnsmasq_hits_at_least_percentage_critical}%{endif})).publish('CRIT') +EOF + + rule { + description = "is too low <= ${var.dnsmasq_hits_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.dnsmasq_hits_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.dnsmasq_hits_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.dnsmasq_hits_runbook_url, var.runbook_url), "") + tip = var.dnsmasq_hits_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.dnsmasq_hits_max_delay +} + +resource "signalfx_detector" "dnsmasq_hit_rate" { + name = format("%s %s", local.detector_name_prefix, "Dnsmasq hit rate") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + A = data('dnsmasq_hits', filter=${module.filtering.signalflow})${var.dnsmasq_hit_rate_aggregation_function}${var.dnsmasq_hit_rate_transformation_function} + B = data('dnsmasq_misses', filter=${module.filtering.signalflow})${var.dnsmasq_hit_rate_aggregation_function}${var.dnsmasq_hit_rate_transformation_function} + signal = (A/(A+B)).fill(0).scale(100).publish('signal') + detect(when(signal < ${var.dnsmasq_hit_rate_threshold_minor}%{if var.dnsmasq_hit_rate_lasting_duration_minor != null}, lasting='${var.dnsmasq_hit_rate_lasting_duration_minor}', at_least=${var.dnsmasq_hit_rate_at_least_percentage_minor}%{endif}) and (not when(signal <= ${var.dnsmasq_hit_rate_threshold_major}%{if var.dnsmasq_hit_rate_lasting_duration_major != null}, lasting='${var.dnsmasq_hit_rate_lasting_duration_major}', at_least=${var.dnsmasq_hit_rate_at_least_percentage_major}%{endif}))).publish('MINOR') + detect(when(signal <= ${var.dnsmasq_hit_rate_threshold_major}%{if var.dnsmasq_hit_rate_lasting_duration_major != null}, lasting='${var.dnsmasq_hit_rate_lasting_duration_major}', at_least=${var.dnsmasq_hit_rate_at_least_percentage_major}%{endif})).publish('MAJOR') +EOF + + rule { + description = "is too low < ${var.dnsmasq_hit_rate_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.dnsmasq_hit_rate_disabled_minor, var.dnsmasq_hit_rate_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.dnsmasq_hit_rate_notifications, "minor", []), var.notifications.minor), null) + runbook_url = try(coalesce(var.dnsmasq_hit_rate_runbook_url, var.runbook_url), "") + tip = var.dnsmasq_hit_rate_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too low <= ${var.dnsmasq_hit_rate_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.dnsmasq_hit_rate_disabled_major, var.dnsmasq_hit_rate_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.dnsmasq_hit_rate_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.dnsmasq_hit_rate_runbook_url, var.runbook_url), "") + tip = var.dnsmasq_hit_rate_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.dnsmasq_hit_rate_max_delay +} + diff --git a/modules/prometheus-exporter_dnsmasq/outputs.tf b/modules/prometheus-exporter_dnsmasq/outputs.tf new file mode 100644 index 000000000..4c3112430 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/outputs.tf @@ -0,0 +1,15 @@ +output "dnsmasq_hit_rate" { + description = "Detector resource for dnsmasq_hit_rate" + value = signalfx_detector.dnsmasq_hit_rate +} + +output "dnsmasq_hits" { + description = "Detector resource for dnsmasq_hits" + value = signalfx_detector.dnsmasq_hits +} + +output "heartbeat" { + description = "Detector resource for heartbeat" + value = signalfx_detector.heartbeat +} + diff --git a/modules/prometheus-exporter_dnsmasq/tags.tf b/modules/prometheus-exporter_dnsmasq/tags.tf new file mode 100644 index 000000000..9c6615c89 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/tags.tf @@ -0,0 +1,4 @@ +locals { + tags = ["prometheus-exporter", "dnsmasq"] +} + diff --git a/modules/prometheus-exporter_dnsmasq/variables-gen.tf b/modules/prometheus-exporter_dnsmasq/variables-gen.tf new file mode 100644 index 000000000..9027271c0 --- /dev/null +++ b/modules/prometheus-exporter_dnsmasq/variables-gen.tf @@ -0,0 +1,201 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_exclude_not_running_vm" { + description = "Don’t send alerts if associated VM is stopped or stopping (metadata provided by cloud provider integration). Can be useful for ephemeral infrastructure (such as auto scaling groups) as VM will be stopped and started regularly. Note that timeframe must be at least 25 minutes for the metadata to be available to the detector." + type = bool + default = true +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"25m\"). Must be at least \"25m\" if \"heartbeat_exclude_not_running_vm\" is true" + type = string + default = "25m" +} + +# dnsmasq_hits detector + +variable "dnsmasq_hits_notifications" { + description = "Notification recipients list per severity overridden for dnsmasq_hits detector" + type = map(list(string)) + default = {} +} + +variable "dnsmasq_hits_aggregation_function" { + description = "Aggregation function and group by for dnsmasq_hits detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "dnsmasq_hits_transformation_function" { + description = "Transformation function for dnsmasq_hits detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "dnsmasq_hits_max_delay" { + description = "Enforce max delay for dnsmasq_hits detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "dnsmasq_hits_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "dnsmasq_hits_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "dnsmasq_hits_disabled" { + description = "Disable all alerting rules for dnsmasq_hits detector" + type = bool + default = null +} + +variable "dnsmasq_hits_threshold_critical" { + description = "Critical threshold for dnsmasq_hits detector" + type = number + default = 1 +} + +variable "dnsmasq_hits_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "dnsmasq_hits_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# dnsmasq_hit_rate detector + +variable "dnsmasq_hit_rate_notifications" { + description = "Notification recipients list per severity overridden for dnsmasq_hit_rate detector" + type = map(list(string)) + default = {} +} + +variable "dnsmasq_hit_rate_aggregation_function" { + description = "Aggregation function and group by for dnsmasq_hit_rate detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "dnsmasq_hit_rate_transformation_function" { + description = "Transformation function for dnsmasq_hit_rate detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='5m')" +} + +variable "dnsmasq_hit_rate_max_delay" { + description = "Enforce max delay for dnsmasq_hit_rate detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "dnsmasq_hit_rate_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "dnsmasq_hit_rate_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "dnsmasq_hit_rate_disabled" { + description = "Disable all alerting rules for dnsmasq_hit_rate detector" + type = bool + default = null +} + +variable "dnsmasq_hit_rate_disabled_minor" { + description = "Disable minor alerting rule for dnsmasq_hit_rate detector" + type = bool + default = null +} + +variable "dnsmasq_hit_rate_disabled_major" { + description = "Disable major alerting rule for dnsmasq_hit_rate detector" + type = bool + default = null +} + +variable "dnsmasq_hit_rate_threshold_minor" { + description = "Minor threshold for dnsmasq_hit_rate detector" + type = number + default = 90 +} + +variable "dnsmasq_hit_rate_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "dnsmasq_hit_rate_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "dnsmasq_hit_rate_threshold_major" { + description = "Major threshold for dnsmasq_hit_rate detector" + type = number + default = 80 +} + +variable "dnsmasq_hit_rate_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "dnsmasq_hit_rate_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} From 53a0cc19d4aeb7500d173e54020396da16362d0c Mon Sep 17 00:00:00 2001 From: Emmanuel Clisson <132999284+eclisson-clt@users.noreply.github.com> Date: Thu, 12 Sep 2024 15:07:24 +0200 Subject: [PATCH 6/6] Integration gcp cloud sql postgresql (#568) * feat(gcp): Add PostgreSQL detectors * Feat(gpc): Add PostgreSQL detectors * Feat(gpc): Add PostgreSQL detectors * Feat(gpc): Add PostgreSQL detectors * Update README;md * Update README;md * Clean Up * test * Revert "test" This reverts commit 84d497b50bd80017d14ce37b20a8b2172325c31e. * Fix readme * fix readme --- docs/severity.md | 8 ++ .../README.md | 107 ++++++++++++++++++ .../common-locals.tf | 1 + .../common-modules.tf | 1 + .../common-variables.tf | 1 + .../common-versions.tf | 1 + .../conf/01-replication_lag.yaml | 21 ++++ .../conf/readme.yaml | 3 + .../detectors-gen.tf | 40 +++++++ .../filters.tf | 4 + .../outputs.tf | 5 + .../tags.tf | 3 + .../variables-gen.tf | 90 +++++++++++++++ .../variables.tf | 4 + 14 files changed, 289 insertions(+) create mode 100644 modules/integration_gcp-cloud-sql-postgresql/README.md create mode 120000 modules/integration_gcp-cloud-sql-postgresql/common-locals.tf create mode 120000 modules/integration_gcp-cloud-sql-postgresql/common-modules.tf create mode 120000 modules/integration_gcp-cloud-sql-postgresql/common-variables.tf create mode 120000 modules/integration_gcp-cloud-sql-postgresql/common-versions.tf create mode 100644 modules/integration_gcp-cloud-sql-postgresql/conf/01-replication_lag.yaml create mode 100644 modules/integration_gcp-cloud-sql-postgresql/conf/readme.yaml create mode 100644 modules/integration_gcp-cloud-sql-postgresql/detectors-gen.tf create mode 100644 modules/integration_gcp-cloud-sql-postgresql/filters.tf create mode 100644 modules/integration_gcp-cloud-sql-postgresql/outputs.tf create mode 100644 modules/integration_gcp-cloud-sql-postgresql/tags.tf create mode 100644 modules/integration_gcp-cloud-sql-postgresql/variables-gen.tf create mode 100644 modules/integration_gcp-cloud-sql-postgresql/variables.tf diff --git a/docs/severity.md b/docs/severity.md index a9477c7e3..149003026 100644 --- a/docs/severity.md +++ b/docs/severity.md @@ -70,6 +70,7 @@ - [integration_gcp-cloud-sql-common](#integration_gcp-cloud-sql-common) - [integration_gcp-cloud-sql-failover](#integration_gcp-cloud-sql-failover) - [integration_gcp-cloud-sql-mysql](#integration_gcp-cloud-sql-mysql) +- [integration_gcp-cloud-sql-postgresql](#integration_gcp-cloud-sql-postgresql) - [integration_gcp-compute-engine](#integration_gcp-compute-engine) - [integration_gcp-load-balancing](#integration_gcp-load-balancing) - [integration_gcp-memorystore-redis](#integration_gcp-memorystore-redis) @@ -775,6 +776,13 @@ |GCP Cloud SQL MySQL replication lag|X|X|-|-|-| +## integration_gcp-cloud-sql-postgresql + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|GCP Cloud SQL PostgreSQL replication lag|X|X|-|-|-| + + ## integration_gcp-compute-engine |Detector|Critical|Major|Minor|Warning|Info| diff --git a/modules/integration_gcp-cloud-sql-postgresql/README.md b/modules/integration_gcp-cloud-sql-postgresql/README.md new file mode 100644 index 000000000..759671357 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/README.md @@ -0,0 +1,107 @@ +# GCP-CLOUD-SQL-POSTGRESQL SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Metrics](#metrics) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-integration-gcp-cloud-sql-postgresql" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/integration_gcp-cloud-sql-postgresql?ref={revision}" + + environment = var.environment + notifications = local.notifications + gcp_project_id = "fillme" +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +|Detector|Critical|Major|Minor|Warning|Info| +|---|---|---|---|---|---| +|GCP Cloud SQL PostgreSQL replication lag|X|X|-|-|-| + +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +[GCP integration](https://docs.splunk.com/observability/en/gdi/get-data-in/connect/gcp/gcp-metrics.html) configurable +with [this Terraform module](https://github.com/claranet/terraform-signalfx-integrations/tree/master/cloud/gcp). + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + + + +### Metrics + + +Here is the list of required metrics for detectors in this module. + +* `database/postgresql/replication/replica_byte_lag` + + + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) +* [Stackdriver metrics](https://cloud.google.com/monitoring/api/metrics_gcp#gcp-cloudsql) diff --git a/modules/integration_gcp-cloud-sql-postgresql/common-locals.tf b/modules/integration_gcp-cloud-sql-postgresql/common-locals.tf new file mode 120000 index 000000000..5672d21ab --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/common-locals.tf @@ -0,0 +1 @@ +../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-sql-postgresql/common-modules.tf b/modules/integration_gcp-cloud-sql-postgresql/common-modules.tf new file mode 120000 index 000000000..8c81ef377 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/common-modules.tf @@ -0,0 +1 @@ +../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-sql-postgresql/common-variables.tf b/modules/integration_gcp-cloud-sql-postgresql/common-variables.tf new file mode 120000 index 000000000..f3037a584 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/common-variables.tf @@ -0,0 +1 @@ +../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-sql-postgresql/common-versions.tf b/modules/integration_gcp-cloud-sql-postgresql/common-versions.tf new file mode 120000 index 000000000..fa7f5509f --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/common-versions.tf @@ -0,0 +1 @@ +../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_gcp-cloud-sql-postgresql/conf/01-replication_lag.yaml b/modules/integration_gcp-cloud-sql-postgresql/conf/01-replication_lag.yaml new file mode 100644 index 000000000..e101d6112 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/conf/01-replication_lag.yaml @@ -0,0 +1,21 @@ +module: "GCP Cloud SQL PostgreSQL" +name: "Replication lag" +id: "replication_lag" + +transformation: ".min(over='10m')" +aggregation: true + + +signals: + signal: + metric: "database/postgresql/replication/replica_byte_lag" + +rules: + critical: + threshold: 180 + comparator: ">" + + major: + threshold: 90 + comparator: ">" + dependency: "critical" \ No newline at end of file diff --git a/modules/integration_gcp-cloud-sql-postgresql/conf/readme.yaml b/modules/integration_gcp-cloud-sql-postgresql/conf/readme.yaml new file mode 100644 index 000000000..fd58ccf0f --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/conf/readme.yaml @@ -0,0 +1,3 @@ +documentations: + - name: Stackdriver metrics + url: 'https://cloud.google.com/monitoring/api/metrics_gcp#gcp-cloudsql' diff --git a/modules/integration_gcp-cloud-sql-postgresql/detectors-gen.tf b/modules/integration_gcp-cloud-sql-postgresql/detectors-gen.tf new file mode 100644 index 000000000..71cfeb58e --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/detectors-gen.tf @@ -0,0 +1,40 @@ +resource "signalfx_detector" "replication_lag" { + name = format("%s %s", local.detector_name_prefix, "GCP Cloud SQL PostgreSQL replication lag") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('database/postgresql/replication/replica_byte_lag', filter=${module.filtering.signalflow})${var.replication_lag_aggregation_function}${var.replication_lag_transformation_function}.publish('signal') + detect(when(signal > ${var.replication_lag_threshold_critical}%{if var.replication_lag_lasting_duration_critical != null}, lasting='${var.replication_lag_lasting_duration_critical}', at_least=${var.replication_lag_at_least_percentage_critical}%{endif})).publish('CRIT') + detect(when(signal > ${var.replication_lag_threshold_major}%{if var.replication_lag_lasting_duration_major != null}, lasting='${var.replication_lag_lasting_duration_major}', at_least=${var.replication_lag_at_least_percentage_major}%{endif}) and (not when(signal > ${var.replication_lag_threshold_critical}%{if var.replication_lag_lasting_duration_critical != null}, lasting='${var.replication_lag_lasting_duration_critical}', at_least=${var.replication_lag_at_least_percentage_critical}%{endif}))).publish('MAJOR') +EOF + + rule { + description = "is too high > ${var.replication_lag_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.replication_lag_disabled_critical, var.replication_lag_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.replication_lag_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.replication_lag_runbook_url, var.runbook_url), "") + tip = var.replication_lag_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.replication_lag_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.replication_lag_disabled_major, var.replication_lag_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.replication_lag_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.replication_lag_runbook_url, var.runbook_url), "") + tip = var.replication_lag_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.replication_lag_max_delay +} + diff --git a/modules/integration_gcp-cloud-sql-postgresql/filters.tf b/modules/integration_gcp-cloud-sql-postgresql/filters.tf new file mode 100644 index 000000000..5445c6f26 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/filters.tf @@ -0,0 +1,4 @@ +locals { + filters = "filter('project_id', '${var.gcp_project_id}')" +} + diff --git a/modules/integration_gcp-cloud-sql-postgresql/outputs.tf b/modules/integration_gcp-cloud-sql-postgresql/outputs.tf new file mode 100644 index 000000000..84e4af8e1 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/outputs.tf @@ -0,0 +1,5 @@ +output "replication_lag" { + description = "Detector resource for replication_lag" + value = signalfx_detector.replication_lag +} + diff --git a/modules/integration_gcp-cloud-sql-postgresql/tags.tf b/modules/integration_gcp-cloud-sql-postgresql/tags.tf new file mode 100644 index 000000000..23bf95f8a --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/tags.tf @@ -0,0 +1,3 @@ +locals { + tags = ["integration", "gcp-cloud-sql-postgresql"] +} diff --git a/modules/integration_gcp-cloud-sql-postgresql/variables-gen.tf b/modules/integration_gcp-cloud-sql-postgresql/variables-gen.tf new file mode 100644 index 000000000..d48d30b24 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/variables-gen.tf @@ -0,0 +1,90 @@ +# replication_lag detector + +variable "replication_lag_notifications" { + description = "Notification recipients list per severity overridden for replication_lag detector" + type = map(list(string)) + default = {} +} + +variable "replication_lag_aggregation_function" { + description = "Aggregation function and group by for replication_lag detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "replication_lag_transformation_function" { + description = "Transformation function for replication_lag detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='10m')" +} + +variable "replication_lag_max_delay" { + description = "Enforce max delay for replication_lag detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "replication_lag_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "replication_lag_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "replication_lag_disabled" { + description = "Disable all alerting rules for replication_lag detector" + type = bool + default = null +} + +variable "replication_lag_disabled_critical" { + description = "Disable critical alerting rule for replication_lag detector" + type = bool + default = null +} + +variable "replication_lag_disabled_major" { + description = "Disable major alerting rule for replication_lag detector" + type = bool + default = null +} + +variable "replication_lag_threshold_critical" { + description = "Critical threshold for replication_lag detector" + type = number + default = 180 +} + +variable "replication_lag_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "replication_lag_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "replication_lag_threshold_major" { + description = "Major threshold for replication_lag detector" + type = number + default = 90 +} + +variable "replication_lag_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "replication_lag_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/integration_gcp-cloud-sql-postgresql/variables.tf b/modules/integration_gcp-cloud-sql-postgresql/variables.tf new file mode 100644 index 000000000..901d3ad46 --- /dev/null +++ b/modules/integration_gcp-cloud-sql-postgresql/variables.tf @@ -0,0 +1,4 @@ +variable "gcp_project_id" { + description = "GCP project id used for default filtering while lables are not synced" + type = string +}