From 39185ab2af1d5d288b0b15c85c7e614818024d5d Mon Sep 17 00:00:00 2001 From: Ignacio Rivas Mendez Date: Thu, 18 Jul 2024 15:06:34 +0200 Subject: [PATCH] feat : Add custom detector for MongoDB status --- modules/smart-agent_custom_mongodb/README.md | 172 ++++++++++++++++++ .../common-filters.tf | 4 + .../common-locals.tf | 44 +++++ .../common-modules.tf | 8 + .../common-variables.tf | 78 ++++++++ .../common-versions.tf | 9 + .../conf/00-service-up.yaml | 12 ++ .../conf/readme.yaml | 46 +++++ .../detectors-gen.tf | 26 +++ modules/smart-agent_custom_mongodb/outputs.tf | 5 + modules/smart-agent_custom_mongodb/tags.tf | 3 + .../variables-gen.tf | 62 +++++++ 12 files changed, 469 insertions(+) create mode 100644 modules/smart-agent_custom_mongodb/README.md create mode 100644 modules/smart-agent_custom_mongodb/common-filters.tf create mode 100644 modules/smart-agent_custom_mongodb/common-locals.tf create mode 100644 modules/smart-agent_custom_mongodb/common-modules.tf create mode 100644 modules/smart-agent_custom_mongodb/common-variables.tf create mode 100644 modules/smart-agent_custom_mongodb/common-versions.tf create mode 100644 modules/smart-agent_custom_mongodb/conf/00-service-up.yaml create mode 100644 modules/smart-agent_custom_mongodb/conf/readme.yaml create mode 100644 modules/smart-agent_custom_mongodb/detectors-gen.tf create mode 100644 modules/smart-agent_custom_mongodb/outputs.tf create mode 100644 modules/smart-agent_custom_mongodb/tags.tf create mode 100644 modules/smart-agent_custom_mongodb/variables-gen.tf diff --git a/modules/smart-agent_custom_mongodb/README.md b/modules/smart-agent_custom_mongodb/README.md new file mode 100644 index 000000000..84ecfd7e2 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/README.md @@ -0,0 +1,172 @@ +# MongoDB SignalFx detectors + + + +:link: **Contents** + +- [How to use this module?](#how-to-use-this-module) +- [What are the available detectors in this module?](#what-are-the-available-detectors-in-this-module) +- [How to collect required metrics?](#how-to-collect-required-metrics) + - [Monitors](#monitors) + - [Metrics](#metrics) +- [Notes](#notes) +- [Related documentation](#related-documentation) + + + +## How to use this module? + +This directory defines a [Terraform](https://www.terraform.io/) +[module](https://www.terraform.io/language/modules/syntax) you can use in your +existing [stack](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#stack) by adding a +`module` configuration and setting its `source` parameter to URL of this folder: + +```hcl +module "signalfx-detectors-smart-agent-mongodb" { + source = "github.com/claranet/terraform-signalfx-detectors.git//modules/smart-agent_custom_mongodb?ref={revision}" + + environment = var.environment + notifications = local.notifications +} +``` + +Note the following parameters: + +* `source`: Use this parameter to specify the URL of the module. The double slash (`//`) is intentional and required. + Terraform uses it to specify subfolders within a Git repo (see [module + sources](https://www.terraform.io/language/modules/sources)). The `ref` parameter specifies a specific Git tag in + this repository. It is recommended to use the latest "pinned" version in place of `{revision}`. Avoid using a branch + like `master` except for testing purpose. Note that every modules in this repository are available on the Terraform + [registry](https://registry.terraform.io/modules/claranet/detectors/signalfx) and we recommend using it as source + instead of `git` which is more flexible but less future-proof. + +* `environment`: Use this parameter to specify the + [environment](https://github.com/claranet/terraform-signalfx-detectors/wiki/Getting-started#environment) used by this + instance of the module. + Its value will be added to the `prefixes` list at the start of the [detector + name](https://github.com/claranet/terraform-signalfx-detectors/wiki/Templating#example). + In general, it will also be used in the `filtering` internal sub-module to [apply + filters](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#filtering) based on our default + [tagging convention](https://github.com/claranet/terraform-signalfx-detectors/wiki/Tagging-convention) by default. + +* `notifications`: Use this parameter to define where alerts should be sent depending on their severity. It consists + of a Terraform [object](https://www.terraform.io/language/expressions/type-constraints#object) where each key represents an available + [detector rule severity](https://docs.splunk.com/observability/alerts-detectors-notifications/create-detectors-for-alerts.html#severity) + and its value is a list of recipients. Every recipients must respect the [detector notification + format](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector#notification-format). + Check the [notification binding](https://github.com/claranet/terraform-signalfx-detectors/wiki/Notifications-binding) + documentation to understand the recommended role of each severity. + +These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all +[modules](../) in this repository. Other variables, specific to this module, are available in +[variables-gen.tf](variables-gen.tf). +In general, the default configuration "works" but all of these Terraform +[variables](https://www.terraform.io/language/values/variables) make it possible to +customize the detectors behavior to better fit your needs. + +Most of them represent usual tips and rules detailed in the +[guidance](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance) documentation and listed in the +common [variables](https://github.com/claranet/terraform-signalfx-detectors/wiki/Variables) dedicated documentation. + +Feel free to explore the [wiki](https://github.com/claranet/terraform-signalfx-detectors/wiki) for more information about +general usage of this repository. + +## What are the available detectors in this module? + +This module creates the following SignalFx detectors which could contain one or multiple alerting rules: + +| Detector |Critical|Major|Minor|Warning|Info| +|-----------------------------|---|---|---|---|---| +| MongoDB DOWN |X|-|-|-|-| +## How to collect required metrics? + +This module deploys detectors using metrics reported by the +[SignalFx Smart Agent Monitors](https://github.com/signalfx/signalfx-agent#monitors). + +Even if the [Smart Agent is deprecated](https://github.com/signalfx/signalfx-agent/blob/main/docs/smartagent-deprecation-notice.md) +it remains an efficient, lightweight and simple monitoring agent which still works fine. +See the [official documentation](https://docs.splunk.com/Observability/gdi/smart-agent/smart-agent-resources.html) for more information +about this agent. +You might find the related following documentations useful: +- the global level [agent configuration](https://github.com/signalfx/signalfx-agent/blob/main/docs/config-schema.md) +- the [monitor level configuration](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitor-config.md) +- the internal [agent configuration tips](https://github.com/claranet/terraform-signalfx-detectors/wiki/Guidance#agent-configuration). +- the full list of [monitors available](https://github.com/signalfx/signalfx-agent/tree/main/docs/monitors) with their own specific documentation. + +In addition, all of these monitors are still available in the [Splunk Otel Collector](https://github.com/signalfx/splunk-otel-collector), +the Splunk [distro of OpenTelemetry Collector](https://opentelemetry.io/docs/concepts/distributions/) which replaces SignalFx Smart Agent, +thanks to the internal [Smart Agent Receiver](https://github.com/signalfx/splunk-otel-collector/tree/main/pkg/receiver/smartagentreceiver). + +As a result: +- any SignalFx Smart Agent monitor are compatible with the new agent OpenTelemetry Collector and related modules in this repository keep `smart-agent` as source name. +- any OpenTelemetry receiver not based on an existing Smart Agent monitor is not available from old agent so related modules in this repository use `otel-collector` as source name. + + +Check the [Related documentation](#related-documentation) section for more detailed and specific information about this module dependencies. + +### Monitors + +This monitor is only available from agent version `>= 5.2.0` but it has evolved since and we +recommend to use at least version `v5.5.6`. + +Check the examples in the official monitor documentation and the Notes section below. + + +### Metrics + + +To filter only required metrics for the detectors of this module, add the +[datapointsToExclude](https://docs.splunk.com/observability/gdi/smart-agent/smart-agent-resources.html#filtering-data-using-the-smart-agent) +parameter to the corresponding monitor configuration: + +```yaml + datapointsToExclude: + - metricNames: + - '*' + - '!mongodb.status_code' + +``` + +## Notes + +This module creates some detectors to check web urls and optionally their associated tls certificates. + +* By default, `signalfx-agent` collection interval is `10s`. Depending of webservices +checked this could dangerous or useless to requet them as often so you can change +`intervalSeconds` monitor(s) parameter as you prefer. + +* The transformation allows to adapt sensitivity applying its function on a timeframe +which will change the evaluated value. The alert will be raised as soon the conditions are +met but comapared to a transformed value not true to reality and obviously more favorable. +This also affect the chart which could be not desired especially for troubleshooting +(webchecks often require accuracy). I.e. `max(over='15m')` on `mongodb_code_matched` will +always be OK (`1`) on alert (and so chart also) even if more than `50%` of checks done +on the timeframe are failed. + +* The `lasting` function does not change the value. It could apply on an evaluated value +different from the orginal (i.e. if you set `transformation_function` explicitely). +The chart will show the exact real value and even alert condition itself will be met +strictly immediately but alert will be raised only at the end of lasting timeframe +if the conditions have always remained. + +* By default, this module will raise alerts these detectors with moderate sensitivity in +combination with `10s` collection interval and `lasting('60s')`: `6` datapoints for 1m +so the webcheck could fail 5 consecutive times before raising alert. + +* Feel free to use variables to adapt this sensitivity depending of your needs to make +detectors more tolerant (increasing lasting timeframe or even adding transformation) or +more strict (decreasing lasting timeframe or changing transformation function from `max` +to `min`). + +* If you have multiple webhecks which require different sensitivity level so you can add +common dimension using `addExtraDimensions` to set of similar monitors on agent. Then, +you can import as many times this module with different value for `filtering_custom` variable +to match these different dimension(s) value(s). + + +## Related documentation + +* [Terraform SignalFx provider](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs) +* [Terraform SignalFx detector](https://registry.terraform.io/providers/splunk-terraform/signalfx/latest/docs/resources/detector) +* [Splunk Observability integrations](https://docs.splunk.com/Observability/gdi/get-data-in/integrations.html) +* [Smart Agent monitor](https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/http.md) diff --git a/modules/smart-agent_custom_mongodb/common-filters.tf b/modules/smart-agent_custom_mongodb/common-filters.tf new file mode 100644 index 000000000..ba770a5bd --- /dev/null +++ b/modules/smart-agent_custom_mongodb/common-filters.tf @@ -0,0 +1,4 @@ +locals { + filters = "filter('env', '${var.environment}') and filter('sfx_monitored', 'true')" +} + diff --git a/modules/smart-agent_custom_mongodb/common-locals.tf b/modules/smart-agent_custom_mongodb/common-locals.tf new file mode 100644 index 000000000..51a7650c1 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/common-locals.tf @@ -0,0 +1,44 @@ +locals { + heartbeat_auto_resolve_after = "1s" + not_running_vm_filters_gcp = "(not filter('gcp_status', '{Code=3, Name=STOPPING}', '{Code=4, Name=TERMINATED}'))" + not_running_vm_filters_aws = "(not filter('aws_state', '{Code: 32,Name: shutting-down}', '{Code: 48,Name: terminated}', '{Code: 64,Name: stopping}', '{Code: 80,Name: stopped}'))" + not_running_vm_filters_azure = "(not filter('azure_power_state', 'PowerState/stopping', 'PowerState/stopped', 'PowerState/deallocating', 'PowerState/deallocated'))" + not_running_vm_filters = format( + "%s and %s and %s", + local.not_running_vm_filters_aws, + local.not_running_vm_filters_gcp, + local.not_running_vm_filters_azure + ) + detector_name_prefix = "${join("", formatlist("[%s]", var.prefixes))}[${var.environment}]" + common_tags = concat(["terraform", var.environment], var.teams) + rule_subject_prefix = "[{{ruleSeverity}}]{{{detectorName}}} {{{readableRule}}}" + rule_subject_suffix = "on {{{dimensions}}}" + rule_subject = format("%s ({{inputs.signal.value}}) %s", local.rule_subject_prefix, local.rule_subject_suffix) + rule_subject_novalue = format("%s %s", local.rule_subject_prefix, local.rule_subject_suffix) + rule_body = <<-EOF + **Alert**: + *[{{ruleSeverity}}]{{{detectorName}}} {{{readableRule}}} ({{inputs.signal.value}})* + {{#if anomalous}} + **Triggered at**: + *{{timestamp}}* + {{else}} + **Cleared at**: + *{{timestamp}}* + {{/if}} + + {{#notEmpty dimensions}} + **Dimensions**: + *{{{dimensions}}}* + {{/notEmpty}} + + {{#if anomalous}} + {{#if runbookUrl}}**Runbook**: + Go to [this page]({{{runbookUrl}}}) for help and analysis. + {{/if}} + + {{#if tip}}**Tip**: + {{{tip}}} + {{/if}} + {{/if}} +EOF +} diff --git a/modules/smart-agent_custom_mongodb/common-modules.tf b/modules/smart-agent_custom_mongodb/common-modules.tf new file mode 100644 index 000000000..79d068bdd --- /dev/null +++ b/modules/smart-agent_custom_mongodb/common-modules.tf @@ -0,0 +1,8 @@ +module "filtering" { + source = "../internal_filtering" + + filtering_default = local.filters + filtering_custom = var.filtering_custom + append_mode = var.filtering_append +} + diff --git a/modules/smart-agent_custom_mongodb/common-variables.tf b/modules/smart-agent_custom_mongodb/common-variables.tf new file mode 100644 index 000000000..80cc77eee --- /dev/null +++ b/modules/smart-agent_custom_mongodb/common-variables.tf @@ -0,0 +1,78 @@ +# Global + +variable "environment" { + description = "Infrastructure environment" + type = string +} + +variable "notifications" { + description = "Default notification recipients list per severity" + type = object({ + critical = list(string) + major = list(string) + minor = list(string) + warning = list(string) + info = list(string) + }) +} + +variable "prefixes" { + description = "Prefixes list to prepend between brackets on every monitors names before environment" + type = list(string) + default = [] +} + +variable "filtering_custom" { + description = "Filters as SignalFlow string to either replace or append to default filtering convention which is the only one used if not defined" + type = string + default = null +} + +variable "filtering_append" { + description = "If true, the `filtering_custom` string will be appended to the default filtering convention instead of fully replace it" + type = bool + default = false +} + +variable "detectors_disabled" { + description = "Disable all detectors in this module" + type = bool + default = false +} + +variable "runbook_url" { + description = "Default runbook URL to apply to all detectors (if not overridden at detector level)" + type = string + default = "" +} + +variable "authorized_writer_teams" { + description = "List of teams IDs authorized (with admins) to edit the detector. If defined, it requires an user token to work" + type = list(string) + default = null +} + +variable "teams" { + description = "List of teams IDs to associate the detector to" + type = list(string) + default = [] +} + +variable "message_subject" { + description = "The subject to use in alerting rules messages which overrides the default template" + type = string + default = "" +} + +variable "message_body" { + description = "The body to use in alerting rules messages which overrides the default template" + type = string + default = "" +} + +variable "extra_tags" { + description = "List of tags to add to the detectors resources, useful to find detectors " + type = list(string) + default = [] +} + diff --git a/modules/smart-agent_custom_mongodb/common-versions.tf b/modules/smart-agent_custom_mongodb/common-versions.tf new file mode 100644 index 000000000..d77818c04 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/common-versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = ">= 7.0.0" + } + } + required_version = ">= 0.12.26" +} diff --git a/modules/smart-agent_custom_mongodb/conf/00-service-up.yaml b/modules/smart-agent_custom_mongodb/conf/00-service-up.yaml new file mode 100644 index 000000000..d693e7da0 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/conf/00-service-up.yaml @@ -0,0 +1,12 @@ +module: MongoDB +name: status + +transformation: false +aggregation: true +exclude_not_running_vm: true + +signals: + signal: + metric: mongodb.status_code +rules: + critical: diff --git a/modules/smart-agent_custom_mongodb/conf/readme.yaml b/modules/smart-agent_custom_mongodb/conf/readme.yaml new file mode 100644 index 000000000..7697b3058 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/conf/readme.yaml @@ -0,0 +1,46 @@ +documentations: + - name: Smart Agent monitor + url: 'https://github.com/signalfx/signalfx-agent/blob/main/docs/monitors/http.md' + +source_doc: | + ### Monitors + + This monitor is only available from agent version `>= 5.2.0` but it has evolved since and we + recommend to use at least version `v5.5.6`. + + Check the examples in the official monitor documentation and the Notes section below. + +notes: | + This module creates some detectors to check web urls and optionally their associated tls certificates. + + * By default, `signalfx-agent` collection interval is `10s`. Depending of webservices + checked this could dangerous or useless to requet them as often so you can change + `intervalSeconds` monitor(s) parameter as you prefer. + + * The transformation allows to adapt sensitivity applying its function on a timeframe + which will change the evaluated value. The alert will be raised as soon the conditions are + met but comapared to a transformed value not true to reality and obviously more favorable. + This also affect the chart which could be not desired especially for troubleshooting + (webchecks often require accuracy). I.e. `max(over='15m')` on `mongodb_code_matched` will + always be OK (`1`) on alert (and so chart also) even if more than `50%` of checks done + on the timeframe are failed. + + * The `lasting` function does not change the value. It could apply on an evaluated value + different from the orginal (i.e. if you set `transformation_function` explicitely). + The chart will show the exact real value and even alert condition itself will be met + strictly immediately but alert will be raised only at the end of lasting timeframe + if the conditions have always remained. + + * By default, this module will raise alerts these detectors with moderate sensitivity in + combination with `10s` collection interval and `lasting('60s')`: `6` datapoints for 1m + so the webcheck could fail 5 consecutive times before raising alert. + + * Feel free to use variables to adapt this sensitivity depending of your needs to make + detectors more tolerant (increasing lasting timeframe or even adding transformation) or + more strict (decreasing lasting timeframe or changing transformation function from `max` + to `min`). + + * If you have multiple webhecks which require different sensitivity level so you can add + common dimension using `addExtraDimensions` to set of similar monitors on agent. Then, + you can import as many times this module with different value for `filtering_custom` variable + to match these different dimension(s) value(s). diff --git a/modules/smart-agent_custom_mongodb/detectors-gen.tf b/modules/smart-agent_custom_mongodb/detectors-gen.tf new file mode 100644 index 000000000..6bf6403c6 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/detectors-gen.tf @@ -0,0 +1,26 @@ +resource "signalfx_detector" "mongodb_code_matched" { + name = format("%s %s", local.detector_name_prefix, "MongoDB service DOWN") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + signal = data('mongodb.status_code', filter=${module.filtering.signalflow}, rollup='min')${var.mongodb_code_matched_aggregation_function}${var.mongodb_code_matched_transformation_function}.publish('signal') + detect(when(signal > ${var.mongodb_code_matched_threshold_critical}%{if var.mongodb_code_matched_lasting_duration_critical != null}, lasting='${var.mongodb_code_matched_lasting_duration_critical}', at_least=${var.mongodb_code_matched_at_least_percentage_critical}%{endif})).publish('CRIT') +EOF + + rule { + description = "does not match expected result > ${var.mongodb_code_matched_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.mongodb_code_matched_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.mongodb_code_matched_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.mongodb_code_matched_runbook_url, var.runbook_url), "") + tip = var.mongodb_code_matched_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.mongodb_code_matched_max_delay +} diff --git a/modules/smart-agent_custom_mongodb/outputs.tf b/modules/smart-agent_custom_mongodb/outputs.tf new file mode 100644 index 000000000..e9224274f --- /dev/null +++ b/modules/smart-agent_custom_mongodb/outputs.tf @@ -0,0 +1,5 @@ + +output "mongodb_code_matched" { + description = "Detector resource for mongodb_code_matched" + value = signalfx_detector.mongodb_code_matched +} diff --git a/modules/smart-agent_custom_mongodb/tags.tf b/modules/smart-agent_custom_mongodb/tags.tf new file mode 100644 index 000000000..2734ff153 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/tags.tf @@ -0,0 +1,3 @@ +locals { + tags = ["smart-agent", "http"] +} diff --git a/modules/smart-agent_custom_mongodb/variables-gen.tf b/modules/smart-agent_custom_mongodb/variables-gen.tf new file mode 100644 index 000000000..4524ba9d4 --- /dev/null +++ b/modules/smart-agent_custom_mongodb/variables-gen.tf @@ -0,0 +1,62 @@ + +# mongodb_code_matched detector + +variable "mongodb_code_matched_notifications" { + description = "Notification recipients list per severity overridden for mongodb_code_matched detector" + type = map(list(string)) + default = {} +} + +variable "mongodb_code_matched_aggregation_function" { + description = "Aggregation function and group by for mongodb_code_matched detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "mongodb_code_matched_transformation_function" { + description = "Transformation function for mongodb_code_matched detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "mongodb_code_matched_max_delay" { + description = "Enforce max delay for mongodb_code_matched detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "mongodb_code_matched_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "mongodb_code_matched_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "mongodb_code_matched_disabled" { + description = "Disable all alerting rules for mongodb_code_matched detector" + type = bool + default = null +} + +variable "mongodb_code_matched_threshold_critical" { + description = "Critical threshold for mongodb_code_matched detector" + type = number + default = 0 +} + +variable "mongodb_code_matched_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "1m" +} + +variable "mongodb_code_matched_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +}