diff --git a/base-helm-configs/prometheus/alerting_rules.yaml b/base-helm-configs/prometheus/alerting_rules.yaml index f63aad63..6117bd8a 100644 --- a/base-helm-configs/prometheus/alerting_rules.yaml +++ b/base-helm-configs/prometheus/alerting_rules.yaml @@ -123,3 +123,21 @@ additionalPrometheusRulesMap: annotations: summary: OVN backup volume >= 90% disk usage description: "OVN backup volume >= 90% disk usage" + - name: MariaDB backup alerts + rules: + - alert: mariadbBackupWarning + expr: time() - kube_cronjob_status_last_successful_time{cronjob="mariadb-backup"} > 21600 + for: 1h + labels: + severity: warning + annotations: + summary: Last MariaDB backup not successful within 1 hour of scheduled run + description: "Last MariaDB backup not successful within 1 hour of scheduled run" + - alert: mariadbBackupCritical + expr: time() - kube_cronjob_status_last_successful_time{cronjob="mariadb-backup"} > 43200 + for: 1h + labels: + severity: critical + annotations: + summary: Second successive MariaDB backup not successful within 1 hour of scheduled run + description: "Second successive MariaDB backup not successful within 1 hour of scheduled run" diff --git a/docs/alerting-info.md b/docs/alerting-info.md index d783b52c..f2a78c13 100644 --- a/docs/alerting-info.md +++ b/docs/alerting-info.md @@ -80,3 +80,14 @@ The following list contains a few examples of these receivers as part of the [al * [Microsoft Teams Receiver](alertmanager-msteams.md) We can now take all this information and build out an alerting workflow that suits our needs! + +## Genestack alerts + +This section contains some information on individual Genestack alert. + +### MariaDB backup alert + +Based on a schedule of 6 hours by default, it allows 1 hour to upload and +alerts when MySQL doesn't successfully complete a backup. + +It alerts at warning level the first time this happens, and at critical level the second time this happens.