Increases PodDown alert threshold from 1h to 4h (#905)

nkinkade · web-flow · commit 64c276f40475 · 2024-09-23T14:23:34.000-06:00
1h is too suceptible to catching transient errors that clear themselves after a
while.
diff --git a/config/prometheus/alerts.yml b/config/prometheus/alerts.yml
@@ -386,16 +386,14 @@ groups:
           gmx_machine_maintenance == 1 or
           up{job="kubernetes-nodes"} == 0
         )
-    for: 1h
+    for: 4h
     labels:
       repo: ops-tracker
       severity: ticket
       cluster: platform
     annotations:
       summary: A {{ $labels.deployment }} pod is down or broken.
-      description: A {{ $labels.deployment }} pod is down or broken. Verify that the
-        DaemonSet or Deployment is healthy. Check the status of the node that the
-        pod is scheduled on. Check the status of the pod itself, if it exists.
+      description: https://github.com/m-lab/ops-tracker/wiki/Alerts-&-Troubleshooting#platformcluster_poddown
       dashboard: https://grafana.mlab-staging.measurementlab.net/d/rJ7z2Suik/k8s-site-overview
 
 # Etcd alerts.