From 5da6d75094e2ed6f4b67c65b07c563a2a36a8cc9 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Sun, 12 Apr 2026 12:41:07 +0100
Subject: [PATCH] fix(monitoring): PodCrashLooping alert now fires only for
 active CrashLoopBackOff

Switch from restart-count based detection (increase restarts[1h] > 5) to
waiting-reason based (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}).
Alert auto-resolves when pod recovers, making it clear whether the issue is active.
---
 .../modules/monitoring/prometheus_chart_values.tpl        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index b979e875..e72e7085 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -192,7 +192,7 @@ server:
           sizeLimit: 2Gi
       - name: prometheus-backup
         persistentVolumeClaim:
-          claimName: monitoring-prometheus-backup
+          claimName: monitoring-prometheus-backup-host
   extraVolumeMounts:
     - name: prometheus-wal-tmpfs
       mountPath: /data/wal
@@ -1017,12 +1017,12 @@ serverFiles:
       - name: K8s Health
         rules:
           - alert: PodCrashLooping
-            expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
-            for: 15m
+            expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
+            for: 5m
             labels:
               severity: warning
             annotations:
-              summary: "{{ $labels.namespace }}/{{ $labels.pod }}: {{ $value | printf \"%.0f\" }} restarts in 1h"
+              summary: "{{ $labels.namespace }}/{{ $labels.pod }}: stuck in CrashLoopBackOff"
           - alert: ContainerOOMKilled
             expr: increase(container_oom_events_total{container!=""}[15m]) > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
             for: 5m