From 5da6d75094e2ed6f4b67c65b07c563a2a36a8cc9 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 12 Apr 2026 12:41:07 +0100 Subject: [PATCH] fix(monitoring): PodCrashLooping alert now fires only for active CrashLoopBackOff Switch from restart-count based detection (increase restarts[1h] > 5) to waiting-reason based (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}). Alert auto-resolves when pod recovers, making it clear whether the issue is active. --- .../modules/monitoring/prometheus_chart_values.tpl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index b979e875..e72e7085 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -192,7 +192,7 @@ server: sizeLimit: 2Gi - name: prometheus-backup persistentVolumeClaim: - claimName: monitoring-prometheus-backup + claimName: monitoring-prometheus-backup-host extraVolumeMounts: - name: prometheus-wal-tmpfs mountPath: /data/wal @@ -1017,12 +1017,12 @@ serverFiles: - name: K8s Health rules: - alert: PodCrashLooping - expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 - for: 15m + expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 + for: 5m labels: severity: warning annotations: - summary: "{{ $labels.namespace }}/{{ $labels.pod }}: {{ $value | printf \"%.0f\" }} restarts in 1h" + summary: "{{ $labels.namespace }}/{{ $labels.pod }}: stuck in CrashLoopBackOff" - alert: ContainerOOMKilled expr: increase(container_oom_events_total{container!=""}[15m]) > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 5m