fix(monitoring): PodCrashLooping alert now fires only for active CrashLoopBackOff
Switch from restart-count based detection (increase restarts[1h] > 5) to
waiting-reason based (kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}).
Alert auto-resolves when pod recovers, making it clear whether the issue is active.
This commit is contained in:
parent
cc670d949c
commit
5da6d75094
1 changed files with 4 additions and 4 deletions
|
|
@ -192,7 +192,7 @@ server:
|
|||
sizeLimit: 2Gi
|
||||
- name: prometheus-backup
|
||||
persistentVolumeClaim:
|
||||
claimName: monitoring-prometheus-backup
|
||||
claimName: monitoring-prometheus-backup-host
|
||||
extraVolumeMounts:
|
||||
- name: prometheus-wal-tmpfs
|
||||
mountPath: /data/wal
|
||||
|
|
@ -1017,12 +1017,12 @@ serverFiles:
|
|||
- name: K8s Health
|
||||
rules:
|
||||
- alert: PodCrashLooping
|
||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 15m
|
||||
expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }}: {{ $value | printf \"%.0f\" }} restarts in 1h"
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }}: stuck in CrashLoopBackOff"
|
||||
- alert: ContainerOOMKilled
|
||||
expr: increase(container_oom_events_total{container!=""}[15m]) > 0 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue