From cbd0f71a3bbc54ef75fc39136407885b15cbeaac Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 12 May 2026 09:31:46 +0000 Subject: [PATCH] monitoring: PodImagePullBackOff alert + 2 inhibitors + JobFailed for:2h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three improvements identified in the 7d alert-noise review: A. New PodImagePullBackOff alert. `KubeletImagePullErrors` measures node-level pull error rate, which doesn't catch a single pod stuck in ImagePullBackOff — council-complaints sat broken for ~10h on 2026-05-12 without paging. The new rule fires per-pod after 30m. B. Two new inhibit_rules: - PVFillingUp (95% used, critical) suppresses PVPredictedFull (linear projection, warning) on the same PVC. Pair was producing ~24h of redundant firing per 7d. - EmailRoundtripFailing (active probe failure) suppresses EmailRoundtripStale (derivative >60min no-success). Same outage windows, ~14.5h of duplicate firing per 7d. C. JobFailed for: 30m → 2h. Most cronjobs run every 5–15min; the old 30-minute window paged on the first failed iteration before the next run could recover. 2h means "still failing across at least two cron iterations" — much more actionable. Verified live: rules loaded, inhibitors in alertmanager config, PodImagePullBackOff is currently inactive (council-complaints ImagePullBackOff actively detected — see separate fix). --- .../monitoring/prometheus_chart_values.tpl | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index fb39a942..ea0fe791 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -102,6 +102,21 @@ alertmanager: - alertname = HomeAssistantDown target_matchers: - alertname =~ "HomeAssistantCriticalSensorUnavailable|HomeAssistantMetricsMissing" + # PVFillingUp (95% used) is the immediate critical; PVPredictedFull + # (linear projection over 6h) is the leading indicator. When the disk + # is actually full, the prediction is redundant. + - source_matchers: + - alertname = PVFillingUp + target_matchers: + - alertname = PVPredictedFull + equal: [namespace, persistentvolumeclaim] + # EmailRoundtripFailing = active outage right now (probe failed). + # EmailRoundtripStale = derivative ("haven't seen success in 60min"). + # The Failing alert subsumes the Stale alert. + - source_matchers: + - alertname = EmailRoundtripFailing + target_matchers: + - alertname = EmailRoundtripStale # Power outage makes on-battery alert redundant - source_matchers: - alertname = PowerOutage @@ -1216,16 +1231,35 @@ serverFiles: severity: warning annotations: summary: "{{ $labels.node }}: {{ $labels.condition }} active" + # `for: 2h` requires the failure to persist across at least 2 + # cron iterations of a typical 5-min/15-min/1h job before paging — + # transient single-run failures (network blip, upstream timeout) + # are recovered by the next iteration without alerting. - alert: JobFailed expr: | kube_job_status_failed > 0 and on(namespace, job_name) (time() - kube_job_status_start_time) < 3600 - for: 30m + for: 2h labels: severity: warning annotations: summary: "Job {{ $labels.namespace }}/{{ $labels.job_name }}: {{ $value | printf \"%.0f\" }} failure(s)" + # `KubeletImagePullErrors` measures node-level pull-error rate, + # which is too coarse to catch one pod stuck in ImagePullBackOff. + # Council-complaints sat in ImagePullBackOff for 10h on 2026-05-12 + # without paging because the rate stayed below threshold. + - alert: PodImagePullBackOff + expr: | + sum by (namespace, pod, container) ( + kube_pod_container_status_waiting_reason{reason=~"ImagePullBackOff|ErrImagePull|InvalidImageName"} + ) > 0 + for: 30m + labels: + severity: warning + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) cannot pull image" + description: "Check the deployment's image reference — often a stale tag, a removed registry, or a credentials mismatch. `kubectl -n {{ $labels.namespace }} describe pod {{ $labels.pod }}` shows the pull error." - name: Infrastructure Health rules: - alert: HomeAssistantDown