From 165bb7258e3b96b3c29fb3456a791e97a822bcf7 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 12 May 2026 08:23:18 +0000 Subject: [PATCH] monitoring: detect stale Traefik replicas + reduce alert-storm cascading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new alertmanager inhibit rules and one new Prometheus alert, informed by the 2026-05-12 incident where Traefik pod traefik-db7696fbf-k42wp came back after a SIGTERM with only 6 routers vs 119 on healthy peers (stale K8s informer cache) and served 404 for ~1/3 of viktorbarzin.me traffic. * New alert TraefikReplicaConfigStale: fires when max/min reload-rate ratio across Traefik pods exceeds 5x for 2h. The 2h window + 2h for-clause tolerates legitimate post-restart ramp-up; the bug pattern persists indefinitely. * New inhibit: TraefikReplicaConfigStale suppresses the symptom alerts (HighService{Error,4xx,Latency}, IngressTTFB{High,Critical}, IngressErrorRate5xxHigh, TraefikHighOpenConnections, ForwardAuthFallbackActive, AnubisChallengeStoreErrors, ExternalAccessDivergence) so only the actionable root cause pages. * New inhibit: HomeAssistantDown suppresses HomeAssistantCriticalSensorUnavailable and HomeAssistantMetricsMissing — when HA itself is down, every sensor going unavailable is noise (10x firings observed in the last 12h). * Extend NodeDown and NFSServerUnresponsive target lists to also suppress HomeAssistantCriticalSensorUnavailable. --- .../monitoring/prometheus_chart_values.tpl | 43 ++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index beb7c7f4..ef4ac851 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -73,12 +73,12 @@ alertmanager: - source_matchers: - alertname = NodeDown target_matchers: - - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" + - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" # NFS down causes mass pod failures and NFS-dependent service outages - source_matchers: - alertname = NFSServerUnresponsive target_matchers: - - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown" + - alertname =~ "PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|PostgreSQLDown|RedisDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|HomeAssistantDown|HomeAssistantCriticalSensorUnavailable" # Traefik down makes service-level alerts noise - source_matchers: - alertname = TraefikDown @@ -89,6 +89,19 @@ alertmanager: - alertname = TraefikDown target_matchers: - alertname =~ "PoisonFountainDown|ForwardAuthFallbackActive" + # A stale Traefik replica returns 404 for a fraction of requests; the same + # bug surfaces as TTFB / 4xx / 5xx / external-divergence symptoms downstream. + # When TraefikReplicaConfigStale fires, the root cause is identified — + # suppress the symptom alerts so only the actionable one pages. + - source_matchers: + - alertname = TraefikReplicaConfigStale + target_matchers: + - alertname =~ "HighServiceErrorRate|HighService4xxRate|HighServiceLatency|TraefikHighOpenConnections|IngressTTFBHigh|IngressTTFBCritical|IngressErrorRate5xxHigh|ForwardAuthFallbackActive|AnubisChallengeStoreErrors|ExternalAccessDivergence" + # HA down → every sensor goes unavailable. One root-cause alert is enough. + - source_matchers: + - alertname = HomeAssistantDown + target_matchers: + - alertname =~ "HomeAssistantCriticalSensorUnavailable|HomeAssistantMetricsMissing" # Power outage makes on-battery alert redundant - source_matchers: - alertname = PowerOutage @@ -1941,6 +1954,32 @@ serverFiles: severity: critical annotations: summary: "Traefik pod {{ $labels.instance }} is down" + # Detects a Traefik replica whose K8s Ingress informer cache has gone + # stale — pod reloads at a fraction of the rate of peers, returns 404 + # for ingresses it never re-listed. Pattern observed 2026-05-12 when + # pod traefik-db7696fbf-k42wp came back after a SIGTERM with only 6 + # routers vs 119 on healthy peers (11 vs 145 config reloads). Symptom: + # ~33% of requests to viktorbarzin.me hosts returned a Go-style + # "404 page not found" depending on which replica kube-proxy picked. + # Remediation: `kubectl -n traefik delete pod ` — the + # Deployment recreates it with a fresh informer cache. PDB + # minAvailable=2 keeps the other two replicas serving. + # Window 2h + for 2h tolerates pod restarts (rate normalizes within + # an hour); the bug pattern persists indefinitely until restart. + - alert: TraefikReplicaConfigStale + expr: | + ( + max(rate(traefik_config_reloads_total[2h])) + / + clamp_min(min(rate(traefik_config_reloads_total[2h])), 0.0001) + ) > 5 + and max(rate(traefik_config_reloads_total[2h])) > 0.001 + for: 2h + labels: + severity: warning + annotations: + summary: "Traefik replica config divergence: max/min reload rate = {{ $value | printf \"%.1f\" }}x" + description: "One Traefik replica is reloading config much less than its peers — likely a stale K8s informer cache returning 404 for ingresses. Identify the stale pod by comparing `traefik_config_reloads_total` across pods and delete it: `kubectl -n traefik delete pod `." - alert: HighServiceErrorRate expr: | (