From 503ac4c192be1e206c6e54da8d5cf8be4ebc032d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 23 May 2026 09:28:53 +0000 Subject: [PATCH] monitoring: tune 4 alerts for transient drain/upgrade blips Today's worker-phase rolling upgrade tripped MysqlStandaloneDown, MetalLBSpeakerDown, KubeletRunningContainersDrop, and IngressErrorRate5xxHigh even though every affected workload recovered within 30-60s. Loosen `for:` (and one threshold) on each so they only fire on persistent faults, not on routine drain+kubelet- restart cycles. - MysqlStandaloneDown: for 2m -> 3m (single-replica StatefulSet, drain re-scheduling routinely takes 1-3m). - MetalLBSpeakerDown: for 5m -> 2m (kubelet restart drops the speaker pod for 30-45s; 2m suppresses that blip). - KubeletRunningContainersDrop: absolute `< -10` threshold replaced with relative `< -0.5` (>50% drop vs. 10m ago); routine drains routinely shed 10-30 containers and tripped the old rule. - IngressErrorRate5xxHigh: for 5m -> 10m (rolling pod migrations cause brief 5xx spikes that clear in 1-2m). Severity, labels, and annotation structure preserved; only `for:` durations and the one expression changed. Tactical loosening of four specific alerts -- broader observability audit tracked separately in beads. Co-Authored-By: Claude Opus 4.7 --- .../monitoring/prometheus_chart_values.tpl | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 4bb3c92b..59a6d1a9 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1870,12 +1870,21 @@ serverFiles: annotations: summary: "Kubelet {{ $labels.operation_type }} p99: {{ $value | printf \"%.0f\" }}s on {{ $labels.instance }} (threshold: 30s)" - alert: KubeletRunningContainersDrop - expr: (kubelet_running_containers{container_state="running"} - kubelet_running_containers{container_state="running"} offset 10m) < -10 + # Relative >50% drop vs. 10m ago, sustained for 5m. + # Absolute-count threshold removed 2026-05-18: routine drains + # routinely drop 10-30 containers and tripped the old `< -10` + # rule; only a >50% drop that persists 5m+ indicates a real + # node-level fault (kubelet hang, runtime crash, mass eviction). + expr: | + ( + (kubelet_running_containers{container_state="running"} - kubelet_running_containers{container_state="running"} offset 10m) + / kubelet_running_containers{container_state="running"} offset 10m + ) < -0.5 for: 5m labels: severity: critical annotations: - summary: "Running containers on {{ $labels.instance }} dropped by {{ $value | printf \"%.0f\" }} in 10m" + summary: "Running containers on {{ $labels.instance }} dropped >50% in 10m ({{ $value | printf \"%.2f\" }} ratio)" - alert: CalicoNodeNotReady expr: kube_daemonset_status_number_ready{namespace="calico-system", daemonset="calico-node"} < kube_daemonset_status_desired_number_scheduled{namespace="calico-system", daemonset="calico-node"} for: 5m @@ -1934,8 +1943,11 @@ serverFiles: annotations: summary: "Node {{ $labels.node }} kubelet started {{ $value | humanizeDuration }} ago — 1h settle window halts further reboots" - alert: MysqlStandaloneDown + # Single-replica StatefulSet: brief drain re-scheduling routinely + # takes 1-3 min during k8s upgrades. 3m suppresses those blips; + # real outages persist longer. Raised from 2m on 2026-05-18. expr: kube_statefulset_status_replicas_ready{statefulset="mysql-standalone"} < 1 - for: 2m + for: 3m labels: severity: critical annotations: @@ -2178,6 +2190,9 @@ serverFiles: annotations: summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)" - alert: IngressErrorRate5xxHigh + # Rolling upgrades / pod migrations cause brief 5xx spikes that + # clear within 1-2 min. Only persistent 5xx indicates a real + # problem. Raised from 5m to 10m on 2026-05-18. expr: | ( sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service) @@ -2186,11 +2201,11 @@ serverFiles: ) > 5 and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 - for: 5m + for: 10m labels: severity: critical annotations: - summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)" + summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 10m)" - alert: AnubisChallengeStoreErrors # Anubis exposes only Go-runtime metrics on :9090 (no anubis_* / # challenge_* counters), so we proxy via Traefik 5xx on services @@ -2227,12 +2242,16 @@ serverFiles: annotations: summary: "Cloudflared: {{ $value | printf \"%.0f\" }} replica(s) unavailable" - alert: MetalLBSpeakerDown + # kubelet restart during k8s upgrade briefly takes the speaker + # pod down; typical recovery is 30-45s. 2m suppresses those + # transient blips while still catching genuine failures. + # Adjusted from 5m on 2026-05-18. expr: | ( kube_daemonset_status_desired_number_scheduled{namespace="metallb-system", daemonset="metallb-speaker"} - on(namespace, daemonset) kube_daemonset_status_number_ready{namespace="metallb-system", daemonset="metallb-speaker"} ) > 0 - for: 5m + for: 2m labels: severity: critical annotations: