monitoring: tune 4 alerts for transient drain/upgrade blips
Today's worker-phase rolling upgrade tripped MysqlStandaloneDown, MetalLBSpeakerDown, KubeletRunningContainersDrop, and IngressErrorRate5xxHigh even though every affected workload recovered within 30-60s. Loosen `for:` (and one threshold) on each so they only fire on persistent faults, not on routine drain+kubelet- restart cycles. - MysqlStandaloneDown: for 2m -> 3m (single-replica StatefulSet, drain re-scheduling routinely takes 1-3m). - MetalLBSpeakerDown: for 5m -> 2m (kubelet restart drops the speaker pod for 30-45s; 2m suppresses that blip). - KubeletRunningContainersDrop: absolute `< -10` threshold replaced with relative `< -0.5` (>50% drop vs. 10m ago); routine drains routinely shed 10-30 containers and tripped the old rule. - IngressErrorRate5xxHigh: for 5m -> 10m (rolling pod migrations cause brief 5xx spikes that clear in 1-2m). Severity, labels, and annotation structure preserved; only `for:` durations and the one expression changed. Tactical loosening of four specific alerts -- broader observability audit tracked separately in beads. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
ad9f6c8f41
commit
503ac4c192
1 changed files with 25 additions and 6 deletions
|
|
@ -1870,12 +1870,21 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Kubelet {{ $labels.operation_type }} p99: {{ $value | printf \"%.0f\" }}s on {{ $labels.instance }} (threshold: 30s)"
|
||||
- alert: KubeletRunningContainersDrop
|
||||
expr: (kubelet_running_containers{container_state="running"} - kubelet_running_containers{container_state="running"} offset 10m) < -10
|
||||
# Relative >50% drop vs. 10m ago, sustained for 5m.
|
||||
# Absolute-count threshold removed 2026-05-18: routine drains
|
||||
# routinely drop 10-30 containers and tripped the old `< -10`
|
||||
# rule; only a >50% drop that persists 5m+ indicates a real
|
||||
# node-level fault (kubelet hang, runtime crash, mass eviction).
|
||||
expr: |
|
||||
(
|
||||
(kubelet_running_containers{container_state="running"} - kubelet_running_containers{container_state="running"} offset 10m)
|
||||
/ kubelet_running_containers{container_state="running"} offset 10m
|
||||
) < -0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Running containers on {{ $labels.instance }} dropped by {{ $value | printf \"%.0f\" }} in 10m"
|
||||
summary: "Running containers on {{ $labels.instance }} dropped >50% in 10m ({{ $value | printf \"%.2f\" }} ratio)"
|
||||
- alert: CalicoNodeNotReady
|
||||
expr: kube_daemonset_status_number_ready{namespace="calico-system", daemonset="calico-node"} < kube_daemonset_status_desired_number_scheduled{namespace="calico-system", daemonset="calico-node"}
|
||||
for: 5m
|
||||
|
|
@ -1934,8 +1943,11 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Node {{ $labels.node }} kubelet started {{ $value | humanizeDuration }} ago — 1h settle window halts further reboots"
|
||||
- alert: MysqlStandaloneDown
|
||||
# Single-replica StatefulSet: brief drain re-scheduling routinely
|
||||
# takes 1-3 min during k8s upgrades. 3m suppresses those blips;
|
||||
# real outages persist longer. Raised from 2m on 2026-05-18.
|
||||
expr: kube_statefulset_status_replicas_ready{statefulset="mysql-standalone"} < 1
|
||||
for: 2m
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -2178,6 +2190,9 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Critically slow ingress on {{ $labels.service }}: avg latency {{ $value | printf \"%.2f\" }}s (threshold: 3s for 5m)"
|
||||
- alert: IngressErrorRate5xxHigh
|
||||
# Rolling upgrades / pod migrations cause brief 5xx spikes that
|
||||
# clear within 1-2 min. Only persistent 5xx indicates a real
|
||||
# problem. Raised from 5m to 10m on 2026-05-18.
|
||||
expr: |
|
||||
(
|
||||
sum(rate(traefik_service_requests_total{code=~"5..", service!~".*nextcloud.*"}[5m])) by (service)
|
||||
|
|
@ -2186,11 +2201,11 @@ serverFiles:
|
|||
) > 5
|
||||
and sum(rate(traefik_service_requests_total{service!~".*nextcloud.*"}[5m])) by (service) > 0.1
|
||||
and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
|
||||
for: 5m
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 5m)"
|
||||
summary: "5xx rate on {{ $labels.service }}: {{ $value | printf \"%.1f\" }}% (threshold: 5% for 10m)"
|
||||
- alert: AnubisChallengeStoreErrors
|
||||
# Anubis exposes only Go-runtime metrics on :9090 (no anubis_* /
|
||||
# challenge_* counters), so we proxy via Traefik 5xx on services
|
||||
|
|
@ -2227,12 +2242,16 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Cloudflared: {{ $value | printf \"%.0f\" }} replica(s) unavailable"
|
||||
- alert: MetalLBSpeakerDown
|
||||
# kubelet restart during k8s upgrade briefly takes the speaker
|
||||
# pod down; typical recovery is 30-45s. 2m suppresses those
|
||||
# transient blips while still catching genuine failures.
|
||||
# Adjusted from 5m on 2026-05-18.
|
||||
expr: |
|
||||
(
|
||||
kube_daemonset_status_desired_number_scheduled{namespace="metallb-system", daemonset="metallb-speaker"}
|
||||
- on(namespace, daemonset) kube_daemonset_status_number_ready{namespace="metallb-system", daemonset="metallb-speaker"}
|
||||
) > 0
|
||||
for: 5m
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue