monitoring: alert hygiene — disambiguate, rename, tune, fix inhibits
- HighPowerUsage: add subsystem:gpu (line 724) + subsystem:r730 (line 775) labels so the two same-named alerts are distinguishable in routing. - HeadscaleDown (deployment-replicas flavor, line 1414) → rename to HeadscaleReplicasMismatch. Line 2039 keeps HeadscaleDown as the real up-metric critical check. NodeDown inhibit rule updated to suppress the renamed alert too. - EmailRoundtripStale (line 1816): for 10m → 20m. Survives one missed 20-min probe cycle before firing, cuts flapping (12 short-burst fires over last 24h). ATSOverload tuning skipped: 24h fire-count is 0, it's continuously firing not flapping — already-known sustained 83% ATS load, tuning would not change behavior. 8 backup *NeverSucceeded rules audited: all 7 using kube_cronjob_status_last_successful_time target real K8s CronJobs with active metrics (not Pushgateway-sourced). PrometheusBackupNeverRun already uses absent() correctly. No fixes needed.
This commit is contained in:
parent
ac695dea38
commit
9b4970da61
1 changed files with 5 additions and 3 deletions
|
|
@ -73,7 +73,7 @@ alertmanager:
|
||||||
- source_matchers:
|
- source_matchers:
|
||||||
- alertname = NodeDown
|
- alertname = NodeDown
|
||||||
target_matchers:
|
target_matchers:
|
||||||
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
|
- alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing"
|
||||||
# NFS down causes mass pod failures and NFS-dependent service outages
|
# NFS down causes mass pod failures and NFS-dependent service outages
|
||||||
- source_matchers:
|
- source_matchers:
|
||||||
- alertname = NFSServerUnresponsive
|
- alertname = NFSServerUnresponsive
|
||||||
|
|
@ -726,6 +726,7 @@ serverFiles:
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
subsystem: gpu
|
||||||
annotations:
|
annotations:
|
||||||
summary: "GPU power: {{ $value | printf \"%.0f\" }}W (threshold: 50W)"
|
summary: "GPU power: {{ $value | printf \"%.0f\" }}W (threshold: 50W)"
|
||||||
- alert: HighUtilization
|
- alert: HighUtilization
|
||||||
|
|
@ -777,6 +778,7 @@ serverFiles:
|
||||||
for: 60m
|
for: 60m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
subsystem: r730
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Server power: {{ $value | printf \"%.0f\" }}W (threshold: 300W)"
|
summary: "Server power: {{ $value | printf \"%.0f\" }}W (threshold: 300W)"
|
||||||
- alert: UsingInverterEnergyForTooLong
|
- alert: UsingInverterEnergyForTooLong
|
||||||
|
|
@ -1411,7 +1413,7 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Redis master {{ $labels.pod }} has only {{ $value }} connected replicas (expected 2)"
|
summary: "Redis master {{ $labels.pod }} has only {{ $value }} connected replicas (expected 2)"
|
||||||
- alert: HeadscaleDown
|
- alert: HeadscaleReplicasMismatch
|
||||||
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
|
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
|
|
@ -1815,7 +1817,7 @@ serverFiles:
|
||||||
summary: "Email round-trip probe failing. Check MX DNS, Postfix, Mailgun API, and IMAP."
|
summary: "Email round-trip probe failing. Check MX DNS, Postfix, Mailgun API, and IMAP."
|
||||||
- alert: EmailRoundtripStale
|
- alert: EmailRoundtripStale
|
||||||
expr: (time() - email_roundtrip_last_success_timestamp{job="email-roundtrip-monitor"}) > 3600
|
expr: (time() - email_roundtrip_last_success_timestamp{job="email-roundtrip-monitor"}) > 3600
|
||||||
for: 10m
|
for: 20m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue