From 304f0de43aba9d714f7aad3de122d0de808ce083 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 23 Mar 2026 22:24:17 +0200 Subject: [PATCH] add Metric Staleness alerts for UPS, iDRAC, ATS, and HA metrics Replace fragile NoiDRACData alert with proper absent() checks. Add UPSMetricsMissing (critical), iDRACRedfishMetricsMissing, iDRACSNMPMetricsMissing, ATSMetricsMissing, and HomeAssistantMetricsMissing alerts. Update PowerOutage and NodeDown inhibit rules to suppress staleness alerts during outages. --- .../monitoring/prometheus_chart_values.tpl | 48 +++++++++++++++---- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index d114f75d..7a7e9edb 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -72,7 +72,7 @@ alertmanager: - source_matchers: - alertname = NodeDown target_matchers: - - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown" + - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|MySQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" # NFS down causes mass pod failures and NFS-dependent service outages - source_matchers: - alertname = NFSServerUnresponsive @@ -97,7 +97,7 @@ alertmanager: - source_matchers: - alertname = PowerOutage target_matchers: - - alertname =~ "NodeDown|NFSServerUnresponsive|NodeExporterDown|CloudflaredDown|MetalLBSpeakerDown|MetalLBControllerDown" + - alertname =~ "NodeDown|NFSServerUnresponsive|NodeExporterDown|CloudflaredDown|MetalLBSpeakerDown|MetalLBControllerDown|UPSMetricsMissing|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|ATSMetricsMissing|HomeAssistantMetricsMissing" # Containerd broken suppresses downstream pod alerts - source_matchers: - alertname = KubeletImagePullErrors @@ -337,13 +337,6 @@ serverFiles: severity: info annotations: summary: "HDD write rate: {{ $value | printf \"%.1f\" }} MB/s (threshold: 10 MB/s)" - - alert: NoiDRACData - expr: (max(r730_idrac_idrac_system_health + 1) or on() vector(0)) == 0 - for: 30m - labels: - severity: info - annotations: - summary: "No iDRAC data for 30m - check Prometheus scraping" - alert: HighSystemLoad expr: scalar(node_load1{instance="pve-node-r730"}) * 100 / count(count(node_cpu_seconds_total{instance="pve-node-r730"}) by (cpu)) > 50 for: 30m @@ -432,6 +425,43 @@ serverFiles: severity: info annotations: summary: "On inverter for >24h - check grid switchover" + - name: Metric Staleness + rules: + - alert: UPSMetricsMissing + expr: absent(ups_upsInputVoltage) + for: 10m + labels: + severity: critical + annotations: + summary: "UPS metrics missing for 10m - check SNMP exporter and ups.viktorbarzin.lan" + - alert: iDRACRedfishMetricsMissing + expr: absent(r730_idrac_idrac_power_supply_input_voltage) + for: 10m + labels: + severity: warning + annotations: + summary: "iDRAC Redfish metrics missing for 10m - check idrac-redfish-exporter pod" + - alert: iDRACSNMPMetricsMissing + expr: absent(r730_idrac_idrac_system_health) + for: 10m + labels: + severity: warning + annotations: + summary: "iDRAC SNMP metrics missing for 10m - check SNMP exporter and idrac.viktorbarzin.lan" + - alert: ATSMetricsMissing + expr: absent(automatic_transfer_switch_power_mode) + for: 15m + labels: + severity: warning + annotations: + summary: "ATS metrics missing for 15m - check tuya-bridge pod" + - alert: HomeAssistantMetricsMissing + expr: absent(up{job="haos"}) + for: 10m + labels: + severity: warning + annotations: + summary: "Home Assistant (ha-sofia) metrics missing for 10m - check HA Prometheus integration" - name: Storage rules: - alert: NodeFilesystemFull