From 8bea55266417d0215d1ca2f228ae0a46078ba48d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 11 Feb 2026 23:24:46 +0000 Subject: [PATCH] [ci skip] Add HomeAssistantDown alert for ha-sofia Fires after 5m if the haos Prometheus scrape target is unreachable. Covers the HTTP API endpoint which shares the same process as the WebSocket API used by the mobile app. --- modules/kubernetes/monitoring/prometheus_chart_values.tpl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index 8b401bd4..dd64a5dd 100755 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -316,6 +316,13 @@ serverFiles: summary: "Job {{ $labels.namespace }}/{{ $labels.job_name }}: {{ $value | printf \"%.0f\" }} failure(s)" - name: Infrastructure Health rules: + - alert: HomeAssistantDown + expr: up{job="haos"} == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Home Assistant down: {{ $labels.instance }}" - alert: CoreDNSErrors expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1 for: 10m