From cd5261161bdfd9e6a6cb17aeed5e2639f0afd03b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 11 Feb 2026 23:24:46 +0000 Subject: [PATCH] [ci skip] Add HomeAssistantDown alert for ha-sofia Fires after 5m if the haos Prometheus scrape target is unreachable. Covers the HTTP API endpoint which shares the same process as the WebSocket API used by the mobile app. --- modules/kubernetes/monitoring/prometheus_chart_values.tpl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/kubernetes/monitoring/prometheus_chart_values.tpl b/modules/kubernetes/monitoring/prometheus_chart_values.tpl index 8b401bd4..dd64a5dd 100755 --- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl +++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl @@ -316,6 +316,13 @@ serverFiles: summary: "Job {{ $labels.namespace }}/{{ $labels.job_name }}: {{ $value | printf \"%.0f\" }} failure(s)" - name: Infrastructure Health rules: + - alert: HomeAssistantDown + expr: up{job="haos"} == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Home Assistant down: {{ $labels.instance }}" - alert: CoreDNSErrors expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1 for: 10m