healthcheck: tune noise filters + nvidia-exporter auth=none

Six tuning changes to cluster_healthcheck.sh so PASS sections actually reflect "nothing to act on": 1. prometheus_alerts: only count severity=warning|critical. Info-level alerts (RecentNodeReboot soak, PVAutoExpanding) are by design — the alert rule itself sets severity; the script should respect it. 2. tls_certs: lower WARN threshold 30d → 14d. cnpg-webhook-cert auto-rotates at 7d before expiry, kyverno tls pairs at 15d, the Lets Encrypt wildcard renews weekly; <14d is the only window where human attention is genuinely useful. 3. ha_entities: skip mobile_app/device_tracker/notify/button/scene/ event/image/update domains (transient by design), skip friendly names containing iphone/ipad/macbook/tv/bravia/laptop/etc., and only count entities whose last_changed > 24h. Was 431/1470, most of which were "phone in standby" noise. 4. ha_automations: only flag DISABLED automations as abandoned if they've also been untouched (last_changed) for >180 days; raise stale threshold 30d → 180d. Was flagging seasonal/holiday-only automations as broken. 5. problematic_pods + evicted_pods: exclude pods owned by Jobs. CronJob retry leftovers (Error/Failed phase pods that K8s keeps around for log inspection) aren't problematic at the cluster level. 6. uptime_kuma: retry the WebSocket login 3x with backoff. Single- shot failures were a recurring false-positive even though the service was healthy. Also: nvidia-exporter ingress auth=required → auth=none. HA Sofia's nvidia REST sensors (Tesla_T4_GPU_Temperature, Power_Usage, etc.) poll /metrics and got 302'd to Authentik like the idrac/snmp ones did. Same fix. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 22:26:22 +00:00 · 2026-05-10 22:26:22 +00:00 · 2f0e8c88a9
commit 2f0e8c88a9
parent 7d7c7e4b7f
2 changed files with 158 additions and 24 deletions
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
@ -196,6 +196,19 @@ check_pods() {
    section 4 "Problematic Pods"
    local bad count detail="" status="PASS"

+    # Skip pods owned by Jobs (which are owned by CronJobs). A failed CronJob
+    # retry isn't a problematic pod — the next CronJob fire will replace it.
+    # Real problems are deployments / statefulsets / daemonsets in trouble.
+    local job_owned_pods
+    job_owned_pods=$($KUBECTL get pods -A -o json 2>/dev/null | python3 -c '
+import json, sys
+d = json.load(sys.stdin)
+for p in d["items"]:
+    owners = p["metadata"].get("ownerReferences", [])
+    if any(o.get("kind") == "Job" for o in owners):
+        print(f"{p[\"metadata\"][\"namespace\"]} {p[\"metadata\"][\"name\"]}")
+' 2>/dev/null || true)
+
    bad=$( {
        $KUBECTL get pods -A --no-headers --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \
            | grep -E 'CrashLoopBackOff|Error|Pending|Init:|ImagePullBackOff|ErrImagePull' || true
@ -203,6 +216,14 @@ check_pods() {
            | grep -E 'CrashLoopBackOff|ImagePullBackOff|ErrImagePull' || true
    } | awk '!seen[$1,$2]++' | sed '/^$/d') || true

+    # Filter out Job-owned pods
+    if [[ -n "$job_owned_pods" && -n "$bad" ]]; then
+        bad=$(echo "$bad" | awk -v jp="$job_owned_pods" '
+            BEGIN { n = split(jp, lines, "\n"); for (i=1;i<=n;i++) skip[lines[i]] = 1 }
+            { key = $1 " " $2; if (!(key in skip)) print }
+        ')
+    fi
+
    count=$(count_lines "$bad")

    if [[ "$count" -eq 0 ]]; then
@ -229,7 +250,21 @@ check_evicted() {
    section 5 "Evicted/Failed Pods"
    local evicted count detail="" status="PASS"

-    evicted=$($KUBECTL get pods -A --no-headers --field-selector=status.phase=Failed 2>/dev/null || true)
+    # Exclude pods owned by Jobs — those are CronJob retries that K8s leaves
+    # behind for log inspection. They're not "evicted" in the cluster-health
+    # sense and the next CronJob fire replaces them.
+    evicted=$($KUBECTL get pods -A -o json --field-selector=status.phase=Failed 2>/dev/null | python3 -c '
+import json, sys
+try:
+    d = json.load(sys.stdin)
+except Exception:
+    sys.exit(0)
+for p in d.get("items", []):
+    owners = p["metadata"].get("ownerReferences", [])
+    if any(o.get("kind") == "Job" for o in owners):
+        continue
+    print(f"{p[\"metadata\"][\"namespace\"]}\t{p[\"metadata\"][\"name\"]}\t{p.get(\"status\",{}).get(\"reason\",\"\")}")
+' 2>/dev/null || true)
    count=$(count_lines "$evicted")

    if [[ "$count" -eq 0 ]]; then
@ -540,18 +575,25 @@ check_alerts() {
        return 0
    fi

+    # Only count warning + critical alerts. Info-level alerts (RecentNodeReboot,
+    # PVAutoExpanding, etc.) are informational by design and shouldn't be
+    # treated as a script-level WARN — the alert rules themselves already
+    # encode the severity.
    firing_count=$(echo "$alerts" | python3 -c '
 import json, sys
+ACTIONABLE = {"warning", "critical"}
+def actionable(labels):
+    return labels.get("severity", "info").lower() in ACTIONABLE
 try:
    data = json.load(sys.stdin)
    if isinstance(data, list):
-        active = [a for a in data if a.get("status", {}).get("state") == "active"]
+        active = [a for a in data if a.get("status", {}).get("state") == "active" and actionable(a.get("labels", {}))]
        count = len(active)
        names = [a.get("labels", {}).get("alertname", "?") for a in active]
        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
    elif isinstance(data, dict) and "data" in data:
        alerts_list = data["data"].get("alerts", [])
-        firing = [a for a in alerts_list if a.get("state") == "firing"]
+        firing = [a for a in alerts_list if a.get("state") == "firing" and actionable(a.get("labels", {}))]
        count = len(firing)
        names = [a.get("labels", {}).get("alertname", "?") for a in firing]
        print(f"{count}:" + ",".join(names) if count > 0 else "0:")
@ -600,16 +642,35 @@ check_uptime_kuma() {
    fi

    result=$(UPTIME_KUMA_PASSWORD="$uk_pass" ~/.venvs/claude/bin/python3 -c '
-import sys, os
+import sys, os, time
 try:
    from uptime_kuma_api import UptimeKumaApi
 except ImportError:
    print("ERROR:uptime-kuma-api not installed")
    sys.exit(0)

+# The uptime-kuma WebSocket login is intermittently flaky — single-shot
+# failures showed up repeatedly in healthchecks even though the service
+# was healthy. Retry up to 3 times with backoff before declaring connect
+# failure.
+last_exc = None
+api = None
+for attempt in range(3):
+    try:
+        api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2)
+        api.login("admin", os.environ["UPTIME_KUMA_PASSWORD"])
+        break
+    except Exception as e:
+        last_exc = e
+        try: api.disconnect()
+        except Exception: pass
+        api = None
+        time.sleep(2 * (attempt + 1))
+if api is None:
+    print(f"CONN_ERROR:{last_exc}")
+    sys.exit(0)
+
 try:
-    api = UptimeKumaApi("https://uptime.viktorbarzin.me", timeout=120, wait_events=0.2)
-    api.login("admin", os.environ["UPTIME_KUMA_PASSWORD"])

    monitors = api.get_monitors()
    heartbeats = api.get_heartbeats()
@ -1075,9 +1136,14 @@ for item in data.get("items", []):
                    expiry = datetime.strptime(date_str.strip(), "%b %d %H:%M:%S %Y %Z")
                    expiry = expiry.replace(tzinfo=timezone.utc)
                    days_left = (expiry - datetime.now(timezone.utc)).days
+                    # Threshold rationale (lowered from 30d):
+                    # - cnpg-webhook-cert: CNPG operator auto-rotates at 7d before expiry
+                    # - kyverno-*-tls-pair: Kyverno auto-rotates at 15d before expiry
+                    # - viktorbarzin.me Lets Encrypt wildcard: renewed weekly via Woodpecker
+                    # Anything still <14d at check time is genuinely worth surfacing.
                    if days_left <= 7:
                        print(f"FAIL:{ns}/{name}:{days_left}d")
-                    elif days_left <= 30:
+                    elif days_left <= 14:
                        print(f"WARN:{ns}/{name}:{days_left}d")
                except ValueError:
                    pass
@ -1086,8 +1152,8 @@ for item in data.get("items", []):
 ' 2>/dev/null) || true

    if [[ -z "$cert_issues" ]]; then
-        pass "All TLS certificates valid for >30 days"
-        json_add "tls_certs" "PASS" "All valid >30d"
+        pass "All TLS certificates valid for >14 days"
+        json_add "tls_certs" "PASS" "All valid >14d"
    else
        [[ "$QUIET" == true ]] && section_always 22 "TLS Certificate Expiry"
        while IFS= read -r line; do
@ -1333,12 +1399,59 @@ check_ha_entities() {
    local result
    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
 import os, json
+from datetime import datetime, timezone, timedelta
+
+# Noise filter rationale:
+# * The HA "unavailable" state covers everything from "the iDRAC scrape failed
+#   30 seconds ago" to "this iPhone hasn't checked in in 6 hours" to
+#   "this YAML rest sensor has been broken for a week". Counting all of them
+#   produces 400+ alerts that are mostly expected (phones in standby, lights
+#   off, TVs idle).
+# * Three filters dramatically cut noise without hiding real outages:
+#     1. SKIP_DOMAINS — domains that go unavailable transiently by design
+#        (mobile_app on backgrounded apps, notify per-device, button/scene/
+#        event are momentary).
+#     2. STALE_HOURS — only count entities that have been unavailable for
+#        this long. A flapping integration that recovers in <24h is noise;
+#        one stuck for >24h is real.
+#     3. SKIP_DEVICE_HINTS — friendly-name substrings for things that come
+#        and go (laptops, phones, TVs, vacuums, washers).
+SKIP_DOMAINS = {"mobile_app", "device_tracker", "notify", "button", "scene",
+                "event", "image", "update"}
+SKIP_DEVICE_HINTS = ("iphone", "ipad", "macbook", "mac mini", "tv", "bravia",
+                     "playstation", "switch", "roomba", "vacuum", "rumi",
+                     "ipad", "laptop", "phone", "перална", "сушилня",
+                     "миялна", "laptop2")
+STALE_HOURS = 24

 cache = os.environ["HA_CACHE_DIR"]
 with open(f"{cache}/states.json") as f:
    states = json.load(f)

-unavail = [s for s in states if s.get("state") in ("unavailable", "unknown")]
+now = datetime.now(timezone.utc)
+threshold = now - timedelta(hours=STALE_HOURS)
+
+def is_stale(s):
+    if s.get("state") not in ("unavailable", "unknown"):
+        return False
+    domain = s["entity_id"].split(".")[0]
+    if domain in SKIP_DOMAINS:
+        return False
+    name = (s.get("attributes", {}).get("friendly_name") or "").lower()
+    if any(h in name for h in SKIP_DEVICE_HINTS):
+        return False
+    # last_changed = when the state last flipped. If it flipped to unavailable
+    # >24h ago and stayed there, the integration is genuinely broken.
+    lc = s.get("last_changed") or s.get("last_updated")
+    if not lc:
+        return True  # no timestamp = treat as old
+    try:
+        dt = datetime.fromisoformat(lc.replace("Z", "+00:00"))
+    except ValueError:
+        return True
+    return dt < threshold
+
+unavail = [s for s in states if is_stale(s)]
 domains = {}
 for s in unavail:
    d = s["entity_id"].split(".")[0]
@ -1497,24 +1610,42 @@ with open(f"{cache}/states.json") as f:

 autos = [s for s in states if s["entity_id"].startswith("automation.")]
 total = len(autos)
-disabled = [a["entity_id"] for a in autos if a["state"] == "off"]
-disabled_count = len(disabled)
+
+# Noise filter rationale (was: any disabled OR not-triggered-in-30d):
+# * "Disabled" alone is fine — Viktor disables automations intentionally
+#   (seasonal, holiday-only, paused). Only flag when ABANDONED, i.e.
+#   disabled for >180 days AND never triggered recently.
+# * "Stale" alone is fine for low-frequency automations (annual reminders,
+#   manual triggers). Raise the bar to 180d (was 30d).
+DISABLED_STALE_DAYS = 180
+STALE_DAYS = 180

 now = datetime.now(timezone.utc)
+
+def days_since(ts):
+    if not ts:
+        return None
+    try:
+        return (now - datetime.fromisoformat(ts.replace("Z", "+00:00"))).days
+    except Exception:
+        return None
+
+disabled = []
 stale = []
 for a in autos:
+    lt_days = days_since(a.get("attributes", {}).get("last_triggered"))
+    changed_days = days_since(a.get("last_changed"))
    if a["state"] == "off":
-        continue
-    lt = a.get("attributes", {}).get("last_triggered")
-    if lt:
-        try:
-            t = datetime.fromisoformat(lt.replace("Z", "+00:00"))
-            days = (now - t).days
-            if days > 30:
-                stale.append(a["entity_id"] + "=" + str(days) + "d")
-        except:
-            pass
+        # Only flag a disabled automation if it has ALSO been untouched for
+        # the threshold — i.e. genuinely abandoned, not "paused for now".
+        # Use last_changed as a proxy for "user-touched recently".
+        if changed_days is None or changed_days > DISABLED_STALE_DAYS:
+            disabled.append(a["entity_id"])
+    else:
+        if lt_days is not None and lt_days > STALE_DAYS:
+            stale.append(f"{a['entity_id']}={lt_days}d")

+disabled_count = len(disabled)
 stale_count = len(stale)
 disabled_names = "; ".join(disabled)
 stale_names = "; ".join(stale[:10])
--- a/stacks/nvidia/modules/nvidia/main.tf
+++ b/stacks/nvidia/modules/nvidia/main.tf
@ -217,8 +217,11 @@ resource "kubernetes_service" "nvidia-exporter" {


 module "ingress" {
-  source                  = "../../../../modules/kubernetes/ingress_factory"
-  auth                    = "required"
+  source = "../../../../modules/kubernetes/ingress_factory"
+  # Auth disabled — HA Sofia REST sensors poll /metrics; the OIDC flow
+  # would 302 every request. Same pattern as idrac-redfish-exporter +
+  # snmp-exporter (commit 5c594291).
+  auth                    = "none"
  namespace               = kubernetes_namespace.nvidia.metadata[0].name
  name                    = "nvidia-exporter"
  root_domain             = "viktorbarzin.lan"