add HA Sofia checks (26-29) to cluster healthcheck and backup-dr docs
- Healthcheck: add entity availability, integration health, automation status, and system resources checks for Home Assistant Sofia - Docs: add backup-dr architecture documentation
This commit is contained in:
parent
b0178cf6d2
commit
72d832fee7
2 changed files with 447 additions and 1 deletions
|
|
@ -137,6 +137,7 @@ graph TB
|
||||||
| Redis Backup | Weekly Sunday 03:00, 30d | CronJob in `redis` | BGSAVE + copy |
|
| Redis Backup | Weekly Sunday 03:00, 30d | CronJob in `redis` | BGSAVE + copy |
|
||||||
| Prometheus Backup | Monthly 1st Sunday, 2 copies | CronJob in `monitoring` | TSDB snapshot → tar.gz |
|
| Prometheus Backup | Monthly 1st Sunday, 2 copies | CronJob in `monitoring` | TSDB snapshot → tar.gz |
|
||||||
| plotting-book Backup | Weekly Sunday 03:00, 30d | CronJob in `plotting-book` | sqlite3 .backup |
|
| plotting-book Backup | Weekly Sunday 03:00, 30d | CronJob in `plotting-book` | sqlite3 .backup |
|
||||||
|
| LVM Thin Snapshots | Twice daily (00:00, 12:00), 7d | PVE host: `lvm-pvc-snapshot` | CoW snapshots of 13 proxmox-lvm PVCs |
|
||||||
| Incremental Sync | Every 6h (cron) | TrueNAS: `/root/cloudsync-copy.sh` | ZFS diff → rclone copy |
|
| Incremental Sync | Every 6h (cron) | TrueNAS: `/root/cloudsync-copy.sh` | ZFS diff → rclone copy |
|
||||||
| Full Sync | Weekly Sunday 09:00 | TrueNAS Cloud Sync Task 1 | rclone sync with deletions |
|
| Full Sync | Weekly Sunday 09:00 | TrueNAS Cloud Sync Task 1 | rclone sync with deletions |
|
||||||
| CloudSync Monitor | Every 6h (cron) | CronJob in `monitoring` | Query TrueNAS API → Pushgateway |
|
| CloudSync Monitor | Every 6h (cron) | CronJob in `monitoring` | Query TrueNAS API → Pushgateway |
|
||||||
|
|
@ -170,6 +171,27 @@ zfs rollback main/<service>@auto-2026-03-23_00-00
|
||||||
zfs clone main/<service>@auto-2026-03-23_00-00 main/<service>-recovered
|
zfs clone main/<service>@auto-2026-03-23_00-00 main/<service>-recovered
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Layer 1b: LVM Thin Snapshots (Proxmox CSI PVCs)
|
||||||
|
|
||||||
|
Native LVM thin snapshots provide crash-consistent point-in-time recovery for all 13 Proxmox CSI PVCs (~340Gi). These are CoW snapshots — instant creation, minimal overhead, sharing the thin pool's free space.
|
||||||
|
|
||||||
|
**Script**: `/usr/local/bin/lvm-pvc-snapshot` on PVE host (source: `infra/scripts/lvm-pvc-snapshot`)
|
||||||
|
**Schedule**: Twice daily (00:00, 12:00) via systemd timer, 7-day retention (max 14 snapshots per LV)
|
||||||
|
**Discovery**: Auto-discovers PVC LVs matching `vm-*-pvc-*` pattern in VG `pve` thin pool `data`
|
||||||
|
|
||||||
|
**Coverage**: All proxmox-lvm PVCs **except** `dbaas` and `monitoring` namespaces. These are excluded because:
|
||||||
|
- MySQL InnoDB, PostgreSQL, and Prometheus are high-churn (50%+ CoW divergence/hour)
|
||||||
|
- They already have app-level dumps (Layer 2)
|
||||||
|
- Including them causes ~36% write amplification; excluding them reduces overhead to ~0%
|
||||||
|
|
||||||
|
Snapshotted PVCs include: Redis, Vaultwarden, Calibre, Nextcloud, Forgejo, FreshRSS, ActualBudget, NovelApp, Headscale, Uptime Kuma, etc. (~20 low-churn LVs)
|
||||||
|
|
||||||
|
**Exclusion config**: `EXCLUDE_NAMESPACES` variable in script (default: `dbaas,monitoring`). Uses kubectl to resolve LV names dynamically.
|
||||||
|
|
||||||
|
**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>24h), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free).
|
||||||
|
|
||||||
|
**Restore**: `lvm-pvc-snapshot restore <pvc-lv> <snapshot-lv>` — auto-discovers K8s workload, scales down, swaps LVs, scales back up. See `docs/runbooks/restore-lvm-snapshot.md`.
|
||||||
|
|
||||||
### Layer 2: Application-Level Backups
|
### Layer 2: Application-Level Backups
|
||||||
|
|
||||||
K8s CronJobs run inside the cluster, dumping database/state to NFS-exported backup directories. Each service writes to `/mnt/main/<service>-backup/`.
|
K8s CronJobs run inside the cluster, dumping database/state to NFS-exported backup directories. Each service writes to `/mnt/main/<service>-backup/`.
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ JSON=false
|
||||||
KUBECONFIG_PATH="$(pwd)/config"
|
KUBECONFIG_PATH="$(pwd)/config"
|
||||||
KUBECTL=""
|
KUBECTL=""
|
||||||
JSON_RESULTS=()
|
JSON_RESULTS=()
|
||||||
TOTAL_CHECKS=25
|
TOTAL_CHECKS=29
|
||||||
|
|
||||||
# --- Helpers ---
|
# --- Helpers ---
|
||||||
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
|
info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
|
||||||
|
|
@ -1206,6 +1206,426 @@ check_overcommit() {
|
||||||
json_add "overcommit" "$status" "$detail"
|
json_add "overcommit" "$status" "$detail"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- HA helpers ---
|
||||||
|
HA_CACHE_DIR=""
|
||||||
|
|
||||||
|
ha_sofia_available() {
|
||||||
|
if [[ -z "${HOME_ASSISTANT_SOFIA_URL:-}" ]] || [[ -z "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]]; then
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fetch all HA data once and cache in temp files
|
||||||
|
ha_sofia_fetch_cache() {
|
||||||
|
if [[ -n "$HA_CACHE_DIR" ]]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
HA_CACHE_DIR=$(mktemp -d)
|
||||||
|
export HA_CACHE_DIR
|
||||||
|
trap "rm -rf $HA_CACHE_DIR" EXIT
|
||||||
|
|
||||||
|
python3 << 'HA_FETCH_EOF'
|
||||||
|
import os, json, requests, sys
|
||||||
|
|
||||||
|
url = os.environ["HOME_ASSISTANT_SOFIA_URL"]
|
||||||
|
token = os.environ["HOME_ASSISTANT_SOFIA_TOKEN"]
|
||||||
|
cache = os.environ["HA_CACHE_DIR"]
|
||||||
|
headers = {"Authorization": f"Bearer {token}"}
|
||||||
|
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Fetch states (used by checks 26, 28)
|
||||||
|
try:
|
||||||
|
resp = requests.get(f"{url}/api/states", headers=headers, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
with open(f"{cache}/states.json", "w") as f:
|
||||||
|
json.dump(resp.json(), f)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"states:{e}")
|
||||||
|
|
||||||
|
# Fetch config entries (used by check 27)
|
||||||
|
try:
|
||||||
|
resp = requests.get(f"{url}/api/config/config_entries/entry", headers=headers, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
with open(f"{cache}/entries.json", "w") as f:
|
||||||
|
json.dump(resp.json(), f)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"entries:{e}")
|
||||||
|
|
||||||
|
# Fetch config (used by check 29)
|
||||||
|
try:
|
||||||
|
resp = requests.get(f"{url}/api/config", headers=headers, timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
with open(f"{cache}/config.json", "w") as f:
|
||||||
|
json.dump(resp.json(), f)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"config:{e}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
with open(f"{cache}/errors.txt", "w") as f:
|
||||||
|
f.write("\n".join(errors))
|
||||||
|
HA_FETCH_EOF
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- 26. HA Entity Availability ---
|
||||||
|
check_ha_entities() {
|
||||||
|
section 26 "HA Sofia — Entity Availability"
|
||||||
|
|
||||||
|
if ! ha_sofia_available; then
|
||||||
|
warn "HA Sofia token not configured — skipping"
|
||||||
|
json_add "ha_entities" "WARN" "Token not configured"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ha_sofia_fetch_cache
|
||||||
|
|
||||||
|
if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
|
||||||
|
local err=""
|
||||||
|
[[ -f "$HA_CACHE_DIR/errors.txt" ]] && err=$(grep "^states:" "$HA_CACHE_DIR/errors.txt" | head -1)
|
||||||
|
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
|
||||||
|
warn "HA Sofia API unreachable: ${err:-unknown error}"
|
||||||
|
json_add "ha_entities" "WARN" "API unreachable"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local result
|
||||||
|
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
|
||||||
|
import os, json
|
||||||
|
|
||||||
|
cache = os.environ["HA_CACHE_DIR"]
|
||||||
|
with open(f"{cache}/states.json") as f:
|
||||||
|
states = json.load(f)
|
||||||
|
|
||||||
|
unavail = [s for s in states if s.get("state") in ("unavailable", "unknown")]
|
||||||
|
domains = {}
|
||||||
|
for s in unavail:
|
||||||
|
d = s["entity_id"].split(".")[0]
|
||||||
|
domains[d] = domains.get(d, 0) + 1
|
||||||
|
|
||||||
|
total = len(states)
|
||||||
|
count = len(unavail)
|
||||||
|
summary = ", ".join(f"{d}:{n}" for d, n in sorted(domains.items(), key=lambda x: -x[1]))
|
||||||
|
entity_list = "\n".join("ENTITY:" + s["entity_id"] for s in unavail)
|
||||||
|
print(f"{count}:{total}:{summary}")
|
||||||
|
if entity_list:
|
||||||
|
print(entity_list)
|
||||||
|
PYEOF
|
||||||
|
) || result="ERROR:python execution failed"
|
||||||
|
|
||||||
|
if [[ "$result" == "ERROR:"* ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
|
||||||
|
warn "HA Sofia: ${result#ERROR:}"
|
||||||
|
json_add "ha_entities" "WARN" "${result#ERROR:}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local first_line count total summary
|
||||||
|
first_line=$(echo "$result" | head -1)
|
||||||
|
count=$(echo "$first_line" | cut -d: -f1)
|
||||||
|
total=$(echo "$first_line" | cut -d: -f2)
|
||||||
|
summary=$(echo "$first_line" | cut -d: -f3-)
|
||||||
|
|
||||||
|
if [[ "$count" -eq 0 ]]; then
|
||||||
|
pass "All $total HA entities available"
|
||||||
|
json_add "ha_entities" "PASS" "0/$total unavailable"
|
||||||
|
elif [[ "$count" -le 10 ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
|
||||||
|
warn "$count/$total entities unavailable ($summary)"
|
||||||
|
if [[ "$JSON" != true && "$QUIET" != true ]]; then
|
||||||
|
echo "$result" | grep "^ENTITY:" | sed 's/^ENTITY:/ /'
|
||||||
|
fi
|
||||||
|
json_add "ha_entities" "WARN" "$count/$total: $summary"
|
||||||
|
else
|
||||||
|
[[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
|
||||||
|
fail "$count/$total entities unavailable ($summary)"
|
||||||
|
if [[ "$JSON" != true && "$QUIET" != true ]]; then
|
||||||
|
echo "$result" | grep "^ENTITY:" | head -20 | sed 's/^ENTITY:/ /'
|
||||||
|
local entity_count
|
||||||
|
entity_count=$(echo "$result" | grep -c "^ENTITY:" || true)
|
||||||
|
if [[ "$entity_count" -gt 20 ]]; then
|
||||||
|
echo " ... and $((entity_count - 20)) more"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
json_add "ha_entities" "FAIL" "$count/$total: $summary"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- 27. HA Integration Health ---
|
||||||
|
check_ha_integrations() {
|
||||||
|
section 27 "HA Sofia — Integration Health"
|
||||||
|
|
||||||
|
if ! ha_sofia_available; then
|
||||||
|
warn "HA Sofia token not configured — skipping"
|
||||||
|
json_add "ha_integrations" "WARN" "Token not configured"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ha_sofia_fetch_cache
|
||||||
|
|
||||||
|
if [[ ! -f "$HA_CACHE_DIR/entries.json" ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
|
||||||
|
warn "HA Sofia config entries API unavailable"
|
||||||
|
json_add "ha_integrations" "WARN" "API unavailable"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local result
|
||||||
|
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
|
||||||
|
import os, json
|
||||||
|
|
||||||
|
cache = os.environ["HA_CACHE_DIR"]
|
||||||
|
with open(f"{cache}/entries.json") as f:
|
||||||
|
entries = json.load(f)
|
||||||
|
|
||||||
|
total = len(entries)
|
||||||
|
not_loaded = []
|
||||||
|
setup_error = []
|
||||||
|
for e in entries:
|
||||||
|
state = e.get("state", "loaded")
|
||||||
|
domain = e.get("domain", "?")
|
||||||
|
title = e.get("title", "?")
|
||||||
|
if state == "setup_error" or state == "setup_retry":
|
||||||
|
setup_error.append(f"{domain} ({title})")
|
||||||
|
elif state == "not_loaded":
|
||||||
|
not_loaded.append(f"{domain} ({title})")
|
||||||
|
|
||||||
|
error_count = len(setup_error)
|
||||||
|
unloaded_count = len(not_loaded)
|
||||||
|
error_names = "; ".join(setup_error) if setup_error else ""
|
||||||
|
unloaded_names = "; ".join(not_loaded) if not_loaded else ""
|
||||||
|
print(f"{total}:{error_count}:{unloaded_count}:{error_names}:{unloaded_names}")
|
||||||
|
PYEOF
|
||||||
|
) || result="ERROR:python execution failed"
|
||||||
|
|
||||||
|
if [[ "$result" == "ERROR:"* ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
|
||||||
|
warn "HA Sofia: ${result#ERROR:}"
|
||||||
|
json_add "ha_integrations" "WARN" "${result#ERROR:}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local total error_count unloaded_count error_names unloaded_names
|
||||||
|
total=$(echo "$result" | cut -d: -f1)
|
||||||
|
error_count=$(echo "$result" | cut -d: -f2)
|
||||||
|
unloaded_count=$(echo "$result" | cut -d: -f3)
|
||||||
|
error_names=$(echo "$result" | cut -d: -f4)
|
||||||
|
unloaded_names=$(echo "$result" | cut -d: -f5-)
|
||||||
|
|
||||||
|
if [[ "$error_count" -gt 0 ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
|
||||||
|
fail "$error_count integration(s) in error state: $error_names"
|
||||||
|
json_add "ha_integrations" "FAIL" "$error_count errors: $error_names"
|
||||||
|
elif [[ "$unloaded_count" -gt 0 ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
|
||||||
|
warn "$unloaded_count integration(s) not loaded: $unloaded_names"
|
||||||
|
json_add "ha_integrations" "WARN" "$unloaded_count not loaded: $unloaded_names"
|
||||||
|
else
|
||||||
|
pass "All $total integrations loaded"
|
||||||
|
json_add "ha_integrations" "PASS" "All $total loaded"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- 28. HA Automation Status ---
|
||||||
|
check_ha_automations() {
|
||||||
|
section 28 "HA Sofia — Automation Status"
|
||||||
|
|
||||||
|
if ! ha_sofia_available; then
|
||||||
|
warn "HA Sofia token not configured — skipping"
|
||||||
|
json_add "ha_automations" "WARN" "Token not configured"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ha_sofia_fetch_cache
|
||||||
|
|
||||||
|
if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
|
||||||
|
warn "HA Sofia states API unavailable"
|
||||||
|
json_add "ha_automations" "WARN" "API unavailable"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local result
|
||||||
|
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
|
||||||
|
import os, json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
cache = os.environ["HA_CACHE_DIR"]
|
||||||
|
with open(f"{cache}/states.json") as f:
|
||||||
|
states = json.load(f)
|
||||||
|
|
||||||
|
autos = [s for s in states if s["entity_id"].startswith("automation.")]
|
||||||
|
total = len(autos)
|
||||||
|
disabled = [a["entity_id"] for a in autos if a["state"] == "off"]
|
||||||
|
disabled_count = len(disabled)
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
stale = []
|
||||||
|
for a in autos:
|
||||||
|
if a["state"] == "off":
|
||||||
|
continue
|
||||||
|
lt = a.get("attributes", {}).get("last_triggered")
|
||||||
|
if lt:
|
||||||
|
try:
|
||||||
|
t = datetime.fromisoformat(lt.replace("Z", "+00:00"))
|
||||||
|
days = (now - t).days
|
||||||
|
if days > 30:
|
||||||
|
stale.append(a["entity_id"] + "=" + str(days) + "d")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
stale_count = len(stale)
|
||||||
|
disabled_names = "; ".join(disabled)
|
||||||
|
stale_names = "; ".join(stale[:10])
|
||||||
|
print(f"{total}:{disabled_count}:{stale_count}:{disabled_names}:{stale_names}")
|
||||||
|
PYEOF
|
||||||
|
) || result="ERROR:python execution failed"
|
||||||
|
|
||||||
|
if [[ "$result" == "ERROR:"* ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
|
||||||
|
warn "HA Sofia: ${result#ERROR:}"
|
||||||
|
json_add "ha_automations" "WARN" "${result#ERROR:}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local total disabled_count stale_count disabled_names stale_names
|
||||||
|
total=$(echo "$result" | cut -d: -f1)
|
||||||
|
disabled_count=$(echo "$result" | cut -d: -f2)
|
||||||
|
stale_count=$(echo "$result" | cut -d: -f3)
|
||||||
|
disabled_names=$(echo "$result" | cut -d: -f4)
|
||||||
|
stale_names=$(echo "$result" | cut -d: -f5-)
|
||||||
|
|
||||||
|
local status="PASS" detail=""
|
||||||
|
if [[ "$disabled_count" -gt 0 ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
|
||||||
|
warn "$disabled_count/$total automation(s) disabled"
|
||||||
|
if [[ "$JSON" != true && "$QUIET" != true && -n "$disabled_names" ]]; then
|
||||||
|
echo "$disabled_names" | tr ';' '\n' | sed 's/^ */ /'
|
||||||
|
fi
|
||||||
|
status="WARN"
|
||||||
|
detail+="$disabled_count disabled; "
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$stale_count" -gt 0 ]]; then
|
||||||
|
[[ "$status" == "PASS" && "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
|
||||||
|
warn "$stale_count automation(s) not triggered in 30+ days"
|
||||||
|
if [[ "$JSON" != true && "$QUIET" != true && -n "$stale_names" ]]; then
|
||||||
|
echo "$stale_names" | tr ';' '\n' | sed 's/^ */ /'
|
||||||
|
fi
|
||||||
|
[[ "$status" == "PASS" ]] && status="WARN"
|
||||||
|
detail+="$stale_count stale; "
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$status" == "PASS" ]]; then
|
||||||
|
pass "All $total automations enabled and recently active"
|
||||||
|
json_add "ha_automations" "PASS" "All $total active"
|
||||||
|
else
|
||||||
|
json_add "ha_automations" "$status" "$detail"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- 29. HA System Resources ---
|
||||||
|
check_ha_system() {
|
||||||
|
section 29 "HA Sofia — System Resources"
|
||||||
|
|
||||||
|
if ! ha_sofia_available; then
|
||||||
|
warn "HA Sofia token not configured — skipping"
|
||||||
|
json_add "ha_system" "WARN" "Token not configured"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
ha_sofia_fetch_cache
|
||||||
|
|
||||||
|
if [[ ! -f "$HA_CACHE_DIR/states.json" ]] || [[ ! -f "$HA_CACHE_DIR/config.json" ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
|
||||||
|
warn "HA Sofia API unavailable for system check"
|
||||||
|
json_add "ha_system" "WARN" "API unavailable"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local result
|
||||||
|
result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
|
||||||
|
import os, json
|
||||||
|
|
||||||
|
cache = os.environ["HA_CACHE_DIR"]
|
||||||
|
with open(f"{cache}/states.json") as f:
|
||||||
|
states = json.load(f)
|
||||||
|
with open(f"{cache}/config.json") as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
|
version = config.get("version", "unknown")
|
||||||
|
entity_map = {s["entity_id"]: s for s in states}
|
||||||
|
|
||||||
|
cpu_patterns = ["sensor.processor_use", "sensor.system_monitor_processor_use"]
|
||||||
|
mem_patterns = ["sensor.memory_use_percent", "sensor.system_monitor_memory_use_percent"]
|
||||||
|
disk_patterns = ["sensor.disk_use_percent", "sensor.disk_use_percent_", "sensor.system_monitor_disk_use_percent"]
|
||||||
|
|
||||||
|
def find_entity(patterns):
|
||||||
|
for p in patterns:
|
||||||
|
if p in entity_map:
|
||||||
|
try:
|
||||||
|
return float(entity_map[p]["state"])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
for eid, s in entity_map.items():
|
||||||
|
for p in patterns:
|
||||||
|
if p.rstrip("_") in eid and "percent" in eid:
|
||||||
|
try:
|
||||||
|
return float(s["state"])
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
cpu = find_entity(cpu_patterns)
|
||||||
|
mem = find_entity(mem_patterns)
|
||||||
|
disk = find_entity(disk_patterns)
|
||||||
|
|
||||||
|
parts = ["version=" + version]
|
||||||
|
if cpu is not None:
|
||||||
|
parts.append("cpu=" + str(int(cpu)))
|
||||||
|
if mem is not None:
|
||||||
|
parts.append("mem=" + str(int(mem)))
|
||||||
|
if disk is not None:
|
||||||
|
parts.append("disk=" + str(int(disk)))
|
||||||
|
|
||||||
|
level = "PASS"
|
||||||
|
for val in [cpu, mem, disk]:
|
||||||
|
if val is not None:
|
||||||
|
if val > 90:
|
||||||
|
level = "FAIL"
|
||||||
|
break
|
||||||
|
elif val > 80:
|
||||||
|
level = "WARN"
|
||||||
|
|
||||||
|
print(level + ":" + ":".join(parts))
|
||||||
|
PYEOF
|
||||||
|
) || result="ERROR:python execution failed"
|
||||||
|
|
||||||
|
if [[ "$result" == "ERROR:"* ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
|
||||||
|
warn "HA Sofia: ${result#ERROR:}"
|
||||||
|
json_add "ha_system" "WARN" "${result#ERROR:}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
local level detail
|
||||||
|
level=$(echo "$result" | cut -d: -f1)
|
||||||
|
detail=$(echo "$result" | cut -d: -f2-)
|
||||||
|
|
||||||
|
if [[ "$level" == "FAIL" ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
|
||||||
|
fail "HA Sofia resources critical: $detail"
|
||||||
|
json_add "ha_system" "FAIL" "$detail"
|
||||||
|
elif [[ "$level" == "WARN" ]]; then
|
||||||
|
[[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
|
||||||
|
warn "HA Sofia resources elevated: $detail"
|
||||||
|
json_add "ha_system" "WARN" "$detail"
|
||||||
|
else
|
||||||
|
pass "HA Sofia healthy ($detail)"
|
||||||
|
json_add "ha_system" "PASS" "$detail"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# --- Summary ---
|
# --- Summary ---
|
||||||
print_summary() {
|
print_summary() {
|
||||||
if [[ "$JSON" == true ]]; then
|
if [[ "$JSON" == true ]]; then
|
||||||
|
|
@ -1283,6 +1703,10 @@ main() {
|
||||||
check_gpu
|
check_gpu
|
||||||
check_cloudflare_tunnel
|
check_cloudflare_tunnel
|
||||||
check_overcommit
|
check_overcommit
|
||||||
|
check_ha_entities
|
||||||
|
check_ha_integrations
|
||||||
|
check_ha_automations
|
||||||
|
check_ha_system
|
||||||
print_summary
|
print_summary
|
||||||
|
|
||||||
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
# Exit code: 2 for failures, 1 for warnings, 0 for clean
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue