diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md
index 5f1aa53e..e2e171f3 100644
--- a/docs/architecture/backup-dr.md
+++ b/docs/architecture/backup-dr.md
@@ -137,6 +137,7 @@ graph TB
 | Redis Backup | Weekly Sunday 03:00, 30d | CronJob in `redis` | BGSAVE + copy |
 | Prometheus Backup | Monthly 1st Sunday, 2 copies | CronJob in `monitoring` | TSDB snapshot → tar.gz |
 | plotting-book Backup | Weekly Sunday 03:00, 30d | CronJob in `plotting-book` | sqlite3 .backup |
+| LVM Thin Snapshots | Twice daily (00:00, 12:00), 7d | PVE host: `lvm-pvc-snapshot` | CoW snapshots of 13 proxmox-lvm PVCs |
 | Incremental Sync | Every 6h (cron) | TrueNAS: `/root/cloudsync-copy.sh` | ZFS diff → rclone copy |
 | Full Sync | Weekly Sunday 09:00 | TrueNAS Cloud Sync Task 1 | rclone sync with deletions |
 | CloudSync Monitor | Every 6h (cron) | CronJob in `monitoring` | Query TrueNAS API → Pushgateway |
@@ -170,6 +171,27 @@ zfs rollback main/<service>@auto-2026-03-23_00-00
 zfs clone main/<service>@auto-2026-03-23_00-00 main/<service>-recovered
 ```
 
+### Layer 1b: LVM Thin Snapshots (Proxmox CSI PVCs)
+
+Native LVM thin snapshots provide crash-consistent point-in-time recovery for all 13 Proxmox CSI PVCs (~340Gi). These are CoW snapshots — instant creation, minimal overhead, sharing the thin pool's free space.
+
+**Script**: `/usr/local/bin/lvm-pvc-snapshot` on PVE host (source: `infra/scripts/lvm-pvc-snapshot`)
+**Schedule**: Twice daily (00:00, 12:00) via systemd timer, 7-day retention (max 14 snapshots per LV)
+**Discovery**: Auto-discovers PVC LVs matching `vm-*-pvc-*` pattern in VG `pve` thin pool `data`
+
+**Coverage**: All proxmox-lvm PVCs **except** `dbaas` and `monitoring` namespaces. These are excluded because:
+- MySQL InnoDB, PostgreSQL, and Prometheus are high-churn (50%+ CoW divergence/hour)
+- They already have app-level dumps (Layer 2)
+- Including them causes ~36% write amplification; excluding them reduces overhead to ~0%
+
+Snapshotted PVCs include: Redis, Vaultwarden, Calibre, Nextcloud, Forgejo, FreshRSS, ActualBudget, NovelApp, Headscale, Uptime Kuma, etc. (~20 low-churn LVs)
+
+**Exclusion config**: `EXCLUDE_NAMESPACES` variable in script (default: `dbaas,monitoring`). Uses kubectl to resolve LV names dynamically.
+
+**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>24h), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free).
+
+**Restore**: `lvm-pvc-snapshot restore <pvc-lv> <snapshot-lv>` — auto-discovers K8s workload, scales down, swaps LVs, scales back up. See `docs/runbooks/restore-lvm-snapshot.md`.
+
 ### Layer 2: Application-Level Backups
 
 K8s CronJobs run inside the cluster, dumping database/state to NFS-exported backup directories. Each service writes to `/mnt/main/<service>-backup/`.
diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh
index 1bc56f17..caa8e39c 100755
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
@@ -26,7 +26,7 @@ JSON=false
 KUBECONFIG_PATH="$(pwd)/config"
 KUBECTL=""
 JSON_RESULTS=()
-TOTAL_CHECKS=25
+TOTAL_CHECKS=29
 
 # --- Helpers ---
 info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@@ -1206,6 +1206,426 @@ check_overcommit() {
     json_add "overcommit" "$status" "$detail"
 }
 
+# --- HA helpers ---
+HA_CACHE_DIR=""
+
+ha_sofia_available() {
+    if [[ -z "${HOME_ASSISTANT_SOFIA_URL:-}" ]] || [[ -z "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]]; then
+        return 1
+    fi
+    return 0
+}
+
+# Fetch all HA data once and cache in temp files
+ha_sofia_fetch_cache() {
+    if [[ -n "$HA_CACHE_DIR" ]]; then
+        return 0
+    fi
+    HA_CACHE_DIR=$(mktemp -d)
+    export HA_CACHE_DIR
+    trap "rm -rf $HA_CACHE_DIR" EXIT
+
+    python3 << 'HA_FETCH_EOF'
+import os, json, requests, sys
+
+url = os.environ["HOME_ASSISTANT_SOFIA_URL"]
+token = os.environ["HOME_ASSISTANT_SOFIA_TOKEN"]
+cache = os.environ["HA_CACHE_DIR"]
+headers = {"Authorization": f"Bearer {token}"}
+
+errors = []
+
+# Fetch states (used by checks 26, 28)
+try:
+    resp = requests.get(f"{url}/api/states", headers=headers, timeout=30)
+    resp.raise_for_status()
+    with open(f"{cache}/states.json", "w") as f:
+        json.dump(resp.json(), f)
+except Exception as e:
+    errors.append(f"states:{e}")
+
+# Fetch config entries (used by check 27)
+try:
+    resp = requests.get(f"{url}/api/config/config_entries/entry", headers=headers, timeout=30)
+    resp.raise_for_status()
+    with open(f"{cache}/entries.json", "w") as f:
+        json.dump(resp.json(), f)
+except Exception as e:
+    errors.append(f"entries:{e}")
+
+# Fetch config (used by check 29)
+try:
+    resp = requests.get(f"{url}/api/config", headers=headers, timeout=10)
+    resp.raise_for_status()
+    with open(f"{cache}/config.json", "w") as f:
+        json.dump(resp.json(), f)
+except Exception as e:
+    errors.append(f"config:{e}")
+
+if errors:
+    with open(f"{cache}/errors.txt", "w") as f:
+        f.write("\n".join(errors))
+HA_FETCH_EOF
+}
+
+# --- 26. HA Entity Availability ---
+check_ha_entities() {
+    section 26 "HA Sofia — Entity Availability"
+
+    if ! ha_sofia_available; then
+        warn "HA Sofia token not configured — skipping"
+        json_add "ha_entities" "WARN" "Token not configured"
+        return 0
+    fi
+
+    ha_sofia_fetch_cache
+
+    if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
+        local err=""
+        [[ -f "$HA_CACHE_DIR/errors.txt" ]] && err=$(grep "^states:" "$HA_CACHE_DIR/errors.txt" | head -1)
+        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
+        warn "HA Sofia API unreachable: ${err:-unknown error}"
+        json_add "ha_entities" "WARN" "API unreachable"
+        return 0
+    fi
+
+    local result
+    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
+import os, json
+
+cache = os.environ["HA_CACHE_DIR"]
+with open(f"{cache}/states.json") as f:
+    states = json.load(f)
+
+unavail = [s for s in states if s.get("state") in ("unavailable", "unknown")]
+domains = {}
+for s in unavail:
+    d = s["entity_id"].split(".")[0]
+    domains[d] = domains.get(d, 0) + 1
+
+total = len(states)
+count = len(unavail)
+summary = ", ".join(f"{d}:{n}" for d, n in sorted(domains.items(), key=lambda x: -x[1]))
+entity_list = "\n".join("ENTITY:" + s["entity_id"] for s in unavail)
+print(f"{count}:{total}:{summary}")
+if entity_list:
+    print(entity_list)
+PYEOF
+) || result="ERROR:python execution failed"
+
+    if [[ "$result" == "ERROR:"* ]]; then
+        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
+        warn "HA Sofia: ${result#ERROR:}"
+        json_add "ha_entities" "WARN" "${result#ERROR:}"
+        return 0
+    fi
+
+    local first_line count total summary
+    first_line=$(echo "$result" | head -1)
+    count=$(echo "$first_line" | cut -d: -f1)
+    total=$(echo "$first_line" | cut -d: -f2)
+    summary=$(echo "$first_line" | cut -d: -f3-)
+
+    if [[ "$count" -eq 0 ]]; then
+        pass "All $total HA entities available"
+        json_add "ha_entities" "PASS" "0/$total unavailable"
+    elif [[ "$count" -le 10 ]]; then
+        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
+        warn "$count/$total entities unavailable ($summary)"
+        if [[ "$JSON" != true && "$QUIET" != true ]]; then
+            echo "$result" | grep "^ENTITY:" | sed 's/^ENTITY:/    /'
+        fi
+        json_add "ha_entities" "WARN" "$count/$total: $summary"
+    else
+        [[ "$QUIET" == true ]] && section_always 26 "HA Sofia — Entity Availability"
+        fail "$count/$total entities unavailable ($summary)"
+        if [[ "$JSON" != true && "$QUIET" != true ]]; then
+            echo "$result" | grep "^ENTITY:" | head -20 | sed 's/^ENTITY:/    /'
+            local entity_count
+            entity_count=$(echo "$result" | grep -c "^ENTITY:" || true)
+            if [[ "$entity_count" -gt 20 ]]; then
+                echo "    ... and $((entity_count - 20)) more"
+            fi
+        fi
+        json_add "ha_entities" "FAIL" "$count/$total: $summary"
+    fi
+}
+
+# --- 27. HA Integration Health ---
+check_ha_integrations() {
+    section 27 "HA Sofia — Integration Health"
+
+    if ! ha_sofia_available; then
+        warn "HA Sofia token not configured — skipping"
+        json_add "ha_integrations" "WARN" "Token not configured"
+        return 0
+    fi
+
+    ha_sofia_fetch_cache
+
+    if [[ ! -f "$HA_CACHE_DIR/entries.json" ]]; then
+        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
+        warn "HA Sofia config entries API unavailable"
+        json_add "ha_integrations" "WARN" "API unavailable"
+        return 0
+    fi
+
+    local result
+    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
+import os, json
+
+cache = os.environ["HA_CACHE_DIR"]
+with open(f"{cache}/entries.json") as f:
+    entries = json.load(f)
+
+total = len(entries)
+not_loaded = []
+setup_error = []
+for e in entries:
+    state = e.get("state", "loaded")
+    domain = e.get("domain", "?")
+    title = e.get("title", "?")
+    if state == "setup_error" or state == "setup_retry":
+        setup_error.append(f"{domain} ({title})")
+    elif state == "not_loaded":
+        not_loaded.append(f"{domain} ({title})")
+
+error_count = len(setup_error)
+unloaded_count = len(not_loaded)
+error_names = "; ".join(setup_error) if setup_error else ""
+unloaded_names = "; ".join(not_loaded) if not_loaded else ""
+print(f"{total}:{error_count}:{unloaded_count}:{error_names}:{unloaded_names}")
+PYEOF
+) || result="ERROR:python execution failed"
+
+    if [[ "$result" == "ERROR:"* ]]; then
+        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
+        warn "HA Sofia: ${result#ERROR:}"
+        json_add "ha_integrations" "WARN" "${result#ERROR:}"
+        return 0
+    fi
+
+    local total error_count unloaded_count error_names unloaded_names
+    total=$(echo "$result" | cut -d: -f1)
+    error_count=$(echo "$result" | cut -d: -f2)
+    unloaded_count=$(echo "$result" | cut -d: -f3)
+    error_names=$(echo "$result" | cut -d: -f4)
+    unloaded_names=$(echo "$result" | cut -d: -f5-)
+
+    if [[ "$error_count" -gt 0 ]]; then
+        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
+        fail "$error_count integration(s) in error state: $error_names"
+        json_add "ha_integrations" "FAIL" "$error_count errors: $error_names"
+    elif [[ "$unloaded_count" -gt 0 ]]; then
+        [[ "$QUIET" == true ]] && section_always 27 "HA Sofia — Integration Health"
+        warn "$unloaded_count integration(s) not loaded: $unloaded_names"
+        json_add "ha_integrations" "WARN" "$unloaded_count not loaded: $unloaded_names"
+    else
+        pass "All $total integrations loaded"
+        json_add "ha_integrations" "PASS" "All $total loaded"
+    fi
+}
+
+# --- 28. HA Automation Status ---
+check_ha_automations() {
+    section 28 "HA Sofia — Automation Status"
+
+    if ! ha_sofia_available; then
+        warn "HA Sofia token not configured — skipping"
+        json_add "ha_automations" "WARN" "Token not configured"
+        return 0
+    fi
+
+    ha_sofia_fetch_cache
+
+    if [[ ! -f "$HA_CACHE_DIR/states.json" ]]; then
+        [[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
+        warn "HA Sofia states API unavailable"
+        json_add "ha_automations" "WARN" "API unavailable"
+        return 0
+    fi
+
+    local result
+    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
+import os, json
+from datetime import datetime, timezone
+
+cache = os.environ["HA_CACHE_DIR"]
+with open(f"{cache}/states.json") as f:
+    states = json.load(f)
+
+autos = [s for s in states if s["entity_id"].startswith("automation.")]
+total = len(autos)
+disabled = [a["entity_id"] for a in autos if a["state"] == "off"]
+disabled_count = len(disabled)
+
+now = datetime.now(timezone.utc)
+stale = []
+for a in autos:
+    if a["state"] == "off":
+        continue
+    lt = a.get("attributes", {}).get("last_triggered")
+    if lt:
+        try:
+            t = datetime.fromisoformat(lt.replace("Z", "+00:00"))
+            days = (now - t).days
+            if days > 30:
+                stale.append(a["entity_id"] + "=" + str(days) + "d")
+        except:
+            pass
+
+stale_count = len(stale)
+disabled_names = "; ".join(disabled)
+stale_names = "; ".join(stale[:10])
+print(f"{total}:{disabled_count}:{stale_count}:{disabled_names}:{stale_names}")
+PYEOF
+) || result="ERROR:python execution failed"
+
+    if [[ "$result" == "ERROR:"* ]]; then
+        [[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
+        warn "HA Sofia: ${result#ERROR:}"
+        json_add "ha_automations" "WARN" "${result#ERROR:}"
+        return 0
+    fi
+
+    local total disabled_count stale_count disabled_names stale_names
+    total=$(echo "$result" | cut -d: -f1)
+    disabled_count=$(echo "$result" | cut -d: -f2)
+    stale_count=$(echo "$result" | cut -d: -f3)
+    disabled_names=$(echo "$result" | cut -d: -f4)
+    stale_names=$(echo "$result" | cut -d: -f5-)
+
+    local status="PASS" detail=""
+    if [[ "$disabled_count" -gt 0 ]]; then
+        [[ "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
+        warn "$disabled_count/$total automation(s) disabled"
+        if [[ "$JSON" != true && "$QUIET" != true && -n "$disabled_names" ]]; then
+            echo "$disabled_names" | tr ';' '\n' | sed 's/^ */    /'
+        fi
+        status="WARN"
+        detail+="$disabled_count disabled; "
+    fi
+
+    if [[ "$stale_count" -gt 0 ]]; then
+        [[ "$status" == "PASS" && "$QUIET" == true ]] && section_always 28 "HA Sofia — Automation Status"
+        warn "$stale_count automation(s) not triggered in 30+ days"
+        if [[ "$JSON" != true && "$QUIET" != true && -n "$stale_names" ]]; then
+            echo "$stale_names" | tr ';' '\n' | sed 's/^ */    /'
+        fi
+        [[ "$status" == "PASS" ]] && status="WARN"
+        detail+="$stale_count stale; "
+    fi
+
+    if [[ "$status" == "PASS" ]]; then
+        pass "All $total automations enabled and recently active"
+        json_add "ha_automations" "PASS" "All $total active"
+    else
+        json_add "ha_automations" "$status" "$detail"
+    fi
+}
+
+# --- 29. HA System Resources ---
+check_ha_system() {
+    section 29 "HA Sofia — System Resources"
+
+    if ! ha_sofia_available; then
+        warn "HA Sofia token not configured — skipping"
+        json_add "ha_system" "WARN" "Token not configured"
+        return 0
+    fi
+
+    ha_sofia_fetch_cache
+
+    if [[ ! -f "$HA_CACHE_DIR/states.json" ]] || [[ ! -f "$HA_CACHE_DIR/config.json" ]]; then
+        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
+        warn "HA Sofia API unavailable for system check"
+        json_add "ha_system" "WARN" "API unavailable"
+        return 0
+    fi
+
+    local result
+    result=$(export HA_CACHE_DIR; python3 << 'PYEOF'
+import os, json
+
+cache = os.environ["HA_CACHE_DIR"]
+with open(f"{cache}/states.json") as f:
+    states = json.load(f)
+with open(f"{cache}/config.json") as f:
+    config = json.load(f)
+
+version = config.get("version", "unknown")
+entity_map = {s["entity_id"]: s for s in states}
+
+cpu_patterns = ["sensor.processor_use", "sensor.system_monitor_processor_use"]
+mem_patterns = ["sensor.memory_use_percent", "sensor.system_monitor_memory_use_percent"]
+disk_patterns = ["sensor.disk_use_percent", "sensor.disk_use_percent_", "sensor.system_monitor_disk_use_percent"]
+
+def find_entity(patterns):
+    for p in patterns:
+        if p in entity_map:
+            try:
+                return float(entity_map[p]["state"])
+            except (ValueError, TypeError):
+                pass
+    for eid, s in entity_map.items():
+        for p in patterns:
+            if p.rstrip("_") in eid and "percent" in eid:
+                try:
+                    return float(s["state"])
+                except (ValueError, TypeError):
+                    pass
+    return None
+
+cpu = find_entity(cpu_patterns)
+mem = find_entity(mem_patterns)
+disk = find_entity(disk_patterns)
+
+parts = ["version=" + version]
+if cpu is not None:
+    parts.append("cpu=" + str(int(cpu)))
+if mem is not None:
+    parts.append("mem=" + str(int(mem)))
+if disk is not None:
+    parts.append("disk=" + str(int(disk)))
+
+level = "PASS"
+for val in [cpu, mem, disk]:
+    if val is not None:
+        if val > 90:
+            level = "FAIL"
+            break
+        elif val > 80:
+            level = "WARN"
+
+print(level + ":" + ":".join(parts))
+PYEOF
+) || result="ERROR:python execution failed"
+
+    if [[ "$result" == "ERROR:"* ]]; then
+        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
+        warn "HA Sofia: ${result#ERROR:}"
+        json_add "ha_system" "WARN" "${result#ERROR:}"
+        return 0
+    fi
+
+    local level detail
+    level=$(echo "$result" | cut -d: -f1)
+    detail=$(echo "$result" | cut -d: -f2-)
+
+    if [[ "$level" == "FAIL" ]]; then
+        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
+        fail "HA Sofia resources critical: $detail"
+        json_add "ha_system" "FAIL" "$detail"
+    elif [[ "$level" == "WARN" ]]; then
+        [[ "$QUIET" == true ]] && section_always 29 "HA Sofia — System Resources"
+        warn "HA Sofia resources elevated: $detail"
+        json_add "ha_system" "WARN" "$detail"
+    else
+        pass "HA Sofia healthy ($detail)"
+        json_add "ha_system" "PASS" "$detail"
+    fi
+}
+
 # --- Summary ---
 print_summary() {
     if [[ "$JSON" == true ]]; then
@@ -1283,6 +1703,10 @@ main() {
     check_gpu
     check_cloudflare_tunnel
     check_overcommit
+    check_ha_entities
+    check_ha_integrations
+    check_ha_automations
+    check_ha_system
     print_summary
 
     # Exit code: 2 for failures, 1 for warnings, 0 for clean