stem95su: scheduled Drive->site sync CronJob (every 10m)

CronJob stem95su-gdrive-sync (*/10) mounts the content PVC RW and rclone-syncs the read-only Drive folder "claude" (stem claude/files) onto it (rclone/rclone:1.74.3, scope=drive.readonly, empty-source guard + --max-delete 25). ESO ExternalSecret stem95su-rclone <- Vault secret/stem95su. Requires the GCP OAuth app published to Production or the refresh token expires ~weekly. Lands the gdrive-sync stack on master (it had landed on a feature branch by accident on the shared devvm checkout). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:42:26 +00:00 · 2026-06-09 08:42:26 +00:00 · 6d224861c4
commit 6d224861c4
parent 05b50d2b96
1168 changed files with 120 additions and 358547 deletions
--- a/scripts/apply-mbps-caps.service
+++ b/scripts/apply-mbps-caps.service
@ -1,12 +0,0 @@
-[Unit]
-Description=Apply per-VM I/O caps via qm set (idempotent)
-Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/apply-mbps-caps.sh
-After=pve-cluster.service
-Wants=pve-cluster.service
-
-[Service]
-Type=oneshot
-ExecStart=/usr/local/bin/apply-mbps-caps.sh
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=apply-mbps-caps
--- a/scripts/apply-mbps-caps.sh
+++ b/scripts/apply-mbps-caps.sh
@ -1,74 +0,0 @@
-#!/usr/bin/env bash
-# Apply per-VM I/O caps via `qm set` on the PVE host.
-#
-# - Reads each target VM's current boot-disk options.
-# - Appends/normalises `mbps_rd=<N>,mbps_wr=<N>`.
-# - Re-applies via `qm set` (live, no reboot needed).
-# - Idempotent: re-running with no drift is a no-op at the storage
-#   level (proxmox config rewrite is cheap).
-# - Continues on per-VM failures so one missing/stopped VM doesn't
-#   skip the rest — designed to be safe under the systemd timer.
-#
-# Backed by `apply-mbps-caps.{service,timer}` (hourly + 5min-after-boot).
-# Why these values: see beads code-9v2j + memory id=2726 (alloy IO storm)
-# + memory id=1575 (VMs intentionally out of TF).
-
-set -uo pipefail  # NOT -e — keep going if a single VM step fails.
-
-# vmid:disk_slot:mbps_rd:mbps_wr  (Linux VMs only — skipping 101 pfsense BSD, 300 Windows)
-TARGETS=(
-  "102:scsi0:60:60"      # devvm
-  "103:sata0:40:40"      # home-assistant
-  "200:scsi0:100:60"     # k8s-master (alloy storm origin — firmest clip)
-  "201:scsi1:150:120"    # k8s-node1 (GPU + many CSI disks; boots from scsi1)
-  "202:scsi0:150:120"    # k8s-node2
-  "203:scsi0:150:120"    # k8s-node3
-  "204:scsi0:150:120"    # k8s-node4
-  "220:scsi0:40:40"      # docker-registry
-)
-
-apply_one() {
-  local spec="$1"
-  local vmid slot rd wr
-  IFS=: read -r vmid slot rd wr <<<"$spec"
-
-  # Skip non-existent VMs cleanly (e.g. node decommissioned, never rebuilt).
-  if ! qm status "$vmid" >/dev/null 2>&1; then
-    echo "vmid $vmid: not present on this host — skipping"
-    return 0
-  fi
-
-  local current cleaned newvalue
-  current=$(qm config "$vmid" | awk -v s="$slot:" '$1==s {sub(/^[^ ]+ /, ""); print; exit}')
-  if [[ -z "$current" ]]; then
-    echo "vmid $vmid: no $slot line in config — skipping"
-    return 0
-  fi
-
-  cleaned=$(echo "$current" | sed -E 's/,mbps_rd=[0-9]+//g; s/,mbps_wr=[0-9]+//g')
-  newvalue="${cleaned},mbps_rd=${rd},mbps_wr=${wr}"
-
-  # Skip the qm-set call entirely when state already matches — keeps
-  # journal noise low under the hourly timer.
-  if [[ "$current" == "$newvalue" ]]; then
-    echo "vmid $vmid: $slot already at mbps_rd=${rd},mbps_wr=${wr} — no-op"
-    return 0
-  fi
-
-  echo "vmid $vmid: updating $slot"
-  echo "  before: $current"
-  echo "  after:  $newvalue"
-  if qm set "$vmid" "--$slot" "$newvalue"; then
-    echo "  ok"
-  else
-    echo "  FAILED: qm set returned non-zero"
-    return 1
-  fi
-}
-
-rc=0
-for spec in "${TARGETS[@]}"; do
-  apply_one "$spec" || rc=1
-done
-
-exit "$rc"
--- a/scripts/apply-mbps-caps.timer
+++ b/scripts/apply-mbps-caps.timer
@ -1,18 +0,0 @@
-[Unit]
-Description=Re-apply per-VM I/O caps periodically + after PVE boot
-
-[Timer]
-# After every PVE host reboot — caps survive in /etc/pve/qemu-server/<vmid>.conf
-# normally, but a config restore from backup can drop them (see 2026-05-26
-# incident where we restored 202.conf + 203.conf from /mnt/backup/pve-config/).
-OnBootSec=5min
-
-# Hourly during normal operation — catches manual `qm set` drift or fresh
-# VM clones that haven't had caps applied yet.
-OnCalendar=hourly
-
-Persistent=true
-RandomizedDelaySec=2min
-
-[Install]
-WantedBy=timers.target
--- a/scripts/check-ingress-auth-comments.py
+++ b/scripts/check-ingress-auth-comments.py
@ -1,124 +0,0 @@
-#!/usr/bin/env python3
-"""Enforce the inline-comment convention for ingress_factory auth tiers.
-
-Every `auth = "app"` or `auth = "none"` line under a stack must have an
-immediately-preceding comment block containing `# auth = "<tier>":`
-that documents what gates the app (for "app") or why the endpoint is
-intentionally public (for "none").
-
-This is the static guard for the anti-exposure rule documented in
-`infra/.claude/CLAUDE.md` "Auth" section. It's invoked by `scripts/tg`
-before every plan/apply/destroy/refresh, so it fires regardless of who
-or what is running terragrunt — local laptop, CI, headless agent.
-
-Stack-scoped by design: only checks the .tf files under the stack
-being acted on. Other stacks' historical violations don't block work
-on the current stack; each stack documents itself the next time it's
-edited.
-
-Usage:
-  check-ingress-auth-comments.py <stack-path>     # scan one stack
-  check-ingress-auth-comments.py --all            # scan every stack
-"""
-
-import argparse
-import os
-import re
-import sys
-
-AUTH_LINE = re.compile(r'^\s*auth\s*=\s*"(app|none)"\s*$')
-COMMENT_LINE = re.compile(r'^\s*#')
-COMMENT_TIER = re.compile(r'auth\s*=\s*"(app|none)"')
-
-
-def scan_dir(path):
-    violations = []
-    for root, _, files in os.walk(path):
-        for f in files:
-            if not f.endswith('.tf'):
-                continue
-            full = os.path.join(root, f)
-            try:
-                with open(full) as fh:
-                    lines = fh.readlines()
-            except OSError:
-                continue
-            for i, line in enumerate(lines):
-                m = AUTH_LINE.match(line)
-                if not m:
-                    continue
-                tier = m.group(1)
-                # Walk backwards through contiguous comment lines.
-                # Pass if ANY of them documents the matching tier.
-                ok = False
-                j = i - 1
-                while j >= 0 and COMMENT_LINE.match(lines[j]):
-                    cm = COMMENT_TIER.search(lines[j])
-                    if cm and cm.group(1) == tier:
-                        ok = True
-                        break
-                    j -= 1
-                if not ok:
-                    violations.append((full, i + 1, tier))
-    return violations
-
-
-def main():
-    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
-    g = ap.add_mutually_exclusive_group(required=True)
-    g.add_argument('path', nargs='?', help='Stack directory to scan')
-    g.add_argument('--all', action='store_true', help='Scan every stack under stacks/')
-    args = ap.parse_args()
-
-    if args.all:
-        scan_paths = ['stacks']
-    else:
-        if not os.path.isdir(args.path):
-            print(f"ERROR: {args.path} is not a directory", file=sys.stderr)
-            sys.exit(2)
-        scan_paths = [args.path]
-
-    violations = []
-    for p in scan_paths:
-        violations.extend(scan_dir(p))
-
-    if not violations:
-        return
-
-    print(
-        "\n"
-        "==============================================================\n"
-        "ingress_factory auth-comment convention violated\n"
-        "==============================================================\n"
-        "\n"
-        "Every `auth = \"app\"` or `auth = \"none\"` line must have a\n"
-        "preceding comment line documenting what gates the app (for\n"
-        "\"app\") or why the endpoint is intentionally public (for\n"
-        "\"none\"). This guard prevents accidentally exposing private\n"
-        "services. See infra/.claude/CLAUDE.md Auth section.\n"
-        "\n"
-        "Add a comment line directly above the auth line:\n"
-        "\n"
-        "  # auth = \"app\":  <what gates the app, e.g. NextAuth + OAuth>\n"
-        "  auth = \"app\"\n"
-        "\n"
-        "or:\n"
-        "\n"
-        "  # auth = \"none\": <why public, e.g. webhook receiver, CalDAV>\n"
-        "  auth = \"none\"\n"
-        "\n"
-        "Violations:",
-        file=sys.stderr,
-    )
-    for path, line_no, tier in violations:
-        print(
-            f"  {path}:{line_no}: auth = \"{tier}\" missing preceding "
-            f"`# auth = \"{tier}\":` comment",
-            file=sys.stderr,
-        )
-    print(file=sys.stderr)
-    sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
--- a/scripts/cluster_manager.py
+++ b/scripts/cluster_manager.py
@ -1,277 +0,0 @@
-import asyncio
-import click
-import logging
-import time
-from typing import List, Union, Optional
-from kubernetes_asyncio import client, config
-from kubernetes_asyncio.client.api_client import ApiClient
-
-logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
-logger = logging.getLogger(__name__)
-
-
-async def wait_for_healthy(
-    api_instance: client.AppsV1Api,
-    resource_type: str,
-    namespace: str,
-    name: str,
-    target_replicas: int,
-    timeout: int = 300,
-) -> None:
-    start_time = time.time()
-    logger.info(
-        f"Waiting for {resource_type} {name} to reach {target_replicas} replicas..."
-    )
-
-    while True:
-        if time.time() - start_time > timeout:
-            logger.error(f"❌ Timeout reached for {resource_type} {name}")
-            return
-
-        try:
-            if resource_type.lower() == "deployment":
-                res = await api_instance.read_namespaced_deployment_status(
-                    name, namespace
-                )
-                ready = res.status.ready_replicas or 0
-                updated = res.status.updated_replicas or 0
-                if ready == target_replicas and updated == target_replicas:
-                    break
-            else:  # StatefulSet
-                res = await api_instance.read_namespaced_stateful_set_status(
-                    name, namespace
-                )
-                ready = res.status.ready_replicas or 0
-                if ready == target_replicas:
-                    break
-
-        except Exception as e:
-            logger.debug(f"Retrying status check for {name}: {e}")
-
-        await asyncio.sleep(5)
-
-    logger.info(f"✅ {resource_type} {name} is now healthy.")
-
-
-async def wait_for_zero(
-    api: client.AppsV1Api, kind: str, ns: str, name: str, timeout: int
-) -> tuple[str, str]:
-    start_time = asyncio.get_event_loop().time()
-    while (asyncio.get_event_loop().time() - start_time) < timeout:
-        try:
-            res = await (
-                api.read_namespaced_deployment_status(name, ns)
-                if kind.lower() == "deployment"
-                else api.read_namespaced_stateful_set_status(name, ns)
-            )
-            if (res.status.ready_replicas or 0) == 0:
-                return ns, name
-        except Exception:
-            return ns, name  # Assume gone if error
-        await asyncio.sleep(3)
-    logger.error(f"Timeout: {kind} {ns}/{name} still has running pods.")
-    return ns, name
-
-
-async def scale_resource(
-    api_instance: client.AppsV1Api,
-    resource_type: str,
-    namespace: str,
-    name: str,
-    replicas: int,
-) -> None:
-    body = {"spec": {"replicas": replicas}}
-    try:
-        if resource_type.lower() == "deployment":
-            await api_instance.patch_namespaced_deployment_scale(name, namespace, body)
-        else:
-            await api_instance.patch_namespaced_stateful_set_scale(
-                name, namespace, body
-            )
-    except Exception as e:
-        logger.error(f"Failed to scale {resource_type} {name}: {e}")
-
-
-async def run_stop_tier(
-    api_v1: client.AppsV1Api, label: str, output_file: str, timeout: int
-) -> None:
-    """Processes a single label tier: saves, scales to 0, and waits."""
-    excluded_ns = ["kube-system", "kube-public", "kube-node-lease"]
-
-    # 1. Discover
-    targets = [
-        ("Deployment", api_v1.list_deployment_for_all_namespaces),
-        ("StatefulSet", api_v1.list_stateful_set_for_all_namespaces),
-    ]
-
-    tier_resources = []
-    for kind, list_func in targets:
-        resp = await list_func(label_selector=label)
-        tier_resources.extend(
-            [
-                (kind, item)
-                for item in resp.items
-                if item.metadata.namespace not in excluded_ns
-            ]
-        )
-
-    if not tier_resources:
-        logger.warning(f"No resources found for label: {label}")
-        return
-
-    # 2. Save & Scale
-    active_jobs: set[tuple[str, str]] = set()
-    wait_tasks = []
-
-    # Append to file so we don't overwrite previous tiers
-    with open(output_file, "a") as f:
-        for kind, item in tier_resources:
-            ns, name = item.metadata.namespace, item.metadata.name
-            reps = item.spec.replicas or 0
-            f.write(f"{kind} {ns} {name} {reps}\n")
-            active_jobs.add((ns, name))
-
-            await scale_resource(api_v1, kind, ns, name, 0)
-            wait_tasks.append(wait_for_zero(api_v1, kind, ns, name, timeout))
-
-    # 3. Wait for this tier to finish before moving to next
-    logger.info(f"Tier [{label}]: Waiting for {len(active_jobs)} resources to stop...")
-    for coro in asyncio.as_completed(wait_tasks):
-        finished_ns, finished_name = await coro
-        active_jobs.discard((finished_ns, finished_name))
-        if active_jobs:
-            remaining_ns = sorted({ns for ns, name in active_jobs})
-            logger.info(
-                f"[{label}] Pending: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
-            )
-
-    logger.info(f"✅ Tier [{label}] successfully shut down.")
-
-
-@click.group()
-def cli():
-    pass
-
-
-@cli.command()
-@click.argument("labels", nargs=-1, required=True)
-@click.option("--output", "-o", default="resources.txt", help="Output state file")
-@click.option("--timeout", "-t", default=3600)
-def stop(labels: List[str], output: str, timeout: int):
-    """Stop tiers sequentially. Usage: stop 'app=web' 'app=db'"""
-
-    async def main():
-        await config.load_kube_config()
-        # Clear/Create file at start
-        open(output, "w").close()
-
-        async with ApiClient() as api_client:
-            api_v1 = client.AppsV1Api(api_client)
-            for label in labels:
-                logger.info(f"🚀 Processing Shutdown Tier: {label}")
-                await run_stop_tier(api_v1, label, output, timeout)
-        logger.info("🏁 Sequence complete. Cluster is gracefully stopped.")
-
-    asyncio.run(main())
-
-
-@cli.command()
-@click.argument("labels", nargs=-1, required=True)
-@click.option("--file", "-f", default="resources.txt")
-@click.option("--timeout", "-t", default=3600, help="Seconds to wait per resource")
-def start(labels: List[str], file: str, timeout: int):
-    asyncio.run(run_start_sequence(labels, file, timeout))
-
-
-async def run_start_sequence(labels: List[str], file_path: str, timeout: int) -> None:
-    await config.load_kube_config()
-
-    async with ApiClient() as api_client:
-        apps_v1 = client.AppsV1Api(api_client)
-
-        # 1. Load the entire snapshot into memory for filtering
-        try:
-            with open(file_path, "r") as f:
-                # Format: Kind Namespace Name Replicas
-                snapshot_lines = [line.strip().split() for line in f if line.strip()]
-        except FileNotFoundError:
-            logger.error(f"Snapshot file {file_path} not found.")
-            return
-
-        # 2. Iterate through labels in the order provided
-        for label in labels:
-            logger.info(f"🚀 Starting Tier: {label}")
-
-            # Find resources in this tier by querying K8s for the label
-            # then matching against our snapshot file data
-            tier_resources = await get_resources_by_label(apps_v1, label)
-
-            # Cross-reference: Only start things that are in BOTH the K8s label query AND our file
-            # This ensures we restore them to the CORRECT previous replica count
-            to_restore = []
-            tier_keys = {(r["ns"], r["name"]) for r in tier_resources}
-
-            for kind, ns, name, reps in snapshot_lines:
-                if (ns, name) in tier_keys:
-                    to_restore.append((kind, ns, name, int(reps)))
-
-            if not to_restore:
-                logger.warning(f"No resources found in snapshot for tier: {label}")
-                continue
-
-            # 3. Scale and Wait for this specific tier
-            await process_start_tier(apps_v1, to_restore, timeout, label)
-
-        logger.info("🏁 All tiers started successfully.")
-
-
-async def get_resources_by_label(api: client.AppsV1Api, label: str) -> List[dict]:
-    """Helper to find what currently exists in the cluster with this label."""
-    targets = [
-        api.list_deployment_for_all_namespaces,
-        api.list_stateful_set_for_all_namespaces,
-    ]
-    found = []
-    for list_func in targets:
-        resp = await list_func(label_selector=label)
-        for item in resp.items:
-            found.append({"ns": item.metadata.namespace, "name": item.metadata.name})
-    return found
-
-
-async def process_start_tier(
-    api: client.AppsV1Api, resources: list, timeout: int, label: str
-):
-    active_jobs = set()
-    scale_tasks = []
-    wait_tasks = []
-
-    # Wrapper to track which job finishes
-    async def tracked_wait(kind, ns, name, target, t_out):
-        await wait_for_healthy(api, kind, ns, name, target, t_out)
-        return (ns, name)
-
-    for kind, ns, name, reps in resources:
-        active_jobs.add((ns, name))
-        scale_tasks.append(scale_resource(api, kind, ns, name, reps))
-        wait_tasks.append(tracked_wait(kind, ns, name, reps, timeout))
-
-    # Trigger all scales for this tier
-    await asyncio.gather(*scale_tasks)
-
-    # Monitor health
-    for coro in asyncio.as_completed(wait_tasks):
-        finished_ns, finished_name = await coro
-        active_jobs.discard((finished_ns, finished_name))
-
-        if active_jobs:
-            remaining_ns = sorted({ns for ns, name in active_jobs})
-            logger.info(
-                f"[{label}] Pending Health: {len(active_jobs)} | Namespaces: {', '.join(remaining_ns)}"
-            )
-
-    logger.info(f"✅ Tier [{label}] is healthy.")
-
-
-if __name__ == "__main__":
-    cli()
--- a/scripts/daily-backup.service
+++ b/scripts/daily-backup.service
@ -1,14 +0,0 @@
-[Unit]
-Description=Daily backup: PVC snapshots + SQLite + pfsense to sda
-After=network-online.target
-
-[Service]
-Type=oneshot
-ExecStart=/usr/local/bin/daily-backup
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=daily-backup
-# 4h budget — the snapshot mount + LUKS decrypt + rsync + sqlite scan loop
-# scales with the number of PVCs (118 today). Hit the 1h ceiling around week
-# 18 of 2026 and silently SIGTERM'd for 10 days. Bumped to 4h with margin.
-TimeoutStartSec=14400
--- a/scripts/daily-backup.sh
+++ b/scripts/daily-backup.sh
@ -1,424 +0,0 @@
-#!/usr/bin/env bash
-# daily-backup — 3-2-1 backup: PVC file copy + SQLite + pfsense + PVE config to sda
-# Deploy to PVE host at /usr/local/bin/daily-backup
-# Schedule: Daily 05:00 via systemd timer
-set -euo pipefail
-
-# --- Configuration ---
-BACKUP_ROOT="/mnt/backup"
-PVC_MOUNT="/tmp/pvc-mount"
-PUSHGATEWAY="${DAILY_BACKUP_PUSHGATEWAY:-http://10.0.20.100:30091}"
-PUSHGATEWAY_JOB="daily-backup"
-LOCKFILE="/run/daily-backup.lock"
-MANIFEST="${BACKUP_ROOT}/.changed-files"
-MAPPING_CACHE="${BACKUP_ROOT}/.lv-pvc-mapping.json"
-KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
-export KUBECONFIG
-
-# --- Logging ---
-log()  { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
-warn() { log "WARN: $*" >&2; }
-die()  { log "FATAL: $*" >&2; push_metrics 1 0; exit 1; }
-
-# --- Manifest append helper ---
-# Both daily-backup and nfs-mirror append to /mnt/backup/.changed-files.
-# If their runs overlap (e.g. nfs-mirror Mon 04:11 still running when
-# daily-backup starts Mon 05:00) the appends can interleave mid-line.
-# `flock -x` on a sibling lock file makes appends atomic across processes.
-MANIFEST_LOCK="${MANIFEST}.lock"
-manifest_append() {
-    (
-        flock -x 200
-        cat >> "${MANIFEST}"
-    ) 200>"${MANIFEST_LOCK}"
-}
-
-# Cap manifest size to prevent unbounded growth (e.g. Synology unreachable
-# for many days, every daily-backup keeps appending). At >500k lines,
-# `--files-from=` rsync becomes pathological — fall back to a full Step 1
-# sync by signalling offsite-sync to ignore the manifest this round.
-MANIFEST_MAX_LINES=500000
-check_manifest_size() {
-    [ -f "${MANIFEST}" ] || return 0
-    local lines
-    lines=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0)
-    if [ "${lines:-0}" -gt "${MANIFEST_MAX_LINES}" ]; then
-        warn "manifest at ${lines} lines (>${MANIFEST_MAX_LINES}) — flagging next offsite-sync as full"
-        touch "${BACKUP_ROOT}/.force-full-sync"
-    fi
-}
-
-# --- Locking ---
-# Track whether we got SIGTERM/SIGINT so cleanup can push a non-success metric.
-# Without this, a systemd timeout-kill leaves WeeklyBackupFailing alerts blind:
-# the script never reaches the success push at the end and the metric goes stale
-# silently. (Root cause of 2026-04-30 → 2026-05-09 silent-failure run.)
-KILLED=""
-
-cleanup() {
-    # Recursively unmount /tmp/pvc-mount: previous SIGTERM'd runs left snapshot
-    # mounts stacked here, which made every subsequent run start with an
-    # already-occupied mountpoint and time out before reaching its own umount.
-    while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
-        umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
-    done
-    # Close any LUKS mappers we opened (or that were left over from a prior crash).
-    for m in /dev/mapper/pvc-snap-*; do
-        [ -e "$m" ] || continue
-        cryptsetup close "$(basename "$m")" 2>/dev/null || true
-    done
-    rm -f "${LOCKFILE}"
-    if [ -n "${KILLED}" ]; then
-        # status=2 = aborted (matches lvm-pvc-snapshot's convention)
-        push_metrics 2 "${TOTAL_BYTES:-0}"
-    fi
-}
-trap cleanup EXIT
-trap 'KILLED=1; exit 143' TERM INT
-
-if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
-    die "Another instance is running (PID $(cat "${LOCKFILE}" 2>/dev/null || echo unknown))"
-fi
-
-# Belt-and-braces: if a previous run was SIGTERM'd before its trap completed,
-# /tmp/pvc-mount may have stacked mounts and stale LUKS mappers. The lock above
-# guarantees we're alone, so it's safe to clean these up now.
-while mountpoint -q "${PVC_MOUNT}" 2>/dev/null; do
-    umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || break
-done
-for m in /dev/mapper/pvc-snap-*; do
-    [ -e "$m" ] || continue
-    cryptsetup close "$(basename "$m")" 2>/dev/null || true
-done
-
-# --- Metrics ---
-push_metrics() {
-    local status="${1:-0}" bytes="${2:-0}"
-    cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
-daily_backup_last_run_timestamp $(date +%s)
-daily_backup_last_status ${status}
-daily_backup_bytes_synced ${bytes}
-EOF
-}
-
-# --- PVC name resolution ---
-resolve_pvc_name() {
-    local lv="$1"
-    jq -r --arg lv "${lv}" '
-        .items[] |
-        select(.spec.csi.volumeHandle // "" | endswith($lv)) |
-        "\(.spec.claimRef.namespace)/\(.spec.claimRef.name)"
-    ' "${MAPPING_CACHE}" 2>/dev/null
-}
-
-# --- NFS Export Health Check ---
-# Verify NFS exports are healthy before starting backup.
-# Detects: missing /etc/exports, incorrect fsid=0 flag, unexpected exports.
-# Added 2026-04-14 [PM-2026-04-14]: backup script accessed NFS causing stale handle
-# propagation during the fsid=0 outage. Early check prevents cascading failures.
-check_nfs_exports() {
-    local exports_file="/etc/exports"
-    local status=0
-
-    if [ ! -f "${exports_file}" ]; then
-        log "WARN: ${exports_file} does not exist — NFS exports may be unconfigured"
-        return 1
-    fi
-
-    # Check for dangerous fsid=0 on /srv/nfs (breaks NFSv4 subdirectory path resolution)
-    if grep -E '^/srv/nfs[[:space:]].*fsid=0' "${exports_file}" 2>/dev/null; then
-        log "ERROR: /etc/exports contains fsid=0 on /srv/nfs — this will break all k8s NFS mounts!"
-        log "ERROR: Remove fsid=0 and run: exportfs -ra && systemctl restart nfs-server"
-        return 1
-    fi
-
-    # Verify NFS server is active
-    if ! systemctl is-active --quiet nfs-server 2>/dev/null; then
-        log "WARN: nfs-server is not running — NFS mounts will fail"
-        return 1
-    fi
-
-    # Verify exports are actually loaded (exportfs -s lists active exports)
-    local active_exports
-    active_exports=$(exportfs -s 2>/dev/null | grep -c '/srv/nfs' || true)
-    if [ "${active_exports:-0}" -eq 0 ]; then
-        log "WARN: No /srv/nfs exports active in kernel — run: exportfs -ra"
-        return 1
-    fi
-
-    log "NFS export health check passed (${active_exports} /srv/nfs export(s) active)"
-    return 0
-}
-
-# --- Main ---
-log "=== daily-backup starting ==="
-
-if ! mountpoint -q "${BACKUP_ROOT}"; then
-    die "${BACKUP_ROOT} is not mounted"
-fi
-
-# NFS export health check — warn but don't abort (backup can proceed with block storage PVCs)
-check_nfs_exports || {
-    log "WARN: NFS export health check failed — NFS-backed PVC backups may fail"
-    STATUS=1
-}
-
-STATUS=0
-TOTAL_BYTES=0
-
-# DO NOT truncate the manifest here.
-#
-# Truncation lives in offsite-sync-backup (only on successful sync). If
-# offsite-sync failed yesterday — Synology unreachable, transient error —
-# the manifest holds yesterday's unconsumed file list. Truncating at the
-# start of today's daily-backup would silently lose those entries; they'd
-# only reach Synology on the next monthly full sync.
-#
-# Appending duplicates across multiple runs is harmless — rsync transfers
-# each file once. If the manifest grows pathologically (Synology down for
-# weeks), the OffsiteBackupSync{Stale,Failing} alerts catch it.
-
-# NFS data is synced to Synology via two paths: nfs-mirror → sda → Step 1
-# for the curated subset, and inotify + Step 2 for the sda-bypass list.
-
-# ============================================================
-# STEP 1: PVC file-level copy from LVM thin snapshots
-# ============================================================
-log "--- Step 1: PVC file copy from snapshots ---"
-WEEK=$(date +%Y-%W)
-PREV=$(ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | tail -1 || true)
-
-# Cache LV→PVC mapping (fallback if kubectl is down next time)
-if kubectl get pv -o json > /tmp/pv-list.json 2>/dev/null; then
-    cp /tmp/pv-list.json "${MAPPING_CACHE}"
-    rm -f /tmp/pv-list.json
-fi
-
-if [ ! -f "${MAPPING_CACHE}" ]; then
-    warn "No PV mapping cache and kubectl unavailable — skipping PVC copy"
-    STATUS=1
-else
-    mkdir -p "${PVC_MOUNT}"
-    PVC_COUNT=0
-    PVC_FAIL=0
-
-    # Iterate origin LVs (not snapshots), find latest snapshot for each
-    for origin_lv in $(lvs --noheadings -o lv_name pve 2>/dev/null | grep 'vm-9999-pvc-' | grep -v '_snap_' | tr -d ' '); do
-        # Find latest snapshot for this origin
-        snap=$(lvs --noheadings -o lv_name pve 2>/dev/null | tr -d ' ' | grep "^${origin_lv}_snap_" | sort | tail -1 || true)
-        [ -z "${snap}" ] && continue
-
-        # Resolve human-readable name
-        ns_pvc=$(resolve_pvc_name "${origin_lv}")
-        if [ -z "${ns_pvc}" ] || [ "${ns_pvc}" = "null/null" ]; then
-            warn "Cannot resolve PVC name for ${origin_lv}, skipping"
-            continue
-        fi
-
-        # Skip-list: PVCs we deliberately don't keep offsite copies of.
-        #   nextcloud-data-proxmox — orphaned pre-encryption PV (Released,
-        #   Retain). Nextcloud moved to nextcloud-data-encrypted on 2026-04-13;
-        #   this old unencrypted PV lingers (Retain) and was still being backed
-        #   up weekly, filling the offsite Synology. Stop copying it (2026-06-01).
-        case "${ns_pvc}" in
-            nextcloud/nextcloud-data-proxmox)
-                log "  skip ${ns_pvc} (orphaned pre-encryption PVC)"
-                continue ;;
-        esac
-
-        # Detect LUKS-encrypted volumes and set up mount device
-        LUKS_NAME=""
-        MOUNT_DEV="/dev/pve/${snap}"
-        MOUNT_OPTS="ro"
-        if blkid -o value -s TYPE "/dev/pve/${snap}" 2>/dev/null | grep -q 'crypto_LUKS'; then
-            # Clean up any stale LUKS mapping for this snapshot from a previous crashed run
-            STALE_LUKS="pvc-snap-$(echo "${snap}" | md5sum | cut -c1-12)"
-            if [ -e "/dev/mapper/${STALE_LUKS}" ]; then
-                umount "/dev/mapper/${STALE_LUKS}" 2>/dev/null || true
-                cryptsetup close "${STALE_LUKS}" 2>/dev/null || true
-            fi
-            LUKS_KEY="/root/.luks-backup-key"
-            LUKS_NAME="pvc-snap-$(echo "${snap}" | md5sum | cut -c1-12)"
-            if [ -f "${LUKS_KEY}" ] && cryptsetup open --type luks --key-file "${LUKS_KEY}" --readonly "/dev/pve/${snap}" "${LUKS_NAME}" 2>&1; then
-                MOUNT_DEV="/dev/mapper/${LUKS_NAME}"
-                MOUNT_OPTS="ro,noload"  # noload skips ext4 journal replay on read-only LUKS
-                log "  LUKS: decrypted ${snap} → ${LUKS_NAME}"
-            else
-                warn "Failed to decrypt LUKS snapshot ${snap}"
-                PVC_FAIL=$((PVC_FAIL + 1))
-                continue
-            fi
-        fi
-
-        # Mount snapshot read-only, rsync files
-        if timeout 30 mount -o "${MOUNT_OPTS}" "${MOUNT_DEV}" "${PVC_MOUNT}" 2>&1; then
-            dst="${BACKUP_ROOT}/pvc-data/${WEEK}/${ns_pvc}"
-            mkdir -p "${dst}"
-            rsync_rc=0
-            # Per-PVC rsync timeout (30 min). Without this, a single hung
-            # PVC blocks the entire backup until systemd's TimeoutStartSec
-            # kills the script (4h ceiling), leaving every later PVC
-            # unbacked and silently triggering WeeklyBackupFailing. Picked
-            # 30 min as well above the largest PVC's normal copy time
-            # (immich-postgres ~10 GiB, ~3 min on local ext4) and well
-            # below the unit-level budget so we still have headroom to
-            # finish the rest.
-            timeout 1800 rsync -a --delete \
-                ${PREV:+--link-dest="${PREV}/${ns_pvc}/"} \
-                "${PVC_MOUNT}/" "${dst}/" 2>&1 || rsync_rc=$?
-            if [ "$rsync_rc" -eq 0 ]; then
-                PVC_COUNT=$((PVC_COUNT + 1))
-            elif [ "$rsync_rc" -eq 23 ] && [ -n "${LUKS_NAME}" ]; then
-                # rsync 23 = partial transfer; expected for LUKS noload mounts
-                # (in-flight writes have corrupt metadata from skipped journal replay)
-                PVC_COUNT=$((PVC_COUNT + 1))
-                log "  partial rsync (LUKS noload) for ${ns_pvc} — OK"
-            elif [ "$rsync_rc" -eq 124 ]; then
-                # `timeout` exit 124 = wall-clock killed the rsync. Track
-                # separately so the next run still produces a metric and
-                # doesn't pretend nothing happened.
-                warn "rsync timed out for ${ns_pvc} after 30 min — moving on"
-                PVC_FAIL=$((PVC_FAIL + 1))
-            else
-                warn "rsync failed for ${ns_pvc} (rc=$rsync_rc)"
-                PVC_FAIL=$((PVC_FAIL + 1))
-            fi
-
-            # Auto-detect and safely backup SQLite databases from snapshot
-            if command -v sqlite3 &>/dev/null; then
-                find "${PVC_MOUNT}" -maxdepth 3 \
-                    \( -name '*.db' -o -name '*.sqlite' -o -name '*.sqlite3' \) \
-                    -size +0 -type f 2>/dev/null | while read -r dbfile; do
-                    # Verify it's actually SQLite (magic number check)
-                    if head -c 15 "$dbfile" 2>/dev/null | grep -q 'SQLite format 3'; then
-                        relpath="${dbfile#${PVC_MOUNT}/}"
-                        dest_file="${BACKUP_ROOT}/sqlite-backup/${WEEK}/${ns_pvc}/${relpath}"
-                        mkdir -p "$(dirname "${dest_file}")"
-                        # 5-min sqlite timeout — same hang-prevention idea
-                        # as rsync above. A corrupted SQLite or one held
-                        # open by a writer in the snapshot can otherwise
-                        # block .backup indefinitely.
-                        if timeout 300 sqlite3 "file://${dbfile}?mode=ro" ".backup '${dest_file}'" 2>/dev/null; then
-                            log "    SQLite: ${ns_pvc}/${relpath}"
-                        else
-                            cp "${dbfile}" "${dest_file}" 2>/dev/null || true
-                        fi
-                    fi
-                done
-            fi
-
-            umount "${PVC_MOUNT}" 2>/dev/null || umount -l "${PVC_MOUNT}" 2>/dev/null || true
-        else
-            warn "Failed to mount snapshot ${snap}"
-            PVC_FAIL=$((PVC_FAIL + 1))
-        fi
-
-        # Close LUKS device if we opened one
-        if [ -n "${LUKS_NAME}" ]; then
-            cryptsetup close "${LUKS_NAME}" 2>/dev/null || true
-        fi
-    done
-
-    log "  PVC copy: ${PVC_COUNT} OK, ${PVC_FAIL} failed"
-    [ "${PVC_FAIL}" -gt 0 ] && STATUS=1
-
-    # Add PVC files to manifest (locked append)
-    if [ -d "${BACKUP_ROOT}/pvc-data/${WEEK}" ]; then
-        find "${BACKUP_ROOT}/pvc-data/${WEEK}" -type f 2>/dev/null | \
-            sed "s|^${BACKUP_ROOT}/||" | manifest_append
-    fi
-
-    # Prune old weekly versions (keep 4)
-    ls -1d "${BACKUP_ROOT}/pvc-data"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true
-    ls -1d "${BACKUP_ROOT}/sqlite-backup"/????-?? 2>/dev/null | head -n -4 | xargs rm -rf 2>/dev/null || true
-
-    PVC_BYTES=$(du -sb "${BACKUP_ROOT}/pvc-data/${WEEK}" 2>/dev/null | cut -f1 || true)
-    TOTAL_BYTES=$((TOTAL_BYTES + ${PVC_BYTES:-0}))
-fi
-
-# ============================================================
-# STEP 3: pfsense backup (config.xml + full tar)
-# ============================================================
-log "--- Step 3: pfsense backup ---"
-PFSENSE_DEST="${BACKUP_ROOT}/pfsense"
-DATE=$(date +%Y%m%d)
-PFSENSE_STATUS=0
-mkdir -p "${PFSENSE_DEST}"
-
-if timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 root@10.0.20.1 true 2>/dev/null; then
-    # config.xml — primary restore artifact
-    if scp -o ConnectTimeout=10 root@10.0.20.1:/cf/conf/config.xml "${PFSENSE_DEST}/config-${DATE}.xml" 2>/dev/null; then
-        log "  OK: config.xml"
-        echo "pfsense/config-${DATE}.xml" | manifest_append
-    else
-        warn "Failed to copy pfsense config.xml"
-        STATUS=1
-        PFSENSE_STATUS=1
-    fi
-
-    # Full filesystem tar — Sundays only (weekly).
-    # config.xml is the primary restore artifact and runs daily above; the
-    # full filesystem tar is for forensic / package-state recovery only and
-    # rarely-needed. Re-tarring 100M+ daily writes ~3G/month to sda + Synology
-    # for unchanged content. Keep one fresh tarball per week instead.
-    if [ "$(date +%u)" = "7" ]; then
-        if ssh -o ConnectTimeout=10 root@10.0.20.1 \
-            "tar czf - --exclude=/dev --exclude=/proc --exclude=/tmp --exclude=/var/run /" \
-            > "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" 2>/dev/null; then
-            log "  OK: weekly full tar ($(du -sh "${PFSENSE_DEST}/pfsense-full-${DATE}.tar.gz" | cut -f1))"
-            echo "pfsense/pfsense-full-${DATE}.tar.gz" | manifest_append
-        else
-            warn "Failed to tar pfsense filesystem"
-            STATUS=1
-            PFSENSE_STATUS=1
-        fi
-    else
-        log "  skip weekly full tar (only runs Sundays)"
-    fi
-
-    # Retention: keep 4 weekly copies
-    ls -t "${PFSENSE_DEST}"/config-*.xml 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
-    ls -t "${PFSENSE_DEST}"/pfsense-full-*.tar.gz 2>/dev/null | tail -n +5 | xargs rm -f 2>/dev/null || true
-else
-    warn "Cannot SSH to pfsense (10.0.20.1) — skipping"
-    STATUS=1
-    PFSENSE_STATUS=1
-fi
-
-# Push pfsense-backup metrics in BOTH success and failure paths so
-# PfsenseBackupStale + PfsenseBackupFailing alerts can fire instead of going
-# silent when ssh-to-pfsense is broken.
-{
-    echo "backup_last_run_timestamp $(date +%s)"
-    echo "backup_last_status ${PFSENSE_STATUS}"
-    [ "${PFSENSE_STATUS}" -eq 0 ] && echo "backup_last_success_timestamp $(date +%s)"
-} | curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
-    "${PUSHGATEWAY}/metrics/job/pfsense-backup" 2>/dev/null || true
-
-# ============================================================
-# STEP 4: PVE host config backup
-# ============================================================
-log "--- Step 4: PVE host config ---"
-mkdir -p "${BACKUP_ROOT}/pve-config/scripts"
-timeout 300 rsync -a --delete /etc/pve/ "${BACKUP_ROOT}/pve-config/etc-pve/" 2>&1 || { warn "Failed to sync /etc/pve"; STATUS=1; }
-for script in /usr/local/bin/lvm-pvc-snapshot /usr/local/bin/daily-backup /usr/local/bin/offsite-sync-backup; do
-    [ -f "${script}" ] && cp "${script}" "${BACKUP_ROOT}/pve-config/scripts/" 2>/dev/null || true
-done
-find "${BACKUP_ROOT}/pve-config" -type f 2>/dev/null | sed "s|^${BACKUP_ROOT}/||" | manifest_append
-log "  OK: PVE config"
-
-check_manifest_size
-
-# ============================================================
-# STEP 5: Prune LVM snapshots older than 7 days
-# ============================================================
-log "--- Step 5: Snapshot pruning (7-day retention) ---"
-/usr/local/bin/lvm-pvc-snapshot prune 2>&1 || { warn "Snapshot prune failed"; STATUS=1; }
-
-# ============================================================
-# Done
-# ============================================================
-MANIFEST_LINES=$(wc -l < "${MANIFEST}" 2>/dev/null || echo 0)
-log "=== daily-backup complete (status=${STATUS}, ${TOTAL_BYTES} bytes, ${MANIFEST_LINES} files in manifest) ==="
-push_metrics "${STATUS}" "${TOTAL_BYTES}"
-exit "${STATUS}"
--- a/scripts/daily-backup.timer
+++ b/scripts/daily-backup.timer
@ -1,10 +0,0 @@
-[Unit]
-Description=Daily backup: PVC snapshots + SQLite + pfsense to sda
-
-[Timer]
-OnCalendar=*-*-* 05:00:00
-Persistent=true
-RandomizedDelaySec=300
-
-[Install]
-WantedBy=timers.target
--- a/scripts/extend_vm_storage.sh
+++ b/scripts/extend_vm_storage.sh
@ -1,372 +0,0 @@
-#!/usr/bin/env bash
-
-# Extend disk storage on a Kubernetes node VM.
-# Drains the node, shuts down the VM, resizes the disk in Proxmox,
-# boots the VM, expands the filesystem, and uncordons the node.
-#
-# Usage: ./scripts/extend_vm_storage.sh <node-name> <size-increment>
-# Example: ./scripts/extend_vm_storage.sh k8s-node2 +64G
-
-# --- Constants ---
-PROXMOX_HOST="root@192.168.1.127"
-VM_SSH_USER="wizard"
-KUBECTL="kubectl --kubeconfig $(pwd)/config"
-SHUTDOWN_TIMEOUT=300
-SSH_WAIT_TIMEOUT=300
-POLL_INTERVAL=5
-
-# --- Colors ---
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-info()  { echo -e "${BLUE}[INFO]${NC} $*"; }
-ok()    { echo -e "${GREEN}[OK]${NC} $*"; }
-warn()  { echo -e "${YELLOW}[WARN]${NC} $*"; }
-error() { echo -e "${RED}[ERROR]${NC} $*"; }
-
-# --- Node-to-VMID mapping ---
-declare -A NODE_VMID=(
-    [k8s-master]=200
-    [k8s-node1]=201
-    [k8s-node2]=202
-    [k8s-node3]=203
-    [k8s-node4]=204
-)
-
-# --- Cleanup trap ---
-DRAINED_NODE=""
-cleanup() {
-    if [[ -n "$DRAINED_NODE" ]]; then
-        echo ""
-        error "Script exited unexpectedly!"
-        warn "The node '$DRAINED_NODE' may still be cordoned/drained."
-        warn "Recovery steps:"
-        warn "  1. Check VM status: ssh $PROXMOX_HOST 'qm status ${NODE_VMID[$DRAINED_NODE]}'"
-        warn "  2. Start VM if stopped: ssh $PROXMOX_HOST 'qm start ${NODE_VMID[$DRAINED_NODE]}'"
-        warn "  3. Uncordon node: $KUBECTL uncordon $DRAINED_NODE"
-    fi
-}
-trap cleanup EXIT
-
-# --- Input validation ---
-usage() {
-    echo "Usage: $0 <node-name> <size-increment>"
-    echo ""
-    echo "Arguments:"
-    echo "  node-name       One of: ${!NODE_VMID[*]}"
-    echo "  size-increment  Disk size increase, e.g. +64G, +128G"
-    echo ""
-    echo "Example:"
-    echo "  $0 k8s-node2 +64G"
-    exit 1
-}
-
-if [[ $# -ne 2 ]]; then
-    usage
-fi
-
-NODE_NAME="$1"
-SIZE_INCREMENT="$2"
-
-if [[ -z "${NODE_VMID[$NODE_NAME]+x}" ]]; then
-    error "Unknown node: '$NODE_NAME'"
-    echo "Valid nodes: ${!NODE_VMID[*]}"
-    exit 1
-fi
-
-if [[ ! "$SIZE_INCREMENT" =~ ^\+[0-9]+G$ ]]; then
-    error "Invalid size increment: '$SIZE_INCREMENT'"
-    echo "Must match pattern +<number>G, e.g. +64G"
-    exit 1
-fi
-
-VMID="${NODE_VMID[$NODE_NAME]}"
-
-# --- Resolve node IP via kubectl ---
-info "Resolving IP for node '$NODE_NAME'..."
-NODE_IP=$($KUBECTL get node "$NODE_NAME" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}' 2>/dev/null)
-if [[ -z "$NODE_IP" ]]; then
-    error "Could not resolve IP for node '$NODE_NAME'. Is the cluster reachable?"
-    exit 1
-fi
-ok "Node IP: $NODE_IP"
-
-# --- Query current disk size ---
-info "Querying current disk size for VM $VMID..."
-SCSI0_LINE=$(ssh "$PROXMOX_HOST" "qm config $VMID" 2>/dev/null | grep '^scsi0:')
-if [[ -z "$SCSI0_LINE" ]]; then
-    error "Could not read scsi0 config for VM $VMID."
-    exit 1
-fi
-# Extract size value, e.g. "size=64G" from the config line
-CURRENT_SIZE=$(echo "$SCSI0_LINE" | sed -n 's/.*size=\([0-9]*G\).*/\1/p')
-if [[ -z "$CURRENT_SIZE" ]]; then
-    error "Could not parse current disk size from: $SCSI0_LINE"
-    exit 1
-fi
-CURRENT_SIZE_NUM=${CURRENT_SIZE%G}
-INCREMENT_NUM=${SIZE_INCREMENT//[+G]/}
-NEW_SIZE_NUM=$((CURRENT_SIZE_NUM + INCREMENT_NUM))
-ok "Current disk size: ${CURRENT_SIZE_NUM}G → New size: ${NEW_SIZE_NUM}G (${SIZE_INCREMENT})"
-
-if [[ $NEW_SIZE_NUM -le $CURRENT_SIZE_NUM ]]; then
-    error "New size (${NEW_SIZE_NUM}G) must be greater than current size (${CURRENT_SIZE_NUM}G)."
-    exit 1
-fi
-
-# --- Confirmation ---
-echo ""
-echo "========================================="
-echo "  Extend VM Storage"
-echo "========================================="
-echo "  Node:       $NODE_NAME"
-echo "  VMID:       $VMID"
-echo "  Node IP:    $NODE_IP"
-echo "  Current:    ${CURRENT_SIZE_NUM}G"
-echo "  Increment:  $SIZE_INCREMENT"
-echo "  New size:   ${NEW_SIZE_NUM}G"
-echo "  Proxmox:    $PROXMOX_HOST"
-echo "========================================="
-echo ""
-echo "This will:"
-echo "  1. Drain the node (evict pods)"
-echo "  2. Shut down the VM"
-echo "  3. Resize disk (scsi0) from ${CURRENT_SIZE_NUM}G to ${NEW_SIZE_NUM}G"
-echo "  4. Start the VM"
-echo "  5. Expand the filesystem inside the guest"
-echo "  6. Uncordon the node"
-echo ""
-read -rp "Proceed? [y/N] " confirm
-if [[ ! "$confirm" =~ ^[yY]$ ]]; then
-    echo "Aborted."
-    exit 0
-fi
-
-# --- Step 1: Drain node ---
-info "Step 1/7: Draining node '$NODE_NAME'..."
-DRAINED_NODE="$NODE_NAME"
-if ! $KUBECTL drain "$NODE_NAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=300s; then
-    error "Failed to drain node '$NODE_NAME'."
-    exit 1
-fi
-ok "Node drained."
-
-# --- Step 2: Shutdown VM ---
-info "Step 2/7: Shutting down VM $VMID..."
-if ! ssh "$PROXMOX_HOST" "qm shutdown $VMID"; then
-    error "Failed to send shutdown command to VM $VMID."
-    exit 1
-fi
-
-info "Waiting for VM to stop (timeout: ${SHUTDOWN_TIMEOUT}s)..."
-elapsed=0
-while true; do
-    status=$(ssh "$PROXMOX_HOST" "qm status $VMID" 2>/dev/null)
-    if [[ "$status" == *"stopped"* ]]; then
-        break
-    fi
-    if [[ $elapsed -ge $SHUTDOWN_TIMEOUT ]]; then
-        error "VM $VMID did not stop within ${SHUTDOWN_TIMEOUT}s. Current status: $status"
-        exit 1
-    fi
-    sleep "$POLL_INTERVAL"
-    elapsed=$((elapsed + POLL_INTERVAL))
-done
-ok "VM stopped."
-
-# --- Step 3: Resize disk ---
-info "Step 3/7: Resizing disk scsi0 by $SIZE_INCREMENT..."
-if ! ssh "$PROXMOX_HOST" "qm resize $VMID scsi0 $SIZE_INCREMENT"; then
-    error "Failed to resize disk on VM $VMID."
-    exit 1
-fi
-ok "Disk resized."
-
-# --- Step 4: Start VM ---
-info "Step 4/7: Starting VM $VMID..."
-if ! ssh "$PROXMOX_HOST" "qm start $VMID"; then
-    error "Failed to start VM $VMID."
-    exit 1
-fi
-
-info "Waiting for SSH to become available at $NODE_IP (timeout: ${SSH_WAIT_TIMEOUT}s)..."
-elapsed=0
-while true; do
-    if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "true" 2>/dev/null; then
-        break
-    fi
-    if [[ $elapsed -ge $SSH_WAIT_TIMEOUT ]]; then
-        error "SSH not reachable on $NODE_IP within ${SSH_WAIT_TIMEOUT}s."
-        exit 1
-    fi
-    sleep "$POLL_INTERVAL"
-    elapsed=$((elapsed + POLL_INTERVAL))
-done
-ok "VM is up and SSH is reachable."
-
-info "Waiting 10s for system stabilization..."
-sleep 10
-
-# --- Step 5: Expand filesystem ---
-info "Step 5/7: Expanding filesystem inside the guest..."
-ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" 'bash -s' <<'REMOTE_SCRIPT'
-set -o pipefail
-
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-info()  { echo -e "${BLUE}[INFO]${NC} $*"; }
-ok()    { echo -e "${GREEN}[OK]${NC} $*"; }
-warn()  { echo -e "${YELLOW}[WARN]${NC} $*"; }
-error() { echo -e "${RED}[ERROR]${NC} $*"; }
-
-ROOT_DEV=$(findmnt -n -o SOURCE /)
-ROOT_FSTYPE=$(findmnt -n -o FSTYPE /)
-info "Root device: $ROOT_DEV"
-info "Root filesystem: $ROOT_FSTYPE"
-
-# Ensure growpart is available
-if ! command -v growpart &>/dev/null; then
-    info "Installing growpart (cloud-guest-utils)..."
-    sudo apt-get update -qq && sudo apt-get install -y -qq cloud-guest-utils
-fi
-
-resize_fs() {
-    local dev="$1"
-    local fstype="$2"
-    if [[ "$fstype" == "ext4" || "$fstype" == "ext3" || "$fstype" == "ext2" ]]; then
-        info "Running resize2fs on $dev..."
-        if ! sudo resize2fs "$dev"; then
-            error "resize2fs failed on $dev"
-            return 1
-        fi
-    elif [[ "$fstype" == "xfs" ]]; then
-        info "Running xfs_growfs on /..."
-        if ! sudo xfs_growfs /; then
-            error "xfs_growfs failed"
-            return 1
-        fi
-    else
-        error "Unsupported filesystem type: $fstype"
-        return 1
-    fi
-    return 0
-}
-
-# Check if root is on LVM (device-mapper)
-if [[ "$ROOT_DEV" == /dev/mapper/* || "$ROOT_DEV" == /dev/dm-* ]]; then
-    info "LVM layout detected."
-
-    # Find the PV device
-    PV_DEV=$(sudo pvs --noheadings -o pv_name | head -1 | tr -d ' ')
-    if [[ -z "$PV_DEV" ]]; then
-        error "Could not determine PV device."
-        exit 1
-    fi
-    info "PV device: $PV_DEV"
-
-    # Parse disk and partition number (handles /dev/sdaX and /dev/nvmeXnXpX)
-    if [[ "$PV_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then
-        DISK="${BASH_REMATCH[1]}"
-        PARTNUM="${BASH_REMATCH[2]}"
-    elif [[ "$PV_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then
-        DISK="${BASH_REMATCH[1]}"
-        PARTNUM="${BASH_REMATCH[2]}"
-    else
-        error "Could not parse disk/partition from PV: $PV_DEV"
-        exit 1
-    fi
-    info "Disk: $DISK, Partition: $PARTNUM"
-
-    # Grow partition
-    info "Growing partition $DISK partition $PARTNUM..."
-    sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)"
-
-    # Resize PV
-    info "Resizing PV $PV_DEV..."
-    if ! sudo pvresize "$PV_DEV"; then
-        error "pvresize failed on $PV_DEV"
-        exit 1
-    fi
-
-    # Resolve LV path if using /dev/dm-*
-    if [[ "$ROOT_DEV" == /dev/dm-* ]]; then
-        LV_PATH=$(sudo lvs --noheadings -o lv_path | head -1 | tr -d ' ')
-    else
-        LV_PATH="$ROOT_DEV"
-    fi
-    info "LV path: $LV_PATH"
-
-    # Extend LV
-    info "Extending LV $LV_PATH to use all free space..."
-    if ! sudo lvextend -l +100%FREE "$LV_PATH"; then
-        warn "lvextend reported no change (LV may already use all space)."
-    fi
-
-    # Resize filesystem
-    resize_fs "$LV_PATH" "$ROOT_FSTYPE"
-    if [[ $? -ne 0 ]]; then
-        exit 1
-    fi
-else
-    info "Direct partition layout detected."
-
-    # Parse disk and partition number
-    if [[ "$ROOT_DEV" =~ ^(/dev/nvme[0-9]+n[0-9]+)p([0-9]+)$ ]]; then
-        DISK="${BASH_REMATCH[1]}"
-        PARTNUM="${BASH_REMATCH[2]}"
-    elif [[ "$ROOT_DEV" =~ ^(/dev/[a-z]+)([0-9]+)$ ]]; then
-        DISK="${BASH_REMATCH[1]}"
-        PARTNUM="${BASH_REMATCH[2]}"
-    else
-        error "Could not parse disk/partition from: $ROOT_DEV"
-        exit 1
-    fi
-    info "Disk: $DISK, Partition: $PARTNUM"
-
-    # Grow partition
-    info "Growing partition $DISK partition $PARTNUM..."
-    sudo growpart "$DISK" "$PARTNUM" || echo "(growpart: partition may already be at max size)"
-
-    # Resize filesystem
-    resize_fs "$ROOT_DEV" "$ROOT_FSTYPE"
-    if [[ $? -ne 0 ]]; then
-        exit 1
-    fi
-fi
-
-ok "Filesystem expansion complete."
-df -h /
-REMOTE_SCRIPT
-
-if [[ $? -ne 0 ]]; then
-    error "Filesystem expansion failed on the guest."
-    exit 1
-fi
-ok "Filesystem expanded."
-
-# --- Step 6: Uncordon node ---
-info "Step 6/7: Uncordoning node '$NODE_NAME'..."
-if ! $KUBECTL uncordon "$NODE_NAME"; then
-    error "Failed to uncordon node '$NODE_NAME'."
-    exit 1
-fi
-DRAINED_NODE=""
-ok "Node uncordoned."
-
-# --- Step 7: Verify ---
-info "Step 7/7: Verification"
-echo ""
-info "Disk usage on $NODE_NAME:"
-ssh -o StrictHostKeyChecking=no "$VM_SSH_USER@$NODE_IP" "df -h /"
-echo ""
-info "Node status:"
-$KUBECTL get node "$NODE_NAME"
-echo ""
-ok "Storage extension complete for $NODE_NAME."
--- a/scripts/fan-control.env.example
+++ b/scripts/fan-control.env.example
@ -1,21 +0,0 @@
-# /etc/fan-control.env  —  config for the fan-control daemon (chmod 600).
-# Deployed manually to the PVE host; the real file holds a secret token and is
-# NOT committed. Copy this template, fill HA_TOKEN, scp to /etc/fan-control.env.
-
-# Long-lived ha-sofia access token (Home Assistant -> Profile -> Security ->
-# Long-lived access tokens). Empty => presence disabled, daemon runs COOL-only.
-HA_TOKEN=
-
-# --- optional overrides (defaults shown) ---
-# HA_URL=http://192.168.1.8:8123
-# GARAGE_ENTITY=sensor.garage_door_state_bg
-# GARAGE_OPEN_STATE=Отворена
-# HOLD_SECS=900            # quiet-mode hold after last garage activity (15 min)
-# LOOP_INTERVAL=15
-# PRESENCE_INTERVAL=30
-# DEADBAND=3
-# CEILING=83               # degC: hand back to Dell auto at/above this
-# RESUME_BELOW=75
-# RESUME_STABLE=120
-# MAX_IPMI_FAILS=3
-PUSHGATEWAY_URL=http://10.0.20.100:30091
--- a/scripts/fan-control.service
+++ b/scripts/fan-control.service
@ -1,21 +0,0 @@
-[Unit]
-Description=Presence-aware IPMI fan controller (Dell R730, garage)
-Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/fan-control.sh
-After=network-online.target
-Wants=network-online.target
-
-[Service]
-Type=simple
-EnvironmentFile=-/etc/fan-control.env
-ExecStart=/usr/local/bin/fan-control
-# Belt-and-suspenders: whatever happens to the daemon, hand the fans back to
-# the iDRAC's own automatic curve so the box is never stuck in manual mode.
-ExecStopPost=/usr/bin/ipmitool raw 0x30 0x30 0x01 0x01
-Restart=on-failure
-RestartSec=10
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=fan-control
-
-[Install]
-WantedBy=multi-user.target
--- a/scripts/fan-control.sh
+++ b/scripts/fan-control.sh
@ -1,262 +0,0 @@
-#!/usr/bin/env bash
-# Presence-aware IPMI fan controller for the Dell R730 PVE host (192.168.1.127).
-#
-# The server lives in the GARAGE (memory id=1723). Two curves, picked by
-# whether someone is physically in the garage:
-#   - COOL  : garage empty -> minimise CPU temp, noise is free.
-#   - QUIET : someone in the garage -> minimise noise, accept a warmer CPU.
-# Presence comes from the ha-sofia garage-door sensor: door open now, OR it
-# last changed within HOLD_SECS, => QUIET. Otherwise COOL.
-#
-# Safety (manual fan mode bypasses the iDRAC's own curve, so we backstop it):
-#   - On ANY exit (crash/stop/TERM) the EXIT trap hands fans back to Dell
-#     automatic control (raw 0x30 0x30 0x01 0x01). systemd ExecStopPost
-#     repeats this belt-and-suspenders.
-#   - CPU >= CEILING -> hand back to Dell auto until it recovers (RESUME_BELOW
-#     held for RESUME_STABLE s). The firmware's own emergency cooling takes over.
-#   - IPMI read failures (>= MAX_IPMI_FAILS) -> hand back to Dell auto.
-#
-# Deploy: scp to /usr/local/bin/fan-control (strip .sh) + install
-# fan-control.service + /etc/fan-control.env. Same pattern as apply-mbps-caps.
-# Tests: test-fan-control.sh (sources this file, exercises the pure functions).
-# Design: infra/docs/plans/2026-06-04-pve-fan-control-design.md
-# Runbook: infra/docs/runbooks/fan-control.md
-
-set -uo pipefail
-
-# ---- configuration (override via /etc/fan-control.env) ----
-: "${IPMITOOL:=ipmitool}"
-: "${LOOP_INTERVAL:=15}"             # seconds between temperature decisions
-: "${PRESENCE_INTERVAL:=30}"         # seconds between ha-sofia garage-door polls
-: "${DEADBAND:=3}"                   # degC hysteresis applied to downward fan steps
-: "${CEILING:=83}"                   # degC: hand back to Dell auto at/above this
-: "${RESUME_BELOW:=75}"              # degC: eligible to resume manual below this...
-: "${RESUME_STABLE:=120}"            # ...once held that long
-: "${HOLD_SECS:=900}"                # quiet-mode hold after last garage activity (15 min)
-: "${HA_URL:=http://192.168.1.8:8123}"
-: "${HA_TOKEN:=}"                    # long-lived ha-sofia token; empty => presence disabled (COOL only)
-: "${GARAGE_ENTITY:=sensor.garage_door_state_bg}"
-: "${GARAGE_OPEN_STATE:=Отворена}"   # ha state string meaning "open"
-# HA control: a mode select + manual % the user drives from Home Assistant.
-# auto => garage-presence curve (default); cool/quiet => force that curve;
-# manual => hold MANUAL_ENTITY %. Empty HA_TOKEN or unreachable HA => auto.
-: "${MODE_ENTITY:=input_select.r730_fan_mode}"
-: "${MANUAL_ENTITY:=input_number.r730_fan_manual_pct}"
-: "${PUSHGATEWAY_URL:=}"             # optional Prometheus Pushgateway base URL
-: "${MAX_IPMI_FAILS:=3}"
-: "${DRY_RUN:=0}"                    # 1 => log IPMI actions instead of executing
-: "${RUN_ONCE:=0}"                   # 1 => one iteration then exit (testing)
-
-# Continuous LINEAR fan curve (2026-06-05): fan% ramps proportionally with CPU
-# temp between (T_LO,P_LO) and (T_HI,P_HI), clamped flat outside. Replaces the old
-# discrete step-bands (which flapped at band edges — e.g. 45<->65%). Both modes
-# reach 100% right at the 83°C ceiling. Anchors are env-tunable.
-#   COOL  (garage empty):  30% @50°C .. 100% @83°C  (~2.1%/°C; equilibrium ~60°C/~51%)
-#   QUIET (someone there): 20% @68°C .. 100% @83°C  (near-silent until ~70°C)
-# Web-researched: a linear curve + 2-3°C hysteresis is the homelab standard; PID is
-# overkill for this slow thermal loop. See docs/plans/2026-06-04-pve-fan-control-design.md.
-: "${COOL_T_LO:=50}"; : "${COOL_P_LO:=30}"; : "${COOL_T_HI:=83}"; : "${COOL_P_HI:=100}"
-: "${QUIET_T_LO:=68}"; : "${QUIET_P_LO:=20}"; : "${QUIET_T_HI:=83}"; : "${QUIET_P_HI:=100}"
-: "${MIN_STEP:=3}"   # min fan-% change worth an IPMI write (anti-jitter on the smooth curve)
-
-log() { printf '%s %s\n' "$(date '+%Y-%m-%dT%H:%M:%S%z')" "$*"; }
-
-# ---- pure functions (no side effects; unit-tested) ----
-
-# fc_curve <mode> <temp> -> fan percent (continuous linear interpolation between
-# the per-mode (T_LO,P_LO)..(T_HI,P_HI) anchors; clamped flat outside the range).
-fc_curve() {
-  local mode="$1" temp="$2" tlo plo thi phi
-  if [[ "$mode" == "quiet" ]]; then tlo=$QUIET_T_LO; plo=$QUIET_P_LO; thi=$QUIET_T_HI; phi=$QUIET_P_HI
-  else tlo=$COOL_T_LO; plo=$COOL_P_LO; thi=$COOL_T_HI; phi=$COOL_P_HI; fi
-  if (( temp <= tlo )); then echo "$plo"; return 0; fi
-  if (( temp >= thi )); then echo "$phi"; return 0; fi
-  echo $(( plo + ( (temp - tlo) * (phi - plo) + (thi - tlo) / 2 ) / (thi - tlo) ))  # rounded
-}
-
-# fc_decide <mode> <temp> <current_pct> <deadband> -> fan percent
-# Ramps up immediately; only steps down once the curve still wants a lower
-# percent even DEADBAND degrees hotter (prevents flapping at band edges).
-fc_decide() {
-  local mode="$1" temp="$2" current="$3" deadband="$4" target
-  target="$(fc_curve "$mode" "$temp")"
-  if (( current < 0 || target >= current )); then echo "$target"; return 0; fi
-  if (( $(fc_curve "$mode" "$((temp + deadband))") < current )); then echo "$target"; else echo "$current"; fi
-}
-
-# fc_presence_mode <state> <last_changed_epoch> <now_epoch> <hold_secs> <open_state> -> quiet|cool
-fc_presence_mode() {
-  local state="$1" lc="$2" now="$3" hold="$4" open="$5"
-  if [[ "$state" == "$open" ]]; then echo "quiet"; return 0; fi
-  if (( now - lc < hold )); then echo "quiet"; return 0; fi
-  echo "cool"
-}
-
-# fc_parse_temp <ipmitool 'Temp' line> -> integer degC
-fc_parse_temp() {
-  echo "$1" | grep -oE '[0-9]+ degrees C' | grep -oE '^[0-9]+' | head -1
-}
-
-# fc_json_str_field <json> <key> -> string value (first match; jq-free)
-fc_json_str_field() {
-  printf '%s' "$1" | grep -oE "\"$2\"[[:space:]]*:[[:space:]]*\"[^\"]*\"" | head -1 \
-    | sed -E "s/.*:[[:space:]]*\"(.*)\"\$/\1/"
-}
-
-# fc_pct_to_hex <pct> -> 0xNN
-fc_pct_to_hex() { printf '0x%02x' "$1"; }
-
-# fc_clamp <pct> -> 0..100
-fc_clamp() { local p="$1"; (( p < 0 )) && p=0; (( p > 100 )) && p=100; echo "$p"; }
-
-# fc_fan_watts <rpm> -> estimated TOTAL fan power (W). The iDRAC reports only
-# total DCMI watts + RPM (no per-fan power), so this is a MODEL: fan power ∝ RPM³
-# (fan affinity law), calibrated to the 2026-06-05 power sweep — fits within ~3W
-# (~2W @4800rpm · ~17W @9360 · ~42W @12720 · ~99W @16920). Integer: 0.0205·(rpm/1e3)³.
-fc_fan_watts() { echo $(( $1 * $1 * $1 * 205 / 10000000000000 )); }
-
-# fc_resolve <ha_mode> <temp> <manual_pct> <presence> <current> <deadband> -> pct
-# HA mode resolution (the hard ceiling is handled by the caller):
-#   manual      -> clamp(manual_pct), no hysteresis
-#   cool|quiet  -> that curve (with hysteresis)
-#   auto (else) -> presence-driven curve (garage door)
-fc_resolve() {
-  local ha_mode="$1" temp="$2" manual_pct="$3" presence="$4" current="$5" deadband="$6"
-  if [[ "$ha_mode" == "manual" ]]; then fc_clamp "$manual_pct"; return 0; fi
-  local eff; [[ "$ha_mode" == "auto" ]] && eff="$presence" || eff="$ha_mode"
-  fc_decide "$eff" "$temp" "$current" "$deadband"
-}
-
-# ---- side-effecting wrappers ----
-
-ipmi_manual_on=0
-
-set_manual() {  # <pct>
-  local pct="$1" hex; hex="$(fc_pct_to_hex "$pct")"
-  if (( DRY_RUN == 1 )); then log "DRY set fan ${pct}% (${hex})"; ipmi_manual_on=1; return 0; fi
-  if (( ipmi_manual_on == 0 )); then
-    "$IPMITOOL" raw 0x30 0x30 0x01 0x00 >/dev/null 2>&1 || return 1
-    ipmi_manual_on=1
-  fi
-  "$IPMITOOL" raw 0x30 0x30 0x02 0xff "$hex" >/dev/null 2>&1
-}
-
-restore_auto() {
-  if (( DRY_RUN == 1 )); then log "DRY restore Dell auto fan control"; ipmi_manual_on=0; return 0; fi
-  "$IPMITOOL" raw 0x30 0x30 0x01 0x01 >/dev/null 2>&1
-  ipmi_manual_on=0
-}
-
-read_cpu_temp() {
-  fc_parse_temp "$("$IPMITOOL" sdr type temperature 2>/dev/null | grep -E '^Temp ' | head -1)"
-}
-
-read_fan_rpm() {  # Fan1 RPM — representative (all 6 fans are set together)
-  "$IPMITOOL" sdr type fan 2>/dev/null | awk -F'|' '/^Fan1/{gsub(/[^0-9]/,"",$5); print $5+0; exit}'
-}
-
-presence_cache="cool"; presence_ts=0
-get_presence() {
-  local now; now="$(date +%s)"
-  if (( now - presence_ts < PRESENCE_INTERVAL )); then echo "$presence_cache"; return 0; fi
-  presence_ts="$now"
-  [[ -z "$HA_TOKEN" ]] && { echo "$presence_cache"; return 0; }
-  local resp state lc_iso lc_epoch
-  resp="$(curl -fsS --max-time 5 -H "Authorization: Bearer $HA_TOKEN" \
-            "$HA_URL/api/states/$GARAGE_ENTITY" 2>/dev/null)" || { echo "$presence_cache"; return 0; }
-  state="$(fc_json_str_field "$resp" state)"
-  [[ -z "$state" ]] && { echo "$presence_cache"; return 0; }
-  lc_iso="$(fc_json_str_field "$resp" last_changed)"
-  lc_epoch="$(date -d "$lc_iso" +%s 2>/dev/null || echo "$now")"
-  presence_cache="$(fc_presence_mode "$state" "$lc_epoch" "$now" "$HOLD_SECS" "$GARAGE_OPEN_STATE")"
-  echo "$presence_cache"
-}
-
-# ha_entity_state <entity> -> state string (empty if HA disabled/unreachable)
-ha_entity_state() {
-  [[ -z "$HA_TOKEN" ]] && return 0
-  local resp
-  resp="$(curl -fsS --max-time 5 -H "Authorization: Bearer $HA_TOKEN" \
-            "$HA_URL/api/states/$1" 2>/dev/null)" || return 0
-  fc_json_str_field "$resp" state
-}
-
-push_metrics() {  # <temp> <pct> <mode> <ha_ok> <fallback> [fan_rpm] [fan_watts_est]
-  [[ -z "$PUSHGATEWAY_URL" ]] && return 0
-  local mode_num; case "$3" in quiet) mode_num=1;; cool) mode_num=2;; manual) mode_num=3;; *) mode_num=0;; esac
-  curl -fsS --max-time 5 --data-binary @- \
-    "$PUSHGATEWAY_URL/metrics/job/fan_control/instance/pve-r730" >/dev/null 2>&1 <<EOF || true
-# TYPE pve_fan_control_cpu_temp_celsius gauge
-pve_fan_control_cpu_temp_celsius $1
-# TYPE pve_fan_control_fan_percent gauge
-pve_fan_control_fan_percent $2
-# TYPE pve_fan_control_mode gauge
-pve_fan_control_mode $mode_num
-# TYPE pve_fan_control_ha_reachable gauge
-pve_fan_control_ha_reachable $4
-# TYPE pve_fan_control_fallback gauge
-pve_fan_control_fallback $5
-# TYPE pve_fan_control_fan_rpm gauge
-pve_fan_control_fan_rpm ${6:-0}
-# TYPE pve_fan_control_fan_watts_est gauge
-pve_fan_control_fan_watts_est ${7:-0}
-EOF
-}
-
-main() {
-  log "fan-control start (loop=${LOOP_INTERVAL}s presence=${PRESENCE_INTERVAL}s hold=${HOLD_SECS}s ceiling=${CEILING}C dry_run=${DRY_RUN})"
-  trap 'log "exit — restoring Dell auto fan control"; restore_auto' EXIT
-  local current=-1 fails=0 in_fallback=0 cool_since=0
-  while true; do
-    local temp; temp="$(read_cpu_temp)"
-    if [[ -z "$temp" ]]; then
-      fails=$((fails + 1)); log "WARN cannot read CPU temp ($fails/$MAX_IPMI_FAILS)"
-      if (( fails >= MAX_IPMI_FAILS )); then log "ERR temp unreadable — Dell auto"; restore_auto; current=-1; fi
-      (( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
-    fi
-    fails=0
-
-    if (( temp >= CEILING )); then
-      (( in_fallback == 0 )) && { log "CEILING temp=${temp}≥${CEILING} — Dell auto"; restore_auto; current=-1; in_fallback=1; }
-      push_metrics "$temp" 0 fallback 1 1
-      (( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
-    fi
-    if (( in_fallback == 1 )); then
-      if (( temp < RESUME_BELOW )); then
-        (( cool_since == 0 )) && cool_since="$(date +%s)"
-        if (( $(date +%s) - cool_since >= RESUME_STABLE )); then
-          log "recovered (temp<${RESUME_BELOW}C ${RESUME_STABLE}s) — resuming manual"; in_fallback=0; cool_since=0
-        else
-          push_metrics "$temp" 0 fallback 1 1; (( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
-        fi
-      else
-        cool_since=0; push_metrics "$temp" 0 fallback 1 1
-        (( RUN_ONCE == 1 )) && break || { sleep "$LOOP_INTERVAL"; continue; }
-      fi
-    fi
-
-    # HA-desired mode (auto/cool/quiet/manual); unreachable/unset => auto.
-    local ha_mode ha_ok=1; ha_mode="$(ha_entity_state "$MODE_ENTITY")"; [[ -z "$HA_TOKEN" ]] && ha_ok=0
-    [[ -z "$ha_mode" ]] && ha_mode="auto"
-    case "$ha_mode" in auto|cool|quiet|manual) ;; *) ha_mode="auto" ;; esac
-    local manual_pct=0
-    if [[ "$ha_mode" == "manual" ]]; then
-      manual_pct="$(ha_entity_state "$MANUAL_ENTITY")"; manual_pct="${manual_pct%%.*}"
-      [[ "$manual_pct" =~ ^[0-9]+$ ]] || manual_pct=0
-    fi
-    local presence="cool"; [[ "$ha_mode" == "auto" ]] && presence="$(get_presence)"
-    local eff; if [[ "$ha_mode" == "manual" ]]; then eff="manual"; elif [[ "$ha_mode" == "auto" ]]; then eff="$presence"; else eff="$ha_mode"; fi
-    local pct; pct="$(fc_resolve "$ha_mode" "$temp" "$manual_pct" "$presence" "$current" "$DEADBAND")"
-    # Only write when first-run or the change clears MIN_STEP (kills 1-2% jitter
-    # on the continuous curve; fc_decide already gives asymmetric hysteresis).
-    if (( current < 0 || pct - current >= MIN_STEP || current - pct >= MIN_STEP )); then
-      if set_manual "$pct"; then log "temp=${temp}C ha_mode=${ha_mode} eff=${eff} fan=${pct}% (was ${current}%)"; current="$pct"
-      else log "WARN set_manual ${pct}% failed"; fi
-    fi
-    local rpm fan_w; rpm="$(read_fan_rpm)"; rpm="${rpm:-0}"; fan_w="$(fc_fan_watts "$rpm")"
-    push_metrics "$temp" "$current" "$eff" "$ha_ok" 0 "$rpm" "$fan_w"
-    (( RUN_ONCE == 1 )) && break || sleep "$LOOP_INTERVAL"
-  done
-}
-
-if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@"; fi
--- a/scripts/forgejo-migrate-orphan-images.sh
+++ b/scripts/forgejo-migrate-orphan-images.sh
@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-# One-shot migration of every private image on registry.viktorbarzin.me to
-# Forgejo. Used as a stop-gap when the dual-push CI pipelines aren't
-# producing Forgejo images on their own (Forgejo-Woodpecker forge driver
-# context-deadline-exceeded issue, see bd code-d3y / 2026-05-07).
-#
-# Pulls each image from registry.viktorbarzin.me, retags, pushes to
-# forgejo.viktorbarzin.me/viktor/<name>:<tag> — preserving the blob bytes
-# verbatim so the cluster can flip image= without a rebuild.
-#
-# Run from any host with docker + network reach to BOTH registries. Auth
-# from `docker login` (~/.docker/config.json) — make sure both registries
-# are logged in:
-#   docker login registry.viktorbarzin.me -u viktorbarzin
-#   docker login forgejo.viktorbarzin.me -u viktor   # use viktor PAT, not ci-pusher
-#
-# (ci-pusher CANNOT push to viktor/<image> — Forgejo container packages
-# are scoped to the pushing user. Only viktor's PAT can write to viktor/*.)
-#
-# After the script, the new image lives at
-#   forgejo.viktorbarzin.me/viktor/<name>:<tag>
-# Phase 3 of the consolidation flips infra/stacks/<svc>/main.tf image=
-# to that path.
-
-set -euo pipefail
-
-OLD_REG=registry.viktorbarzin.me
-NEW_REG=forgejo.viktorbarzin.me/viktor
-
-# Image list: <name>:<tag>. Generated 2026-05-07 from `grep -rEn 'image\s*=\s*
-# "registry\.viktorbarzin\.me'` across infra/stacks/.
-#
-# Excluded:
-# - wealthfolio-sync: registry repo exists but has 0 tags (CronJob has been
-#   broken for 36+ days, separate decision needed). User to triage before
-#   migration.
-# - fire-planner: registry repo exists but has 0 tags. Dockerfile + CI added
-#   in this session (commit 8b53d99e); rebuild via Woodpecker before flipping.
-IMAGES=(
-  "chrome-service-novnc:v4"
-  "chrome-service-novnc:latest"
-  "payslip-ingest:latest"
-  "job-hunter:latest"
-  "claude-agent-service:latest"
-  "freedify:latest"
-  "beadboard:latest"
-  "infra-ci:latest"
-)
-
-for img in "${IMAGES[@]}"; do
-  echo "=== $img ==="
-  src="$OLD_REG/$img"
-  dst="$NEW_REG/$img"
-
-  if ! docker pull "$src" 2>&1 | tee /tmp/pull-$$ | grep -q 'Status: '; then
-    if grep -q 'not found' /tmp/pull-$$; then
-      echo "  SKIP — image not present in source registry"
-      rm -f /tmp/pull-$$
-      continue
-    fi
-  fi
-  rm -f /tmp/pull-$$
-
-  echo "  tag → $dst"
-  docker tag "$src" "$dst"
-
-  echo "  push $dst"
-  docker push "$dst" 2>&1 | tail -2
-
-  echo "  cleanup local copy"
-  docker rmi "$src" "$dst" 2>&1 | tail -1 || true
-done
-
-echo ""
-echo "Done. Verify in Forgejo Web UI: https://forgejo.viktorbarzin.me/viktor/-/packages?type=container"
-echo "Phase 3 of the plan flips infra/stacks/{wealthfolio,fire-planner}/main.tf image= references."
--- a/scripts/frigate-bulk-classify.js
+++ b/scripts/frigate-bulk-classify.js
@ -1,698 +0,0 @@
-// Frigate Bulk Classification Labeler
-// Paste this into the browser console on the Frigate /classification page
-// while viewing a model's training images.
-//
-// Image URL pattern: /clips/{modelName}/train/{filename}
-// Categorize API: POST /api/classification/{modelName}/dataset/categorize
-//   body: { category: "...", training_file: "..." }
-// Delete API: POST /api/classification/{modelName}/train/delete
-//   body: { ids: ["..."] }
-// Dataset API: GET /api/classification/{modelName}/dataset
-//   returns: { categories: { catName: [files...] }, training_metadata: {...} }
-
-(async () => {
-  "use strict";
-
-  // --- Configuration ---
-  const API_BASE = window.location.origin + "/api";
-  const TOOLBAR_ID = "bulk-classify-toolbar";
-  // Frigate's axios instance sends these headers on every request.
-  // X-CSRF-TOKEN is required for state-modifying (POST/PUT/DELETE) requests.
-  const API_HEADERS = {
-    "Content-Type": "application/json",
-    "X-CSRF-TOKEN": "1",
-    "X-CACHE-BYPASS": "1",
-  };
-
-  // Abort if already injected
-  if (document.getElementById(TOOLBAR_ID)) {
-    console.log("Bulk classifier already active. Refresh page to re-inject.");
-    return;
-  }
-
-  // --- Extract model name from page ---
-  // Training images use src="/clips/{modelName}/train/{filename}"
-  let modelName = null;
-
-  // Method 1: Extract from training image src on the page
-  for (const img of document.querySelectorAll("img")) {
-    const src = img.getAttribute("src") || "";
-    const m = src.match(/\/clips\/([^/]+)\/train\//);
-    if (m) { modelName = decodeURIComponent(m[1]); break; }
-  }
-
-  // Method 2: List all custom models from config and let the user pick
-  if (!modelName) {
-    try {
-      const resp = await fetch(`${API_BASE}/config`);
-      const config = await resp.json();
-      // Custom classification models are under config.classification.custom
-      const models = Object.keys(config.classification?.custom || {});
-      if (models.length === 1) {
-        modelName = models[0];
-      } else if (models.length > 1) {
-        modelName = prompt(
-          `Multiple classification models found. Enter the model name:\n\n${models.join(", ")}`,
-        );
-      }
-    } catch (_) {}
-  }
-
-  if (!modelName) {
-    alert(
-      "Could not detect model name.\nMake sure you are on the /classification page with training images visible.",
-    );
-    return;
-  }
-
-  console.log(`[bulk-classify] Detected model: "${modelName}"`);
-
-  // --- Fetch categories from the dataset API ---
-  let categories = [];
-  try {
-    const resp = await fetch(`${API_BASE}/classification/${encodeURIComponent(modelName)}/dataset`);
-    const data = await resp.json();
-    // Dataset response: { categories: { catName: [files...] }, training_metadata: {...} }
-    categories = Object.keys(data.categories || data);
-  } catch (e) {
-    console.error("Failed to fetch categories:", e);
-  }
-
-  // Deduplicate
-  categories = [...new Set(categories)];
-  console.log("[bulk-classify] Categories:", categories);
-
-  // --- Fetch all training filenames and build event groups ---
-  // Frigate groups training images by eventId (first two segments of the filename).
-  // Filename format: {timestamp}-{randomId}-{timestamp2}-{label}-{score}.webp
-  // EventId = "{timestamp}-{randomId}"
-  let allTrainFiles = [];
-  const eventGroups = {}; // eventId -> [filename, ...]
-
-  function parseEventId(filename) {
-    const base = filename.replace(/\.webp$/, "");
-    const parts = base.split("-");
-    if (parts.length >= 2) return `${parts[0]}-${parts[1]}`;
-    return filename; // fallback: treat as its own group
-  }
-
-  try {
-    const resp = await fetch(
-      `${API_BASE}/classification/${encodeURIComponent(modelName)}/train`,
-      { headers: API_HEADERS },
-    );
-    allTrainFiles = await resp.json();
-    for (const f of allTrainFiles) {
-      const eid = parseEventId(f);
-      if (!eventGroups[eid]) eventGroups[eid] = [];
-      eventGroups[eid].push(f);
-    }
-    console.log(
-      `[bulk-classify] Loaded ${allTrainFiles.length} training files in ${Object.keys(eventGroups).length} event groups.`,
-    );
-  } catch (e) {
-    console.error("[bulk-classify] Failed to fetch training files:", e);
-  }
-
-  // Get all filenames in the same event group as the given filename
-  function getGroupFiles(filename) {
-    const eid = parseEventId(filename);
-    return eventGroups[eid] || [filename];
-  }
-
-  // --- State ---
-  const selected = new Set();
-
-  // --- Inject styles ---
-  const style = document.createElement("style");
-  style.textContent = `
-    #${TOOLBAR_ID} {
-      position: fixed;
-      bottom: 20px;
-      left: 50%;
-      transform: translateX(-50%);
-      z-index: 99999;
-      background: #1e1e2e;
-      border: 1px solid #444;
-      border-radius: 12px;
-      padding: 12px 20px;
-      display: flex;
-      align-items: center;
-      gap: 12px;
-      box-shadow: 0 8px 32px rgba(0,0,0,0.5);
-      font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
-      font-size: 14px;
-      color: #cdd6f4;
-    }
-    #${TOOLBAR_ID} button {
-      padding: 6px 14px;
-      border: 1px solid #555;
-      border-radius: 6px;
-      background: #313244;
-      color: #cdd6f4;
-      cursor: pointer;
-      font-size: 13px;
-      white-space: nowrap;
-    }
-    #${TOOLBAR_ID} button:hover {
-      background: #45475a;
-    }
-    #${TOOLBAR_ID} button.primary {
-      background: #89b4fa;
-      color: #1e1e2e;
-      border-color: #89b4fa;
-      font-weight: 600;
-    }
-    #${TOOLBAR_ID} button.primary:hover {
-      background: #74c7ec;
-    }
-    #${TOOLBAR_ID} button.primary:disabled {
-      opacity: 0.5;
-      cursor: not-allowed;
-    }
-    #${TOOLBAR_ID} button.danger {
-      background: #f38ba8;
-      color: #1e1e2e;
-      border-color: #f38ba8;
-      font-weight: 600;
-    }
-    #${TOOLBAR_ID} button.danger:hover {
-      background: #eba0ac;
-    }
-    .bulk-classify-dropdown {
-      position: relative;
-      display: inline-block;
-    }
-    .bulk-classify-dropdown-btn {
-      padding: 6px 14px;
-      border: 1px solid #555;
-      border-radius: 6px;
-      background: #313244;
-      color: #cdd6f4;
-      cursor: pointer;
-      font-size: 13px;
-      white-space: nowrap;
-      min-width: 140px;
-      text-align: left;
-    }
-    .bulk-classify-dropdown-btn::after {
-      content: " ▾";
-      float: right;
-      margin-left: 8px;
-    }
-    .bulk-classify-dropdown-menu {
-      display: none;
-      position: absolute;
-      bottom: 100%;
-      left: 0;
-      margin-bottom: 4px;
-      background: #313244;
-      border: 1px solid #555;
-      border-radius: 6px;
-      max-height: 250px;
-      overflow-y: auto;
-      min-width: 180px;
-      box-shadow: 0 -4px 16px rgba(0,0,0,0.4);
-      z-index: 100000;
-    }
-    .bulk-classify-dropdown-menu.open {
-      display: block;
-    }
-    .bulk-classify-dropdown-item {
-      padding: 8px 14px;
-      cursor: pointer;
-      font-size: 13px;
-      color: #cdd6f4;
-      white-space: nowrap;
-    }
-    .bulk-classify-dropdown-item:hover {
-      background: #45475a;
-    }
-    .bulk-classify-dropdown-item.active {
-      background: #89b4fa;
-      color: #1e1e2e;
-    }
-    #${TOOLBAR_ID} .count {
-      font-weight: 600;
-      min-width: 30px;
-      text-align: center;
-    }
-    #${TOOLBAR_ID} .separator {
-      width: 1px;
-      height: 24px;
-      background: #555;
-    }
-    #${TOOLBAR_ID} .progress {
-      font-size: 12px;
-      color: #a6adc8;
-    }
-    .bulk-classify-checkbox {
-      position: absolute;
-      top: 6px;
-      left: 6px;
-      z-index: 9999;
-      width: 22px;
-      height: 22px;
-      cursor: pointer;
-      accent-color: #89b4fa;
-      pointer-events: auto;
-    }
-    .bulk-classify-selected {
-      outline: 3px solid #89b4fa !important;
-      outline-offset: -3px;
-    }
-    .bulk-classify-overlay {
-      position: fixed;
-      inset: 0;
-      z-index: 99998;
-      background: rgba(0,0,0,0.6);
-      display: flex;
-      align-items: center;
-      justify-content: center;
-    }
-    .bulk-classify-dialog {
-      background: #1e1e2e;
-      border: 1px solid #444;
-      border-radius: 12px;
-      padding: 24px;
-      min-width: 350px;
-      color: #cdd6f4;
-      font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
-    }
-    .bulk-classify-dialog h3 {
-      margin: 0 0 16px;
-      font-size: 16px;
-    }
-    .bulk-classify-dialog .progress-bar {
-      width: 100%;
-      height: 8px;
-      background: #313244;
-      border-radius: 4px;
-      overflow: hidden;
-      margin: 12px 0;
-    }
-    .bulk-classify-dialog .progress-fill {
-      height: 100%;
-      background: #89b4fa;
-      transition: width 0.2s;
-    }
-    .bulk-classify-dialog .status {
-      font-size: 13px;
-      color: #a6adc8;
-    }
-  `;
-  document.head.appendChild(style);
-
-  // --- Helper: find all training image cards ---
-  function getImageCards() {
-    // Training images use src="/clips/{modelName}/train/{filename}"
-    // Filenames are like: 1770573871.602803-in4y00-1770573889.027752-none-1.0.webp
-    const pattern = /\/clips\/[^/]+\/train\/([^/?#]+)/;
-    const imgs = document.querySelectorAll("img");
-    const cards = [];
-    const seen = new Set();
-    for (const img of imgs) {
-      const src = img.getAttribute("src") || "";
-      const match = src.match(pattern);
-      if (match && !seen.has(match[1])) {
-        seen.add(match[1]);
-        // Walk up to find the card container (Frigate uses aspect-square divs)
-        let card =
-          img.closest("[class*='aspect-']") ||
-          img.closest("[class*='card']") ||
-          img.parentElement?.parentElement ||
-          img.parentElement;
-        // Resolve the full group of filenames for this card
-        const groupFiles = getGroupFiles(match[1]);
-        cards.push({ element: card, filename: match[1], img, groupFiles });
-      }
-    }
-    return cards;
-  }
-
-  // --- Debug: log what images we found ---
-  const debugImgs = document.querySelectorAll("img");
-  const debugSrcs = Array.from(debugImgs)
-    .map((i) => i.getAttribute("src"))
-    .filter(Boolean);
-  console.log(
-    `[bulk-classify] Found ${debugSrcs.length} <img> elements. Sample srcs:`,
-    debugSrcs.slice(0, 5),
-  );
-  const initialCards = getImageCards();
-  console.log(
-    `[bulk-classify] Matched ${initialCards.length} training image cards.`,
-  );
-
-  // --- Add checkboxes to all cards ---
-  function injectCheckboxes() {
-    const cards = getImageCards();
-    for (const { element, filename, groupFiles } of cards) {
-      if (element.querySelector(".bulk-classify-checkbox")) continue;
-
-      // Ensure relative positioning for absolute checkbox
-      element.style.position = "relative";
-
-      const cb = document.createElement("input");
-      cb.type = "checkbox";
-      cb.className = "bulk-classify-checkbox";
-      cb.dataset.filename = filename;
-      cb.checked = selected.has(filename);
-
-      // Show group count badge next to checkbox if group has >1 image
-      let badge = null;
-      if (groupFiles.length > 1) {
-        badge = document.createElement("span");
-        badge.className = "bulk-classify-badge";
-        badge.textContent = groupFiles.length;
-        badge.style.cssText =
-          "position:absolute;top:6px;left:32px;z-index:9999;background:#89b4fa;color:#1e1e2e;" +
-          "font-size:11px;font-weight:700;padding:1px 5px;border-radius:8px;pointer-events:none;";
-      }
-
-      cb.addEventListener("change", (e) => {
-        e.stopPropagation();
-        if (cb.checked) {
-          // Select ALL files in this event group
-          for (const f of groupFiles) selected.add(f);
-          element.classList.add("bulk-classify-selected");
-        } else {
-          for (const f of groupFiles) selected.delete(f);
-          element.classList.remove("bulk-classify-selected");
-        }
-        updateCount();
-      });
-
-      // Also allow clicking the image to toggle
-      element.addEventListener("click", (e) => {
-        // Don't intercept if clicking the checkbox itself or a button
-        if (
-          e.target === cb ||
-          e.target.closest("button") ||
-          e.target.closest("a")
-        )
-          return;
-        e.preventDefault();
-        e.stopPropagation();
-        cb.checked = !cb.checked;
-        cb.dispatchEvent(new Event("change"));
-      });
-
-      element.prepend(cb);
-      if (badge) element.appendChild(badge);
-    }
-  }
-
-  // --- Toolbar ---
-  const toolbar = document.createElement("div");
-  toolbar.id = TOOLBAR_ID;
-
-  const countLabel = document.createElement("span");
-  countLabel.className = "count";
-  countLabel.textContent = "0";
-
-  const countText = document.createElement("span");
-  countText.textContent = "selected";
-
-  const sep1 = document.createElement("div");
-  sep1.className = "separator";
-
-  const selectAllBtn = document.createElement("button");
-  selectAllBtn.textContent = "Select All";
-  selectAllBtn.addEventListener("click", () => {
-    const cards = getImageCards();
-    for (const { element, groupFiles } of cards) {
-      for (const f of groupFiles) selected.add(f);
-      element.classList.add("bulk-classify-selected");
-      const cb = element.querySelector(".bulk-classify-checkbox");
-      if (cb) cb.checked = true;
-    }
-    updateCount();
-  });
-
-  const deselectBtn = document.createElement("button");
-  deselectBtn.textContent = "Deselect All";
-  deselectBtn.addEventListener("click", () => {
-    const cards = getImageCards();
-    for (const { element, groupFiles } of cards) {
-      for (const f of groupFiles) selected.delete(f);
-      element.classList.remove("bulk-classify-selected");
-      const cb = element.querySelector(".bulk-classify-checkbox");
-      if (cb) cb.checked = false;
-    }
-    updateCount();
-  });
-
-  const sep2 = document.createElement("div");
-  sep2.className = "separator";
-
-  // --- Custom dropdown (replaces native <select> which React intercepts) ---
-  let selectedCategory = "";
-  const dropdown = document.createElement("div");
-  dropdown.className = "bulk-classify-dropdown";
-
-  const dropdownBtn = document.createElement("div");
-  dropdownBtn.className = "bulk-classify-dropdown-btn";
-  dropdownBtn.textContent = "-- pick category --";
-
-  const dropdownMenu = document.createElement("div");
-  dropdownMenu.className = "bulk-classify-dropdown-menu";
-
-  function buildMenuItems() {
-    dropdownMenu.innerHTML = "";
-    for (const cat of categories) {
-      const item = document.createElement("div");
-      item.className = "bulk-classify-dropdown-item";
-      if (cat === selectedCategory) item.classList.add("active");
-      item.textContent = cat;
-      item.addEventListener("mousedown", (e) => {
-        e.preventDefault();
-        e.stopPropagation();
-        selectedCategory = cat;
-        dropdownBtn.textContent = cat;
-        dropdownMenu.classList.remove("open");
-        buildMenuItems(); // refresh active state
-      });
-      dropdownMenu.appendChild(item);
-    }
-  }
-  buildMenuItems();
-
-  dropdownBtn.addEventListener("mousedown", (e) => {
-    e.preventDefault();
-    e.stopPropagation();
-    dropdownMenu.classList.toggle("open");
-  });
-
-  // Close dropdown when clicking outside
-  document.addEventListener("mousedown", (e) => {
-    if (!dropdown.contains(e.target)) {
-      dropdownMenu.classList.remove("open");
-    }
-  });
-
-  dropdown.appendChild(dropdownBtn);
-  dropdown.appendChild(dropdownMenu);
-
-  // Allow typing a new category
-  const newCatInput = document.createElement("input");
-  newCatInput.type = "text";
-  newCatInput.placeholder = "or type new...";
-  newCatInput.style.cssText =
-    "padding:6px 10px;border:1px solid #555;border-radius:6px;background:#313244;color:#cdd6f4;font-size:13px;width:120px;";
-
-  const categorizeBtn = document.createElement("button");
-  categorizeBtn.className = "primary";
-  categorizeBtn.textContent = "Categorize Selected";
-
-  const deleteBtn = document.createElement("button");
-  deleteBtn.className = "danger";
-  deleteBtn.textContent = "Delete Selected";
-
-  toolbar.append(
-    countLabel,
-    countText,
-    sep1,
-    selectAllBtn,
-    deselectBtn,
-    sep2,
-    dropdown,
-    newCatInput,
-    categorizeBtn,
-    deleteBtn,
-  );
-
-  // Prevent events from bubbling out of toolbar to React's root handler
-  for (const evt of ["click", "mousedown", "mouseup", "pointerdown", "pointerup", "focus", "blur"]) {
-    toolbar.addEventListener(evt, (e) => e.stopPropagation());
-  }
-
-  document.body.appendChild(toolbar);
-
-  function updateCount() {
-    countLabel.textContent = selected.size;
-    categorizeBtn.disabled = selected.size === 0;
-  }
-
-  // --- Progress dialog ---
-  function showProgress(title, total) {
-    const overlay = document.createElement("div");
-    overlay.className = "bulk-classify-overlay";
-    const dialog = document.createElement("div");
-    dialog.className = "bulk-classify-dialog";
-    dialog.innerHTML = `
-      <h3>${title}</h3>
-      <div class="status">0 / ${total}</div>
-      <div class="progress-bar"><div class="progress-fill" style="width:0%"></div></div>
-      <div class="errors" style="color:#f38ba8;font-size:12px;margin-top:8px"></div>
-    `;
-    overlay.appendChild(dialog);
-    document.body.appendChild(overlay);
-
-    return {
-      update(current, errorMsg) {
-        const pct = Math.round((current / total) * 100);
-        dialog.querySelector(".status").textContent =
-          `${current} / ${total}`;
-        dialog.querySelector(".progress-fill").style.width = pct + "%";
-        if (errorMsg) {
-          dialog.querySelector(".errors").textContent += errorMsg + "\n";
-        }
-      },
-      close() {
-        overlay.remove();
-      },
-    };
-  }
-
-  // --- Categorize handler ---
-  // POST /api/classification/{modelName}/dataset/categorize
-  // body: { category: "...", training_file: "..." }
-  categorizeBtn.addEventListener("click", async () => {
-    const category = newCatInput.value.trim() || selectedCategory;
-    if (!category) {
-      alert("Select a category or type a new one.");
-      return;
-    }
-    if (selected.size === 0) {
-      alert("No images selected.");
-      return;
-    }
-
-    const files = Array.from(selected);
-    if (
-      !confirm(
-        `Categorize ${files.length} image(s) as "${category}"?`,
-      )
-    )
-      return;
-
-    const progress = showProgress(
-      `Categorizing as "${category}"`,
-      files.length,
-    );
-    let errors = 0;
-
-    for (let i = 0; i < files.length; i++) {
-      try {
-        const resp = await fetch(
-          `${API_BASE}/classification/${encodeURIComponent(modelName)}/dataset/categorize`,
-          {
-            method: "POST",
-            headers: API_HEADERS,
-            body: JSON.stringify({
-              category: category,
-              training_file: files[i],
-            }),
-          },
-        );
-        if (!resp.ok) {
-          const text = await resp.text();
-          progress.update(i + 1, `Failed: ${files[i]} - ${text}`);
-          errors++;
-        } else {
-          progress.update(i + 1);
-        }
-      } catch (e) {
-        progress.update(i + 1, `Error: ${files[i]} - ${e.message}`);
-        errors++;
-      }
-    }
-
-    setTimeout(() => {
-      progress.close();
-      if (errors === 0) {
-        selected.clear();
-        updateCount();
-        alert(
-          `Done! ${files.length} image(s) categorized as "${category}".\nRefreshing the training view...`,
-        );
-        window.location.reload();
-      } else {
-        alert(
-          `Completed with ${errors} error(s). Check console for details.`,
-        );
-      }
-    }, 500);
-  });
-
-  // --- Delete handler ---
-  // POST /api/classification/{modelName}/train/delete
-  // body: { ids: ["filename1", "filename2", ...] }
-  deleteBtn.addEventListener("click", async () => {
-    if (selected.size === 0) {
-      alert("No images selected.");
-      return;
-    }
-
-    const files = Array.from(selected);
-    if (
-      !confirm(
-        `DELETE ${files.length} training image(s)? This cannot be undone.`,
-      )
-    )
-      return;
-
-    const progress = showProgress("Deleting training images", 1);
-
-    try {
-      const resp = await fetch(
-        `${API_BASE}/classification/${encodeURIComponent(modelName)}/train/delete`,
-        {
-          method: "POST",
-          headers: API_HEADERS,
-          body: JSON.stringify({ ids: files }),
-        },
-      );
-      if (!resp.ok) {
-        const text = await resp.text();
-        progress.update(1, `Failed: ${text}`);
-      } else {
-        progress.update(1);
-      }
-    } catch (e) {
-      progress.update(1, `Error: ${e.message}`);
-    }
-
-    setTimeout(() => {
-      progress.close();
-      selected.clear();
-      updateCount();
-      alert(`Deleted ${files.length} training image(s).\nRefreshing...`);
-      window.location.reload();
-    }, 500);
-  });
-
-  // --- Initial injection + MutationObserver for dynamic loading ---
-  injectCheckboxes();
-
-  const observer = new MutationObserver(() => {
-    injectCheckboxes();
-  });
-  observer.observe(document.body, { childList: true, subtree: true });
-
-  updateCount();
-  console.log(
-    `Bulk classifier active for model "${modelName}". ${categories.length} categories found: [${categories.join(", ")}]`,
-  );
-})();
--- a/scripts/frigate-inspect.mjs
+++ b/scripts/frigate-inspect.mjs
@ -1,305 +0,0 @@
-#!/usr/bin/env node
-// Frigate Classification Page Inspector
-// Phase 1: Fetch API data via HTTP to understand the data model
-// Phase 2: Fetch the classification page HTML and parse its DOM structure
-// No browser needed — uses plain HTTP requests.
-
-import { spawn } from "child_process";
-import http from "http";
-
-const KUBE_CONFIG = `${process.cwd()}/config`;
-const LOCAL_PORT = 15000;
-const FRIGATE_NS = "frigate";
-const FRIGATE_SVC = "svc/frigate";
-const FRIGATE_PORT = 80;
-const BASE_URL = `http://localhost:${LOCAL_PORT}`;
-
-async function startPortForward() {
-  console.log(
-    `[port-forward] Starting: kubectl port-forward ${FRIGATE_SVC} ${LOCAL_PORT}:${FRIGATE_PORT} -n ${FRIGATE_NS}`,
-  );
-  const proc = spawn(
-    "kubectl",
-    [
-      "--kubeconfig",
-      KUBE_CONFIG,
-      "port-forward",
-      FRIGATE_SVC,
-      `${LOCAL_PORT}:${FRIGATE_PORT}`,
-      "-n",
-      FRIGATE_NS,
-    ],
-    { stdio: ["ignore", "pipe", "pipe"] },
-  );
-
-  await new Promise((resolve, reject) => {
-    const timer = setTimeout(
-      () => reject(new Error("Port-forward timed out")),
-      15000,
-    );
-    proc.stdout.on("data", (data) => {
-      if (data.toString().includes("Forwarding from")) {
-        clearTimeout(timer);
-        resolve();
-      }
-    });
-    proc.stderr.on("data", (data) => {
-      console.error(`[port-forward stderr] ${data.toString().trim()}`);
-    });
-    proc.on("error", (err) => {
-      clearTimeout(timer);
-      reject(err);
-    });
-    proc.on("exit", (code) => {
-      if (code !== null && code !== 0) {
-        clearTimeout(timer);
-        reject(new Error(`port-forward exited with code ${code}`));
-      }
-    });
-  });
-
-  console.log("[port-forward] Ready");
-  return proc;
-}
-
-function httpGet(path) {
-  return new Promise((resolve, reject) => {
-    const url = `${BASE_URL}${path}`;
-    http.get(url, (res) => {
-      let body = "";
-      res.on("data", (chunk) => (body += chunk));
-      res.on("end", () =>
-        resolve({ status: res.statusCode, body, headers: res.headers }),
-      );
-    }).on("error", (err) => reject(err));
-  });
-}
-
-async function main() {
-  let portForwardProc = null;
-
-  try {
-    portForwardProc = await startPortForward();
-
-    // ================================================================
-    // API INSPECTION
-    // ================================================================
-    console.log("\n" + "=".repeat(80));
-    console.log("API INSPECTION");
-    console.log("=".repeat(80));
-
-    // Get config to find model names
-    const configResp = await httpGet("/api/config");
-    let modelNames = [];
-    if (configResp.status === 200) {
-      try {
-        const config = JSON.parse(configResp.body);
-        // Custom classification models are under config.classification.custom
-        const classificationModels = config.classification?.custom || {};
-        modelNames = Object.keys(classificationModels);
-        console.log(
-          `\n[API] /api/config - Classification models: ${JSON.stringify(modelNames)}`,
-        );
-        console.log(
-          `[API] Classification config:\n${JSON.stringify(config.classification, null, 2)}`,
-        );
-      } catch (e) {
-        console.log(`[API] /api/config - Failed to parse: ${e.message}`);
-        console.log(
-          `[API] Raw (first 500): ${configResp.body.slice(0, 500)}`,
-        );
-      }
-    } else {
-      console.log(`[API] /api/config - HTTP ${configResp.status}`);
-    }
-
-    for (const model of modelNames) {
-      console.log(`\n--- Model: ${model} ---`);
-      const encodedModel = encodeURIComponent(model);
-
-      // Dataset endpoint
-      const datasetResp = await httpGet(
-        `/api/classification/${encodedModel}/dataset`,
-      );
-      if (datasetResp.status === 200) {
-        try {
-          const dataset = JSON.parse(datasetResp.body);
-          // Dataset response: { categories: { catName: [files...] }, training_metadata: {...} }
-          const cats = dataset.categories || dataset;
-          const categories = Object.keys(cats);
-          console.log(`[API] /api/classification/${model}/dataset`);
-          console.log(`  Categories: ${JSON.stringify(categories)}`);
-          for (const cat of categories) {
-            const items = Array.isArray(cats[cat]) ? cats[cat] : [];
-            console.log(
-              `  "${cat}": ${items.length} items, first 3: ${JSON.stringify(items.slice(0, 3))}`,
-            );
-          }
-          if (dataset.training_metadata) {
-            console.log(`  Training metadata: ${JSON.stringify(dataset.training_metadata, null, 2)}`);
-          }
-        } catch (e) {
-          console.log(`  Failed to parse dataset: ${e.message}`);
-        }
-      } else {
-        console.log(
-          `[API] /api/classification/${model}/dataset - HTTP ${datasetResp.status}: ${datasetResp.body.slice(0, 200)}`,
-        );
-      }
-
-      // Train endpoint
-      const trainResp = await httpGet(
-        `/api/classification/${encodedModel}/train`,
-      );
-      if (trainResp.status === 200) {
-        try {
-          const train = JSON.parse(trainResp.body);
-          const entries = Array.isArray(train) ? train : Object.entries(train);
-          console.log(`[API] /api/classification/${model}/train`);
-          console.log(
-            `  Type: ${Array.isArray(train) ? "array" : typeof train}, length/keys: ${Array.isArray(train) ? train.length : Object.keys(train).length}`,
-          );
-          console.log(
-            `  First 5 entries:\n${JSON.stringify(entries.slice(0, 5), null, 2)}`,
-          );
-        } catch (e) {
-          console.log(`  Failed to parse train: ${e.message}`);
-        }
-      } else {
-        console.log(
-          `[API] /api/classification/${model}/train - HTTP ${trainResp.status}: ${trainResp.body.slice(0, 200)}`,
-        );
-      }
-
-      // Try to get a thumbnail URL to understand the image src pattern
-      if (trainResp.status === 200) {
-        try {
-          const train = JSON.parse(trainResp.body);
-          const firstFile = Array.isArray(train) ? train[0] : null;
-          if (firstFile) {
-            // Try various thumbnail URL patterns
-            const patterns = [
-              `/api/classification/${encodedModel}/train/${firstFile}/thumbnail.jpg`,
-              `/api/classification/${encodedModel}/train/${firstFile}`,
-              `/clips/${encodedModel}/train/${firstFile}`,
-            ];
-            for (const p of patterns) {
-              const resp = await httpGet(p);
-              console.log(
-                `  Thumbnail URL test: ${p} -> HTTP ${resp.status} (content-type: ${resp.headers["content-type"]}, size: ${resp.body.length})`,
-              );
-            }
-          }
-        } catch (_) {}
-      }
-    }
-
-    // ================================================================
-    // HTML/DOM INSPECTION
-    // ================================================================
-    console.log("\n" + "=".repeat(80));
-    console.log("HTML / DOM INSPECTION");
-    console.log("=".repeat(80));
-
-    // Fetch the main classification page HTML
-    const classifPageResp = await httpGet("/classification");
-    console.log(
-      `\n[HTML] /classification - HTTP ${classifPageResp.status} (${classifPageResp.body.length} bytes)`,
-    );
-
-    // This is likely a React SPA, so the HTML will be minimal. Let's check.
-    const html = classifPageResp.body;
-    console.log(`[HTML] First 2000 chars:\n${html.slice(0, 2000)}`);
-
-    // Check for any JS bundle references (to find source maps or component names)
-    const scriptMatches = html.match(/<script[^>]*src="([^"]+)"[^>]*>/g) || [];
-    console.log(`\n[HTML] Script tags: ${scriptMatches.length}`);
-    for (const s of scriptMatches) {
-      console.log(`  ${s}`);
-    }
-
-    // Fetch the main JS bundle to look for classification component code
-    const jsMatch = html.match(/src="(\/assets\/[^"]+\.js)"/);
-    if (jsMatch) {
-      console.log(`\n[JS] Fetching main bundle: ${jsMatch[1]}`);
-      const jsResp = await httpGet(jsMatch[1]);
-      if (jsResp.status === 200) {
-        const js = jsResp.body;
-        console.log(`[JS] Bundle size: ${js.length} bytes`);
-
-        // Search for classification-related code patterns
-        const searchTerms = [
-          "classify image as",
-          "Classify image as",
-          "categorize",
-          "/classification/",
-          "dataset/categorize",
-          "training_file",
-          "train/delete",
-          "ModelTraining",
-          "classification",
-        ];
-        for (const term of searchTerms) {
-          const idx = js.indexOf(term);
-          if (idx !== -1) {
-            const context = js.slice(Math.max(0, idx - 200), idx + 200);
-            console.log(`\n[JS] Found "${term}" at offset ${idx}:`);
-            console.log(`  ...${context}...`);
-          }
-        }
-
-        // Look for the dropdown/select implementation
-        const selectTerms = [
-          "combobox",
-          "listbox",
-          "SelectTrigger",
-          "SelectContent",
-          "SelectItem",
-          "Select>",
-          "DropdownMenu",
-        ];
-        for (const term of selectTerms) {
-          const idx = js.indexOf(term);
-          if (idx !== -1) {
-            const context = js.slice(Math.max(0, idx - 150), idx + 150);
-            console.log(`\n[JS] Found "${term}" at offset ${idx}:`);
-            console.log(`  ...${context}...`);
-          }
-        }
-      }
-    }
-
-    // Also check if there are multiple JS chunks
-    const allJsMatches =
-      html.match(/src="(\/assets\/[^"]+\.js)"/g) || [];
-    console.log(`\n[JS] All JS assets: ${allJsMatches.length}`);
-    for (const m of allJsMatches) {
-      const path = m.match(/src="([^"]+)"/)?.[1];
-      if (path) console.log(`  ${path}`);
-    }
-
-    // Try to fetch the Frigate source for classification view from GitHub
-    console.log("\n" + "=".repeat(80));
-    console.log("FRIGATE VERSION");
-    console.log("=".repeat(80));
-
-    const versionResp = await httpGet("/api/version");
-    if (versionResp.status === 200) {
-      console.log(`[API] Frigate version: ${versionResp.body}`);
-    }
-
-    console.log("\n" + "=".repeat(80));
-    console.log("INSPECTION COMPLETE");
-    console.log("=".repeat(80));
-  } catch (err) {
-    console.error(`\n[ERROR] ${err.message}`);
-    console.error(err.stack);
-  } finally {
-    if (portForwardProc) {
-      console.log("\n[cleanup] Killing port-forward...");
-      portForwardProc.kill("SIGTERM");
-    }
-  }
-}
-
-main().catch(console.error);
--- a/scripts/gen_service_stacks.py
+++ b/scripts/gen_service_stacks.py
@ -1,511 +0,0 @@
-#!/usr/bin/env python3
-"""Generate Terragrunt service stack files for all app-level services."""
-import os
-import textwrap
-
-REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-# Each service: (module_name, source_dir, [(arg_name, var_expr), ...], tier)
-# var_expr is what goes on the right side of = in the module call.
-# If var_expr starts with "var.", it's a variable passthrough and we declare the variable.
-# If it's a literal string, we inline it.
-# Special: "LOCAL_TIER" means we use local.tiers.<tier>
-SERVICES = [
-    ("blog", "blog", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("descheduler", "descheduler", []),
-("f1-stream", "f1-stream", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-        ("turn_secret", "var.coturn_turn_secret"),
-        ("public_ip", "var.public_ip"),
-    ]),
-    ("coturn", "coturn", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:edge"),
-        ("turn_secret", "var.coturn_turn_secret"),
-        ("public_ip", "var.public_ip"),
-    ]),
-    ("hackmd", "hackmd", [
-        ("hackmd_db_password", "var.hackmd_db_password"),
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("kms", "kms", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("k8s-dashboard", "k8s-dashboard", [
-        ("tier", "LOCAL_TIER:cluster"),
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("client_certificate_secret_name", "var.client_certificate_secret_name"),
-    ]),
-    ("privatebin", "privatebin", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("reloader", "reloader", [
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("shadowsocks", "shadowsocks", [
-        ("password", "var.shadowsocks_password"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("city-guesser", "city-guesser", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("echo", "echo", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("url", "url-shortener", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("geolite_license_key", "var.url_shortener_geolite_license_key"),
-        ("api_key", "var.url_shortener_api_key"),
-        ("mysql_password", "var.url_shortener_mysql_password"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("webhook_handler", "webhook_handler", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("webhook_secret", "var.webhook_handler_secret"),
-        ("fb_verify_token", "var.webhook_handler_fb_verify_token"),
-        ("fb_page_token", "var.webhook_handler_fb_page_token"),
-        ("fb_app_secret", "var.webhook_handler_fb_app_secret"),
-        ("git_user", "var.webhook_handler_git_user"),
-        ("git_token", "var.webhook_handler_git_token"),
-        ("ssh_key", "var.webhook_handler_ssh_key"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("excalidraw", "excalidraw", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("travel_blog", "travel_blog", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("dashy", "dashy", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("send", "send", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("ytdlp", "youtube_dl", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-        ("openrouter_api_key", "var.openrouter_api_key"),
-        ("slack_bot_token", "var.slack_bot_token"),
-        ("slack_channel", "var.slack_channel"),
-    ]),
-    ("immich", "immich", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("postgresql_password", "var.immich_postgresql_password"),
-        ("frame_api_key", "var.immich_frame_api_key"),
-        ("homepage_token", 'var.homepage_credentials["immich"]["token"]'),
-        ("tier", "LOCAL_TIER:gpu"),
-    ]),
-    ("resume", "resume", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-        ("database_url", "var.resume_database_url"),
-        ("auth_secret", "var.resume_auth_secret"),
-        ("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
-    ]),
-    ("frigate", "frigate", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:gpu"),
-    ]),
-    ("paperless-ngx", "paperless-ngx", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("db_password", "var.paperless_db_password"),
-        ("homepage_username", 'var.homepage_credentials["paperless-ngx"]["username"]'),
-        ("homepage_password", 'var.homepage_credentials["paperless-ngx"]["password"]'),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("jsoncrack", "jsoncrack", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("servarr", "servarr", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-        ("aiostreams_database_connection_string", "var.aiostreams_database_connection_string"),
-    ]),
-    ("ollama", "ollama", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:gpu"),
-        ("ollama_api_credentials", "var.ollama_api_credentials"),
-    ]),
-    ("ntfy", "ntfy", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("cyberchef", "cyberchef", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("diun", "diun", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("diun_nfty_token", "var.diun_nfty_token"),
-        ("diun_slack_url", "var.diun_slack_url"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("meshcentral", "meshcentral", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("netbox", "netbox", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("nextcloud", "nextcloud", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("db_password", "var.nextcloud_db_password"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("homepage", "homepage", [
-        ("tier", "LOCAL_TIER:aux"),
-        ("tls_secret_name", "var.tls_secret_name"),
-    ]),
-    ("matrix", "matrix", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("linkwarden", "linkwarden", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("postgresql_password", "var.linkwarden_postgresql_password"),
-        ("authentik_client_id", "var.linkwarden_authentik_client_id"),
-        ("authentik_client_secret", "var.linkwarden_authentik_client_secret"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("actualbudget", "actualbudget", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:edge"),
-        ("credentials", "var.actualbudget_credentials"),
-    ]),
-    ("owntracks", "owntracks", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("owntracks_credentials", "var.owntracks_credentials"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("dawarich", "dawarich", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("database_password", "var.dawarich_database_password"),
-        ("geoapify_api_key", "var.geoapify_api_key"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("changedetection", "changedetection", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("tandoor", "tandoor", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tandoor_database_password", "var.tandoor_database_password"),
-        ("tandoor_email_password", "var.tandoor_email_password"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("n8n", "n8n", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("postgresql_password", "var.n8n_postgresql_password"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("real-estate-crawler", "real-estate-crawler", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("db_password", "var.realestate_crawler_db_password"),
-        ("notification_settings", "var.realestate_crawler_notification_settings"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("osm_routing", "osm-routing", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("tor-proxy", "tor-proxy", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("onlyoffice", "onlyoffice", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("db_password", "var.onlyoffice_db_password"),
-        ("jwt_token", "var.onlyoffice_jwt_token"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("forgejo", "forgejo", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("freshrss", "freshrss", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("navidrome", "navidrome", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("networking-toolbox", "networking-toolbox", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("tuya-bridge", "tuya-bridge", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:cluster"),
-        ("tiny_tuya_api_key", "var.tiny_tuya_api_key"),
-        ("tiny_tuya_api_secret", "var.tiny_tuya_api_secret"),
-        ("tiny_tuya_service_secret", "var.tiny_tuya_service_secret"),
-        ("slack_url", "var.tiny_tuya_slack_url"),
-    ]),
-    ("stirling-pdf", "stirling-pdf", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("isponsorblocktv", "isponsorblocktv", [
-        ("tier", "LOCAL_TIER:edge"),
-    ]),
-    ("ebook2audiobook", "ebook2audiobook", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:gpu"),
-    ]),
-    ("rybbit", "rybbit", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("clickhouse_password", "var.clickhouse_password"),
-        ("postgres_password", "var.clickhouse_postgres_password"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("wealthfolio", "wealthfolio", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("wealthfolio_password_hash", "var.wealthfolio_password_hash"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("speedtest", "speedtest", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-        ("db_password", "var.speedtest_db_password"),
-    ]),
-    ("freedify", "freedify", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-        ("additional_credentials", "var.freedify_credentials"),
-    ]),
-    ("affine", "affine", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("postgresql_password", "var.affine_postgresql_password"),
-        ("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("plotting-book", "plotting-book", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("health", "health", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("postgresql_password", "var.health_postgresql_password"),
-        ("secret_key", "var.health_secret_key"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("whisper", "whisper", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("tier", "LOCAL_TIER:gpu"),
-    ]),
-    ("grampsweb", "grampsweb", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("smtp_password", 'var.mailserver_accounts["info@viktorbarzin.me"]'),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-    ("openclaw", "openclaw", [
-        ("tls_secret_name", "var.tls_secret_name"),
-        ("ssh_key", "var.openclaw_ssh_key"),
-        ("skill_secrets", "var.openclaw_skill_secrets"),
-        ("gemini_api_key", "var.gemini_api_key"),
-        ("llama_api_key", "var.llama_api_key"),
-        ("brave_api_key", "var.brave_api_key"),
-        ("modal_api_key", "var.modal_api_key"),
-        ("tier", "LOCAL_TIER:aux"),
-    ]),
-]
-
-# Variable type overrides (var_name -> type declaration)
-VAR_TYPES = {
-    "tls_secret_name": "string",
-    "client_certificate_secret_name": "string",
-    "public_ip": "string",
-    "hackmd_db_password": "string",
-    "shadowsocks_password": "string",
-    "openrouter_api_key": "string",
-    "slack_bot_token": "string",
-    "slack_channel": "string",
-    "ollama_api_credentials": "string",
-    "clickhouse_password": "string",
-    "clickhouse_postgres_password": "string",
-    "wealthfolio_password_hash": "string",
-    "speedtest_db_password": "string",
-    "affine_postgresql_password": "string",
-    "health_postgresql_password": "string",
-    "health_secret_key": "string",
-    "gemini_api_key": "string",
-    "llama_api_key": "string",
-    "brave_api_key": "string",
-    "modal_api_key": "string",
-    "coturn_turn_secret": "string",
-    "onlyoffice_db_password": "string",
-    "onlyoffice_jwt_token": "string",
-    "resume_database_url": "string",
-    "resume_auth_secret": "string",
-    "nextcloud_db_password": "string",
-    "paperless_db_password": "string",
-    "diun_nfty_token": "string",
-    "diun_slack_url": "string",
-    "dawarich_database_password": "string",
-    "geoapify_api_key": "string",
-    "tandoor_database_password": "string",
-    "tandoor_email_password": "string",
-    "n8n_postgresql_password": "string",
-    "realestate_crawler_db_password": "string",
-    "immich_postgresql_password": "string",
-    "immich_frame_api_key": "string",
-    "linkwarden_postgresql_password": "string",
-    "linkwarden_authentik_client_id": "string",
-    "linkwarden_authentik_client_secret": "string",
-    "aiostreams_database_connection_string": "string",
-    "tiny_tuya_api_key": "string",
-    "tiny_tuya_api_secret": "string",
-    "tiny_tuya_service_secret": "string",
-    "tiny_tuya_slack_url": "string",
-    "url_shortener_geolite_license_key": "string",
-    "url_shortener_api_key": "string",
-    "url_shortener_mysql_password": "string",
-    "webhook_handler_secret": "string",
-    "webhook_handler_fb_verify_token": "string",
-    "webhook_handler_fb_page_token": "string",
-    "webhook_handler_fb_app_secret": "string",
-    "webhook_handler_git_user": "string",
-    "webhook_handler_git_token": "string",
-    "webhook_handler_ssh_key": "string",
-    "openclaw_ssh_key": "string",
-    "openclaw_skill_secrets": "map(string)",
-    "actualbudget_credentials": "map(any)",
-    "freedify_credentials": "map(any)",
-    "realestate_crawler_notification_settings": "map(string)",
-    "homepage_credentials": "map(any)",
-    "mailserver_accounts": "map(any)",
-    "owntracks_credentials": "string",
-}
-
-TERRAGRUNT_HCL = """\
-include "root" {
-  path = find_in_parent_folders()
-}
-
-dependency "platform" {
-  config_path  = "../platform"
-  skip_outputs = true
-}
-"""
-
-TIERS_BLOCK = """\
-locals {
-  tiers = {
-    core    = "0-core"
-    cluster = "1-cluster"
-    gpu     = "2-gpu"
-    edge    = "3-edge"
-    aux     = "4-aux"
-  }
-}
-"""
-
-
-def extract_var_name(expr):
-    """Extract variable name from var.xxx or var.xxx["yyy"]["zzz"]."""
-    if not expr.startswith("var."):
-        return None
-    # Get the base variable name (before any indexing)
-    name = expr[4:]
-    bracket = name.find("[")
-    if bracket != -1:
-        name = name[:bracket]
-    return name
-
-
-def gen_main_tf(mod_name, source_dir, args):
-    """Generate main.tf content for a service stack."""
-    lines = []
-
-    # Collect variables needed
-    vars_needed = {}
-    needs_tiers = False
-    for arg_name, var_expr in args:
-        if var_expr.startswith("LOCAL_TIER:"):
-            needs_tiers = True
-            continue
-        vname = extract_var_name(var_expr)
-        if vname and vname not in vars_needed:
-            vtype = VAR_TYPES.get(vname, None)
-            vars_needed[vname] = vtype
-
-    # Variable declarations
-    for vname, vtype in vars_needed.items():
-        if vtype:
-            lines.append(f'variable "{vname}" {{ type = {vtype} }}')
-        else:
-            lines.append(f'variable "{vname}" {{}}')
-
-    if vars_needed:
-        lines.append("")
-
-    # Tiers block if needed
-    if needs_tiers:
-        lines.append(TIERS_BLOCK)
-
-    # Module call
-    lines.append(f'module "{mod_name}" {{')
-    lines.append(f'  source = "../../modules/kubernetes/{source_dir}"')
-    for arg_name, var_expr in args:
-        if var_expr.startswith("LOCAL_TIER:"):
-            tier = var_expr.split(":")[1]
-            val = f"local.tiers.{tier}"
-        else:
-            val = var_expr
-        # Pad for alignment
-        lines.append(f"  {arg_name:30s} = {val}")
-    lines.append("}")
-    lines.append("")
-
-    return "\n".join(lines)
-
-
-def main():
-    stacks_dir = os.path.join(REPO_ROOT, "stacks")
-
-    for mod_name, source_dir, args in SERVICES:
-        # Use source_dir as the stack directory name for consistency
-        # But some modules have different names than source dirs
-        # Use the module name for the stack dir
-        stack_dir = os.path.join(stacks_dir, mod_name)
-        os.makedirs(stack_dir, exist_ok=True)
-
-        # terragrunt.hcl
-        tg_path = os.path.join(stack_dir, "terragrunt.hcl")
-        with open(tg_path, "w") as f:
-            f.write(TERRAGRUNT_HCL)
-
-        # main.tf
-        main_path = os.path.join(stack_dir, "main.tf")
-        with open(main_path, "w") as f:
-            f.write(gen_main_tf(mod_name, source_dir, args))
-
-        # secrets symlink
-        secrets_link = os.path.join(stack_dir, "secrets")
-        if not os.path.exists(secrets_link):
-            os.symlink("../../secrets", secrets_link)
-
-        print(f"  Created stacks/{mod_name}/")
-
-    print(f"\nGenerated {len(SERVICES)} service stacks")
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/graceful-db-maintenance.sh
+++ b/scripts/graceful-db-maintenance.sh
@ -1,143 +0,0 @@
-#!/usr/bin/env bash
-# graceful-db-maintenance.sh — Scale down/up dependents of a service
-# based on the dependency.kyverno.io/wait-for pod annotation.
-#
-# Usage:
-#   ./scripts/graceful-db-maintenance.sh shutdown mysql.dbaas
-#   # ... perform maintenance ...
-#   ./scripts/graceful-db-maintenance.sh startup mysql.dbaas
-#
-# The shutdown action saves original replica counts to a state file
-# so startup can restore them exactly.
-
-set -euo pipefail
-
-ACTION="${1:-}"
-SERVICE="${2:-}"
-STATE_DIR="/tmp"
-
-usage() {
-  echo "Usage: $0 <shutdown|startup> <service>"
-  echo ""
-  echo "Examples:"
-  echo "  $0 shutdown mysql.dbaas      # Scale down all MySQL dependents"
-  echo "  $0 startup  mysql.dbaas      # Restore all MySQL dependents"
-  echo "  $0 shutdown postgresql.dbaas  # Scale down all PostgreSQL dependents"
-  echo "  $0 shutdown redis.redis       # Scale down all Redis dependents"
-  exit 1
-}
-
-[[ -z "$ACTION" || -z "$SERVICE" ]] && usage
-[[ "$ACTION" != "shutdown" && "$ACTION" != "startup" ]] && usage
-
-STATE_FILE="${STATE_DIR}/dep-maintenance-$(echo "$SERVICE" | tr '.' '-').json"
-KUBECONFIG="${KUBECONFIG:-$(dirname "$0")/../config}"
-export KUBECONFIG
-
-# Find all pods with the dependency annotation containing our service
-find_dependent_owners() {
-  local service="$1"
-  kubectl get pods --all-namespaces \
-    -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.annotations.dependency\.kyverno\.io/wait-for}{"\t"}{.metadata.ownerReferences[0].kind}{"\t"}{.metadata.ownerReferences[0].name}{"\n"}{end}' \
-    2>/dev/null | \
-    grep "$service" | \
-    while IFS=$'\t' read -r ns annotation owner_kind owner_name; do
-      [[ -z "$owner_kind" || -z "$owner_name" ]] && continue
-      # Resolve ReplicaSet -> Deployment
-      if [[ "$owner_kind" == "ReplicaSet" ]]; then
-        deploy_name=$(kubectl get replicaset "$owner_name" -n "$ns" \
-          -o jsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true)
-        if [[ -n "$deploy_name" ]]; then
-          echo "Deployment/${deploy_name}/${ns}"
-        fi
-      elif [[ "$owner_kind" == "StatefulSet" ]]; then
-        echo "StatefulSet/${owner_name}/${ns}"
-      fi
-    done | sort -u
-}
-
-do_shutdown() {
-  echo "Finding dependents of $SERVICE..."
-  local owners
-  owners=$(find_dependent_owners "$SERVICE")
-
-  if [[ -z "$owners" ]]; then
-    echo "No dependents found for $SERVICE"
-    exit 0
-  fi
-
-  echo "Dependents found:"
-  echo "$owners" | while IFS='/' read -r kind name ns; do
-    echo "  $ns/$kind/$name"
-  done
-
-  # Save current replica counts
-  local state="[]"
-  while IFS='/' read -r kind name ns; do
-    replicas=$(kubectl get "$kind" "$name" -n "$ns" \
-      -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
-    state=$(echo "$state" | jq --arg kind "$kind" --arg name "$name" \
-      --arg ns "$ns" --argjson replicas "${replicas:-1}" \
-      '. + [{"kind": $kind, "name": $name, "namespace": $ns, "replicas": $replicas}]')
-  done <<< "$owners"
-
-  echo "$state" > "$STATE_FILE"
-  echo "Saved replica state to $STATE_FILE"
-
-  # Scale down
-  while IFS='/' read -r kind name ns; do
-    echo "Scaling $ns/$kind/$name to 0..."
-    kubectl scale "$kind" "$name" -n "$ns" --replicas=0
-  done <<< "$owners"
-
-  echo ""
-  echo "Waiting for pods to terminate..."
-  while IFS='/' read -r kind name ns; do
-    kubectl rollout status "$kind" "$name" -n "$ns" --timeout=120s 2>/dev/null || true
-  done <<< "$owners"
-
-  echo ""
-  echo "All dependents of $SERVICE scaled to 0."
-  echo "Run '$0 startup $SERVICE' after maintenance to restore."
-}
-
-do_startup() {
-  if [[ ! -f "$STATE_FILE" ]]; then
-    echo "Error: No state file found at $STATE_FILE"
-    echo "Did you run '$0 shutdown $SERVICE' first?"
-    exit 1
-  fi
-
-  echo "Restoring dependents of $SERVICE from $STATE_FILE..."
-
-  local count
-  count=$(jq length "$STATE_FILE")
-
-  for ((i = 0; i < count; i++)); do
-    kind=$(jq -r ".[$i].kind" "$STATE_FILE")
-    name=$(jq -r ".[$i].name" "$STATE_FILE")
-    ns=$(jq -r ".[$i].namespace" "$STATE_FILE")
-    replicas=$(jq -r ".[$i].replicas" "$STATE_FILE")
-
-    echo "Scaling $ns/$kind/$name to $replicas..."
-    kubectl scale "$kind" "$name" -n "$ns" --replicas="$replicas"
-  done
-
-  echo ""
-  echo "Waiting for rollouts..."
-  for ((i = 0; i < count; i++)); do
-    kind=$(jq -r ".[$i].kind" "$STATE_FILE")
-    name=$(jq -r ".[$i].name" "$STATE_FILE")
-    ns=$(jq -r ".[$i].namespace" "$STATE_FILE")
-    kubectl rollout status "$kind" "$name" -n "$ns" --timeout=300s 2>/dev/null || true
-  done
-
-  rm -f "$STATE_FILE"
-  echo ""
-  echo "All dependents of $SERVICE restored."
-}
-
-case "$ACTION" in
-  shutdown) do_shutdown ;;
-  startup)  do_startup ;;
-esac
--- a/scripts/image_pull.sh
+++ b/scripts/image_pull.sh
@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-for n in $(kubectl get nodes -o wide | grep node | awk '{print $1}'); do 
-    echo $n;
-    kubectl drain $n --ignore-daemonsets --delete-emptydir-data && \
-    ssh wizard@$n < image_pull_remote.sh
-    # Check result
-    kubectl get --raw "/api/v1/nodes/$n/proxy/configz" | jq '.kubeletconfig | {serializeImagePulls, maxParallelImagePulls}'
-    kubectl uncordon $n
-done
--- a/scripts/image_pull_remote.sh
+++ b/scripts/image_pull_remote.sh
@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-# Containerd
-sudo sed -i 's/.*max_concurrent_downloads.*/max_concurrent_downloads = 5/g' /etc/containerd/config.toml 
-sudo systemctl restart containerd
-
-# Kubelet
-#sed serializeImagePulls: false # Allow container images to be downloaded in parallel
-#maxParallelImagePulls: 20 # To limit the number of parallel image pulls.
-
-sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
-sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
-echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 5' | sudo tee -a /var/lib/kubelet/config.yaml
-sudo systemctl restart kubelet
--- a/scripts/k8s-apiserver-audit-policy.yaml
+++ b/scripts/k8s-apiserver-audit-policy.yaml
@ -1,57 +0,0 @@
-# kube-apiserver audit policy -- k8s-master (10.0.20.100), single control-plane.
-#
-# Goal: a durable "who/when/what" trail for MUTATIONS (create/update/patch/
-# delete) so resource deletions can be attributed even though direct
-# kubectl-to-apiserver calls otherwise leave no trace (see the 2026-06-06
-# novelapp incident: a dashboard delete was attributable, a direct-kubectl
-# recreate was not). Deployed OUTSIDE Terraform (the k8s VMs are not TF-managed,
-# see memory id=1575); this file is the source of truth, scp'd to
-# /etc/kubernetes/audit-policy.yaml and wired into the apiserver static-pod
-# manifest + the kubeadm-config ConfigMap (so "kubeadm upgrade" preserves it).
-#
-# Tuned for LOW WRITE VOLUME (the cluster's sdc HDD is write-sensitive, see
-# memory id=559): reads are dropped entirely, high-churn resources and probe
-# endpoints are dropped, and the verbose RequestReceived stage is omitted, so
-# only one Metadata-level line is written per mutating request.
-apiVersion: audit.k8s.io/v1
-kind: Policy
-# Only emit the post-execution stage -- halves volume vs logging both stages.
-omitStages:
-  - RequestReceived
-rules:
-  # 1. Never log read-only verbs -- the overwhelming majority of traffic and
-  #    irrelevant to "who changed/deleted X".
-  - level: None
-    verbs: ["get", "list", "watch"]
-
-  # 2. Drop high-churn / low-value resources even on writes.
-  - level: None
-    resources:
-      - group: ""
-        resources: ["events", "endpoints", "nodes/status", "pods/status"]
-      - group: "coordination.k8s.io"
-        resources: ["leases"]
-      - group: "discovery.k8s.io"
-        resources: ["endpointslices"]
-      - group: "metrics.k8s.io"
-      - group: "authentication.k8s.io"
-        resources: ["tokenreviews"]
-      - group: "authorization.k8s.io"
-        resources: ["subjectaccessreviews", "selfsubjectaccessreviews"]
-
-  # 3. Drop noisy non-resource probe / discovery URLs.
-  - level: None
-    nonResourceURLs:
-      - "/healthz*"
-      - "/readyz*"
-      - "/livez*"
-      - "/version"
-      - "/metrics"
-      - "/openapi/*"
-      - "/swagger*"
-
-  # 4. Everything else (every create/update/patch/delete on real resources):
-  #    record WHO (user + sourceIP + userAgent), WHAT (resource/namespace/name),
-  #    WHEN, and the verb -- at Metadata level (no request/response bodies, so
-  #    each entry stays small).
-  - level: Metadata
--- a/scripts/kill_ns.sh
+++ b/scripts/kill_ns.sh
@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-NAMESPACE=$1
-if [ -z "$NAMESPACE" ]; then
-	echo "Pass in parameter namespace"
-	exit 1
-fi
-kubectl proxy &
-kubectl get namespace $NAMESPACE -o json |jq '.spec = {"finalizers":[]}' > /tmp/kill_rogue_ns.json
-curl -k -H "Content-Type: application/json" -X PUT --data-binary @/tmp/kill_rogue_ns.json 127.0.0.1:8001/api/v1/namespaces/$NAMESPACE/finalize
-kill %1
--- a/scripts/lvm-pvc-snapshot.sh
+++ b/scripts/lvm-pvc-snapshot.sh
@ -1,469 +0,0 @@
-#!/usr/bin/env bash
-# lvm-pvc-snapshot — LVM thin snapshot management for Proxmox CSI PVCs
-# Deploy to PVE host at /usr/local/bin/lvm-pvc-snapshot
-set -euo pipefail
-
-# --- Configuration ---
-VG="pve"
-THINPOOL="data"
-SNAP_SUFFIX_FORMAT="%Y%m%d_%H%M"
-RETENTION_DAYS=7
-MIN_FREE_PCT=10
-PUSHGATEWAY="${LVM_SNAP_PUSHGATEWAY:-http://10.0.20.100:30091}"
-PUSHGATEWAY_JOB="lvm-pvc-snapshot"
-LOCKFILE="/run/lvm-pvc-snapshot.lock"
-KUBECONFIG="${KUBECONFIG:-/root/.kube/config}"
-export KUBECONFIG
-
-# Namespaces to exclude from snapshots (high-churn, have app-level dumps)
-# These PVCs cause significant CoW write amplification (~36% overhead)
-EXCLUDE_NAMESPACES="${LVM_SNAP_EXCLUDE_NS:-dbaas,monitoring}"
-
-# --- Logging ---
-log()  { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
-warn() { log "WARN: $*" >&2; }
-die()  { log "FATAL: $*" >&2; exit 1; }
-
-# --- Helpers ---
-
-get_thinpool_free_pct() {
-    local data_pct
-    data_pct=$(lvs --noheadings --nosuffix -o data_percent "${VG}/${THINPOOL}" 2>/dev/null | tr -d ' ')
-    echo "scale=2; 100 - ${data_pct}" | bc
-}
-
-build_exclude_lv_list() {
-    # Query K8s for PVs in excluded namespaces, extract their LV names
-    if [[ -z "${EXCLUDE_NAMESPACES}" ]] || ! command -v kubectl &>/dev/null; then
-        return
-    fi
-    kubectl get pv -o json 2>/dev/null | jq -r --arg ns "${EXCLUDE_NAMESPACES}" '
-        ($ns | split(",")) as $excl |
-        .items[] |
-        select(.spec.csi.driver == "csi.proxmox.sinextra.dev") |
-        select(.spec.claimRef.namespace as $n | $excl | index($n)) |
-        .spec.csi.volumeHandle | split("/") | last
-    ' 2>/dev/null || true
-}
-
-discover_pvc_lvs() {
-    # List thin LVs matching PVC pattern, excluding snapshots, pre-restore backups,
-    # and LVs belonging to excluded namespaces (high-churn databases/metrics)
-    local all_lvs exclude_lvs
-    all_lvs=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
-        | awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
-        | grep -E '^vm-[0-9]+-pvc-' \
-        | grep -v '_snap_' \
-        | grep -v '_pre_restore_')
-
-    exclude_lvs=$(build_exclude_lv_list)
-
-    if [[ -n "${exclude_lvs}" ]]; then
-        # Filter out excluded LVs
-        local exclude_pattern
-        exclude_pattern=$(echo "${exclude_lvs}" | paste -sd'|' -)
-        echo "${all_lvs}" | grep -vE "(${exclude_pattern})" || true
-    else
-        echo "${all_lvs}"
-    fi
-}
-
-list_snapshots() {
-    lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
-        | awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
-        | grep '_snap_' || true
-}
-
-parse_snap_timestamp() {
-    # Extract YYYYMMDD_HHMM from snapshot name, convert to epoch
-    local snap_name="$1"
-    local ts_str
-    ts_str=$(echo "${snap_name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
-    if [[ -z "${ts_str}" ]]; then
-        echo "0"
-        return
-    fi
-    local ymd="${ts_str:0:8}"
-    local hm="${ts_str:9:4}"
-    date -d "${ymd:0:4}-${ymd:4:2}-${ymd:6:2} ${hm:0:2}:${hm:2:2}" +%s 2>/dev/null || echo "0"
-}
-
-get_original_lv_from_snap() {
-    # vm-200-pvc-abc_snap_20260403_1200 -> vm-200-pvc-abc
-    echo "$1" | sed 's/_snap_[0-9]\{8\}_[0-9]\{4\}$//'
-}
-
-push_metrics() {
-    local status="$1" created="$2" failed="$3" pruned="$4"
-    local free_pct
-    free_pct=$(get_thinpool_free_pct)
-
-    cat <<METRICS | curl -sf --connect-timeout 5 --max-time 10 --data-binary @- \
-        "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || warn "Failed to push metrics to Pushgateway"
-# HELP lvm_snapshot_last_run_timestamp Unix timestamp of last snapshot run
-# TYPE lvm_snapshot_last_run_timestamp gauge
-lvm_snapshot_last_run_timestamp $(date +%s)
-# HELP lvm_snapshot_last_status Exit status (0=success, 1=partial failure, 2=aborted)
-# TYPE lvm_snapshot_last_status gauge
-lvm_snapshot_last_status ${status}
-# HELP lvm_snapshot_created_total Number of snapshots created in last run
-# TYPE lvm_snapshot_created_total gauge
-lvm_snapshot_created_total ${created}
-# HELP lvm_snapshot_failed_total Number of snapshot failures in last run
-# TYPE lvm_snapshot_failed_total gauge
-lvm_snapshot_failed_total ${failed}
-# HELP lvm_snapshot_pruned_total Number of snapshots pruned in last run
-# TYPE lvm_snapshot_pruned_total gauge
-lvm_snapshot_pruned_total ${pruned}
-# HELP lvm_snapshot_thinpool_free_pct Thin pool free percentage
-# TYPE lvm_snapshot_thinpool_free_pct gauge
-lvm_snapshot_thinpool_free_pct ${free_pct}
-METRICS
-}
-
-# --- Subcommands ---
-
-cmd_snapshot() {
-    log "Starting PVC LVM thin snapshot run"
-
-    # Check thin pool free space
-    local free_pct
-    free_pct=$(get_thinpool_free_pct)
-    log "Thin pool free space: ${free_pct}%"
-    if (( $(echo "${free_pct} < ${MIN_FREE_PCT}" | bc -l) )); then
-        warn "Thin pool has only ${free_pct}% free (minimum: ${MIN_FREE_PCT}%). Aborting."
-        push_metrics 2 0 0 0
-        exit 1
-    fi
-
-    # Discover PVC LVs
-    local lvs_list
-    lvs_list=$(discover_pvc_lvs)
-    if [[ -z "${lvs_list}" ]]; then
-        warn "No PVC LVs found matching pattern"
-        push_metrics 2 0 0 0
-        exit 1
-    fi
-
-    local count=0 failed=0 total
-    total=$(echo "${lvs_list}" | wc -l | tr -d ' ')
-    local snap_ts
-    snap_ts=$(date +"${SNAP_SUFFIX_FORMAT}")
-
-    log "Found ${total} PVC LVs to snapshot"
-
-    while IFS= read -r lv; do
-        local snap_name="${lv}_snap_${snap_ts}"
-        if lvcreate -s -kn -n "${snap_name}" "${VG}/${lv}" >/dev/null 2>&1; then
-            log "  Created: ${snap_name}"
-            count=$((count + 1))
-        else
-            warn "  Failed to create snapshot for ${lv}"
-            failed=$((failed + 1))
-        fi
-    done <<< "${lvs_list}"
-
-    log "Snapshot run complete: ${count} created, ${failed} failed out of ${total}"
-
-    # Auto-prune
-    log "Running auto-prune..."
-    local pruned
-    pruned=$(cmd_prune_count)
-
-    # Determine status
-    local status=0
-    if (( failed > 0 && count > 0 )); then
-        status=1  # partial
-    elif (( failed > 0 && count == 0 )); then
-        status=2  # all failed
-    fi
-
-    push_metrics "${status}" "${count}" "${failed}" "${pruned}"
-    log "Done"
-}
-
-cmd_list() {
-    printf "%-45s %-50s %8s %8s\n" "ORIGINAL LV" "SNAPSHOT" "AGE" "DATA%"
-    printf "%-45s %-50s %8s %8s\n" "-----------" "--------" "---" "-----"
-
-    local now
-    now=$(date +%s)
-
-    local snap_lines
-    snap_lines=$(lvs --noheadings --nosuffix -o lv_name,lv_size,data_percent "${VG}" 2>/dev/null \
-        | grep -E '_snap_|_pre_restore_' || true)
-
-    if [[ -z "${snap_lines}" ]]; then
-        echo "(no snapshots found)"
-        return
-    fi
-
-    echo "${snap_lines}" | while read -r name size data_pct; do
-            local original age_str ts epoch
-            if [[ "${name}" == *"_pre_restore_"* ]]; then
-                original=$(echo "${name}" | sed 's/_pre_restore_[0-9]\{8\}_[0-9]\{4\}$//')
-                ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
-            else
-                original=$(get_original_lv_from_snap "${name}")
-                ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$')
-            fi
-            epoch=$(parse_snap_timestamp "${name}")
-            if (( epoch > 0 )); then
-                local age_s=$(( now - epoch ))
-                local days=$(( age_s / 86400 ))
-                local hours=$(( (age_s % 86400) / 3600 ))
-                age_str="${days}d${hours}h"
-            else
-                age_str="unknown"
-            fi
-            printf "%-45s %-50s %8s %7s%%\n" "${original}" "${name}" "${age_str}" "${data_pct}"
-        done
-}
-
-cmd_prune() {
-    local pruned
-    pruned=$(cmd_prune_count)
-    log "Pruned ${pruned} expired snapshots"
-}
-
-cmd_prune_count() {
-    # NOTE: stdout of this function is captured by callers (`pruned=$(cmd_prune_count)`),
-    # so all log/warn output must go to stderr — the only thing on stdout is the count.
-    local now cutoff pruned=0
-    now=$(date +%s)
-    cutoff=$(( now - RETENTION_DAYS * 86400 ))
-
-    local snaps
-    snaps=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \
-        | awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \
-        | grep -E '_snap_|_pre_restore_' || true)
-
-    if [[ -z "${snaps}" ]]; then
-        echo "0"
-        return
-    fi
-
-    while IFS= read -r snap; do
-        local epoch
-        epoch=$(parse_snap_timestamp "${snap}")
-        if (( epoch > 0 && epoch < cutoff )); then
-            if lvremove -f "${VG}/${snap}" >/dev/null 2>&1; then
-                log "  Pruned: ${snap}" >&2
-                pruned=$((pruned + 1))
-            else
-                warn "  Failed to prune: ${snap}"
-            fi
-        fi
-    done <<< "${snaps}"
-
-    echo "${pruned}"
-}
-
-cmd_restore() {
-    local pvc_lv="${1:-}" snapshot_lv="${2:-}"
-
-    if [[ -z "${pvc_lv}" || -z "${snapshot_lv}" ]]; then
-        die "Usage: $0 restore <pvc-lv-name> <snapshot-lv-name>"
-    fi
-
-    # Validate LVs exist
-    if ! lvs "${VG}/${pvc_lv}" >/dev/null 2>&1; then
-        die "PVC LV '${pvc_lv}' not found in VG '${VG}'"
-    fi
-    if ! lvs "${VG}/${snapshot_lv}" >/dev/null 2>&1; then
-        die "Snapshot LV '${snapshot_lv}' not found in VG '${VG}'"
-    fi
-
-    # Discover K8s context
-    log "Discovering Kubernetes context for LV '${pvc_lv}'..."
-
-    local volume_handle="local-lvm:${pvc_lv}"
-    local pv_info
-    pv_info=$(kubectl get pv -o json 2>/dev/null | jq -r \
-        --arg vh "${volume_handle}" \
-        '.items[] | select(.spec.csi.volumeHandle == $vh) | "\(.metadata.name) \(.spec.claimRef.namespace) \(.spec.claimRef.name)"' \
-    ) || die "Failed to query PVs (is kubectl configured?)"
-
-    if [[ -z "${pv_info}" ]]; then
-        die "No PV found with volumeHandle '${volume_handle}'"
-    fi
-
-    local pv_name pvc_ns pvc_name
-    read -r pv_name pvc_ns pvc_name <<< "${pv_info}"
-    log "Found: PV=${pv_name}, PVC=${pvc_ns}/${pvc_name}"
-
-    # Find the workload (Deployment or StatefulSet) that uses this PVC
-    local workload_type="" workload_name="" original_replicas=""
-
-    # Check StatefulSets first (databases use these)
-    local sts_info
-    sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
-        --arg pvc "${pvc_name}" \
-        '.items[] | select(
-            (.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc) or
-            (.spec.volumeClaimTemplates // [] | .[].metadata.name as $vct |
-                .spec.replicas as $r | range($r) | "\($vct)-\(.metadata.name)-\(.)" ) == $pvc
-        ) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
-    ) || true
-
-    # If not found via simple volume check, try matching VCT naming pattern
-    if [[ -z "${sts_info}" ]]; then
-        sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
-            --arg pvc "${pvc_name}" \
-            '.items[] | .metadata.name as $sts | .spec.replicas as $r |
-            select(.spec.volumeClaimTemplates != null) |
-            .spec.volumeClaimTemplates[].metadata.name as $vct |
-            [range($r)] | map("\($vct)-\($sts)-\(.)") |
-            if any(. == $pvc) then "\($sts) \($r)" else empty end' 2>/dev/null \
-        ) || true
-    fi
-
-    if [[ -n "${sts_info}" ]]; then
-        read -r workload_name original_replicas <<< "${sts_info}"
-        workload_type="statefulset"
-    else
-        # Check Deployments
-        local deploy_info
-        deploy_info=$(kubectl get deployment -n "${pvc_ns}" -o json 2>/dev/null | jq -r \
-            --arg pvc "${pvc_name}" \
-            '.items[] | select(
-                .spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc
-            ) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \
-        ) || true
-
-        if [[ -n "${deploy_info}" ]]; then
-            read -r workload_name original_replicas <<< "${deploy_info}"
-            workload_type="deployment"
-        fi
-    fi
-
-    if [[ -z "${workload_type}" ]]; then
-        warn "Could not auto-discover workload for PVC '${pvc_name}' in namespace '${pvc_ns}'."
-        warn "You may need to scale down the pod manually."
-        echo ""
-        read -rp "Continue with LV swap anyway? (yes/no): " confirm
-        [[ "${confirm}" == "yes" ]] || die "Aborted by user"
-        workload_type="manual"
-    fi
-
-    # Dry-run output
-    local backup_name="${pvc_lv}_pre_restore_$(date +"${SNAP_SUFFIX_FORMAT}")"
-    echo ""
-    echo "╔══════════════════════════════════════════════════════════════╗"
-    echo "║                    RESTORE DRY-RUN                         ║"
-    echo "╠══════════════════════════════════════════════════════════════╣"
-    echo "║ PVC:       ${pvc_ns}/${pvc_name}"
-    echo "║ PV:        ${pv_name}"
-    if [[ "${workload_type}" != "manual" ]]; then
-        echo "║ Workload:  ${workload_type}/${workload_name} (replicas: ${original_replicas}→0→${original_replicas})"
-    fi
-    echo "║"
-    echo "║ Actions:"
-    if [[ "${workload_type}" != "manual" ]]; then
-        echo "║   1. Scale ${workload_type}/${workload_name} to 0 replicas"
-        echo "║   2. Wait for pod termination"
-    fi
-    echo "║   3. Rename ${pvc_lv} → ${backup_name}"
-    echo "║   4. Rename ${snapshot_lv} → ${pvc_lv}"
-    if [[ "${workload_type}" != "manual" ]]; then
-        echo "║   5. Scale ${workload_type}/${workload_name} back to ${original_replicas} replicas"
-    fi
-    echo "╚══════════════════════════════════════════════════════════════╝"
-    echo ""
-
-    # Interactive confirmation
-    read -rp "Type 'yes' to proceed with restore: " confirm
-    if [[ "${confirm}" != "yes" ]]; then
-        die "Aborted by user"
-    fi
-
-    # Scale down
-    if [[ "${workload_type}" != "manual" ]]; then
-        log "Scaling ${workload_type}/${workload_name} to 0 replicas..."
-        kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas=0
-
-        log "Waiting for pod termination (timeout: 120s)..."
-        kubectl wait --for=delete pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
-        kubectl wait --for=delete pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \
-            warn "Timeout waiting for pods — continuing anyway (LV may still be in use)"
-        sleep 5  # extra grace period for device detach
-    fi
-
-    # Verify LV is not active
-    local lv_active
-    lv_active=$(lvs --noheadings -o lv_active "${VG}/${pvc_lv}" 2>/dev/null | tr -d ' ')
-    if [[ "${lv_active}" == "active" ]]; then
-        warn "LV ${pvc_lv} is still active. Attempting to deactivate..."
-        # Close any LUKS mapper on the LV before deactivation
-        if dmsetup ls 2>/dev/null | grep -q "${pvc_lv}"; then
-            log "Closing LUKS mapper for ${pvc_lv}..."
-            cryptsetup luksClose "${pvc_lv}" 2>/dev/null || true
-        fi
-        lvchange -an "${VG}/${pvc_lv}" 2>/dev/null || warn "Could not deactivate — proceeding with caution"
-    fi
-
-    # LV swap
-    log "Renaming ${pvc_lv} → ${backup_name}"
-    lvrename "${VG}" "${pvc_lv}" "${backup_name}" || die "Failed to rename original LV"
-
-    log "Renaming ${snapshot_lv} → ${pvc_lv}"
-    lvrename "${VG}" "${snapshot_lv}" "${pvc_lv}" || die "Failed to rename snapshot LV"
-
-    # Scale back up
-    if [[ "${workload_type}" != "manual" ]]; then
-        log "Scaling ${workload_type}/${workload_name} back to ${original_replicas} replicas..."
-        kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas="${original_replicas}"
-
-        log "Waiting for pod to become Ready (timeout: 300s)..."
-        kubectl wait --for=condition=Ready pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
-        kubectl wait --for=condition=Ready pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \
-            warn "Timeout waiting for pod Ready — check manually"
-    fi
-
-    echo ""
-    log "Restore complete!"
-    log "Old data preserved as: ${backup_name}"
-    log "To delete old data after verification: lvremove -f ${VG}/${backup_name}"
-}
-
-# --- Main ---
-
-usage() {
-    cat <<EOF
-Usage: $(basename "$0") <command> [args]
-
-Commands:
-  snapshot              Create thin snapshots of all PVC LVs
-  list                  List existing snapshots with age and data%
-  prune                 Remove snapshots older than ${RETENTION_DAYS} days
-  restore <lv> <snap>   Restore a PVC from a snapshot (interactive)
-
-Environment:
-  LVM_SNAP_PUSHGATEWAY  Pushgateway URL (default: ${PUSHGATEWAY})
-  KUBECONFIG            Kubeconfig path (default: /root/.kube/config)
-EOF
-}
-
-main() {
-    local cmd="${1:-}"
-    shift || true
-
-    # Acquire lock (except for list which is read-only)
-    if [[ "${cmd}" != "list" && "${cmd}" != "" && "${cmd}" != "help" && "${cmd}" != "--help" && "${cmd}" != "-h" ]]; then
-        exec 200>"${LOCKFILE}"
-        if ! flock -n 200; then
-            die "Another instance is already running (lockfile: ${LOCKFILE})"
-        fi
-    fi
-
-    case "${cmd}" in
-        snapshot) cmd_snapshot ;;
-        list)     cmd_list ;;
-        prune)    cmd_prune ;;
-        restore)  cmd_restore "$@" ;;
-        help|--help|-h|"") usage ;;
-        *) die "Unknown command: ${cmd}. Run '$0 help' for usage." ;;
-    esac
-}
-
-main "$@"
--- a/scripts/lvm-pvc-snapshot.timer
+++ b/scripts/lvm-pvc-snapshot.timer
@ -1,10 +0,0 @@
-[Unit]
-Description=Daily LVM thin snapshots of Proxmox CSI PVCs
-
-[Timer]
-OnCalendar=*-*-* 03:00:00
-Persistent=true
-RandomizedDelaySec=300
-
-[Install]
-WantedBy=timers.target
--- a/scripts/migrate-state-to-pg
+++ b/scripts/migrate-state-to-pg
@ -1,117 +0,0 @@
-#!/usr/bin/env bash
-# scripts/migrate-state-to-pg — One-shot migration from local SOPS state to PG backend.
-# Prerequisites: vault login -method=oidc, PG terraform_state DB exists, Vault static role created.
-# Usage: scripts/migrate-state-to-pg [--dry-run]
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-SYNC="$REPO_ROOT/scripts/state-sync"
-STACKS_DIR="$REPO_ROOT/stacks"
-STATE_DIR="$REPO_ROOT/state/stacks"
-
-TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
-is_tier0() {
-  echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
-}
-
-DRY_RUN=false
-[ "${1:-}" = "--dry-run" ] && DRY_RUN=true
-
-# Fetch PG credentials from Vault
-echo "==> Fetching PG credentials from Vault..."
-PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state) || {
-  echo "ERROR: Cannot read PG credentials. Run: vault login -method=oidc" >&2
-  exit 1
-}
-PG_USER=$(echo "$PG_CREDS" | jq -r .data.username)
-PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password)
-export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
-echo "    PG_CONN_STR set (user: $PG_USER)"
-
-# Enable provider cache
-export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
-export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
-mkdir -p "$TF_PLUGIN_CACHE_DIR"
-
-migrated=0
-failed=0
-skipped=0
-failed_stacks=""
-
-# Increment helpers (avoid arithmetic exit code 1 when value is 0)
-inc_migrated() { migrated=$((migrated + 1)); }
-inc_failed()   { failed=$((failed + 1)); }
-inc_skipped()  { skipped=$((skipped + 1)); }
-
-# Iterate over all stack directories that have state
-for state_dir in "$STATE_DIR"/*/; do
-  stack="$(basename "$state_dir")"
-
-  # Skip Tier 0
-  if is_tier0 "$stack"; then
-    echo "--- SKIP (Tier 0): $stack"
-    inc_skipped
-    continue
-  fi
-
-  # Skip stacks with no state file
-  if [ ! -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
-    echo "--- SKIP (no state): $stack"
-    inc_skipped
-    continue
-  fi
-
-  # Skip stacks with no corresponding stack directory
-  if [ ! -d "$STACKS_DIR/$stack" ]; then
-    echo "--- SKIP (no stack dir): $stack"
-    inc_skipped
-    continue
-  fi
-
-  echo "==> Migrating: $stack"
-
-  if $DRY_RUN; then
-    echo "    [dry-run] Would migrate $stack"
-    inc_skipped
-    continue
-  fi
-
-  # Decrypt state if needed (call decrypt_state directly — state-sync skips Tier 1)
-  if [ -f "$state_dir/terraform.tfstate.enc" ] && [ ! -f "$state_dir/terraform.tfstate" ]; then
-    sops -d --input-type json --output-type json "$state_dir/terraform.tfstate.enc" > "$state_dir/terraform.tfstate" || {
-      echo "    WARNING: decrypt failed, skipping"
-      inc_skipped
-      continue
-    }
-  fi
-
-  # Migrate state
-  cd "$STACKS_DIR/$stack"
-  if terragrunt init -upgrade -migrate-state -force-copy -input=false 2>&1 | tee /tmp/tg-migrate-$stack.log; then
-    echo "    init OK"
-
-    # Verify — plan should show no changes
-    if terragrunt plan -detailed-exitcode -input=false 2>&1 | tail -5 | grep -q "No changes"; then
-      echo "    plan OK — no drift"
-      inc_migrated
-    else
-      echo "    WARNING: plan shows changes (may be normal drift, not migration issue)"
-      inc_migrated
-    fi
-  else
-    echo "    FAILED: init error (see /tmp/tg-migrate-$stack.log)"
-    inc_failed
-    failed_stacks="$failed_stacks $stack"
-  fi
-done
-
-echo ""
-echo "========================================"
-echo "Migration complete"
-echo "  Migrated: $migrated"
-echo "  Failed:   $failed"
-echo "  Skipped:  $skipped"
-if [ -n "$failed_stacks" ]; then
-  echo "  Failed stacks:$failed_stacks"
-fi
-echo "========================================"
--- a/scripts/migrate_service_state.sh
+++ b/scripts/migrate_service_state.sh
@ -1,112 +0,0 @@
-#!/bin/bash
-# Phase 3: Migrate all service module state from root to individual stacks
-# Each module in root state is at: module.kubernetes_cluster.module.<name>["<name>"]
-# Target: state/stacks/<name>/terraform.tfstate as module.<name>
-
-set -euo pipefail
-
-ROOT_STATE="$(pwd)/terraform.tfstate"
-STATE_DIR="$(pwd)/state/stacks"
-
-# All 64 service modules currently in root state
-MODULES=(
-  actualbudget
-  affine
-  blog
-  changedetection
-  city-guesser
-  coturn
-  cyberchef
-  dashy
-  dawarich
-  descheduler
-  diun
-ebook2audiobook
-  echo
-  excalidraw
-  f1-stream
-  forgejo
-  freedify
-  freshrss
-  frigate
-  hackmd
-  health
-  homepage
-  immich
-  isponsorblocktv
-  jsoncrack
-  kms
-  linkwarden
-  matrix
-  meshcentral
-  n8n
-  navidrome
-  netbox
-  networking-toolbox
-  nextcloud
-  ntfy
-  ollama
-  onlyoffice
-  openclaw
-  osm_routing
-  owntracks
-  paperless-ngx
-  plotting-book
-  privatebin
-  real-estate-crawler
-  reloader
-  resume
-  rybbit
-  send
-  servarr
-  shadowsocks
-  speedtest
-  stirling-pdf
-  tandoor
-  tor-proxy
-  travel_blog
-  tuya-bridge
-  url
-  wealthfolio
-  webhook_handler
-  whisper
-  ytdlp
-)
-
-TOTAL=${#MODULES[@]}
-SUCCESS=0
-FAIL=0
-
-echo "=== Phase 3: Service State Migration ==="
-echo "Migrating $TOTAL modules from root state to individual stacks"
-echo ""
-
-for mod in "${MODULES[@]}"; do
-  idx=$((SUCCESS + FAIL + 1))
-  echo "[$idx/$TOTAL] Migrating: $mod"
-
-  # Create state directory
-  mkdir -p "$STATE_DIR/$mod"
-
-  # Source address (with for_each key)
-  SRC="module.kubernetes_cluster.module.${mod}[\"${mod}\"]"
-  DST="module.${mod}"
-  DST_STATE="$STATE_DIR/$mod/terraform.tfstate"
-
-  if terraform state mv \
-    -state="$ROOT_STATE" \
-    -state-out="$DST_STATE" \
-    "$SRC" "$DST" 2>&1; then
-    echo "  ✓ $mod migrated successfully"
-    SUCCESS=$((SUCCESS + 1))
-  else
-    echo "  ✗ $mod FAILED"
-    FAIL=$((FAIL + 1))
-  fi
-  echo ""
-done
-
-echo "=== Migration Summary ==="
-echo "Total:   $TOTAL"
-echo "Success: $SUCCESS"
-echo "Failed:  $FAIL"
--- a/scripts/nfs-change-tracker.service
+++ b/scripts/nfs-change-tracker.service
@ -1,19 +0,0 @@
-[Unit]
-Description=Track NFS filesystem changes for incremental offsite backup
-After=local-fs.target
-
-[Service]
-Type=simple
-ExecStart=/usr/bin/inotifywait -m -r \
-    --format '%%w%%f' \
-    -e create -e modify -e moved_to -e delete \
-    --exclude '(/\..*swp$|/\.nfs|/\.Trash|\.db-shm$|\.db-wal$|\.db-journal$|/stats/.*\.stat$|^/srv/nfs/anca-elements/)' \
-    /srv/nfs \
-    /srv/nfs-ssd
-StandardOutput=append:/mnt/backup/.nfs-changes.log
-StandardError=journal
-Restart=always
-RestartSec=10
-
-[Install]
-WantedBy=multi-user.target
--- a/scripts/nfs-mirror.service
+++ b/scripts/nfs-mirror.service
@ -1,15 +0,0 @@
-[Unit]
-Description=Mirror /srv/nfs (selective) to /mnt/backup (local 2nd copy of critical NFS)
-After=network-online.target local-fs.target
-Wants=network-online.target
-
-[Service]
-Type=oneshot
-ExecStart=/usr/local/bin/nfs-mirror
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=nfs-mirror
-# Heavy sustained IO — don't compete with foreground services.
-Nice=10
-IOSchedulingClass=idle
-TimeoutStartSec=18000
--- a/scripts/nfs-mirror.sh
+++ b/scripts/nfs-mirror.sh
@ -1,179 +0,0 @@
-#!/usr/bin/env bash
-# nfs-mirror — local 2nd copy of /srv/nfs (selective) → /mnt/backup
-#
-# Deploy to PVE host at /usr/local/bin/nfs-mirror.
-# Schedule: weekly Mon 04:00 via nfs-mirror.timer.
-#
-# ROLE in the 3-2-1 strategy:
-#   Copy 1 (sdc):       /srv/nfs/* (live PVE NFS)
-#   Copy 2 (sda, this): /mnt/backup/<svc>/  ← this script
-#   Copy 3 (Synology):  /Backup/Viki/nfs/  (via offsite-sync-backup + inotify)
-#
-# Replaces the dedicated anca-elements-mirror script; same disk, same
-# destination layout (anca-elements lives at /mnt/backup/anca-elements/),
-# but now covers every other critical NFS subtree in one pass.
-#
-# SKIP-LIST rationale (2026-05-26 simplification; REGENERABLE-SERVICE
-# CARVE-OUT added 2026-06-01 — see below):
-#   immich  — 1.5T, doesn't fit on sda; offsite-sync ships it direct to Synology
-#   frigate — camera ring buffer; intentionally NOT backed up anywhere
-#   temp    — scratch; intentionally NOT backed up
-#
-# 2026-06-01 carve-out: the offsite Synology (5.3T) hit 97% and the
-# `Backup` share had grown +670G in a week — traced to the 2026-05-26
-# change that started mirroring large *regenerable* services to sda and
-# thence to Synology pve-backup/. These are now re-excluded because they
-# cost offsite capacity for data we can rebuild on demand:
-#   ollama          (20G) — LLM model blobs, re-pullable
-#   prometheus-backup (64G) — metrics TSDB snapshots; was offsite-excluded
-#                             pre-2026-05-26 by original intent
-#   audiblez        (24G) — generated audiobooks, re-derivable from ebooks
-#   ebook2audiobook (11G) — same, generation output
-# Their live copy stays on sdc (/srv/nfs); only the sda + Synology copies
-# are dropped. `*-backup` DB dumps (sqlite-backup et al.) are intentionally
-# KEPT — they are real database safety copies, not regenerable.
-#
-# Note: /srv/nfs-ssd is intentionally NOT mirrored — its dirs (immich,
-# ollama, llamacpp) go direct to Synology nfs-ssd/ via offsite-sync
-# Step 2, which (also 2026-06-01) was narrowed to immich-only so ollama
-# + llamacpp on the SSD stop reaching Synology too.
-
-set -euo pipefail
-
-SRC=/srv/nfs/
-DST=/mnt/backup/
-LOG=/var/log/nfs-mirror.log
-LOCKFILE=/run/nfs-mirror.lock
-# Manifest of files changed under /mnt/backup since the last offsite-sync.
-# offsite-sync-backup Step 1 reads this and rsyncs the listed files to Synology
-# pve-backup/ on its next daily run. Without populating it, nfs-mirror's writes
-# would only reach Synology via the monthly full sync (1st-7th of month), and
-# the monthly --delete pass would also wipe any pre-positioned data.
-MANIFEST=/mnt/backup/.changed-files
-PUSHGATEWAY="${NFS_MIRROR_PUSHGATEWAY:-http://10.0.20.100:30091}"
-PUSHGATEWAY_JOB=nfs-mirror
-
-EXCLUDES=(
-    # ---- /mnt/backup subtrees owned by daily-backup — leave alone ----
-    --exclude='/pvc-data/'
-    --exclude='/sqlite-backup/'
-    --exclude='/pfsense/'
-    --exclude='/pve-config/'
-    --exclude='/lost+found/'
-
-    # ---- state files used by other backup jobs ----
-    --exclude='/.changed-files'
-    --exclude='/.last-offsite-sync'
-    --exclude='/.lv-pvc-mapping.json'
-    --exclude='/.nfs-changes.log'
-
-    # ---- anca-elements: now in Immich (canonical), /mnt/backup copy deleted
-    # 2026-05-26. Kept in excludes so nfs-mirror doesn't re-populate from sdc
-    # if /srv/nfs/anca-elements is ever re-attached.
-    --exclude='/anca-elements/'
-
-    # ---- NFS paths intentionally NOT backed up ----
-    --exclude='/immich/'   # 1.5T — ships sdc → Synology direct (Step 2)
-    --exclude='/frigate/'  # ring buffer — no backup anywhere
-    --exclude='/temp/'     # scratch — no backup anywhere
-
-    # ---- regenerable services: live-only on sdc, no offsite (2026-06-01) ----
-    # See header carve-out. --delete reaps any existing copies from sda on
-    # the next run; a one-off direct delete already cleared them from Synology.
-    --exclude='/ollama/'           # LLM models — re-pullable
-    --exclude='/prometheus-backup/' # metrics TSDB snapshots
-    --exclude='/audiblez/'         # generated audiobooks
-    --exclude='/ebook2audiobook/'  # generated audiobooks
-
-    # ---- Synology / Windows / macOS cruft ----
-    --exclude='/@eaDir/'
-    --exclude='*@synoeastream'
-    --exclude='/.DS_Store'
-    --exclude='/Thumbs.db'
-)
-
-log()  { echo "[$(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | tee -a "$LOG"; }
-warn() { log "WARN: $*"; }
-
-# Locked manifest append (shared with daily-backup) — see daily-backup.sh
-# for the rationale. flock prevents interleaved appends when nfs-mirror
-# (Mon 04:11) overruns into daily-backup (Mon 05:00).
-MANIFEST_LOCK="${MANIFEST}.lock"
-manifest_append() {
-    (
-        flock -x 200
-        cat >> "${MANIFEST}"
-    ) 200>"${MANIFEST_LOCK}"
-}
-
-push_metrics() {
-    local status="${1:-0}" bytes="${2:-0}"
-    cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
-nfs_mirror_last_run_timestamp $(date +%s)
-nfs_mirror_last_status ${status}
-nfs_mirror_bytes ${bytes}
-EOF
-}
-
-KILLED=""
-STAMP=""
-cleanup() {
-    rm -f "$LOCKFILE"
-    [ -n "$STAMP" ] && rm -f "$STAMP"
-    if [ -n "$KILLED" ]; then
-        push_metrics 2 0  # status=2 = aborted
-    fi
-}
-trap cleanup EXIT
-trap 'KILLED=1; exit 143' TERM INT
-
-if ! ( set -o noclobber; echo $$ > "$LOCKFILE" ) 2>/dev/null; then
-    log "FATAL: another instance running (pid $(cat "$LOCKFILE" 2>/dev/null || echo unknown))"
-    exit 1
-fi
-
-mountpoint -q /mnt/backup || { log "FATAL: /mnt/backup not mounted"; push_metrics 1 0; exit 1; }
-[ -d "$SRC" ]              || { log "FATAL: source $SRC missing"; push_metrics 1 0; exit 1; }
-
-log "=== mirror starting: $SRC → $DST ==="
-log "skip: immich (Synology direct), frigate/temp (no backup), anca-elements, ollama/prometheus-backup/audiblez/ebook2audiobook (regenerable, live-only)"
-
-# Marker file used to identify files written by this rsync run, so we can append
-# their paths to the offsite-sync manifest. Touch BEFORE rsync; `find -newer` AFTER.
-STAMP=$(mktemp)
-
-RSYNC_RC=0
-rsync \
-    -rlt --delete -H \
-    --no-perms --no-owner --no-group \
-    --info=stats2 \
-    "${EXCLUDES[@]}" \
-    "$SRC" "$DST" 2>&1 | tee -a "$LOG" || RSYNC_RC=${PIPESTATUS[0]}
-
-DST_BYTES=$(df -B1 --output=used /mnt/backup | tail -1)
-
-if [ "$RSYNC_RC" -eq 0 ]; then
-    # Capture files that rsync created/modified and feed them to the offsite-sync
-    # manifest so daily Step 1 incremental picks them up tomorrow morning.
-    # Use -cnewer (ctime), not -newer (mtime): rsync -t preserves SOURCE mtime
-    # on the dest, so freshly-written files with old source mtime look "older"
-    # than $STAMP and -newer misses them. ctime is set when the inode is written,
-    # regardless of -t, so it correctly identifies what this run created.
-    # (Bug hit 2026-05-26 full bypass-list mirror: 800k files copied, manifest
-    # captured only 2 entries → forced a .force-full-sync to recover.)
-    NEW_COUNT=$(find /mnt/backup -cnewer "$STAMP" -type f \
-        ! -path '/mnt/backup/.changed-files' \
-        ! -path '/mnt/backup/.changed-files.lock' \
-        ! -path '/mnt/backup/.lv-pvc-mapping.json' \
-        ! -path '/mnt/backup/.nfs-changes.log' \
-        ! -path '/mnt/backup/.last-offsite-sync' \
-        ! -path '/mnt/backup/.force-full-sync' \
-        -printf '%P\n' 2>/dev/null | tee >(manifest_append) | wc -l)
-    log "=== mirror complete; ${NEW_COUNT} files added to offsite manifest ==="
-    log "/mnt/backup used: $(df -h --output=used /mnt/backup | tail -1 | tr -d ' ')"
-    push_metrics 0 "$DST_BYTES"
-else
-    log "=== mirror failed: rsync exited $RSYNC_RC ==="
-    push_metrics 1 "$DST_BYTES"
-    exit "$RSYNC_RC"
-fi
--- a/scripts/nfs-mirror.timer
+++ b/scripts/nfs-mirror.timer
@ -1,16 +0,0 @@
-[Unit]
-Description=Daily local NFS mirror to /mnt/backup
-
-[Timer]
-# Daily 02:00 — runs 3h before daily-backup (05:00) so the .changed-files
-# manifest is populated and offsite-sync (06:00) ships both legs' deltas.
-# Switched from weekly Mon 04:00 → daily 2026-05-26: steady-state delta is
-# 10-20 min of mostly-metadata rsync, so the IO cost is negligible and it
-# cuts non-CronJob app-data RPO from 7d to ~24h (matters for nextcloud
-# shared files, audiobookshelf library, mailserver Maildir, etc.).
-OnCalendar=*-*-* 02:00:00
-Persistent=true
-RandomizedDelaySec=15min
-
-[Install]
-WantedBy=timers.target
--- a/scripts/node_registry_manager.sh
+++ b/scripts/node_registry_manager.sh
@ -1,97 +0,0 @@
-#!/bin/bash
-
-# Simple and reliable containerd registry mirror manager
-# Usage: ./registry-mirror.sh [--add|--remove] [mirror_url]
-# Docs - https://github.com/containerd/containerd/blob/main/docs/cri/registry.md
-# To apply on all nodes (tail +3 skips master node):
-# for node in $(kubectl get nodes -o wide | awk '{print $6}' | tail -n +3); do cat node_registry_manager.sh | s wizard@$node "sudo bash -s -- --add http://10.0.20.10:5000"; done
-# for node in $(kubectl get nodes -o wide | awk '{print $6}' | tail -n +3); do cat node_registry_manager.sh | s wizard@$node "sudo bash -s -- --remove http://10.0.20.10:5000"; done
-
-set -euo pipefail
-CONFIG_FILE="/etc/containerd/config.toml"
-BACKUP_FILE="/etc/containerd/config.toml.bak"
-
-# Validate environment
-[ -f "$CONFIG_FILE" ] || { echo "Error: $CONFIG_FILE not found" >&2; exit 1; }
-[ "$(id -u)" -eq 0 ] || { echo "Error: Requires root privileges" >&2; exit 1; }
-
-add_mirror() {
-    local mirror_url="$1"
-    
-    # Create backup
-    cp -p "$CONFIG_FILE" "$BACKUP_FILE"
-    
-    # Check if mirror already exists
-    if grep -q "endpoint = \[.*\"$mirror_url\".*\]" "$CONFIG_FILE"; then
-        echo "Mirror already exists: $mirror_url"
-        return 0
-    fi
-
-    # Check if docker.io section exists
-    if grep -q "^\[plugins\.\"io\.containerd\.grpc\.v1\.cri\"\.registry\.mirrors\.\"docker.io\"\]" "$CONFIG_FILE"; then
-        # Append to existing section
-        sed -i "/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]/a \  endpoint = [\"$mirror_url\"]" "$CONFIG_FILE"
-    else
-        # Add new section after registry.mirrors
-        if grep -q "^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]" "$CONFIG_FILE"; then
-            sed -i "/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]/a \\n[plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n  endpoint = [\"$mirror_url\"]" "$CONFIG_FILE"
-        else
-            # Add complete new section
-            echo -e "\n[plugins.\"io.containerd.grpc.v1.cri\".registry.mirrors.\"docker.io\"]\n  endpoint = [\"$mirror_url\"]" >> "$CONFIG_FILE"
-        fi
-    fi
-    
-    echo "Added mirror: $mirror_url"
-}
-
-remove_mirror() {
-    local mirror_url="$1"
-    
-    # Create backup
-    cp -p "$CONFIG_FILE" "$BACKUP_FILE"
-    
-    # Remove the specific mirror URL
-    sed -i "/endpoint = \[.*\"$mirror_url\".*\]/d" "$CONFIG_FILE"
-    
-    # Clean up empty sections
-    sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]$/,/^\[/{//!d}' "$CONFIG_FILE"
-    sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker.io"\]$/d' "$CONFIG_FILE"
-    
-    # Clean up multiple empty lines
-    sed -i '/^$/N;/^\n$/D' "$CONFIG_FILE"
-    
-    echo "Removed mirror: $mirror_url"
-}
-
-restart_containerd() {
-    echo "Restarting containerd..."
-    if systemctl restart containerd; then
-        echo "Successfully restarted containerd"
-        return 0
-    else
-        echo "Error: Failed to restart containerd" >&2
-        return 1
-    fi
-}
-
-case "$1" in
-    --add)
-        [ -z "$2" ] && { echo "Error: Mirror URL required" >&2; exit 1; }
-        add_mirror "$2"
-        restart_containerd || exit 1
-        ;;
-    --remove)
-        [ -z "$2" ] && { echo "Error: Mirror URL required" >&2; exit 1; }
-        remove_mirror "$2"
-        restart_containerd || exit 1
-        ;;
-    *)
-        echo "Usage: $0 [--add|--remove] [mirror_url]" >&2
-        echo "Examples:" >&2
-        echo "  Add mirror:    $0 --add https://registry.example.com" >&2
-        echo "  Remove mirror: $0 --remove https://registry.example.com" >&2
-        exit 1
-        ;;
-esac
-
-exit 0
--- a/scripts/offsite-sync-backup.service
+++ b/scripts/offsite-sync-backup.service
@ -1,11 +0,0 @@
-[Unit]
-Description=Daily offsite sync: sda + NFS changes to Synology
-After=network-online.target daily-backup.service
-
-[Service]
-Type=oneshot
-ExecStart=/usr/local/bin/offsite-sync-backup
-StandardOutput=journal
-StandardError=journal
-SyslogIdentifier=offsite-sync-backup
-TimeoutStartSec=7200
--- a/scripts/offsite-sync-backup.sh
+++ b/scripts/offsite-sync-backup.sh
@ -1,187 +0,0 @@
-#!/usr/bin/env bash
-# offsite-sync-backup — Sync backups to Synology NAS
-# Deploy to PVE host at /usr/local/bin/offsite-sync-backup
-# Schedule: Daily 06:00 via systemd timer (After=daily-backup.service)
-#
-# Two sync paths:
-#   Step 1: sda (/mnt/backup) → Synology pve-backup/ (PVC snapshots, pfsense, pve-config, sqlite)
-#   Step 2: NFS (/srv/nfs, /srv/nfs-ssd) → Synology nfs/, nfs-ssd/ (inotify change-tracked)
-set -euo pipefail
-
-# --- Configuration ---
-BACKUP_ROOT="/mnt/backup"
-SYNOLOGY="Administrator@192.168.1.13"
-PVE_BACKUP_DEST="${SYNOLOGY}:/volume1/Backup/Viki/pve-backup"
-NFS_DEST="${SYNOLOGY}:/volume1/Backup/Viki/nfs"
-NFS_SSD_DEST="${SYNOLOGY}:/volume1/Backup/Viki/nfs-ssd"
-MANIFEST="${BACKUP_ROOT}/.changed-files"
-NFS_CHANGE_LOG="${BACKUP_ROOT}/.nfs-changes.log"
-PUSHGATEWAY="${OFFSITE_SYNC_PUSHGATEWAY:-http://10.0.20.100:30091}"
-PUSHGATEWAY_JOB="offsite-backup-sync"
-LOCKFILE="/run/offsite-sync-backup.lock"
-
-# --- Logging ---
-log()  { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
-warn() { log "WARN: $*" >&2; }
-
-# --- Locking ---
-cleanup() { rm -f "${LOCKFILE}"; }
-trap cleanup EXIT
-if ! ( set -o noclobber; echo $$ > "${LOCKFILE}" ) 2>/dev/null; then
-    log "FATAL: Another instance running"; exit 1
-fi
-
-# --- Main ---
-log "=== Offsite sync starting ==="
-STATUS=0
-
-if ! mountpoint -q "${BACKUP_ROOT}"; then
-    log "FATAL: ${BACKUP_ROOT} is not mounted"; exit 1
-fi
-
-if ! timeout 10 ssh -o BatchMode=yes -o ConnectTimeout=5 "${SYNOLOGY}" true 2>/dev/null; then
-    log "FATAL: Cannot SSH to Synology"
-    echo "backup_last_success_timestamp 0" | \
-        curl -s --connect-timeout 5 --max-time 10 --data-binary @- \
-        "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
-    exit 1
-fi
-
-DAY_OF_MONTH=$(date +%d)
-
-# ============================================================
-# STEP 1: sda → Synology pve-backup/ (PVC snapshots, pfsense, pve-config)
-# ============================================================
-log "--- Step 1: sda → Synology pve-backup/ ---"
-
-# Trigger: monthly cleanup window OR daily-backup signalled the manifest grew
-# past its cap (Synology was unreachable too long for incremental to keep up).
-FORCE_FULL_FLAG="${BACKUP_ROOT}/.force-full-sync"
-FORCE_FULL=""
-[ -f "${FORCE_FULL_FLAG}" ] && FORCE_FULL=1
-if [ "${DAY_OF_MONTH}" -le 7 ] || [ -n "${FORCE_FULL}" ]; then
-    [ -n "${FORCE_FULL}" ] && log "Forced full sync (manifest size cap tripped)..." || log "Monthly full sync (1st Sunday)..."
-    # No -z on LAN: gigabit hop to 192.168.1.13 doesn't benefit from compression
-    # and burns CPU on the PVE host that's already busy with cluster IO.
-    rsync -rlt --delete --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r \
-        --exclude='.changed-files' \
-        --exclude='.changed-files.lock' \
-        --exclude='.last-offsite-sync' \
-        --exclude='.lv-pvc-mapping.json' \
-        --exclude='.nfs-changes.log' \
-        --exclude='.force-full-sync' \
-        --exclude='/anca-elements/' \
-        "${BACKUP_ROOT}/" "${PVE_BACKUP_DEST}/" 2>&1 || STATUS=1
-    rm -f "${FORCE_FULL_FLAG}"
-elif [ -s "${MANIFEST}" ]; then
-    MANIFEST_LINES=$(wc -l < "${MANIFEST}")
-    log "Incremental sync (${MANIFEST_LINES} files from manifest)..."
-    # anca-elements: now in Immich (canonical); /mnt/backup copy deleted
-    # 2026-05-26. Exclude retained as a safety belt in case it re-appears.
-    rsync -rlt --chmod=Du=rwx,Dgo=rx,Fu=rw,Fog=r --files-from="${MANIFEST}" \
-        --exclude='anca-elements/' \
-        "${BACKUP_ROOT}/" "${PVE_BACKUP_DEST}/" 2>&1 || STATUS=1
-else
-    log "No changed files in manifest, nothing to sync"
-fi
-
-# ============================================================
-# STEP 2: NFS → Synology nfs/ + nfs-ssd/ (inotify change-tracked, FILTERED)
-# ============================================================
-#
-# DESIGN: Step 2 only carries paths that BYPASS the sda mirror. As of
-# 2026-05-26 that's just /srv/nfs/immich/ (1.5T, doesn't fit on sda).
-# Everything else under /srv/nfs/ now flows through sda via nfs-mirror,
-# reaching Synology via Step 1 (sda → pve-backup/). frigate and temp are
-# excluded from both legs — intentionally NOT backed up.
-#
-# nfs-ssd: as of 2026-06-01 this leg is ALSO immich-only. ollama (59G) and
-# llamacpp (26G) on the SSD were filling the offsite Synology (5.3T hit 97%)
-# for re-pullable model blobs, so they're dropped — live copy stays on the
-# SSD, no offsite. The monthly --delete pass below reaps them from Synology
-# nfs-ssd/; a one-off direct delete cleared the bulk on 2026-06-01.
-#
-# Keep this aligned with /usr/local/bin/nfs-mirror's EXCLUDES. Both legs now
-# carry immich only; everything else is either curated through sda (Step 1)
-# or intentionally live-only (frigate, temp, ollama, llamacpp, audiblez,
-# ebook2audiobook, prometheus-backup).
-log "--- Step 2: NFS → Synology (immich-only on both nfs/ and nfs-ssd/) ---"
-
-# Regex matching paths NOT on sda (must reach Synology directly).
-NFS_SDA_BYPASS_RE='^/srv/nfs/immich/'
-
-# rsync include/exclude args for the monthly full sync (HDD).
-NFS_FULL_INCLUDES=(
-    --include='/immich/'  --include='/immich/***'
-    --exclude='*'
-)
-
-if [ "${DAY_OF_MONTH}" -le 7 ]; then
-    # Monthly: full sync with --delete for cleanup, restricted to bypass-list.
-    # --delete here will reap legacy dirs on Synology (frigate, ollama,
-    # audiblez, ebook2audiobook, *-backup, prometheus, loki, temp,
-    # alertmanager) since they're no longer in NFS_FULL_INCLUDES.
-    log "Monthly full NFS sync (immich-only — reaps legacy bypass dirs)..."
-    rsync -rlt --delete "${NFS_FULL_INCLUDES[@]}" /srv/nfs/ "${NFS_DEST}/" 2>&1 \
-        && log "  OK: nfs/ full sync (immich-only)" || { warn "nfs/ full sync failed"; STATUS=1; }
-    # nfs-ssd: immich-only (2026-06-01) — --delete reaps legacy ollama/llamacpp.
-    rsync -rlt --delete "${NFS_FULL_INCLUDES[@]}" /srv/nfs-ssd/ "${NFS_SSD_DEST}/" 2>&1 \
-        && log "  OK: nfs-ssd/ full sync (immich-only)" || { warn "nfs-ssd/ full sync failed"; STATUS=1; }
-    > "${NFS_CHANGE_LOG}"
-elif [ -s "${NFS_CHANGE_LOG}" ]; then
-    # Incremental: only sync changed files matching the bypass leg (immich).
-    sort -u "${NFS_CHANGE_LOG}" > /tmp/nfs-changes-deduped
-
-    # HDD NFS — include only /srv/nfs/immich/ paths.
-    # `|| true` is REQUIRED: if the last iteration's `[ -f "$f" ]` is false
-    # (file was deleted between inotify capture and now — e.g., immich
-    # encoded-video temp file that got cleaned up), the while loop returns
-    # 1, pipefail propagates, and `set -e` kills the script silently before
-    # reaching the rsync. Matches the SSD section's pattern below.
-    grep -E "${NFS_SDA_BYPASS_RE}" /tmp/nfs-changes-deduped | \
-        while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs/}"; done \
-        > /tmp/sync-nfs.list 2>/dev/null || true
-    NFS_COUNT=$(wc -l < /tmp/sync-nfs.list 2>/dev/null || echo 0)
-    if [ "${NFS_COUNT:-0}" -gt 0 ]; then
-        rsync -rlt --files-from=/tmp/sync-nfs.list /srv/nfs/ "${NFS_DEST}/" 2>&1 \
-            && log "  OK: nfs/ (${NFS_COUNT} immich files)" \
-            || { warn "nfs/ incremental failed"; STATUS=1; }
-    fi
-
-    # SSD NFS — immich-only (2026-06-01); ollama/llamacpp are live-only, no offsite.
-    grep '^/srv/nfs-ssd/immich/' /tmp/nfs-changes-deduped | \
-        while IFS= read -r f; do [ -f "$f" ] && echo "${f#/srv/nfs-ssd/}"; done \
-        > /tmp/sync-nfs-ssd.list 2>/dev/null || true
-    SSD_COUNT=$(wc -l < /tmp/sync-nfs-ssd.list 2>/dev/null || echo 0)
-    if [ "${SSD_COUNT:-0}" -gt 0 ]; then
-        rsync -rlt --files-from=/tmp/sync-nfs-ssd.list /srv/nfs-ssd/ "${NFS_SSD_DEST}/" 2>&1 \
-            && log "  OK: nfs-ssd/ (${SSD_COUNT} files)" \
-            || { warn "nfs-ssd/ incremental failed"; STATUS=1; }
-    fi
-
-    TOTAL=$(wc -l < /tmp/nfs-changes-deduped)
-    log "  Processed ${TOTAL} change events (${NFS_COUNT} nfs/immich + ${SSD_COUNT} nfs-ssd files synced)"
-    > "${NFS_CHANGE_LOG}"
-    rm -f /tmp/nfs-changes-deduped /tmp/sync-nfs.list /tmp/sync-nfs-ssd.list
-else
-    log "  No NFS changes to sync"
-fi
-
-# ============================================================
-# Finish
-# ============================================================
-if [ "${STATUS}" -eq 0 ]; then
-    touch "${BACKUP_ROOT}/.last-offsite-sync"
-    > "${MANIFEST}"
-    log "=== Offsite sync complete (success) ==="
-else
-    warn "Offsite sync had errors — manifest preserved for retry"
-    log "=== Offsite sync complete (with errors) ==="
-fi
-
-cat <<EOF | curl -s --connect-timeout 5 --max-time 10 --data-binary @- "${PUSHGATEWAY}/metrics/job/${PUSHGATEWAY_JOB}" 2>/dev/null || true
-backup_last_success_timestamp $(date +%s)
-offsite_sync_last_status ${STATUS}
-EOF
-
-exit "${STATUS}"
--- a/scripts/offsite-sync-backup.timer
+++ b/scripts/offsite-sync-backup.timer
@ -1,10 +0,0 @@
-[Unit]
-Description=Daily offsite sync: sda + NFS changes to Synology
-
-[Timer]
-OnCalendar=*-*-* 06:00:00
-Persistent=true
-RandomizedDelaySec=300
-
-[Install]
-WantedBy=timers.target
--- a/scripts/parse-postmortem-todos.sh
+++ b/scripts/parse-postmortem-todos.sh
@ -1,89 +0,0 @@
-#!/bin/sh
-# parse-postmortem-todos.sh — Extract auto-implementable TODOs from a post-mortem markdown file
-# Usage: bash scripts/parse-postmortem-todos.sh docs/post-mortems/2026-04-14-foo.md
-# Output: JSON with file path and list of TODOs
-#
-# Supports two table formats:
-#   New: | Priority | Action | Type | Details | Status |
-#   Old: | Action | Status | Details |  (infers type from action text)
-set -eu
-
-PM_FILE="${1:?Usage: $0 <post-mortem.md>}"
-
-if [ ! -f "$PM_FILE" ]; then
-  echo '{"file": "", "todos": [], "error": "File not found"}' >&2
-  exit 1
-fi
-
-python3 -c "
-import re, json, sys
-
-pm_file = sys.argv[1]
-with open(pm_file) as f:
-    content = f.read()
-
-safe_types = {'Alert', 'Config', 'Monitor'}
-
-todos = []
-
-# Format 1 (new template): | Priority | Action | Type | Details | Status |
-pattern_new = r'\|\s*(P[0-3])\s*\|\s*(.+?)\s*\|\s*(\w+)\s*\|\s*(.+?)\s*\|\s*TODO\s*\|'
-for priority, action, todo_type, details in re.findall(pattern_new, content):
-    todos.append({
-        'priority': priority.strip(),
-        'action': action.strip(),
-        'type': todo_type.strip(),
-        'details': details.strip(),
-        'safe': todo_type.strip() in safe_types
-    })
-
-# Format 2 (old): | Action | TODO/Done | Details | or | Action | Owner | Status |
-# Look for rows with TODO in any column
-if not todos:
-    pattern_old = r'\|\s*(.+?)\s*\|\s*TODO\s*\|\s*(.+?)\s*\|'
-    for action, details in re.findall(pattern_old, content):
-        action = action.strip()
-        details = details.strip()
-        # Skip header rows and clean up leading pipes
-        if action.startswith('--') or action.lower() == 'action':
-            continue
-        action = action.lstrip('| ').strip()
-        # Infer type from action text
-        action_lower = action.lower()
-        if any(kw in action_lower for kw in ['prometheusrule', 'alert', 'alerting']):
-            todo_type = 'Alert'
-        elif any(kw in action_lower for kw in ['uptime kuma', 'monitor', 'ping', 'tcp check']):
-            todo_type = 'Monitor'
-        elif any(kw in action_lower for kw in ['config', 'manage', 'add.*option', 'document', 'nfs.conf']):
-            todo_type = 'Config'
-        elif any(kw in action_lower for kw in ['migrate', 'move']):
-            todo_type = 'Migration'
-        elif any(kw in action_lower for kw in ['review', 'investigate', 'verify']):
-            todo_type = 'Investigation'
-        else:
-            todo_type = 'Config'  # default to Config for ambiguous items
-
-        # Infer priority from section header context
-        priority = 'P2'  # default
-        todos.append({
-            'priority': priority,
-            'action': action,
-            'type': todo_type,
-            'details': details,
-            'safe': todo_type in safe_types
-        })
-
-safe_todos = [t for t in todos if t['safe']]
-unsafe_todos = [t for t in todos if not t['safe']]
-
-result = {
-    'file': pm_file,
-    'todos': safe_todos,
-    'skipped': unsafe_todos,
-    'total_todos_in_doc': len(todos),
-    'safe_todos': len(safe_todos),
-    'skipped_todos': len(unsafe_todos)
-}
-
-print(json.dumps(result, indent=2))
-" "$PM_FILE"
--- a/scripts/pfsense-haproxy-bootstrap.php
+++ b/scripts/pfsense-haproxy-bootstrap.php
@ -1,236 +0,0 @@
-<?php
-// pfSense HAProxy bootstrap — configures the mailserver PROXY-v2 path
-// (bd code-yiu, Phases 2/3 + 5).
-//
-// WHY THIS EXISTS
-//   pfSense HAProxy config is stored XML-in-`/cf/conf/config.xml` under
-//   `<installedpackages><haproxy>`. That file IS picked up by the nightly
-//   `daily-backup` on the PVE host (see `scripts/daily-backup.sh` → `scp
-//   root@10.0.20.1:/cf/conf/config.xml`) and synced to Synology. This script
-//   is the canonical reproducer: run it to rebuild the pfSense HAProxy config
-//   from scratch (DR restore, fresh pfSense install, etc.).
-//
-// WHAT IT BUILDS
-//   4 backend pools — one per mail port:
-//     mailserver_nodes_smtp  → k8s-node1..4:30125 (container :2525 postscreen)
-//     mailserver_nodes_smtps → k8s-node1..4:30126 (container :4465 smtps)
-//     mailserver_nodes_sub   → k8s-node1..4:30127 (container :5587 submission)
-//     mailserver_nodes_imaps → k8s-node1..4:30128 (container :10993 IMAPS)
-//   Each server uses `send-proxy-v2` and TCP health-check every 120s.
-//   4 frontends on pfSense 10.0.20.1:{25,465,587,993} TCP mode.
-//   + 1 legacy test frontend on :2525 (kept for validation; safe to remove later).
-//
-// USAGE (on pfSense host, via SSH as admin)
-//   scp infra/scripts/pfsense-haproxy-bootstrap.php admin@10.0.20.1:/tmp/
-//   ssh admin@10.0.20.1 'php /tmp/pfsense-haproxy-bootstrap.php'
-//
-// IDEMPOTENCY
-//   Removes any existing entries named mailserver_* before re-adding, so
-//   repeat runs are safe and behave as reset-to-declared.
-
-require_once('/etc/inc/config.inc');
-require_once('/usr/local/pkg/haproxy/haproxy.inc');
-require_once('/usr/local/pkg/haproxy/haproxy_utils.inc');
-
-global $config;
-parse_config(true);
-
-if (!is_array($config['installedpackages']['haproxy'])) {
-    $config['installedpackages']['haproxy'] = [];
-}
-$h = &$config['installedpackages']['haproxy'];
-
-$h['enable']  = 'yes';
-$h['maxconn'] = '1000';
-
-// Our declared object names (anything starting with mailserver_ is ours)
-$POOL_NAMES = [
-    'mailserver_nodes',          // legacy (Phase 2/3 test)
-    'mailserver_nodes_smtp',
-    'mailserver_nodes_smtps',
-    'mailserver_nodes_sub',
-    'mailserver_nodes_imaps',
-];
-$FRONTEND_NAMES = [
-    'mailserver_proxy_test',     // legacy (Phase 2/3 test, :2525)
-    'mailserver_proxy_25',
-    'mailserver_proxy_465',
-    'mailserver_proxy_587',
-    'mailserver_proxy_993',
-];
-
-// k8s workers. Not in the cluster: master (control-plane) and node5
-// (doesn't exist in this topology).
-$NODES = [
-    ['k8s-node1', '10.0.20.101'],
-    ['k8s-node2', '10.0.20.102'],
-    ['k8s-node3', '10.0.20.103'],
-    ['k8s-node4', '10.0.20.104'],
-];
-
-// Build a pool with optional split healthcheck path.
-//
-// $check_port: if non-null, HAProxy sends health probes to that NodePort
-//   (which Service `mailserver-proxy` maps to the pod's stock no-PROXY
-//   listener — see infra/stacks/mailserver/.../mailserver_proxy ports
-//   30145/30146/30147). Real client traffic still goes to $nodeport with
-//   PROXY v2 framing.
-// $check_type: 'TCP' for plain accept-on-port checks, 'ESMTP' for
-//   `option smtpchk EHLO <monitor_domain>` (real SMTP banner+EHLO+250).
-//
-// Why split: smtpd-proxy587/4465 fatal on every PROXY-v2-aware health
-// probe with `smtpd_peer_hostaddr_to_sockaddr: ... Servname not supported`
-// — the daemon respawns get throttled by Postfix master and real clients
-// land mid-respawn → 6s TCP timeout. Routing health probes to the stock
-// no-PROXY port sidesteps the bug entirely while data path still gets
-// PROXY v2 for CrowdSec/Postfix client-IP visibility. The HAProxy package
-// has no `checkport` field, so `port N` is appended via the server's
-// `advanced` string (HAProxy parses server keywords in any order).
-function build_pool(
-    string $name,
-    string $nodeport,
-    array $nodes,
-    string $check_type = 'TCP',
-    ?string $check_port = null,
-    string $monitor_domain = ''
-): array {
-    $advanced_check = $check_port !== null
-        ? "send-proxy-v2 port {$check_port}"
-        : 'send-proxy-v2';
-    $servers = [];
-    foreach ($nodes as $n) {
-        $servers[] = [
-            'name'       => $n[0],
-            'address'    => $n[1],
-            'port'       => $nodeport,
-            'weight'     => '10',
-            'ssl'        => '',
-            // 5s = sub-block-window failover when a NodePort goes sour.
-            // Safe to be aggressive once health probes don't fatal smtpd.
-            'checkinter' => '5000',
-            'advanced'   => $advanced_check,
-            'status'     => 'active',
-        ];
-    }
-    return [
-        'name'                   => $name,
-        'balance'                => 'roundrobin',
-        'check_type'             => $check_type,
-        'monitor_domain'         => $monitor_domain,
-        'checkinter'             => '5000',
-        'retries'                => '3',
-        'ha_servers'             => ['item' => $servers],
-        'advanced_bind'          => '',
-        'persist_cookie_enabled' => '',
-        'transparent_clientip'   => '',
-        'advanced'               => '',
-    ];
-}
-
-function build_frontend(string $name, string $descr, string $extaddr, string $port, string $pool): array {
-    return [
-        'name'      => $name,
-        'descr'     => $descr,
-        'status'    => 'active',
-        'secondary' => '',
-        'type'      => 'tcp',
-        'a_extaddr' => ['item' => [[
-            'extaddr'          => $extaddr,
-            'extaddr_port'     => $port,
-            'extaddr_ssl'      => '',
-            'extaddr_advanced' => '',
-        ]]],
-        'backend_serverpool' => $pool,
-        'ha_acls'    => '',
-        'dontlognull'=> '',
-        'httpclose'  => '',
-        'forwardfor' => '',
-        'advanced'   => '',
-    ];
-}
-
-// ── Backend pools ───────────────────────────────────────────────────────
-if (!is_array($h['ha_pools']))         $h['ha_pools']         = ['item' => []];
-if (!is_array($h['ha_pools']['item'])) $h['ha_pools']['item'] = [];
-$h['ha_pools']['item'] = array_values(array_filter(
-    $h['ha_pools']['item'],
-    fn($p) => !in_array($p['name'] ?? '', $POOL_NAMES, true)
-));
-
-// Legacy test pool (still used by the :2525 test frontend for manual SMTP roundtrip).
-$h['ha_pools']['item'][] = build_pool('mailserver_nodes',       '30125', $NODES);
-
-// Production pools — one per mail port.
-//
-// All SMTP/SMTPS/Submission backends use plain TCP checks against
-// dedicated non-PROXY healthcheck NodePorts (30145/30146/30147 → pod
-// stock 25/465/587) so probes hit the no-PROXY listeners and avoid
-// the smtpd_peer_hostaddr_to_sockaddr fatal that fires on PROXY-v2
-// LOCAL frames. Real client traffic still goes to 30125-30128 with
-// PROXY v2 for client-IP visibility.
-//
-// We tried `option smtpchk EHLO` initially — it works on the plain
-// `submission` daemon (587) but flaps the `postscreen` listener on
-// port 25 (multi-line greet + DNSBL silence + anti-pre-greet
-// detection makes HAProxy's simple smtpchk parser hit L7RSP). A
-// plain TCP accept-on-port check is enough for both: HAProxy still
-// gets fast failover when the listener actually goes away, and we
-// stop triggering the Postfix fatal entirely.
-//
-// IMAPS stays on its existing TCP-check-with-PROXY-frame for now —
-// Dovecot's PROXY parser doesn't show the same fatal pattern; adding
-// a separate IMAP healthcheck path would require another svc port.
-$h['ha_pools']['item'][] = build_pool('mailserver_nodes_smtp',  '30125', $NODES, 'TCP', '30145');
-$h['ha_pools']['item'][] = build_pool('mailserver_nodes_smtps', '30126', $NODES, 'TCP', '30146');
-$h['ha_pools']['item'][] = build_pool('mailserver_nodes_sub',   '30127', $NODES, 'TCP', '30147');
-$h['ha_pools']['item'][] = build_pool('mailserver_nodes_imaps', '30128', $NODES);
-
-// ── Frontends ───────────────────────────────────────────────────────────
-if (!is_array($h['ha_backends']))         $h['ha_backends']         = ['item' => []];
-if (!is_array($h['ha_backends']['item'])) $h['ha_backends']['item'] = [];
-$h['ha_backends']['item'] = array_values(array_filter(
-    $h['ha_backends']['item'],
-    fn($f) => !in_array($f['name'] ?? '', $FRONTEND_NAMES, true)
-));
-
-// Legacy test frontend — :2525 — retained so SMTP roundtrip tests keep working
-// without touching the real :25. Safe to remove once fully validated.
-$h['ha_backends']['item'][] = build_frontend(
-    'mailserver_proxy_test',
-    'code-yiu Phase 2/3 test — PROXY v2 to k8s mailserver NodePort 30125 (alt port :2525)',
-    '10.0.20.1', '2525',
-    'mailserver_nodes'
-);
-
-// Production frontends — 4 ports listening on pfSense VLAN20 IP 10.0.20.1.
-$h['ha_backends']['item'][] = build_frontend(
-    'mailserver_proxy_25',
-    'code-yiu Phase 4/5 — external SMTP (:25) via PROXY v2 → pod :2525 postscreen',
-    '10.0.20.1', '25',
-    'mailserver_nodes_smtp'
-);
-$h['ha_backends']['item'][] = build_frontend(
-    'mailserver_proxy_465',
-    'code-yiu Phase 4/5 — external SMTPS (:465) via PROXY v2 → pod :4465 smtpd',
-    '10.0.20.1', '465',
-    'mailserver_nodes_smtps'
-);
-$h['ha_backends']['item'][] = build_frontend(
-    'mailserver_proxy_587',
-    'code-yiu Phase 4/5 — external submission (:587) via PROXY v2 → pod :5587 smtpd',
-    '10.0.20.1', '587',
-    'mailserver_nodes_sub'
-);
-$h['ha_backends']['item'][] = build_frontend(
-    'mailserver_proxy_993',
-    'code-yiu Phase 4/5 — external IMAPS (:993) via PROXY v2 → pod :10993 Dovecot',
-    '10.0.20.1', '993',
-    'mailserver_nodes_imaps'
-);
-
-write_config('code-yiu: mailserver HAProxy — 4 production frontends + legacy :2525 test');
-
-$messages = '';
-$rc = haproxy_check_and_run($messages, true);
-echo 'haproxy_check_and_run rc=' . ($rc ? 'OK' : 'FAIL') . "\n";
-echo "messages: $messages\n";
--- a/scripts/pfsense-nat-mailserver-haproxy-flip.php
+++ b/scripts/pfsense-nat-mailserver-haproxy-flip.php
@ -1,68 +0,0 @@
-<?php
-// pfSense NAT redirect flip — mail ports 25/465/587/993 from
-// <mailserver> alias (10.0.20.202 MetalLB LB) to pfSense's own HAProxy
-// listener (10.0.20.1). bd code-yiu.
-//
-// THIS IS THE CUTOVER. After this script:
-//   Internet → pfSense WAN:{25,465,587,993} → rdr → 10.0.20.1:{...}
-//   (pfSense HAProxy) → send-proxy-v2 → k8s-node:{30125..30128} NodePort
-//   → kube-proxy → mailserver pod alt listeners (2525/4465/5587/10993)
-//   → Postfix/Dovecot parse PROXY v2 → real client IP recovered.
-//
-// Internal clients (Roundcube, email-roundtrip-monitor CronJob) continue
-// using the existing mailserver ClusterIP Service on the stock ports
-// (25/465/587/993) which hit container stock listeners WITHOUT PROXY.
-// No change to internal traffic paths.
-//
-// USAGE
-//   scp infra/scripts/pfsense-nat-mailserver-haproxy-flip.php admin@10.0.20.1:/tmp/
-//   ssh admin@10.0.20.1 'php /tmp/pfsense-nat-mailserver-haproxy-flip.php'
-//
-// REVERT — run pfsense-nat-mailserver-haproxy-unflip.php (companion script).
-//
-// IDEMPOTENT — re-runs converge. Flips nothing if already pointed at 10.0.20.1.
-
-require_once('/etc/inc/config.inc');
-require_once('/etc/inc/filter.inc');
-
-global $config;
-parse_config(true);
-
-$PORTS_TO_FLIP = ['25', '465', '587', '993'];
-$OLD_TARGET    = 'mailserver';
-$NEW_TARGET    = '10.0.20.1';
-
-$changed = 0;
-foreach ($config['nat']['rule'] as $i => &$r) {
-    $iface = $r['interface'] ?? '';
-    $lport = $r['local-port'] ?? '';
-    $tgt   = $r['target'] ?? '';
-
-    if ($iface !== 'wan') continue;
-    if (!in_array($lport, $PORTS_TO_FLIP, true)) continue;
-    if ($tgt !== $OLD_TARGET) {
-        printf("rule %d (dport=%s) target=%s — not flipping (already %s or unexpected)\n",
-               $i, $lport, $tgt, $NEW_TARGET);
-        continue;
-    }
-
-    $r['target'] = $NEW_TARGET;
-    // Also unset the 'associated-rule-id' linked filter rule target if any —
-    // actually pfSense regenerates the associated rule from NAT rule on apply,
-    // so leaving associated-rule-id intact is fine.
-    $changed++;
-    printf("rule %d (dport=%s): target %s → %s\n", $i, $lport, $OLD_TARGET, $NEW_TARGET);
-}
-unset($r);
-
-if ($changed === 0) {
-    echo "No changes. (Already flipped? Run unflip script to revert.)\n";
-    exit(0);
-}
-
-write_config("code-yiu: NAT rdr — mail ports {$changed} flipped to HAProxy (10.0.20.1)");
-
-// Rebuild pf rules & reload.
-$rc = filter_configure();
-printf("filter_configure rc=%s\n", var_export($rc, true));
-echo "done.\n";
--- a/scripts/pfsense-nat-mailserver-haproxy-unflip.php
+++ b/scripts/pfsense-nat-mailserver-haproxy-unflip.php
@ -1,48 +0,0 @@
-<?php
-// REVERT of pfsense-nat-mailserver-haproxy-flip.php.
-// Moves mail-port NAT rdr target from 10.0.20.1 (pfSense HAProxy) back to
-// <mailserver> alias (10.0.20.202 MetalLB LB IP). bd code-yiu rollback.
-//
-// USE THIS IF: external mail breaks after the flip, any postscreen
-// PROXY timeouts show up in logs, or you need to back out before Phase 6.
-
-require_once('/etc/inc/config.inc');
-require_once('/etc/inc/filter.inc');
-
-global $config;
-parse_config(true);
-
-$PORTS_TO_REVERT = ['25', '465', '587', '993'];
-$OLD_TARGET      = '10.0.20.1';
-$NEW_TARGET      = 'mailserver';
-
-$changed = 0;
-foreach ($config['nat']['rule'] as $i => &$r) {
-    $iface = $r['interface'] ?? '';
-    $lport = $r['local-port'] ?? '';
-    $tgt   = $r['target'] ?? '';
-
-    if ($iface !== 'wan') continue;
-    if (!in_array($lport, $PORTS_TO_REVERT, true)) continue;
-    if ($tgt !== $OLD_TARGET) {
-        printf("rule %d (dport=%s) target=%s — not reverting (already %s or unexpected)\n",
-               $i, $lport, $tgt, $NEW_TARGET);
-        continue;
-    }
-
-    $r['target'] = $NEW_TARGET;
-    $changed++;
-    printf("rule %d (dport=%s): target %s → %s\n", $i, $lport, $OLD_TARGET, $NEW_TARGET);
-}
-unset($r);
-
-if ($changed === 0) {
-    echo "No changes. (Already reverted.)\n";
-    exit(0);
-}
-
-write_config("code-yiu: NAT rdr — mail ports {$changed} reverted to <mailserver> alias");
-
-$rc = filter_configure();
-printf("filter_configure rc=%s\n", var_export($rc, true));
-echo "done.\n";
--- a/scripts/postmortem-pipeline.sh
+++ b/scripts/postmortem-pipeline.sh
@ -1,81 +0,0 @@
-#!/bin/sh
-# postmortem-pipeline.sh — Woodpecker pipeline step for post-mortem TODO automation
-# Called from .woodpecker/postmortem-todos.yml
-set -e
-
-# 1. Find post-mortem(s) with TODO items
-# Scan all post-mortems — don't rely on git diff (Woodpecker shallow clone breaks HEAD~1)
-PM_FILE=""
-for f in docs/post-mortems/*.md; do
-  if grep -q '| TODO |' "$f" 2>/dev/null; then
-    PM_FILE="$f"
-    break
-  fi
-done
-if [ -z "$PM_FILE" ]; then
-  echo "No post-mortem with pending TODOs found"
-  exit 0
-fi
-echo "Post-mortem with TODOs: $PM_FILE"
-
-# 3. Parse TODOs
-sh scripts/parse-postmortem-todos.sh "$PM_FILE" > /tmp/todos.json
-cat /tmp/todos.json
-TODO_COUNT=$(jq '.safe_todos' /tmp/todos.json)
-echo "$TODO_COUNT safe TODO(s) found"
-if [ "$TODO_COUNT" -eq 0 ]; then
-  echo "No auto-implementable TODOs — skipping"
-  exit 0
-fi
-
-# 4. Authenticate to Vault via K8s SA JWT
-SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)
-VAULT_RESP=$(curl -sf -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \
-  -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}")
-VAULT_TOKEN=$(echo "$VAULT_RESP" | jq -r .auth.client_token)
-if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then
-  echo "ERROR: Vault authentication failed"
-  exit 1
-fi
-echo "Vault authenticated"
-
-# 5. Fetch API token for claude-agent-service
-AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $VAULT_TOKEN" \
-  http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \
-  jq -r '.data.data.api_bearer_token')
-if [ -z "$AGENT_TOKEN" ] || [ "$AGENT_TOKEN" = "null" ]; then
-  echo "ERROR: Failed to fetch agent API token"
-  exit 1
-fi
-echo "Agent token fetched"
-
-# 6. Submit to claude-agent-service
-TODOS=$(cat /tmp/todos.json)
-PAYLOAD=$(jq -n \
-  --arg prompt "Implement the auto-implementable TODOs from $PM_FILE. Parsed TODO list: $TODOS" \
-  --arg agent ".claude/agents/postmortem-todo-resolver" \
-  '{prompt: $prompt, agent: $agent, max_budget_usd: 5, timeout_seconds: 900}')
-
-RESP=$(curl -sf -X POST \
-  -H "Authorization: Bearer $AGENT_TOKEN" \
-  -H "Content-Type: application/json" \
-  -d "$PAYLOAD" \
-  http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute)
-JOB_ID=$(echo "$RESP" | jq -r '.job_id')
-echo "Job submitted: $JOB_ID"
-
-# 7. Poll for completion (15min max)
-for i in $(seq 1 60); do
-  sleep 15
-  RESULT=$(curl -sf \
-    -H "Authorization: Bearer $AGENT_TOKEN" \
-    http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$JOB_ID)
-  STATUS=$(echo "$RESULT" | jq -r '.status')
-  echo "[$i/60] Status: $STATUS"
-  if [ "$STATUS" != "running" ]; then
-    echo "$RESULT" | jq .
-    if [ "$STATUS" = "completed" ]; then exit 0; else exit 1; fi
-  fi
-done
-echo "ERROR: Job timed out after 15 minutes"
-exit 1
--- a/scripts/provision-k8s-worker
+++ b/scripts/provision-k8s-worker
@ -1,109 +0,0 @@
-#!/usr/bin/env bash
-# provision-k8s-worker NAME VMID IP[/CIDR]
-#
-# Clone PVE template 2000 (ubuntu-2404-cloudinit-k8s-template) into a new
-# VM, configure resources to match k8s-node3/4 (32G RAM, 8 vCPU, host CPU,
-# 256G disk, VLAN 20 on vmbr1), attach the shared cicustom snippet
-# (/var/lib/vz/snippets/k8s_cloud_init.yaml), and start it. Cloud-init
-# inside the VM installs containerd + kubelet, applies the bundled
-# setup script, and runs the kubeadm join. No manual steps after this.
-#
-# Hostname is derived from `qm set --name $NAME` and read by cloud-init
-# from Proxmox metadata — DO NOT hard-code in the snippet.
-#
-# Idempotent: aborts if VMID already exists or IP is already in use.
-#
-# Usage:
-#   ssh root@192.168.1.127 bash -s -- k8s-node6 206 10.0.20.106 < provision-k8s-worker
-# or, if the script lives on the PVE host:
-#   provision-k8s-worker k8s-node6 206 10.0.20.106
-#
-# Run on the PVE host (needs qm + /var/lib/vz/snippets access).
-set -euo pipefail
-
-if [ $# -ne 3 ]; then
-    echo "usage: $0 NAME VMID IP" >&2
-    echo "  e.g. $0 k8s-node6 206 10.0.20.106" >&2
-    exit 2
-fi
-
-NAME=$1
-VMID=$2
-IP=$3
-CIDR_IP="${IP}/22"
-GW="10.0.20.1"
-DNS="10.0.20.201"
-SEARCH="viktorbarzin.lan"
-TEMPLATE_ID=2000
-STORAGE="local-lvm"
-USER_SNIPPET="local:snippets/k8s_cloud_init.yaml"
-# Per-node meta-data snippet — written below — supplies local-hostname.
-# Proxmox's auto-generated metadata DOESN'T include hostname when
-# cicustom user=… is set, so the shared user-data snippet alone leaves
-# nodes joining as "ubuntu" (image default). Per-node meta-data is the
-# clean fix.
-META_SNIPPET_FILE="/var/lib/vz/snippets/${NAME}-meta.yaml"
-META_SNIPPET="local:snippets/${NAME}-meta.yaml"
-BRIDGE="vmbr1"
-VLAN=20
-
-# Sanity: VMID must be free
-if qm status "$VMID" >/dev/null 2>&1; then
-    echo "ERROR: VM $VMID already exists. Refusing to clobber." >&2
-    qm status "$VMID" >&2
-    exit 1
-fi
-
-# Sanity: IP must not be pingable
-if ping -c 1 -W 1 "$IP" >/dev/null 2>&1; then
-    echo "ERROR: $IP is already responding to ping. Refusing to assign." >&2
-    exit 1
-fi
-
-# Sanity: snippet must exist
-if [ ! -f "/var/lib/vz/snippets/k8s_cloud_init.yaml" ]; then
-    echo "ERROR: /var/lib/vz/snippets/k8s_cloud_init.yaml missing." >&2
-    echo "  Run `tg apply` in infra/stacks/infra/ to regenerate it." >&2
-    exit 1
-fi
-
-# Sanity: template must be a template
-if ! qm config "$TEMPLATE_ID" | grep -q '^template: 1'; then
-    echo "ERROR: VMID $TEMPLATE_ID is not a template." >&2
-    exit 1
-fi
-
-echo "[1/6] write per-node meta-data snippet ($META_SNIPPET_FILE)"
-cat > "$META_SNIPPET_FILE" <<META
-local-hostname: $NAME
-instance-id: $NAME-$(date +%s)
-META
-
-echo "[2/6] qm clone $TEMPLATE_ID -> $VMID ($NAME)"
-qm clone "$TEMPLATE_ID" "$VMID" --name "$NAME" --full true --storage "$STORAGE"
-
-echo "[3/6] qm set $VMID — VM resources + network + cicustom"
-qm set "$VMID" \
-    --agent 1 \
-    --balloon 32768 \
-    --cores 8 \
-    --cpu host \
-    --memory 32768 \
-    --net0 "virtio,bridge=$BRIDGE,tag=$VLAN" \
-    --ipconfig0 "ip=$CIDR_IP,gw=$GW" \
-    --nameserver "$DNS" \
-    --searchdomain "$SEARCH" \
-    --onboot 1 \
-    --startup 'order=5,up=45,down=420' \
-    --cicustom "user=$USER_SNIPPET,meta=$META_SNIPPET"
-
-echo "[4/6] qm resize $VMID scsi0 256G"
-qm resize "$VMID" scsi0 256G
-
-echo "[5/6] qm start $VMID"
-qm start "$VMID"
-
-echo "[6/6] Done. Cloud-init runs now; node should appear in 'kubectl get nodes' within ~6-10 min."
-echo "  Tail cloud-init: socat -u UNIX-CONNECT:/var/run/qemu-server/$VMID.serial0 STDOUT | strings"
-echo "  Final config:"
-qm config "$VMID" | grep -E '^(name|cores|memory|net0|ipconfig0|cicustom|scsi0|onboot):'
--- a/scripts/pve-nfs-exports
+++ b/scripts/pve-nfs-exports
@ -1,26 +0,0 @@
-# /etc/exports — NFS export configuration for Proxmox VE host
-# Managed in git: infra/scripts/pve-nfs-exports
-# Deploy: scp scripts/pve-nfs-exports root@192.168.1.127:/etc/exports && ssh root@192.168.1.127 exportfs -ra
-#
-# CRITICAL NOTES (learned from 2026-04-14 outage [PM-2026-04-14]):
-#   - NEVER add fsid=0 to /srv/nfs or /srv/nfs-ssd exports. fsid=0 designates the
-#     NFSv4 pseudo-root which changes path resolution for ALL subdirectory mounts.
-#     When CSI mounts use paths like /srv/nfs/technitium, fsid=0 makes them resolve
-#     as the root itself, causing ENOENT on all subdirectory mounts.
-#   - fsid=1 is acceptable on /srv/nfs-ssd (unique ID, not root).
-#   - The NFS CSI driver mounts subdirectories — never use fsid=0 on any export
-#     that serves dynamic path mounts.
-#   - NFSv3 is disabled on this host (vers3=n in /etc/nfs.conf) — all k8s mounts
-#     must use nfsvers=4 mount option.
-#
-# Mount options explanation:
-#   rw           — read/write access (required for PVCs)
-#   async        — async writes safe: UPS protects host + Vault Raft replication +
-#                  databases on block storage. Only NFS metadata at risk.
-#   no_subtree_check — disable subtree checking for performance and reliability
-#   no_root_squash   — k8s CSI driver runs as root; squashing breaks PVC writes
-#   insecure         — allow source ports >1024 (required: pfSense VLAN NAT uses
-#                      unprivileged ports for VLAN 10 → 192.168.1.x traffic)
-#
-/srv/nfs *(rw,async,no_subtree_check,no_root_squash,insecure)
-/srv/nfs-ssd *(rw,sync,no_subtree_check,no_root_squash,insecure,fsid=1)
--- a/scripts/renew_worker_certs.sh
+++ b/scripts/renew_worker_certs.sh
@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-
-echo 'KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --pod-infra-container-image=k8s.gcr.io/pause:3.7 --rotate-certificates=true --rotate-server-certificates=true"' | sudo tee /var/lib/kubelet/kubeadm-flags.env
-
-sudo systemctl daemon-reload
-sudo systemctl restart kubelet
-
-# Aprprove all csrs:
-# for csr in $(kb get csr | grep Pending | awk '{print $1}'); do echo $csr; kb certificate approve $csr; done
--- a/scripts/server_safe_poweroff/deploy_to_nas.sh
+++ b/scripts/server_safe_poweroff/deploy_to_nas.sh
@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o /tmp/powercheck-armv8 . && rsync /tmp/powercheck-armv8 Administrator@nas:~/server-power-cycle/ && rm /tmp/powercheck-armv8
-rsync synology_main.sh Administrator@nas:~/server-power-cycle/
--- a/scripts/server_safe_poweroff/go.mod
+++ b/scripts/server_safe_poweroff/go.mod
@ -1,12 +0,0 @@
-module viktorbarzin/server-lifecycle
-
-go 1.22.0
-
-toolchain go1.23.6
-
-require (
-	github.com/gosnmp/gosnmp v1.39.0
-	github.com/nightlyone/lockfile v1.0.0
-)
-
-require github.com/golang/glog v1.2.4 // indirect
--- a/scripts/server_safe_poweroff/go.sum
+++ b/scripts/server_safe_poweroff/go.sum
@ -1,14 +0,0 @@
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/golang/glog v1.2.4 h1:CNNw5U8lSiiBk7druxtSHHTsRWcxKoac6kZKm2peBBc=
-github.com/golang/glog v1.2.4/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w=
-github.com/gosnmp/gosnmp v1.39.0 h1:mPJtSWFLkEemo2bz4fdNztZIFHYG86MC6c6veocq0ZE=
-github.com/gosnmp/gosnmp v1.39.0/go.mod h1:CxVS6bXqmWZlafUj9pZUnQX5e4fAltqPcijxWpCitDo=
-github.com/nightlyone/lockfile v1.0.0 h1:RHep2cFKK4PonZJDdEl4GmkabuhbsRMgk/k3uAmxBiA=
-github.com/nightlyone/lockfile v1.0.0/go.mod h1:rywoIealpdNse2r832aiD9jRk8ErCatROs6LzC841CI=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
-github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/scripts/server_safe_poweroff/idrac_utils.go
+++ b/scripts/server_safe_poweroff/idrac_utils.go
@ -1,125 +0,0 @@
-package main
-
-import (
-	"bytes"
-	"crypto/tls"
-	"encoding/json"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"net/http"
-
-	"github.com/golang/glog"
-)
-
-type PowerStateResponse struct {
-	PowerState string `json:"PowerState"`
-}
-type ResetType string
-
-const (
-	On               ResetType = "On"
-	GracefulShutdown ResetType = "GracefulShutdown"
-)
-
-func checkPowerState(idractCredentials idracCredentials) (string, error) {
-	// Construct the full URL for the Redfish Systems endpoint
-	redfishURL := fmt.Sprintf("%s/redfish/v1/Systems/System.Embedded.1", idractCredentials.url)
-
-	// Create an HTTP client
-	client := &http.Client{
-		Transport: &http.Transport{
-			TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
-		},
-	}
-
-	// Create a new GET request
-	req, err := http.NewRequest("GET", redfishURL, nil)
-	if err != nil {
-		return "", fmt.Errorf("failed to create request: %v", err)
-	}
-
-	// Set basic authentication
-	req.SetBasicAuth(idractCredentials.username, idractCredentials.password)
-
-	// Set the Accept header to request JSON
-	req.Header.Set("Accept", "application/json")
-
-	// Send the request
-	resp, err := client.Do(req)
-	if err != nil {
-		return "", fmt.Errorf("failed to send request: %v", err)
-	}
-	defer resp.Body.Close()
-
-	// Check the HTTP status code
-	if resp.StatusCode != http.StatusOK {
-		body, _ := io.ReadAll(resp.Body)
-		return "", fmt.Errorf("unexpected status code: %d, response: %s", resp.StatusCode, string(body))
-	}
-
-	// Read the response body
-	body, err := ioutil.ReadAll(resp.Body)
-	if err != nil {
-		return "", fmt.Errorf("failed to read response body: %v", err)
-	}
-
-	// return string(body), nil
-	// Parse the JSON response
-	var powerStateResponse PowerStateResponse
-	err = json.Unmarshal(body, &powerStateResponse)
-	if err != nil {
-		return "", fmt.Errorf("failed to parse JSON response: %v", err)
-	}
-
-	// Return the power state
-	return powerStateResponse.PowerState, nil
-}
-
-func performGracefulShutdown(idracCredentials idracCredentials) error {
-	return performResetType(idracCredentials, GracefulShutdown)
-}
-
-func performPowerOn(idracCredentials idracCredentials) error {
-	return performResetType(idracCredentials, On)
-}
-
-func performResetType(idracCredentials idracCredentials, resetType ResetType) error {
-	glog.Warningf("Starting graceful reset type %s!\n", resetType)
-	// Define the payload for the shutdown request
-	payload := map[string]string{
-		"ResetType": string(resetType), // Only ResetType is needed
-	}
-	payloadBytes, err := json.Marshal(payload)
-	if err != nil {
-		return fmt.Errorf("failed to marshal payload: %v", err)
-	}
-
-	// Create a new HTTP request
-	req, err := http.NewRequest("POST", idracCredentials.url, bytes.NewBuffer(payloadBytes))
-	if err != nil {
-		return fmt.Errorf("failed to create request: %v", err)
-	}
-
-	// Set headers
-	req.Header.Set("Content-Type", "application/json")
-	req.SetBasicAuth(idracCredentials.username, idracCredentials.password)
-
-	// Send the request
-	client := &http.Client{}
-	resp, err := client.Do(req)
-	if err != nil {
-		return fmt.Errorf("failed to send request: %v", err)
-	}
-	defer resp.Body.Close()
-
-	// Check the response status code
-	if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
-		body, _ := ioutil.ReadAll(resp.Body)
-		return fmt.Errorf("unexpected status code: %d, response: %s", resp.StatusCode, string(body))
-	}
-
-	glog.Infof("Reset type %s initiated successfully.\n")
-	return nil
-
-}
--- a/scripts/server_safe_poweroff/main.go
+++ b/scripts/server_safe_poweroff/main.go
@ -1,107 +0,0 @@
-package main
-
-import (
-	"flag"
-	"log"
-
-	"github.com/golang/glog"
-	"github.com/nightlyone/lockfile"
-)
-
-const upsMinutesRemainingThreshold = 20
-
-type idracCredentials = struct {
-	url      string
-	username string
-	password string
-}
-
-func main() {
-	idracUsername := flag.String("idracUsername", "root", "iDRAC username")
-	idracPassword := flag.String("idracPassword", "calvin", "iDRAC password")
-	idracHost := flag.String("idracHost", "192.168.1.4", "iDRAC host")
-	flag.Parse()
-	defer glog.Flush()
-	// lock, err := tryGetLock()
-	// if err != nil {
-	// 	glog.Fatalf("Failed to acquire lock:  %v", err)
-	// }
-	// defer lock.Unlock()
-
-	glog.Info("Checking server power state")
-	idracCredentials := idracCredentials{
-		url:      "https://" + *idracHost,
-		username: *idracUsername,
-		password: *idracPassword,
-	}
-	powerState, err := checkPowerState(idracCredentials)
-	if err != nil {
-		glog.Fatalf("Failed to check power state: %v", err)
-	}
-	glog.Infof("Server power state: %s", powerState)
-
-	glog.Info("Checking UPS state")
-	snmp := getSNMPClient()
-	// Connect to the SNMP agent
-	err = snmp.Connect()
-	if err != nil {
-		log.Fatalf("Failed to connect to UPS SNMP agent: %v", err)
-	}
-	defer snmp.Conn.Close()
-
-	upsState, err := getPowerState(snmp)
-	if err != nil {
-		glog.Fatalf("Failed to get UPS power state: %v", err)
-	}
-
-	if powerState == "On" {
-		handleWhenServerOn(upsState, idracCredentials)
-	} else if powerState == "Off" {
-		handleWhenServerOff(upsState, idracCredentials)
-	} else {
-		glog.Fatalf("Unknown server state %s", powerState)
-	}
-}
-func handleWhenServerOn(upsState UPSPowerState, idracCredentials idracCredentials) {
-	if upsState.inputVoltage > 0 {
-		glog.Infof("UPS is on AC power: %d. Nothing to do.\n", upsState.inputVoltage)
-		return
-	} else {
-		glog.Warningln("UPS is on Battery power")
-		if upsState.minutesRemaining < upsMinutesRemainingThreshold {
-			glog.Warningf("Minutes remaining is too low - %d Turning off server.", upsState.minutesRemaining)
-			// Perform a graceful shutdown of the server
-			performGracefulShutdown(idracCredentials)
-		} else {
-			glog.Warningf("Minutes remaining is %d. Server will not be shutdown yet.", upsState.minutesRemaining)
-			return
-		}
-	}
-}
-
-func handleWhenServerOff(upsState UPSPowerState, idracCredentials idracCredentials) {
-	if upsState.inputVoltage > 0 {
-		glog.Infof("UPS is on AC power: %d\n", upsState.inputVoltage)
-		if upsState.minutesRemaining < upsMinutesRemainingThreshold {
-			glog.Infof("UPS battery is still too low - %d minutes remaining. Not turning on server yet.\n", upsState.minutesRemaining)
-		} else {
-			glog.Infof("UPS is on AC power and battery has charged - %d minutes remaining. Turning on server...\n", upsState.minutesRemaining)
-			// Perform startup of the server
-			performPowerOn(idracCredentials)
-		}
-	} else {
-		glog.Warningln("UPS is still on battery power")
-		return
-	}
-}
-func tryGetLock() (*lockfile.Lockfile, error) {
-	lock, err := lockfile.New("/tmp/server_safe_poweroff.pid")
-	if err != nil {
-		log.Fatalf("Failed to create lock file: %v", err)
-	}
-	err = lock.TryLock()
-	if err != nil {
-		return nil, err
-	}
-	return &lock, nil
-}
--- a/scripts/server_safe_poweroff/synology_main.sh
+++ b/scripts/server_safe_poweroff/synology_main.sh
@ -1,23 +0,0 @@
-#!/usr/bin/env bash
-
-# This is used to run the main program on synology nas and log all messages to synology's log system
-
-cd /var/services/homes/Administrator/server-power-cycle
-echo "Starting powercheck"
-./powercheck-armv8 -log_dir=./logs
-
-echo "script completed successfully, logging to synlogy's logs"
-
-
-while IFS= read -r line; do
-# for line in $(cat ./logs/powercheck-armv8.INFO); do
-    msg=$(echo $line | grep -E '^[IWEF][0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}'| awk '{$1=$2=$3=$4=""; print $0}' | sed 's/^ *//')
-    #echo $line
-    echo $msg
-    if [[ -n $msg ]]; then
-        synologset1 sys info 0x11800000 "$msg"
-    fi
-done < "./logs/powercheck-armv8.INFO"
-
-# Cleanup logs
-find ./logs -type f -mtime +7 -exec rm {} \;
--- a/scripts/server_safe_poweroff/ups_utils.go
+++ b/scripts/server_safe_poweroff/ups_utils.go
@ -1,46 +0,0 @@
-package main
-
-import (
-	"time"
-
-	"github.com/golang/glog"
-	"github.com/gosnmp/gosnmp"
-)
-
-type UPSPowerState = struct {
-	inputVoltage     int
-	minutesRemaining uint
-}
-
-func getSNMPClient() *gosnmp.GoSNMP {
-
-	// Define SNMP connection parameters
-	target := "192.168.1.5"
-	community := "Public0"
-
-	// Create a new SNMP client
-	snmp := &gosnmp.GoSNMP{
-		Target:    target,
-		Port:      161, // Default SNMP port
-		Community: community,
-		Version:   gosnmp.Version2c, // Use SNMP v2c
-		Timeout:   time.Duration(5) * time.Second,
-	}
-	return snmp
-}
-func getPowerState(snmp *gosnmp.GoSNMP) (UPSPowerState, error) {
-	oids := []string{
-		// "1.3.6.1.2.1.33.1.2.2.0",     // seconds on battery
-		"1.3.6.1.2.1.33.1.3.3.1.3.1", // input voltage
-		"1.3.6.1.2.1.33.1.2.3.0",     // minutes remaining
-	}
-	// Perform an SNMP GET request to retrieve the values for the specified OIDs
-	result, err := snmp.Get(oids)
-	if err != nil {
-		glog.Fatalf("Failed to perform SNMP GET request: %v", err)
-	}
-
-	inputVoltage := (result.Variables[0].Value).(int)
-	minutesRemaining := result.Variables[1].Value.(uint)
-	return UPSPowerState{inputVoltage, minutesRemaining}, nil
-}
--- a/scripts/setup-containerd-pullthrough.sh
+++ b/scripts/setup-containerd-pullthrough.sh
@ -1,115 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-############################################
-# CONFIGURATION
-############################################
-
-# Internal pull-through registry endpoint
-# Examples:
-#   http://registry.internal:5000
-#   https://registry.internal
-INTERNAL_REGISTRY="http://10.0.20.10:5002"
-
-# Path where containerd reads registry configs
-CERTS_DIR="/etc/containerd/certs.d"
-
-# Optional: path to CA file if INTERNAL_REGISTRY uses HTTPS with custom CA
-# Leave empty if not needed
-INTERNAL_CA_PATH=""
-
-# Restart containerd at the end
-RESTART_CONTAINERD=true
-
-############################################
-# REGISTRIES TO MIRROR
-############################################
-
-REGISTRIES=(
-  "docker.io"
-  "registry-1.docker.io"
-  "registry.k8s.io"
-  "quay.io"
-  "ghcr.io"
-  "gcr.io"
-  "us-docker.pkg.dev"
-  "public.ecr.aws"
-  "mcr.microsoft.com"
-)
-
-############################################
-# FUNCTIONS
-############################################
-
-require_root() {
-  if [[ "$(id -u)" -ne 0 ]]; then
-    echo "ERROR: must be run as root" >&2
-    exit 1
-  fi
-}
-
-ensure_containerd_config_path() {
-  local cfg="/etc/containerd/config.toml"
-
-  if [[ ! -f "$cfg" ]]; then
-    echo "Generating default containerd config"
-    containerd config default > "$cfg"
-  fi
-
-  if ! grep -q 'config_path *= *"/etc/containerd/certs.d"' "$cfg"; then
-    echo "Enabling config_path in containerd config"
-
-    # Minimal and safe append if section exists
-    if grep -q '\[plugins\."io.containerd.grpc.v1.cri".registry\]' "$cfg"; then
-      sed -i '/\[plugins\."io.containerd.grpc.v1.cri".registry\]/a \  config_path = "/etc/containerd/certs.d"' "$cfg"
-    else
-      cat >> "$cfg" <<'EOF'
-
-[plugins."io.containerd.grpc.v1.cri".registry]
-  config_path = "/etc/containerd/certs.d"
-EOF
-    fi
-  fi
-}
-
-write_hosts_toml() {
-  local registry="$1"
-  local dir="$CERTS_DIR/$registry"
-  local file="$dir/hosts.toml"
-
-  mkdir -p "$dir"
-
-  cat > "$file" <<EOF
-server = "https://$registry"
-
-[host."$INTERNAL_REGISTRY"]
-  capabilities = ["pull", "resolve"]
-EOF
-
-  if [[ -n "$INTERNAL_CA_PATH" ]]; then
-    cat >> "$file" <<EOF
-  ca = "$INTERNAL_CA_PATH"
-EOF
-  fi
-}
-
-############################################
-# MAIN
-############################################
-
-require_root
-ensure_containerd_config_path
-
-echo "Creating registry mirror configurations..."
-
-for r in "${REGISTRIES[@]}"; do
-  echo "  - $r"
-  write_hosts_toml "$r"
-done
-
-if [[ "$RESTART_CONTAINERD" == "true" ]]; then
-  echo "Restarting containerd"
-  systemctl restart containerd
-fi
-
-echo "Done."
--- a/scripts/setup-forgejo-containerd-mirror.sh
+++ b/scripts/setup-forgejo-containerd-mirror.sh
@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-# One-shot deployment of the forgejo.viktorbarzin.me containerd hosts.toml
-# entry across every k8s node. Cloud-init only fires on VM provision, so
-# existing nodes need this manual rollout.
-#
-# What it does, per node:
-#   1. drain (ignore-daemonsets, delete-emptydir-data)
-#   2. ssh in: mkdir + write /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml
-#   3. systemctl restart containerd
-#   4. uncordon
-#
-# hosts.toml is documented as hot-reloaded but the post-2026-04-19
-# containerd corruption playbook calls for an explicit restart so the
-# config is unambiguously in effect. Running drain/uncordon around it
-# avoids pulling against an in-flight containerd restart.
-#
-# Re-run is safe: writes are idempotent.
-
-set -euo pipefail
-
-CERTS_DIR=/etc/containerd/certs.d/forgejo.viktorbarzin.me
-HOSTS_TOML='server = "https://forgejo.viktorbarzin.me"
-
-[host."https://10.0.20.203"]
-  capabilities = ["pull", "resolve"]
-  skip_verify = true
-'
-
-NODES=$(kubectl get nodes -o name | sed 's|^node/||')
-if [[ -z "$NODES" ]]; then
-  echo "ERROR: no nodes returned from kubectl get nodes" >&2
-  exit 1
-fi
-
-for n in $NODES; do
-  echo "=== $n ==="
-  kubectl drain "$n" --ignore-daemonsets --delete-emptydir-data --force --grace-period=60
-
-  ssh -o StrictHostKeyChecking=accept-new "wizard@$n" sudo bash <<EOF
-set -euo pipefail
-mkdir -p "$CERTS_DIR"
-cat > "$CERTS_DIR/hosts.toml" <<'TOML'
-$HOSTS_TOML
-TOML
-systemctl restart containerd
-EOF
-
-  kubectl uncordon "$n"
-
-  # Wait for the node to report Ready before moving to the next one.
-  for i in {1..30}; do
-    if kubectl get node "$n" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' | grep -q True; then
-      echo "    node Ready"
-      break
-    fi
-    sleep 2
-  done
-done
-
-echo "All nodes updated."
--- a/scripts/setup-task-pipeline.sh
+++ b/scripts/setup-task-pipeline.sh
@ -1,231 +0,0 @@
-#!/usr/bin/env bash
-#
-# Setup script for the Forgejo task ingestion pipeline.
-# Creates Authentik OAuth2 provider/application, configures Forgejo OAuth2 auth source,
-# creates "tasks" repo, and sets up webhook to n8n.
-#
-# Prerequisites:
-#   - Authentik admin API token
-#   - Forgejo admin API token (create at https://forgejo.viktorbarzin.me/user/settings/applications)
-#
-# Usage:
-#   AUTHENTIK_TOKEN="..." FORGEJO_TOKEN="..." bash scripts/setup-task-pipeline.sh
-
-set -euo pipefail
-
-AUTHENTIK_URL="${AUTHENTIK_URL:-https://authentik.viktorbarzin.me}"
-FORGEJO_URL="${FORGEJO_URL:-https://forgejo.viktorbarzin.me}"
-N8N_WEBHOOK_URL="${N8N_WEBHOOK_URL:-https://n8n.viktorbarzin.me/webhook/forgejo-tasks}"
-FORGEJO_ADMIN_USER="${FORGEJO_ADMIN_USER:-viktor}"
-
-: "${AUTHENTIK_TOKEN:?Set AUTHENTIK_TOKEN (Authentik admin API token)}"
-: "${FORGEJO_TOKEN:?Set FORGEJO_TOKEN (Forgejo admin API token)}"
-
-ak_api() { curl -sf -H "Authorization: Bearer $AUTHENTIK_TOKEN" -H "Content-Type: application/json" "$@"; }
-fg_api() { curl -sf -H "Authorization: token $FORGEJO_TOKEN" -H "Content-Type: application/json" "$@"; }
-
-echo "=== Step 1: Create Authentik group 'Task Submitters' ==="
-GROUP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/groups/" -d '{
-  "name": "Task Submitters",
-  "is_superuser": false,
-  "parent": null
-}' 2>/dev/null) || {
-  echo "  Group may already exist, checking..."
-  GROUP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/groups/?name=Task+Submitters" | python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
-  if [ -z "$GROUP_RESP" ]; then echo "ERROR: Failed to create or find group"; exit 1; fi
-}
-GROUP_PK=$(echo "$GROUP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")
-echo "  Group PK: $GROUP_PK"
-
-echo ""
-echo "=== Step 2: Create Authentik OAuth2 Provider for Forgejo ==="
-# Find the explicit consent authorization flow
-AUTH_FLOW=$(ak_api "$AUTHENTIK_URL/api/v3/flows/instances/?designation=authorization&search=explicit" | \
-  python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
-if [ -z "$AUTH_FLOW" ]; then
-  echo "  WARNING: Could not find explicit consent flow, using implicit"
-  AUTH_FLOW=$(ak_api "$AUTHENTIK_URL/api/v3/flows/instances/?designation=authorization" | \
-    python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
-fi
-echo "  Authorization flow: $AUTH_FLOW"
-
-PROVIDER_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/providers/oauth2/" -d "{
-  \"name\": \"Forgejo\",
-  \"authorization_flow\": \"$AUTH_FLOW\",
-  \"client_type\": \"confidential\",
-  \"redirect_uris\": \"$FORGEJO_URL/user/oauth2/Authentik/callback\",
-  \"property_mappings\": [],
-  \"sub_mode\": \"hashed_user_id\",
-  \"include_claims_in_id_token\": true,
-  \"access_code_validity\": \"minutes=1\",
-  \"access_token_validity\": \"minutes=5\",
-  \"refresh_token_validity\": \"days=30\"
-}" 2>/dev/null) || {
-  echo "  Provider may already exist, checking..."
-  PROVIDER_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/providers/oauth2/?name=Forgejo" | \
-    python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
-  if [ -z "$PROVIDER_RESP" ]; then echo "ERROR: Failed to create or find provider"; exit 1; fi
-}
-PROVIDER_PK=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")
-CLIENT_ID=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['client_id'])")
-CLIENT_SECRET=$(echo "$PROVIDER_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('client_secret','<already-created>'))")
-echo "  Provider PK: $PROVIDER_PK"
-echo "  Client ID: $CLIENT_ID"
-echo "  Client Secret: $CLIENT_SECRET"
-
-echo ""
-echo "=== Step 3: Create Authentik Application for Forgejo ==="
-APP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/applications/" -d "{
-  \"name\": \"Forgejo\",
-  \"slug\": \"forgejo\",
-  \"provider\": $PROVIDER_PK,
-  \"meta_launch_url\": \"$FORGEJO_URL\",
-  \"policy_engine_mode\": \"any\"
-}" 2>/dev/null) || {
-  echo "  Application may already exist, checking..."
-  APP_RESP=$(ak_api "$AUTHENTIK_URL/api/v3/core/applications/?slug=forgejo" | \
-    python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(json.dumps(r[0]) if r else '')")
-}
-APP_SLUG=$(echo "$APP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['slug'])")
-echo "  Application slug: $APP_SLUG"
-
-echo ""
-echo "=== Step 4: Bind 'Task Submitters' group to Forgejo application ==="
-# Create a policy binding that restricts access to the Task Submitters group
-ak_api "$AUTHENTIK_URL/api/v3/policies/bindings/" -d "{
-  \"target\": \"$(echo "$APP_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['pk'])")\",
-  \"group\": \"$GROUP_PK\",
-  \"enabled\": true,
-  \"order\": 0,
-  \"negate\": false,
-  \"timeout\": 30
-}" > /dev/null 2>&1 || echo "  Binding may already exist (OK)"
-echo "  Group binding created"
-
-echo ""
-echo "=== Step 5: Add users to 'Task Submitters' group ==="
-echo "  Adding Viktor Barzin..."
-VIKTOR_PK=$(ak_api "$AUTHENTIK_URL/api/v3/core/users/?search=vbarzin" | \
-  python3 -c "import sys,json; r=json.load(sys.stdin)['results']; print(r[0]['pk'] if r else '')")
-if [ -n "$VIKTOR_PK" ]; then
-  ak_api "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/" -X PATCH -d "{}" > /dev/null 2>&1 || true
-  ak_api -X POST "$AUTHENTIK_URL/api/v3/core/groups/$GROUP_PK/add_user/" -d "{\"pk\": $VIKTOR_PK}" > /dev/null 2>&1 || true
-  echo "  Added Viktor (PK: $VIKTOR_PK)"
-fi
-
-echo ""
-echo "=== Step 6: Configure Forgejo OAuth2 authentication source ==="
-fg_api "$FORGEJO_URL/api/v1/admin/identity-sources" -d "{
-  \"authentication_source\": {
-    \"name\": \"Authentik\",
-    \"type\": \"oauth2\",
-    \"is_active\": true,
-    \"is_sync_enabled\": false,
-    \"oauth2\": {
-      \"provider\": \"openidConnect\",
-      \"client_id\": \"$CLIENT_ID\",
-      \"client_secret\": \"$CLIENT_SECRET\",
-      \"open_id_connect_auto_discovery_url\": \"$AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration\",
-      \"scopes\": [\"openid\", \"profile\", \"email\"],
-      \"required_claim_name\": \"\",
-      \"required_claim_value\": \"\",
-      \"group_claim_name\": \"\",
-      \"admin_group\": \"\",
-      \"restricted_group\": \"\",
-      \"icon_url\": \"\",
-      \"skip_local_2fa\": true,
-      \"attribute_ssn\": \"\"
-    }
-  }
-}" > /dev/null 2>&1 && echo "  OAuth2 source created" || {
-  echo "  Forgejo identity-sources API may not be available."
-  echo "  Falling back to legacy authentication-source API..."
-  fg_api "$FORGEJO_URL/api/v1/admin/auths" -d "{
-    \"name\": \"Authentik\",
-    \"type\": 6,
-    \"is_active\": true,
-    \"is_sync_enabled\": false,
-    \"cfg\": {
-      \"Provider\": \"openidConnect\",
-      \"ClientID\": \"$CLIENT_ID\",
-      \"ClientSecret\": \"$CLIENT_SECRET\",
-      \"OpenIDConnectAutoDiscoveryURL\": \"$AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration\",
-      \"Scopes\": [\"openid\", \"profile\", \"email\"],
-      \"SkipLocalTwoFA\": true
-    }
-  }" > /dev/null 2>&1 && echo "  OAuth2 source created (legacy API)" || {
-    echo "  ERROR: Could not create OAuth2 source via API."
-    echo "  Please create it manually in Forgejo admin panel:"
-    echo "    1. Go to $FORGEJO_URL/-/admin/auths/new"
-    echo "    2. Auth Type: OAuth2"
-    echo "    3. Name: Authentik"
-    echo "    4. OAuth2 Provider: OpenID Connect"
-    echo "    5. Client ID: $CLIENT_ID"
-    echo "    6. Client Secret: $CLIENT_SECRET"
-    echo "    7. Discovery URL: $AUTHENTIK_URL/application/o/forgejo/.well-known/openid-configuration"
-    echo "    8. Scopes: openid profile email"
-  }
-}
-
-echo ""
-echo "=== Step 7: Create 'tasks' repository in Forgejo ==="
-REPO_RESP=$(fg_api "$FORGEJO_URL/api/v1/user/repos" -d '{
-  "name": "tasks",
-  "description": "Task queue for OpenClaw AI agent. Create an issue to submit a task.",
-  "private": false,
-  "auto_init": true,
-  "default_branch": "main"
-}' 2>/dev/null) && echo "  Repository created" || {
-  echo "  Repository may already exist (OK)"
-  REPO_RESP=$(fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks")
-}
-echo "  Repo: $FORGEJO_URL/$FORGEJO_ADMIN_USER/tasks"
-
-echo ""
-echo "=== Step 8: Disable non-issue features on tasks repo ==="
-fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks" -X PATCH -d '{
-  "has_pull_requests": false,
-  "has_wiki": false,
-  "has_projects": false,
-  "has_releases": false,
-  "has_packages": false,
-  "has_actions": false
-}' > /dev/null 2>&1 && echo "  Disabled PRs, wiki, projects, releases, packages, actions" || echo "  Some features may not be disableable (OK)"
-
-echo ""
-echo "=== Step 9: Create issue labels ==="
-for label_data in \
-  '{"name":"pending","color":"#0075ca","description":"Task waiting to be processed"}' \
-  '{"name":"processing","color":"#e4e669","description":"Task currently being processed by OpenClaw"}' \
-  '{"name":"completed","color":"#0e8a16","description":"Task completed successfully"}' \
-  '{"name":"failed","color":"#d73a4a","description":"Task failed during processing"}'; do
-  fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks/labels" -d "$label_data" > /dev/null 2>&1 || true
-done
-echo "  Labels created: pending, processing, completed, failed"
-
-echo ""
-echo "=== Step 10: Create webhook on tasks repo → n8n ==="
-fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_ADMIN_USER/tasks/hooks" -d "{
-  \"type\": \"gitea\",
-  \"config\": {
-    \"url\": \"$N8N_WEBHOOK_URL\",
-    \"content_type\": \"json\",
-    \"secret\": \"\"
-  },
-  \"events\": [\"issues\"],
-  \"active\": true
-}" > /dev/null 2>&1 && echo "  Webhook created → $N8N_WEBHOOK_URL" || echo "  Webhook may already exist (OK)"
-
-echo ""
-echo "=========================================="
-echo "Setup complete!"
-echo ""
-echo "Next steps:"
-echo "  1. Add SOPS secrets:"
-echo "     forgejo_authentik_client_id = \"$CLIENT_ID\""
-echo "     forgejo_authentik_client_secret = \"$CLIENT_SECRET\""
-echo "  2. Run: scripts/tg apply -target=module.forgejo"
-echo "  3. Create n8n workflow (webhook trigger → OpenClaw exec → Forgejo comment)"
-echo "  4. Add more users to 'Task Submitters' group in Authentik"
-echo "  5. Test: Create an issue at $FORGEJO_URL/$FORGEJO_ADMIN_USER/tasks/issues/new"
-echo "=========================================="
--- a/scripts/setup_containerd_mirrors.sh
+++ b/scripts/setup_containerd_mirrors.sh
@ -1,54 +0,0 @@
-#!/bin/bash
-# setup_containerd_mirrors.sh
-# Replaces deprecated wildcard registry mirror with per-registry hosts.toml config.
-# Run on each K8s WORKER node: ssh wizard@<node-ip> 'sudo bash -s' < scripts/setup_containerd_mirrors.sh
-# NOTE: Do NOT run on k8s-master (containerd 1.6.x has conflicts with config_path + mirrors coexisting)
-
-set -euo pipefail
-
-TIMESTAMP=$(date +%s)
-CONFIG="/etc/containerd/config.toml"
-CERTS_DIR="/etc/containerd/certs.d"
-
-echo "=== Backing up containerd config ==="
-cp "$CONFIG" "${CONFIG}.bak.${TIMESTAMP}"
-
-echo "=== Removing deprecated mirror entries ==="
-# Remove wildcard mirror and its endpoint
-sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."\*"\]/d' "$CONFIG"
-sed -i '/endpoint = \["http:\/\/10\.0\.20\.10:5000"\]/d' "$CONFIG"
-# Remove any other per-registry mirror sections (e.g. docker.io) to avoid config_path conflict
-sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."docker\.io"\]/d' "$CONFIG"
-sed -i '/endpoint = \["https:\/\/registry-1\.docker\.io"\]/d' "$CONFIG"
-# Remove the mirrors parent section header if it's now empty
-sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]$/d' "$CONFIG"
-
-echo "=== Setting config_path ==="
-# Replace empty config_path with certs.d path
-if grep -q 'config_path = ""' "$CONFIG"; then
-  sed -i 's|config_path = ""|config_path = "/etc/containerd/certs.d"|' "$CONFIG"
-elif grep -q 'config_path = "/etc/containerd/certs.d"' "$CONFIG"; then
-  echo "config_path already set, skipping"
-else
-  # If config_path line doesn't exist at all, add it under [plugins."io.containerd.grpc.v1.cri".registry]
-  sed -i '/\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\]/a\      config_path = "/etc/containerd/certs.d"' "$CONFIG"
-fi
-
-echo "=== Creating hosts.toml files ==="
-
-# docker.io (Docker Hub) — proxy first, upstream fallback
-mkdir -p "$CERTS_DIR/docker.io"
-printf 'server = "https://registry-1.docker.io"\n\n[host."http://10.0.20.10:5000"]\n  capabilities = ["pull", "resolve"]\n\n[host."https://registry-1.docker.io"]\n  capabilities = ["pull", "resolve"]\n' > "$CERTS_DIR/docker.io/hosts.toml"
-
-# ghcr.io — proxy first, upstream fallback
-mkdir -p "$CERTS_DIR/ghcr.io"
-printf 'server = "https://ghcr.io"\n\n[host."http://10.0.20.10:5010"]\n  capabilities = ["pull", "resolve"]\n\n[host."https://ghcr.io"]\n  capabilities = ["pull", "resolve"]\n' > "$CERTS_DIR/ghcr.io/hosts.toml"
-
-# Low-traffic registries (quay.io, registry.k8s.io, reg.kyverno.io) pull directly — no proxy.
-# Remove stale hosts.toml from previous config if present.
-rm -f "$CERTS_DIR/quay.io/hosts.toml" "$CERTS_DIR/registry.k8s.io/hosts.toml" "$CERTS_DIR/reg.kyverno.io/hosts.toml"
-rmdir "$CERTS_DIR/quay.io" "$CERTS_DIR/registry.k8s.io" "$CERTS_DIR/reg.kyverno.io" 2>/dev/null || true
-
-# No containerd restart needed — hosts.toml is re-read on each pull
-
-echo "=== Done ==="
--- a/scripts/state-sync
+++ b/scripts/state-sync
@ -1,129 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-STATE_DIR="$REPO_ROOT/state/stacks"
-VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
-
-cmd="${1:-help}"
-stack="${2:-}"  # optional: operate on single stack
-
-# Check if Vault token is valid
-vault_available() {
-  VAULT_ADDR="$VAULT_ADDR" vault token lookup &>/dev/null 2>&1
-}
-
-# Per-stack Transit key URI
-transit_uri() {
-  local stack_name="$1"
-  echo "${VAULT_ADDR}/v1/transit/keys/sops-state-${stack_name}"
-}
-
-# Extract stack name from directory path
-stack_name_from_dir() {
-  basename "$1"
-}
-
-# Tier 0 stacks keep SOPS-encrypted local state; Tier 1 uses PG backend
-TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
-is_tier0() {
-  echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
-}
-
-# Read age recipients from .sops.yaml
-AGE_RECIPIENTS="$(python3 -c "
-import yaml, sys
-with open('$REPO_ROOT/.sops.yaml') as f: c = yaml.safe_load(f)
-for r in c.get('creation_rules', []):
-    age = r.get('age', '')
-    if age:
-        print(age.replace('\n', '').strip())
-        break
-" 2>/dev/null || echo "")"
-
-encrypt_state() {
-  local dir="$1"
-  local src="$dir/terraform.tfstate"
-  local dst="$dir/terraform.tfstate.enc"
-  local name
-  name="$(stack_name_from_dir "$dir")"
-  [ -f "$src" ] || return 0
-  # Only re-encrypt if state is newer than encrypted version
-  if [ ! -f "$dst" ] || [ "$src" -nt "$dst" ]; then
-    sops -e --input-type json --output-type json \
-      --hc-vault-transit "$(transit_uri "$name")" \
-      --age "$AGE_RECIPIENTS" \
-      "$src" > "$dst"
-  fi
-}
-
-decrypt_state() {
-  local dir="$1"
-  local src="$dir/terraform.tfstate.enc"
-  local dst="$dir/terraform.tfstate"
-  [ -f "$src" ] || return 0
-
-  if vault_available; then
-    # Vault Transit — per-stack key, no local key needed
-    sops -d --input-type json --output-type json "$src" > "$dst"
-  elif [ -f "${SOPS_AGE_KEY_FILE:-$HOME/.config/sops/age/keys.txt}" ]; then
-    # Fallback: age key on disk (bootstrap / Vault down)
-    echo "state-sync: Vault unavailable, falling back to age key" >&2
-    SOPS_AGE_KEY_FILE="${SOPS_AGE_KEY_FILE:-$HOME/.config/sops/age/keys.txt}" \
-      sops -d --input-type json --output-type json "$src" > "$dst"
-  else
-    echo "state-sync: ERROR — no Vault token and no age key at ~/.config/sops/age/keys.txt" >&2
-    return 1
-  fi
-}
-
-case "$cmd" in
-  encrypt)
-    if [ -n "$stack" ]; then
-      if is_tier0 "$stack"; then
-        encrypt_state "$STATE_DIR/$stack"
-      else
-        echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
-      fi
-    else
-      for dir in "$STATE_DIR"/*/; do
-        _name="$(stack_name_from_dir "$dir")"
-        if is_tier0 "$_name"; then
-          encrypt_state "$dir"
-        fi
-      done
-    fi
-    ;;
-  decrypt)
-    if [ -n "$stack" ]; then
-      if is_tier0 "$stack"; then
-        decrypt_state "$STATE_DIR/$stack"
-      else
-        echo "state-sync: skipping Tier 1 stack '$stack' (PG backend)" >&2
-      fi
-    else
-      for dir in "$STATE_DIR"/*/; do
-        _name="$(stack_name_from_dir "$dir")"
-        if is_tier0 "$_name"; then
-          decrypt_state "$dir"
-        fi
-      done
-    fi
-    ;;
-  commit)
-    # Only Tier 0 stacks have encrypted state in git
-    "$0" encrypt
-    cd "$REPO_ROOT"
-    git add state/stacks/*/terraform.tfstate.enc
-    if ! git diff --cached --quiet; then
-      git commit -m "state: update encrypted terraform state"
-    fi
-    ;;
-  help)
-    echo "Usage: state-sync {encrypt|decrypt|commit} [stack-name]"
-    echo "Operates on Tier 0 stacks only (infra, platform, cnpg, vault, dbaas, external-secrets)."
-    echo "Tier 1 stacks use the PG backend and don't need local state sync."
-    echo "Encrypt uses per-stack Vault Transit key (transit/keys/sops-state-<stack>)."
-    echo "Decrypt uses Vault Transit if logged in, falls back to age key."
-    ;;
-esac
--- a/scripts/stop_storage_services.sh
+++ b/scripts/stop_storage_services.sh
@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# Stop services that may become in a corrupted state if storage is suddenly disconnected
-
-
-set -euxo pipefail
-
-function scale() { kubectl scale deployment --replicas=$3 --namespace $1 $2; }
-
-### ============================
-### MAIN
-### ============================
-cmd="${1:-stop}"
-case "$cmd" in
-  stop)
-    scale redis redis 0
-    scale uptime-kuma uptime-kuma 0
-    scale paperless-ngx paperless-ngx 0
-    scale vaultwarden vaultwarden 0
-    scale immich immich-postgresql 0
-    scale nextcloud nextcloud 0
-    scale monitoring prometheus-server 0
-
-    scale technitium technitium 0
-    scale dbaas mysql 0
-    scale dbaas postgresql 0
-    ;;
-  start)
-    scale dbaas mysql 1
-    scale dbaas postgresql 1
-    scale technitium technitium 1
-    scale immich immich-postgresql 1
-    scale nextcloud nextcloud 1
-    scale paperless-ngx paperless-ngx 1
-    scale monitoring prometheus-server 1
-    scale redis redis 1
-    scale uptime-kuma uptime-kuma 1
-    scale vaultwarden vaultwarden 1
-    ;;
-    # echo "[!] Cleanup only removes links (not flushing all iptables to avoid surprises)."
-    # ip netns list | grep -qw "$NS_NAME" && sudo ip netns del "$NS_NAME" || true
-    # has_link "$HOST_VETH" && sudo ip link del "$HOST_VETH" || true
-    # ;;
-  *)
-    echo "Usage: $0 [stop|start]"
-    exit 1
-    ;;
-esac
--- a/scripts/sudoers-t3-autopair
+++ b/scripts/sudoers-t3-autopair
@ -1,6 +0,0 @@
-# The t3-dispatch service (unprivileged user t3-dispatch) may run ONLY the
-# t3-mint wrapper, as root. t3-mint validates the target user against
-# /etc/ttyd-user-map and mints a one-time t3 pairing token as that user.
-# A compromise of the network-facing dispatch service can therefore mint
-# pairing tokens for already-mapped users at most — never arbitrary root.
-t3-dispatch ALL=(root) NOPASSWD: /usr/local/bin/t3-mint
--- a/scripts/t3-autoupdate.service
+++ b/scripts/t3-autoupdate.service
@ -1,8 +0,0 @@
-[Unit]
-Description=Track latest t3 nightly (health-checked, idle-only restart)
-After=network-online.target
-Wants=network-online.target
-
-[Service]
-Type=oneshot
-ExecStart=/usr/local/bin/t3-autoupdate
--- a/scripts/t3-autoupdate.sh
+++ b/scripts/t3-autoupdate.sh
@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-# Track the latest t3 nightly — with a health-check + auto-rollback (lesson from
-# the Keel auto-update incidents: never blindly trust a new build) and idle-only
-# restarts (never kill an in-flight coding session). Runs as root via the unit.
-set -uo pipefail
-LOG() { logger -t t3-autoupdate "$*"; echo "t3-autoupdate: $*"; }
-
-ver() { t3 --version 2>/dev/null | awk '{print $NF}' | sed 's/^v//'; }
-
-before=$(ver); LOG "current: ${before:-unknown}"
-npm i -g t3@nightly >/dev/null 2>&1 || { LOG "npm install failed; staying on ${before:-current}"; exit 0; }
-after=$(ver)
-
-if [[ -z "$after" || "$after" == "$before" ]]; then
-  LOG "already latest (${before:-?}); nothing to do"; exit 0
-fi
-LOG "installed $after (was $before); health-checking…"
-
-# Health-check the NEW binary on a throwaway port/base-dir before trusting it.
-SMOKE_PORT=3799; SMOKE_DIR=$(mktemp -d)
-t3 serve --host 127.0.0.1 --port "$SMOKE_PORT" --base-dir "$SMOKE_DIR" >/dev/null 2>&1 &
-smoke=$!; ok=0
-for _ in $(seq 1 15); do
-  [[ "$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$SMOKE_PORT/" 2>/dev/null)" == "200" ]] && { ok=1; break; }
-  sleep 2
-done
-kill "$smoke" 2>/dev/null; wait "$smoke" 2>/dev/null; rm -rf "$SMOKE_DIR"
-
-if [[ "$ok" != "1" ]]; then
-  LOG "HEALTH-CHECK FAILED for $after — rolling back to $before"
-  if [[ -n "$before" ]] && npm i -g "t3@$before" >/dev/null 2>&1; then
-    LOG "rolled back to $before"
-  else
-    LOG "ROLLBACK FAILED — manual fix needed (t3 may be broken)"
-  fi
-  exit 1
-fi
-LOG "health OK; restarting idle instances"
-
-# Restart only IDLE per-user instances; defer any with an active agent child.
-for unit in $(systemctl list-units --type=service --state=running --no-legend 't3-serve@*' | awk '{print $1}'); do
-  pid=$(systemctl show -p MainPID --value "$unit")
-  if [[ -n "$pid" && "$pid" != 0 ]] && pgrep -aP "$pid" 2>/dev/null | grep -qiE 'claude|codex|opencode'; then
-    LOG "deferring $unit (active agent) — updates next cycle when idle"
-  else
-    systemctl restart "$unit" && LOG "restarted $unit -> $after"
-  fi
-done
-LOG "update complete: $after"
--- a/scripts/t3-autoupdate.timer
+++ b/scripts/t3-autoupdate.timer
@ -1,10 +0,0 @@
-[Unit]
-Description=Daily t3 nightly auto-update
-
-[Timer]
-OnCalendar=*-*-* 04:00:00
-RandomizedDelaySec=1h
-Persistent=true
-
-[Install]
-WantedBy=timers.target
--- a/scripts/t3-dispatch.service
+++ b/scripts/t3-dispatch.service
@ -1,15 +0,0 @@
-[Unit]
-Description=t3 per-user dispatch + auto-pair (X-authentik-username -> user instance)
-After=network.target
-
-[Service]
-Type=simple
-# Unprivileged dedicated user; the only privileged action is `sudo t3-mint`
-# (scoped in /etc/sudoers.d/t3-autopair). Compromise => mint tokens at most.
-User=t3-dispatch
-ExecStart=/usr/local/bin/t3-dispatch
-Restart=on-failure
-RestartSec=5
-
-[Install]
-WantedBy=multi-user.target
--- a/scripts/t3-dispatch/go.mod
+++ b/scripts/t3-dispatch/go.mod
@ -1,3 +0,0 @@
-module t3-dispatch
-
-go 1.22
--- a/scripts/t3-dispatch/main.go
+++ b/scripts/t3-dispatch/main.go
@ -1,139 +0,0 @@
-// t3-dispatch: per-user dispatch + auto-pair for t3code.
-// Sits behind Traefik+Authentik (which injects X-authentik-username) and routes
-// each authenticated user to their own `t3 serve` instance. On a user's first
-// visit (no t3 session cookie) it mints a pairing token for that user's instance
-// and exchanges it for the session cookie, which it injects into the browser —
-// so an Authentik login lands straight in the user's workspace.
-package main
-
-import (
-	"bytes"
-	"encoding/json"
-	"fmt"
-	"log"
-	"net/http"
-	"net/http/httputil"
-	"net/url"
-	"os"
-	"os/exec"
-	"strings"
-	"sync"
-	"time"
-)
-
-type entry struct {
-	OsUser string `json:"os_user"`
-	Port   int    `json:"port"`
-}
-
-const (
-	cookieName   = "t3_session" // discovered: apps/server/src/auth/utils.ts (web mode)
-	listenAddr   = ":3780"
-	dispatchFile = "/etc/t3-serve/dispatch.json"
-)
-
-var (
-	mu    sync.RWMutex
-	table map[string]entry
-)
-
-func loadTable() error {
-	b, err := os.ReadFile(dispatchFile)
-	if err != nil {
-		return err
-	}
-	m := map[string]entry{}
-	if err := json.Unmarshal(b, &m); err != nil {
-		return err
-	}
-	mu.Lock()
-	table = m
-	mu.Unlock()
-	return nil
-}
-
-func lookup(ak string) (entry, bool) {
-	mu.RLock()
-	defer mu.RUnlock()
-	e, ok := table[ak]
-	return e, ok
-}
-
-// autoPair mints a one-time pairing token for the user's instance (as that OS
-// user, via the scoped sudoers entry) and exchanges it at the instance's
-// /api/auth/bootstrap, relaying the returned t3_session Set-Cookie to the browser.
-func autoPair(e entry, w http.ResponseWriter, r *http.Request) {
-	// t3-mint (root, via scoped sudoers) validates the OS user is in
-	// /etc/ttyd-user-map, then mints as that user. The dispatch service itself
-	// runs unprivileged and can invoke nothing else.
-	out, err := exec.Command("sudo", "-n", "/usr/local/bin/t3-mint", e.OsUser).Output()
-	if err != nil {
-		log.Printf("mint for %s failed: %v", e.OsUser, err)
-		http.Error(w, "pairing mint failed", http.StatusInternalServerError)
-		return
-	}
-	var pc struct {
-		Credential string `json:"credential"` // CLI returns the token under "credential"
-	}
-	if err := json.Unmarshal(out, &pc); err != nil || pc.Credential == "" {
-		http.Error(w, "unparseable pairing output", http.StatusInternalServerError)
-		return
-	}
-	body, _ := json.Marshal(map[string]string{"credential": pc.Credential})
-	resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/api/auth/bootstrap", e.Port),
-		"application/json", bytes.NewReader(body))
-	if err != nil {
-		http.Error(w, "bootstrap request failed", http.StatusBadGateway)
-		return
-	}
-	defer resp.Body.Close()
-	if resp.StatusCode != http.StatusOK {
-		log.Printf("bootstrap for %s returned %d", e.OsUser, resp.StatusCode)
-		http.Error(w, "bootstrap rejected", http.StatusBadGateway)
-		return
-	}
-	for _, c := range resp.Cookies() {
-		http.SetCookie(w, c) // relays t3_session (HttpOnly; Path=/; SameSite=Lax)
-	}
-	http.Redirect(w, r, "/", http.StatusFound)
-}
-
-func handler(w http.ResponseWriter, r *http.Request) {
-	ak := r.Header.Get("X-authentik-username")
-	// Authentik injects the full email (e.g. vbarzin@gmail.com); /etc/ttyd-user-map
-	// (and thus dispatch.json) keys on the local part. Strip @domain, matching the
-	// terminal stack's tmux-attach.sh (`${auth_user%%@*}`).
-	if i := strings.IndexByte(ak, '@'); i >= 0 {
-		ak = ak[:i]
-	}
-	e, ok := lookup(ak)
-	if !ok {
-		http.Error(w, "no t3 instance provisioned for this user", http.StatusForbidden)
-		return
-	}
-	if _, err := r.Cookie(cookieName); err != nil {
-		autoPair(e, w, r)
-		return
-	}
-	// Steady state: reverse-proxy (incl. WebSocket upgrade) to the user's instance.
-	target, _ := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", e.Port))
-	httputil.NewSingleHostReverseProxy(target).ServeHTTP(w, r)
-}
-
-func main() {
-	if err := loadTable(); err != nil {
-		log.Fatalf("load %s: %v", dispatchFile, err)
-	}
-	go func() {
-		for range time.Tick(60 * time.Second) {
-			if err := loadTable(); err != nil {
-				log.Printf("reload %s: %v", dispatchFile, err)
-			}
-		}
-	}()
-	mux := http.NewServeMux()
-	mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte("ok\n")) })
-	mux.HandleFunc("/", handler)
-	log.Printf("t3-dispatch listening on %s", listenAddr)
-	log.Fatal(http.ListenAndServe(listenAddr, mux))
-}
--- a/scripts/t3-mint
+++ b/scripts/t3-mint
@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-# Mint a one-time t3 pairing token for a mapped OS user.
-# Runs as root via the scoped sudoers entry for the t3-dispatch service user.
-# Validates the requested user is an actual t3 OS user (a value on the RHS of
-# /etc/ttyd-user-map) before minting as that user. Prints the t3 CLI JSON.
-set -euo pipefail
-os_user="${1:-}"
-[[ "$os_user" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]] || { echo "invalid user" >&2; exit 2; }
-# Must be a mapped t3 OS user (RHS of a non-comment "authentik=os" line).
-awk -F= '!/^[[:space:]]*#/ && NF==2 { gsub(/[[:space:]]/, "", $2); print $2 }' /etc/ttyd-user-map \
-  | grep -qxF "$os_user" || { echo "user not mapped" >&2; exit 3; }
-exec runuser -u "$os_user" -- /usr/bin/t3 auth pairing create \
-  --base-dir "/home/${os_user}/.t3" --ttl 5m --json
--- a/scripts/t3-provision-users.service
+++ b/scripts/t3-provision-users.service
@ -1,7 +0,0 @@
-[Unit]
-Description=Reconcile per-user t3 instances from /etc/ttyd-user-map
-After=network.target
-
-[Service]
-Type=oneshot
-ExecStart=/usr/local/bin/t3-provision-users
--- a/scripts/t3-provision-users.sh
+++ b/scripts/t3-provision-users.sh
@ -1,171 +0,0 @@
-#!/usr/bin/env bash
-# Reconcile per-user t3 Workstation instances from roster.yaml (the single source
-# of truth). roster_engine.py derives the desired state (accounts, per-tier groups,
-# sticky ports, /etc/ttyd-user-map, dispatch.json); this script APPLIES it.
-#
-# ADDITIVE-ONLY for existing users: never removes a group, never replaces a home,
-# never re-locks/re-chmods an existing account — so a routine (hourly) reconcile is
-# always safe for live users. Destructive offboarding (userdel) is a SEPARATE, gated
-# path, never here. Runs hourly as root via t3-provision-users.timer; root has no
-# Vault token, so tier validation is best-effort (skipped when k8s_users is unreachable).
-#
-# DRY_RUN=1 prints actions without mutating. WORKSTATION_DIR overrides the roster/engine location.
-set -euo pipefail
-
-WORKSTATION_DIR="${WORKSTATION_DIR:-/home/wizard/code/infra/scripts/workstation}"
-ENGINE="$WORKSTATION_DIR/roster_engine.py"
-ROSTER="$WORKSTATION_DIR/roster.yaml"
-ENVDIR=/etc/t3-serve
-MAP=/etc/ttyd-user-map
-DRY_RUN="${DRY_RUN:-0}"
-# Public infra repo for the locked clone (no auth; the monorepo has no remote).
-INFRA_REMOTE="${INFRA_REMOTE:-https://github.com/ViktorBarzin/infra.git}"
-# Per-user OIDC kubeconfig (kubelogin/PKCE; cluster server+CA copied from the admin kubeconfig).
-OIDC_ISSUER="${OIDC_ISSUER:-https://authentik.viktorbarzin.me/application/o/kubernetes/}"
-ADMIN_KUBECONFIG="${ADMIN_KUBECONFIG:-/home/wizard/.kube/config}"
-
-log() { echo "[t3-provision] $*"; }
-run() { if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] $*"; else "$@"; fi; }
-
-# Per-non-admin writable, git-crypt-LOCKED infra clone at ~/code. Keyless +
-# filter=cat ⇒ code/docs are plaintext, git-crypt'd secret files stay ciphertext.
-# Writable + ungated (push != apply; applies are admin-only). NEVER touches an
-# existing ~/code (so emo's symlink survives until the gated cutover).
-install_locked_clone() {
-  local user="$1" home
-  home="$(getent passwd "$user" | cut -d: -f6)"
-  [[ -z "$home" ]] && return 0
-  [[ -e "$home/code" || -L "$home/code" ]] && return 0
-  if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] locked infra clone -> $user:$home/code"; return 0; fi
-  log "clone locked infra -> $user:~/code"
-  runuser -u "$user" -- git clone --quiet --no-checkout "$INFRA_REMOTE" "$home/code"
-  runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.smudge cat
-  runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.clean cat
-  runuser -u "$user" -- git -C "$home/code" config filter.git-crypt.required false
-  runuser -u "$user" -- git -C "$home/code" checkout --quiet master
-}
-
-# Per-user OIDC kubeconfig (kubelogin/PKCE — the `kubernetes` Authentik client is
-# public, no secret). Identical for all users: identity comes from each user's own
-# interactive OIDC login, which the apiserver maps (email claim) to their RBAC.
-# Cluster server + CA are copied from the admin kubeconfig. If-absent, never clobber.
-install_user_kubeconfig() {
-  local user="$1" home kc server ca
-  home="$(getent passwd "$user" | cut -d: -f6)"
-  [[ -z "$home" ]] && return 0
-  kc="$home/.kube/config"
-  [[ -f "$kc" ]] && return 0
-  [[ -r "$ADMIN_KUBECONFIG" ]] || { log "WARN: $ADMIN_KUBECONFIG unreadable -> skip kubeconfig for $user"; return 0; }
-  if [[ "$DRY_RUN" == 1 ]]; then echo "[dry-run] OIDC kubeconfig -> $user:$kc"; return 0; fi
-  server="$(KUBECONFIG="$ADMIN_KUBECONFIG" kubectl config view --raw --minify -o jsonpath='{.clusters[0].cluster.server}')"
-  ca="$(KUBECONFIG="$ADMIN_KUBECONFIG" kubectl config view --raw --minify -o jsonpath='{.clusters[0].cluster.certificate-authority-data}')"
-  [[ -n "$server" && -n "$ca" ]] || { log "WARN: could not read cluster server/CA -> skip kubeconfig for $user"; return 0; }
-  install -d -o "$user" -g "$user" -m 0700 "$home/.kube"
-  cat > "$kc" <<EOF
-apiVersion: v1
-kind: Config
-clusters:
- name: homelab
-  cluster:
-    server: $server
-    certificate-authority-data: $ca
-contexts:
- name: oidc@homelab
-  context:
-    cluster: homelab
-    user: oidc
-current-context: oidc@homelab
-users:
- name: oidc
-  user:
-    exec:
-      apiVersion: client.authentication.k8s.io/v1beta1
-      command: kubectl
-      args:
-      - oidc-login
-      - get-token
-      - --oidc-issuer-url=$OIDC_ISSUER
-      - --oidc-client-id=kubernetes
-      - --oidc-extra-scope=email
-      - --oidc-extra-scope=profile
-      - --oidc-extra-scope=groups
-      interactiveMode: IfAvailable
-EOF
-  chown "$user:$user" "$kc"; chmod 0600 "$kc"
-  log "wrote OIDC kubeconfig -> $user:~/.kube/config"
-}
-
-[[ $EUID -eq 0 ]] || { echo "t3-provision-users: must run as root" >&2; exit 1; }
-for bin in python3 jq; do command -v "$bin" >/dev/null || { echo "missing $bin" >&2; exit 1; }; done
-[[ -f "$ROSTER" && -f "$ENGINE" ]] || { echo "roster/engine not under $WORKSTATION_DIR" >&2; exit 1; }
-install -d -m 0755 "$ENVDIR"
-
-# 1) current sticky ports from existing .env files -> {os_user: port}
-ports_file="$(mktemp)"; trap 'rm -f "$ports_file" "${desired_file:-}"' EXIT
-{ echo "{}"; for f in "$ENVDIR"/*.env; do
-    [[ -e "$f" ]] || continue
-    u="$(basename "$f" .env)"; p="$(grep -oE 'T3_PORT=[0-9]+' "$f" | cut -d= -f2)"
-    [[ -n "$p" ]] && jq -n --arg u "$u" --argjson p "$p" '{($u): $p}'
-  done; } | jq -s 'add' > "$ports_file"
-
-# 2) tier validation vs live k8s_users (best-effort; aborts only on a real conflict)
-if command -v vault >/dev/null; then
-  export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
-  if k8s_raw="$(vault kv get -field=k8s_users secret/platform 2>/dev/null)"; then
-    k8s_file="$(mktemp)"; echo "$k8s_raw" | jq -c 'map_values(.role)' > "$k8s_file"
-    if ! python3 "$ENGINE" validate --roster "$ROSTER" --k8s-users-json "$k8s_file"; then
-      rm -f "$k8s_file"; echo "[t3-provision] ABORT: roster tier conflicts with k8s_users" >&2; exit 1
-    fi
-    rm -f "$k8s_file"
-  else
-    log "WARN: k8s_users unreachable (no Vault token?) -> skipping tier validation"
-  fi
-fi
-
-# 3) derive desired state
-desired_file="$(mktemp)"
-python3 "$ENGINE" derive --roster "$ROSTER" --ports-json "$ports_file" > "$desired_file"
-jq -e . "$desired_file" >/dev/null || { echo "[t3-provision] derive produced invalid JSON" >&2; exit 1; }
-
-# 4) per-account: create-if-absent + ADDITIVE tier groups (never strip) + locked clone
-while IFS=$'\t' read -r os_user tier shell groups_csv; do
-  if ! id "$os_user" >/dev/null 2>&1; then
-    log "create account: $os_user (shell $shell)"
-    run useradd -m -s "$shell" "$os_user"
-    run passwd -l "$os_user"           # SSO/t3 only — no local password
-    run chmod 700 "/home/$os_user"
-  fi
-  if [[ -n "$groups_csv" ]]; then
-    current="$(id -nG "$os_user" 2>/dev/null | tr ' ' '\n')"
-    IFS=',' read -ra want <<< "$groups_csv"
-    for g in "${want[@]}"; do
-      grep -qx "$g" <<< "$current" && continue         # already a member -> skip
-      getent group "$g" >/dev/null 2>&1 || continue     # group must exist
-      log "add $os_user -> group $g"; run gpasswd -a "$os_user" "$g" >/dev/null
-    done
-  fi
-  if [[ "$tier" != admin ]]; then            # non-admins: locked ~/code clone + OIDC kubeconfig
-    install_locked_clone "$os_user"
-    install_user_kubeconfig "$os_user"
-  fi
-done < <(jq -r '.accounts[] | [.os_user, .tier, .shell, (.groups|join(","))] | @tsv' "$desired_file")
-
-# 5) per-user .env (sticky port) + enable t3-serve@
-while IFS=$'\t' read -r os_user port; do
-  envf="$ENVDIR/$os_user.env"
-  if [[ ! -f "$envf" ]] || ! grep -qx "T3_PORT=$port" "$envf"; then
-    run bash -c "printf 'T3_PORT=%s\n' '$port' > '$envf'"
-  fi
-  id "$os_user" >/dev/null 2>&1 && run systemctl enable --now "t3-serve@$os_user.service" >/dev/null 2>&1 || true
-done < <(jq -r '.ports | to_entries[] | [.key, .value] | @tsv' "$desired_file")
-
-# 6) regenerate /etc/ttyd-user-map + dispatch.json from the desired state (SSoT:
-#    a roster entry removed here DISAPPEARS, which is what the offboarding cut relies on)
-if [[ "$DRY_RUN" == 1 ]]; then
-  log "[dry-run] would regenerate $MAP + $ENVDIR/dispatch.json"
-else
-  jq -r '.ttyd_user_map' "$desired_file" > "$MAP.tmp" && install -m 0644 "$MAP.tmp" "$MAP" && rm -f "$MAP.tmp"
-  jq -c '.dispatch' "$desired_file" > "$ENVDIR/dispatch.json.tmp" && install -m 0644 "$ENVDIR/dispatch.json.tmp" "$ENVDIR/dispatch.json" && rm -f "$ENVDIR/dispatch.json.tmp"
-fi
-
-log "reconcile complete ($([[ "$DRY_RUN" == 1 ]] && echo DRY-RUN || echo applied))"
--- a/scripts/t3-provision-users.timer
+++ b/scripts/t3-provision-users.timer
@ -1,10 +0,0 @@
-[Unit]
-Description=Periodic t3 per-user reconcile
-
-[Timer]
-OnBootSec=2min
-OnCalendar=hourly
-Persistent=true
-
-[Install]
-WantedBy=timers.target
--- a/scripts/t3-serve@.service
+++ b/scripts/t3-serve@.service
@ -1,20 +0,0 @@
-[Unit]
-Description=T3 Code server for %i (t3 serve, per-user)
-Documentation=https://github.com/pingdotgg/t3code
-After=network.target
-
-[Service]
-Type=simple
-User=%i
-Group=%i
-Environment=HOME=/home/%i
-Environment=PATH=/usr/local/bin:/usr/bin:/bin:/home/%i/.local/bin
-Environment=NODE_ENV=production
-EnvironmentFile=/etc/t3-serve/%i.env
-WorkingDirectory=/home/%i
-ExecStart=/usr/bin/t3 serve --host 0.0.0.0 --port ${T3_PORT} --base-dir /home/%i/.t3
-Restart=on-failure
-RestartSec=5
-
-[Install]
-WantedBy=multi-user.target
--- a/scripts/task-processor.sh
+++ b/scripts/task-processor.sh
@ -1,261 +0,0 @@
-#!/usr/bin/env bash
-#
-# Task processor for the Forgejo → OpenClaw pipeline.
-# Polls Forgejo for new issues in the tasks repo, sends them to OpenClaw
-# for processing, and posts results back as comments.
-#
-# Runs inside the OpenClaw pod via kubectl exec from a CronJob.
-#
-# Environment:
-#   FORGEJO_TOKEN   — Forgejo API token with repo access
-#   FORGEJO_URL     — Forgejo base URL (default: https://forgejo.viktorbarzin.me)
-#   FORGEJO_REPO    — Repo in format "owner/repo" (default: vbarzin/tasks)
-#   OPENCLAW_URL    — OpenClaw gateway URL (default: http://127.0.0.1:18789)
-#   OPENCLAW_TOKEN  — OpenClaw gateway token
-#   SLACK_WEBHOOK_URL — Optional Slack webhook for notifications
-
-set -euo pipefail
-
-FORGEJO_URL="${FORGEJO_URL:-https://forgejo.viktorbarzin.me}"
-FORGEJO_REPO="${FORGEJO_REPO:-viktor/tasks}"
-OPENCLAW_URL="${OPENCLAW_URL:-https://integrate.api.nvidia.com}"
-SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL:-}"
-
-: "${FORGEJO_TOKEN:?FORGEJO_TOKEN is required}"
-: "${OPENCLAW_TOKEN:?OPENCLAW_TOKEN is required}"
-FORGEJO_BOT_USER="${FORGEJO_BOT_USER:-viktor}"
-
-fg_api() {
-  curl -sf -H "Authorization: token $FORGEJO_TOKEN" -H "Content-Type: application/json" "$@"
-}
-
-get_label_id() {
-  local label_name="$1"
-  fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/labels?limit=50" | \
-    python3 -c "
-import sys, json
-labels = json.load(sys.stdin)
-name = sys.argv[1]
-for l in labels:
-    if l['name'] == name:
-        print(l['id'])
-        break
-else:
-    print(0)
-" "$label_name"
-}
-
-add_label() {
-  local issue_id="$1" label_name="$2"
-  local label_id
-  label_id=$(get_label_id "$label_name")
-  if [ "$label_id" != "0" ]; then
-    fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/labels" \
-      -d "{\"labels\":[$label_id]}" > /dev/null 2>&1 || true
-  fi
-}
-
-remove_label() {
-  local issue_id="$1" label_name="$2"
-  local label_id
-  label_id=$(get_label_id "$label_name")
-  if [ "$label_id" != "0" ]; then
-    fg_api -X DELETE "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/labels/$label_id" > /dev/null 2>&1 || true
-  fi
-}
-
-post_comment() {
-  local issue_id="$1"
-  # Read comment body from stdin to avoid quoting issues
-  python3 -c "
-import sys, json
-body = sys.stdin.read()
-print(json.dumps({'body': body}))
-" | fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/comments" -d @- > /dev/null 2>&1
-}
-
-close_issue() {
-  local issue_id="$1"
-  fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id" \
-    -X PATCH -d '{"state": "closed"}' > /dev/null 2>&1
-}
-
-get_comment_history() {
-  local issue_id="$1"
-  fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues/$issue_id/comments?limit=20" 2>/dev/null | \
-    python3 -c "
-import sys, json
-bot_user = sys.argv[1]
-comments = json.load(sys.stdin)
-history = []
-for c in comments:
-    user = c.get('user', {}).get('login', 'unknown')
-    body = c.get('body', '')
-    # Skip bot's own comments to keep context clean
-    if user == bot_user:
-        # Include a short summary of previous responses
-        if '## OpenClaw Task Result' in body:
-            # Extract just the result content (skip header/footer)
-            lines = body.split('\n')
-            content = [l for l in lines if not l.startswith('## ') and not l.startswith('---') and not l.startswith('*Processed')]
-            summary = '\n'.join(content).strip()[:500]
-            if summary:
-                history.append(f'[Previous AI response]: {summary}')
-    else:
-        history.append(f'[{user}]: {body}')
-print('\n\n'.join(history))
-" "$FORGEJO_BOT_USER" 2>/dev/null
-}
-
-notify_slack() {
-  if [ -n "$SLACK_WEBHOOK_URL" ]; then
-    python3 -c "
-import json, sys
-print(json.dumps({'text': sys.argv[1]}))
-" "$1" | curl -sf -X POST "$SLACK_WEBHOOK_URL" \
-      -H "Content-Type: application/json" -d @- > /dev/null 2>&1 || true
-  fi
-}
-
-process_issue() {
-  local issue_id="$1" title="$2" body="$3" author="$4"
-
-  echo "Processing issue #$issue_id: $title (by $author)"
-
-  # Mark as processing
-  add_label "$issue_id" "processing"
-  remove_label "$issue_id" "pending"
-  remove_label "$issue_id" "completed"
-
-  # Fetch comment history for context
-  local comment_history
-  comment_history=$(get_comment_history "$issue_id")
-
-  # Call OpenClaw gateway API (OpenAI-compatible chat completions)
-  # Use python to safely build the JSON payload
-  local response
-  response=$(python3 -c "
-import json, sys
-title = sys.argv[1]
-body = sys.argv[2]
-author = sys.argv[3]
-comment_history = sys.argv[4]
-
-prompt = f'''You are processing a task submitted by {author} via the Forgejo task queue.
-
-Task title: {title}
-
-Task description:
-{body}'''
-
-if comment_history.strip():
-    prompt += f'''
-
-Conversation history (follow-up comments):
-{comment_history}
-
-The latest comment is the most recent request. Address it in context of the original task and prior conversation.'''
-
-prompt += '''
-
-Please execute this task. When done, provide a clear summary of what was done and any results.
-If the task requires infrastructure changes, describe what changes would be needed but do NOT apply them automatically — list the commands/changes for review.'''
-
-payload = {
-    'model': 'mistralai/mistral-large-3-675b-instruct-2512',
-    'messages': [
-        {'role': 'system', 'content': 'You are an infrastructure AI assistant. Process the task and provide actionable results. Be concise.'},
-        {'role': 'user', 'content': prompt}
-    ],
-    'max_tokens': 8192,
-    'temperature': 0.3
-}
-print(json.dumps(payload))
-" "$title" "$body" "$author" "$comment_history" | \
-    curl -sf --max-time 300 \
-      -H "Authorization: Bearer $OPENCLAW_TOKEN" \
-      -H "Content-Type: application/json" \
-      "$OPENCLAW_URL/v1/chat/completions" \
-      -d @- 2>&1) || {
-    echo "  ERROR: OpenClaw API call failed"
-    echo "Failed to process this task. OpenClaw API returned an error. Please check the CronJob logs or process manually." | \
-      post_comment "$issue_id"
-    add_label "$issue_id" "failed"
-    remove_label "$issue_id" "processing"
-    notify_slack ":x: Task #$issue_id failed: $title"
-    return 1
-  }
-
-  # Extract the response content and post as comment
-  python3 -c "
-import sys, json
-try:
-    data = json.load(sys.stdin)
-    msg = data['choices'][0]['message']
-    # Some models put content in reasoning_content instead of content
-    result = msg.get('content') or msg.get('reasoning_content') or msg.get('reasoning') or 'No response generated.'
-except Exception as e:
-    result = f'Error parsing OpenClaw response: {e}'
-
-body = f'## OpenClaw Task Result\n\n{result}\n\n---\n*Processed automatically by the OpenClaw task pipeline.*'
-print(body)
-" <<< "$response" | post_comment "$issue_id"
-
-  # Update labels and close
-  add_label "$issue_id" "completed"
-  remove_label "$issue_id" "processing"
-  close_issue "$issue_id"
-
-  echo "  Issue #$issue_id processed and closed"
-  notify_slack ":white_check_mark: Task #$issue_id completed: $title"
-}
-
-# --- Main ---
-
-echo "=== Task Processor $(date -u +%Y-%m-%dT%H:%M:%SZ) ==="
-
-# List open issues
-ISSUES=$(fg_api "$FORGEJO_URL/api/v1/repos/$FORGEJO_REPO/issues?state=open&type=issues&limit=10&sort=created&direction=asc" 2>/dev/null) || {
-  echo "ERROR: Could not fetch issues from Forgejo"
-  exit 1
-}
-
-# Parse pending issues into a temp file (avoids delimiter issues)
-PENDING_FILE=$(mktemp)
-trap 'rm -f "$PENDING_FILE"' EXIT
-
-python3 -c "
-import sys, json
-issues = json.load(sys.stdin)
-for issue in issues:
-    labels = [l['name'] for l in issue.get('labels', [])]
-    # Process if: no processing label AND (no completed label OR issue was reopened)
-    if 'processing' not in labels:
-        # Write each issue as a JSON line
-        print(json.dumps({
-            'id': issue['number'],
-            'title': issue['title'],
-            'body': (issue.get('body') or '')[:4000],
-            'author': issue['user']['login']
-        }))
-" <<< "$ISSUES" > "$PENDING_FILE"
-
-ISSUE_COUNT=$(wc -l < "$PENDING_FILE" | tr -d ' ')
-
-if [ "$ISSUE_COUNT" = "0" ]; then
-  echo "No pending issues to process"
-  exit 0
-fi
-
-echo "Found $ISSUE_COUNT pending issue(s)"
-
-# Process each pending issue (one JSON object per line)
-while IFS= read -r line; do
-  issue_id=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['id'])" "$line")
-  title=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['title'])" "$line")
-  body=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['body'])" "$line")
-  author=$(python3 -c "import json,sys; print(json.loads(sys.argv[1])['author'])" "$line")
-  process_issue "$issue_id" "$title" "$body" "$author" || true
-done < "$PENDING_FILE"
-
-echo "=== Task processing complete ==="
--- a/scripts/test-fan-control.sh
+++ b/scripts/test-fan-control.sh
@ -1,85 +0,0 @@
-#!/usr/bin/env bash
-# Unit tests for the pure functions in fan-control.sh.
-# Sources the script (main is guarded), exercises curve/decide/resolve/presence/parse.
-# Run: bash infra/scripts/test-fan-control.sh
-
-set -uo pipefail
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# shellcheck source=/dev/null
-source "$DIR/fan-control.sh"
-
-pass=0 fail=0
-eq() {  # <description> <expected> <actual>
-  if [[ "$2" == "$3" ]]; then pass=$((pass + 1)); else
-    fail=$((fail + 1)); printf 'FAIL: %s — expected [%s] got [%s]\n' "$1" "$2" "$3"
-  fi
-}
-
-# --- COOL curve (continuous linear: 30% @50C .. 100% @83C) ---
-eq "cool <=T_LO clamps" 30  "$(fc_curve cool 40)"
-eq "cool 50 -> 30"      30  "$(fc_curve cool 50)"
-eq "cool 55 -> 41"      41  "$(fc_curve cool 55)"
-eq "cool 60 -> 51"      51  "$(fc_curve cool 60)"
-eq "cool 64 -> 60"      60  "$(fc_curve cool 64)"
-eq "cool 70 -> 72"      72  "$(fc_curve cool 70)"
-eq "cool 75 -> 83"      83  "$(fc_curve cool 75)"
-eq "cool 83 -> 100"     100 "$(fc_curve cool 83)"
-eq "cool >=T_HI clamps" 100 "$(fc_curve cool 90)"
-
-# --- QUIET curve (continuous linear: 20% @68C .. 100% @83C) ---
-eq "quiet <=T_LO clamps" 20  "$(fc_curve quiet 60)"
-eq "quiet 68 -> 20"      20  "$(fc_curve quiet 68)"
-eq "quiet 70 -> 31"      31  "$(fc_curve quiet 70)"
-eq "quiet 75 -> 57"      57  "$(fc_curve quiet 75)"
-eq "quiet 80 -> 84"      84  "$(fc_curve quiet 80)"
-eq "quiet 83 -> 100"     100 "$(fc_curve quiet 83)"
-
-# --- decide: asymmetric hysteresis (ramp up now, ease down only past the deadband) ---
-eq "decide uninit -> target" 68 "$(fc_decide cool 68 -1 3)"
-eq "decide ramp up now"      68 "$(fc_decide cool 68 25 3)"
-eq "decide equal holds"      62 "$(fc_decide cool 65 62 3)"
-eq "decide down held"        72 "$(fc_decide cool 68 72 3)"   # curve(68)=68<72 but curve(71)=75 !<72 -> hold
-eq "decide down past"        60 "$(fc_decide cool 64 72 3)"   # curve(64)=60, curve(67)=66<72 -> drop
-
-# --- fc_clamp / fc_resolve: HA mode resolution ---
-eq "clamp over 100"   100 "$(fc_clamp 150)"
-eq "clamp under 0"      0 "$(fc_clamp -5)"
-eq "clamp passthrough" 45 "$(fc_clamp 45)"
-eq "resolve manual=slider"      42 "$(fc_resolve manual 64 42 cool -1 3)"
-eq "resolve manual clamped"    100 "$(fc_resolve manual 64 150 cool -1 3)"
-eq "resolve cool=cool curve"    51 "$(fc_resolve cool 60 0 cool -1 3)"
-eq "resolve quiet=quiet curve"  73 "$(fc_resolve quiet 78 0 cool -1 3)"
-eq "resolve auto+empty=cool"    51 "$(fc_resolve auto 60 0 cool -1 3)"
-eq "resolve auto+present=quiet" 31 "$(fc_resolve auto 70 0 quiet -1 3)"
-
-# --- fc_fan_watts: estimated fan power from RPM (cube-law, calibrated to the sweep) ---
-eq "fan_watts 0"     0  "$(fc_fan_watts 0)"
-eq "fan_watts 4800"  2  "$(fc_fan_watts 4800)"
-eq "fan_watts 9360"  16 "$(fc_fan_watts 9360)"
-eq "fan_watts 12720" 42 "$(fc_fan_watts 12720)"
-eq "fan_watts 16920" 99 "$(fc_fan_watts 16920)"
-
-# --- presence ---
-now=1000000
-eq "presence open -> quiet"          quiet "$(fc_presence_mode Отворена 0 $now 900 Отворена)"
-eq "presence closed recent -> quiet" quiet "$(fc_presence_mode Затворена $((now - 100)) $now 900 Отворена)"
-eq "presence closed stale -> cool"   cool  "$(fc_presence_mode Затворена $((now - 1000)) $now 900 Отворена)"
-eq "presence closed edge -> cool"    cool  "$(fc_presence_mode Затворена $((now - 900)) $now 900 Отворена)"
-
-# --- temp parsing ---
-eq "parse temp line" 74 "$(fc_parse_temp 'Temp             | 0Eh | ok  |  3.1 | 74 degrees C')"
-eq "parse temp 7C"   72 "$(fc_parse_temp 'Temp             | 0Eh | ok  |  3.1 | 72 degrees C')"
-
-# --- json field (jq-free) ---
-J='{"entity_id":"sensor.garage_door_state_bg","state":"Отворена","attributes":{"friendly_name":"Garage Door State BG"},"last_changed":"2026-06-04T16:55:20.517745+00:00","last_updated":"2026-06-04T16:55:20.517745+00:00"}'
-eq "json state"        "Отворена"                          "$(fc_json_str_field "$J" state)"
-eq "json last_changed" "2026-06-04T16:55:20.517745+00:00"  "$(fc_json_str_field "$J" last_changed)"
-
-# --- hex conversion ---
-eq "hex 20"  0x14 "$(fc_pct_to_hex 20)"
-eq "hex 45"  0x2d "$(fc_pct_to_hex 45)"
-eq "hex 100" 0x64 "$(fc_pct_to_hex 100)"
-eq "hex 5"   0x05 "$(fc_pct_to_hex 5)"
-
-printf '\n%d passed, %d failed\n' "$pass" "$fail"
-(( fail == 0 ))
--- a/scripts/test-vault-token-renew.sh
+++ b/scripts/test-vault-token-renew.sh
@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-# Unit tests for the pure drift-guard functions in vault-token-renew.sh.
-# Sources the script (vtr_main is guarded) and exercises the decision logic that
-# decides whether ~/.vault-token is OUR periodic admin token (renew) or a foreign
-# token that clobbered the file (refuse, fail loud). This is exactly the logic
-# whose ABSENCE let the 2026-06-05 woodpecker-token clobber be silently renewed
-# for two days. Run: bash infra/scripts/test-vault-token-renew.sh
-set -uo pipefail
-DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# shellcheck source=/dev/null
-source "$DIR/vault-token-renew.sh"
-
-pass=0 fail=0
-ok() {  # <description> <cmd...> — expects the command to succeed (renew-OK)
-  if "${@:2}"; then pass=$((pass + 1)); else
-    fail=$((fail + 1)); printf 'FAIL: %s — expected OK, got refuse\n' "$1"
-  fi
-}
-no() {  # <description> <cmd...> — expects the command to fail (drift, refuse)
-  if "${@:2}"; then
-    fail=$((fail + 1)); printf 'FAIL: %s — expected DRIFT, got OK\n' "$1"
-  else pass=$((pass + 1)); fi
-}
-eq() {  # <description> <expected> <actual>
-  if [[ "$2" == "$3" ]]; then pass=$((pass + 1)); else
-    fail=$((fail + 1)); printf 'FAIL: %s — expected [%s] got [%s]\n' "$1" "$2" "$3"
-  fi
-}
-
-# --- vtr_drift_ok: ONLY our periodic admin token (right name AND vault-admin) renews ---
-ok "our token renews"                vtr_drift_ok token-devvm-wizard "default,sops-admin,vault-admin"
-ok "vault-admin anywhere in list"    vtr_drift_ok token-devvm-wizard "default,vault-admin"
-ok "policy order irrelevant"         vtr_drift_ok token-devvm-wizard "vault-admin,default"
-no "woodpecker clobber refused"      vtr_drift_ok kubernetes-woodpecker-default "ci,default,terraform-state"
-no "oidc token (admin but wrong dn)" vtr_drift_ok oidc-vbarzin "default,sops-admin,vault-admin"
-no "right name, no vault-admin"      vtr_drift_ok token-devvm-wizard "default,sops-admin"
-no "empty display_name"              vtr_drift_ok "" "vault-admin"
-no "empty policies"                  vtr_drift_ok token-devvm-wizard ""
-no "no substring false-positive"     vtr_drift_ok token-devvm-wizard "default,vault-admin-ro"
-
-# --- vtr_display_name / vtr_policies_csv: parse real `vault token lookup -format=json` ---
-LOOKUP_OURS='{"data":{"display_name":"token-devvm-wizard","policies":["default","sops-admin","vault-admin"],"identity_policies":null}}'
-LOOKUP_OIDC='{"data":{"display_name":"oidc-vbarzin","policies":["default"],"identity_policies":["sops-admin","vault-admin"]}}'
-LOOKUP_WP='{"data":{"display_name":"kubernetes-woodpecker-default","policies":["ci","default","terraform-state"],"identity_policies":[]}}'
-eq "dn ours"  "token-devvm-wizard" "$(vtr_display_name "$LOOKUP_OURS")"
-eq "dn oidc"  "oidc-vbarzin"       "$(vtr_display_name "$LOOKUP_OIDC")"
-eq "pols ours"                       "default,sops-admin,vault-admin" "$(vtr_policies_csv "$LOOKUP_OURS")"
-eq "pols oidc merges token+identity" "default,sops-admin,vault-admin" "$(vtr_policies_csv "$LOOKUP_OIDC")"
-eq "pols woodpecker"                 "ci,default,terraform-state"     "$(vtr_policies_csv "$LOOKUP_WP")"
-
-# --- parse + decide end-to-end (the real lookup-JSON -> renew/refuse path) ---
-ok "ours: parse+decide renews"        vtr_drift_ok "$(vtr_display_name "$LOOKUP_OURS")" "$(vtr_policies_csv "$LOOKUP_OURS")"
-no "woodpecker: parse+decide refused" vtr_drift_ok "$(vtr_display_name "$LOOKUP_WP")"   "$(vtr_policies_csv "$LOOKUP_WP")"
-no "oidc: parse+decide refused"       vtr_drift_ok "$(vtr_display_name "$LOOKUP_OIDC")" "$(vtr_policies_csv "$LOOKUP_OIDC")"
-
-printf '\n%d passed, %d failed\n' "$pass" "$fail"
-(( fail == 0 ))
--- a/scripts/tg
+++ b/scripts/tg
@ -1,169 +0,0 @@
-#!/usr/bin/env bash
-# scripts/tg — wrapper: decrypt state before, encrypt+commit after mutating ops
-# Usage: scripts/tg apply --non-interactive
-#        scripts/tg plan
-# Auth: `vault login -method=oidc` (token at ~/.vault-token)
-set -euo pipefail
-
-REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
-SYNC="$REPO_ROOT/scripts/state-sync"
-
-# Enable provider cache (shared across stacks)
-export TF_PLUGIN_CACHE_DIR="${TF_PLUGIN_CACHE_DIR:-$HOME/.terraform.d/plugin-cache}"
-export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1
-mkdir -p "$TF_PLUGIN_CACHE_DIR"
-
-# Determine stack name from cwd (relative to stacks/)
-STACK_NAME=""
-cwd="$(pwd)"
-stacks_dir="$REPO_ROOT/stacks"
-if [[ "$cwd" == "$stacks_dir"/* ]]; then
-  rel="${cwd#$stacks_dir/}"
-  STACK_NAME="${rel%%/*}"
-fi
-
-# ── Tier detection ──
-TIER0_STACKS="infra platform cnpg vault dbaas external-secrets"
-is_tier0() {
-  echo "$TIER0_STACKS" | tr ' ' '\n' | grep -qx "$1"
-}
-
-# ── Advisory lock via Vault KV ──
-LOCK_MAX_AGE=1800  # 30 minutes — stale lock threshold
-acquire_lock() {
-  local stack="$1"
-  local vault_addr="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
-  local lock_path="secret/data/locks/$stack"
-  local holder="pid=$$,host=$(hostname -s),user=$(whoami)"
-
-  # Check if lock exists and is not stale
-  local existing
-  existing=$(vault kv get -format=json "secret/locks/$stack" 2>/dev/null || echo '{}')
-  local locked=$(echo "$existing" | jq -r '.data.data.locked // "false"')
-  local acquired=$(echo "$existing" | jq -r '.data.data.acquired // "0"')
-  local existing_holder=$(echo "$existing" | jq -r '.data.data.holder // ""')
-
-  if [ "$locked" = "true" ]; then
-    local now=$(date +%s)
-    local age=$((now - acquired))
-    if [ "$age" -lt "$LOCK_MAX_AGE" ]; then
-      echo "ERROR: Stack '$stack' is locked by: $existing_holder (${age}s ago)"
-      echo "       Wait for it to finish or run: vault kv delete secret/locks/$stack"
-      return 1
-    fi
-    echo "WARNING: Breaking stale lock on '$stack' (held ${age}s by $existing_holder)"
-  fi
-
-  vault kv put "secret/locks/$stack" locked=true holder="$holder" acquired="$(date +%s)" >/dev/null
-}
-
-release_lock() {
-  local stack="$1"
-  vault kv delete "secret/locks/$stack" >/dev/null 2>&1 || true
-}
-
-# ── Pre-flight: decrypt state (Tier 0) or fetch PG creds (Tier 1) ──
-if [ -n "$STACK_NAME" ]; then
-  if is_tier0 "$STACK_NAME"; then
-    # Tier 0: SOPS-encrypted local state
-    if [ -f "$REPO_ROOT/state/stacks/$STACK_NAME/terraform.tfstate.enc" ]; then
-      "$SYNC" decrypt "$STACK_NAME"
-    fi
-  else
-    # Tier 1: PG backend — fetch credentials from Vault
-    if [ -z "${PG_CONN_STR:-}" ]; then
-      # Pre-flight: vault CLI must be available. Previously CI failed with a
-      # misleading "Cannot read PG credentials" message because the Alpine CI
-      # image lacked the vault binary — the 2>/dev/null below swallowed the
-      # real "vault: not found" error. Fail fast with a clear message instead.
-      if ! command -v vault >/dev/null 2>&1; then
-        echo "ERROR: vault CLI not found on PATH. Install it or use an image that includes it (ci/Dockerfile)." >&2
-        exit 1
-      fi
-      VAULT_OUT=$(vault read -format=json database/static-creds/pg-terraform-state 2>&1) || {
-        echo "ERROR: Cannot read PG credentials from Vault. Vault output follows:" >&2
-        echo "$VAULT_OUT" >&2
-        echo "" >&2
-        echo "Hint: humans run 'vault login -method=oidc'; CI auths via K8s SA (role=ci)." >&2
-        exit 1
-      }
-      PG_USER=$(echo "$VAULT_OUT" | jq -r .data.username)
-      PG_PASS=$(echo "$VAULT_OUT" | jq -r .data.password)
-      export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable"
-    fi
-  fi
-fi
-
-# Detect if this is a mutating operation
-is_mutating=false
-for arg in "$@"; do
-  case "$arg" in
-    apply|destroy|import|state) is_mutating=true ;;
-  esac
-done
-
-# Detect if this is a plan/apply/destroy/refresh — anything that reads or
-# writes infra state. Cheap pre-flight check below scans only the current
-# stack's .tf files for the ingress_factory auth-comment convention. Other
-# tg verbs (init, fmt, validate) skip the check.
-is_tf_op=false
-for arg in "$@"; do
-  case "$arg" in
-    plan|apply|destroy|refresh) is_tf_op=true ;;
-  esac
-done
-
-# Anti-exposure guard: every `auth = "app"` or `auth = "none"` in this stack
-# must have a preceding `# auth = "<tier>":` comment documenting what gates
-# the app or why the endpoint is intentionally public. See:
-# - infra/modules/kubernetes/ingress_factory/main.tf (variable description)
-# - infra/.claude/CLAUDE.md "Auth" section
-# Stack-scoped: untouched stacks aren't blocked from future applies until
-# they're actually edited, at which point the convention applies.
-if $is_tf_op && [ -n "$STACK_NAME" ]; then
-  if ! "$REPO_ROOT/scripts/check-ingress-auth-comments.py" "$REPO_ROOT/stacks/$STACK_NAME"; then
-    exit 1
-  fi
-fi
-
-# Acquire lock for mutating operations (Tier 0 only — Tier 1 uses pg_advisory_lock)
-if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
-  if command -v vault &>/dev/null && [ -n "${VAULT_TOKEN:-}" ]; then
-    acquire_lock "$STACK_NAME"
-    trap 'release_lock "$STACK_NAME"' EXIT
-  fi
-fi
-
-# If running apply with --non-interactive, add -auto-approve for Terraform
-args=("$@")
-has_apply=false
-has_non_interactive=false
-for arg in "${args[@]}"; do
-  case "$arg" in
-    apply) has_apply=true ;;
-    --non-interactive) has_non_interactive=true ;;
-  esac
-done
-
-if $has_apply && $has_non_interactive; then
-  new_args=()
-  for arg in "${args[@]}"; do
-    new_args+=("$arg")
-    if [ "$arg" = "apply" ]; then
-      new_args+=("-auto-approve")
-    fi
-  done
-  terragrunt "${new_args[@]}"
-else
-  terragrunt "$@"
-fi
-
-# After mutating operations: encrypt+commit (Tier 0) or no-op (Tier 1 — PG is authoritative)
-if $is_mutating && [ -n "$STACK_NAME" ] && is_tier0 "$STACK_NAME"; then
-  "$SYNC" encrypt "$STACK_NAME"
-  cd "$REPO_ROOT"
-  git add "state/stacks/$STACK_NAME/terraform.tfstate.enc"
-  if ! git diff --cached --quiet; then
-    git commit -m "state($STACK_NAME): update encrypted state"
-  fi
-fi
--- a/scripts/update-istio-injection.sh
+++ b/scripts/update-istio-injection.sh
@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-set -e
-from=$1
-to=$2
-
-if [ -z "$from" ] || [ -z "$to" ]; then
-	echo 'pass 2 positional parameters - $from and $to'
-	exit 1
-fi
-
-commands=()
-# Update terraform modules
-for file in $(grep -rni "\"istio-injection\" : \"$from\"" . | grep -v '#' | awk '{print $1}' | cut -d':' -f1); do
-	echo $file
-	sed -i "s/istio-injection\" : \"$from\"/istio-injection\" : \"$to\"/" $file
-
-	ns=$(echo $file | cut -d'/' -f 4)
-	commands+=("kubectl -n $ns get deployments --no-headers | awk '{print \$1}' | xargs kubectl -n $ns rollout restart deployment")
-done
-
-# Apply changes
-terraform apply -auto-approve
-
-# Restart deployments
-for cmd in "${commands[@]}"; do
-	echo $cmd
-	bash -c "$cmd"
-done
--- a/scripts/update_k8s.sh
+++ b/scripts/update_k8s.sh
@ -1,123 +0,0 @@
-#!/usr/bin/env bash
-#
-# K8s component upgrader. Run on a single node (master OR worker) at a time.
-# The caller is responsible for:
-#   - draining + uncordoning the node (this script does not touch kubectl)
-#   - sequencing nodes (master first, then workers one at a time)
-#   - pre-flight checks (etcd snapshot, halt-on-alert, etc)
-#
-# Used by:
-#   - the k8s-version-upgrade agent (infra/.claude/agents/k8s-version-upgrade.md)
-#   - manual operators following the runbook (infra/docs/runbooks/k8s-version-upgrade.md)
-#
-# Old manual orchestration loop (kept for reference — the agent does the
-# equivalent now):
-#   for n in $(kbn | grep 'k8s-node' | awk '{print $1}'); do
-#     kb drain $n --ignore-daemonsets --delete-emptydir-data
-#     s wizard@$n 'bash -s' < update_k8s.sh --role worker --release 1.34.5
-#     kb uncordon $n
-#   done
-
-set -euo pipefail
-
-ROLE=""
-RELEASE=""
-
-usage() {
-    cat <<EOF
-Usage: $0 --role <master|worker> --release <X.Y.Z>
-
-  --role     master|worker  (required)
-  --release  kubeadm/kubelet/kubectl target patch version, e.g. 1.34.5
-
-Behavior:
-  - Rewrites /etc/apt/sources.list.d/kubernetes.list to the v\$MINOR/deb repo
-    derived from --release (so a 1.34.x release uses v1.34/deb, 1.35.x uses
-    v1.35/deb, etc).
-  - apt-get install kubeadm=<release>-* (apt-mark unhold first).
-  - master: kubeadm upgrade plan && kubeadm upgrade apply v<release> -y
-  - worker: kubeadm upgrade node
-  - apt-get install kubelet=<release>-* kubectl=<release>-* then re-hold.
-  - systemctl daemon-reload && systemctl restart kubelet
-EOF
-}
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --role)    ROLE="$2"; shift 2;;
-        --release) RELEASE="$2"; shift 2;;
-        -h|--help) usage; exit 0;;
-        *) echo "Unknown arg: $1" >&2; usage; exit 2;;
-    esac
-done
-
-if [[ -z "$ROLE" || -z "$RELEASE" ]]; then
-    echo "ERROR: --role and --release are required" >&2
-    usage
-    exit 2
-fi
-
-if [[ "$ROLE" != "master" && "$ROLE" != "worker" ]]; then
-    echo "ERROR: --role must be 'master' or 'worker' (got: $ROLE)" >&2
-    exit 2
-fi
-
-# Derive minor track (e.g. 1.34.5 → 1.34)
-STABLE_VERSION="$(echo "$RELEASE" | awk -F. '{print $1"."$2}')"
-
-echo "==> Upgrading $(hostname) ($ROLE) to v$RELEASE (track v$STABLE_VERSION)"
-
-# Apt repo URL is pinned per minor track. Rewrite + re-import the signing key
-# every run — cheap, idempotent, and handles the minor-bump case where the
-# old track's repo no longer carries the target version.
-echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v$STABLE_VERSION/deb/ /" \
-    | sudo tee /etc/apt/sources.list.d/kubernetes.list
-sudo mkdir -p /etc/apt/keyrings
-curl -fsSL "https://pkgs.k8s.io/core:/stable:/v$STABLE_VERSION/deb/Release.key" \
-    | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg --batch --yes
-
-sudo apt-mark unhold kubeadm kubelet kubectl
-sudo apt-get update
-sudo apt-get install -y "kubeadm=$RELEASE-*"
-
-if [[ "$ROLE" == "master" ]]; then
-    echo "==> Master path: kubeadm upgrade plan + apply"
-    sudo kubeadm upgrade plan
-    # The first apply may fail with "static Pod hash for component <X> did
-    # not change after 5m0s" — kubeadm's 5min wait for the kubelet to reload
-    # a static pod is too tight on our cluster (apiserver-to-kubelet status
-    # sync latency post-master-reboot can exceed it). The etcd image IS
-    # actually updated by then, so a 2nd attempt sees etcd already on
-    # target and skips it. Up to 3 attempts with a 30s delay between.
-    # First attempt: full kubeadm upgrade (incl. etcd). On the static-pod-
-    # hash 5min-timeout failure, retry with --etcd-upgrade=false. The
-    # timeout happens reliably for patch upgrades where etcd's image
-    # doesn't change (kubeadm writes identical manifest → hash doesn't
-    # change → kubeadm waits forever for a change that will never come).
-    # Skipping the etcd phase on retry is safe IF etcd is already on the
-    # right version (which is the only case where this timeout fires).
-    attempt=1
-    extra_flags=""
-    while ! sudo kubeadm upgrade apply "v$RELEASE" -y $extra_flags; do
-        if (( attempt >= 3 )); then
-            echo "ERROR: kubeadm upgrade apply failed after 3 attempts" >&2
-            exit 1
-        fi
-        echo "==> kubeadm apply attempt $attempt failed. Retrying with --etcd-upgrade=false (etcd image is unchanged for patch upgrades; kubeadm's static-pod-hash watch is the only thing failing)."
-        extra_flags="--etcd-upgrade=false"
-        sleep 30
-        attempt=$(( attempt + 1 ))
-    done
-    echo "==> kubeadm upgrade apply succeeded on attempt $attempt (flags: '$extra_flags')"
-else
-    echo "==> Worker path: kubeadm upgrade node"
-    sudo kubeadm upgrade node
-fi
-
-sudo apt-get install -y "kubelet=$RELEASE-*" "kubectl=$RELEASE-*"
-sudo apt-mark hold kubeadm kubelet kubectl
-
-sudo systemctl daemon-reload
-sudo systemctl restart kubelet
-
-echo "==> Done: $(hostname) is on v$RELEASE"
--- a/scripts/update_node.sh
+++ b/scripts/update_node.sh
@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-#
-# OS-major upgrade (Ubuntu do-release-upgrade). NOT in the auto-upgrade
-# pipeline — minor apt patches are handled by unattended-upgrades + kured;
-# K8s component bumps are handled by the k8s-version-upgrade agent. Run this
-# script manually when bumping Ubuntu LTS major versions.
-#
-# See:
-#   - infra/docs/runbooks/k8s-node-auto-upgrades.md  (apt + reboot)
-#   - infra/docs/runbooks/k8s-version-upgrade.md     (kubeadm/kubelet/kubectl)
-
-# sudo apt update && sudo apt autoremove -y && sudo apt upgrade -y
-sudo do-release-upgrade
-sudo apt update && sudo apt autoremove -y && sudo apt upgrade -y
--- a/scripts/upgrade_state.sh
+++ b/scripts/upgrade_state.sh
@ -1,619 +0,0 @@
-#!/usr/bin/env bash
-#
-# upgrade_state.sh — survey the three autonomous-upgrade pipelines.
-#
-# Companion to cluster_healthcheck.sh, surfaced via the /upgrade-state skill.
-# Read-only by design — no --fix.
-#
-# The three pipelines:
-#   1. Apps  — Keel polls registries hourly and rolls Deployments tagged
-#              keel.sh/policy. Metrics on container :9300/metrics.
-#   2. OS    — unattended-upgrades patches in-release per node; kured
-#              reboots within a daily 02:00-06:00 London window.
-#   3. K8s   — k8s-version-check CronJob (Sun 12:00 UTC) detects new
-#              kubeadm patch/minor releases; Job-chain drains+upgrades
-#              node-by-node. Pushgateway holds k8s_upgrade_* gauges.
-#
-# Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
-
-set -euo pipefail
-
-# --- Colors ---
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[0;33m'
-BLUE='\033[0;34m'
-BOLD='\033[1m'
-NC='\033[0m'
-
-# --- Globals ---
-JSON=false
-KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}"
-[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="/home/wizard/code/infra/config"
-KUBECTL=""
-NODES=(k8s-master:10.0.20.100 k8s-node1:10.0.20.101 k8s-node2:10.0.20.102 k8s-node3:10.0.20.103 k8s-node4:10.0.20.104)
-SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no)
-NOW_EPOCH=$(date -u +%s)
-HIGHEST_EXIT=0  # 0 healthy, 1 attention, 2 stalled
-
-# Results — collectors fill these.
-APPS_STATUS_ICON=""; APPS_STATUS_TEXT=""
-APPS_LAST_CHECK=""; APPS_NEXT=""; APPS_NOTES=""
-APPS_ENROLLED=0; APPS_PENDING=0; APPS_UPDATES_LINE=""; APPS_ERROR_LINE=""
-
-OS_STATUS_ICON=""; OS_STATUS_TEXT=""
-OS_LAST_CHECK=""; OS_NEXT=""; OS_NOTES=""
-OS_DISTRO_SUMMARY=""; OS_KERNEL_SUMMARY=""
-OS_PENDING_REBOOT_NODES=""; OS_HELD_DETAIL=""
-OS_LAST_UU=""; OS_LAST_KURED=""
-
-K8S_STATUS_ICON=""; K8S_STATUS_TEXT=""
-K8S_LAST_CHECK=""; K8S_NEXT=""; K8S_NOTES=""
-K8S_RUNNING=""; K8S_PATCH=""; K8S_MINOR=""
-K8S_LAST_DETECT_LINE=""; K8S_IN_FLIGHT="no"; K8S_LAST_CHAIN=""
-
-# --- Helpers ---
-log() { [[ "$JSON" == true ]] && return 0; echo -e "$*"; }
-
-raise_exit() {
-    local n="$1"
-    if [[ "$n" -gt "$HIGHEST_EXIT" ]]; then HIGHEST_EXIT="$n"; fi
-    return 0
-}
-
-usage() {
-    cat <<EOF
-Usage: $0 [--json] [--kubeconfig <path>]
-
-Read-only audit of the three autonomous-upgrade pipelines (apps, OS, k8s).
-
-  --json              machine-readable JSON
-  --kubeconfig PATH   override kubeconfig
-
-Exit codes: 0 healthy, 1 attention warranted, 2 something stalled.
-EOF
-}
-
-parse_args() {
-    while [[ $# -gt 0 ]]; do
-        case "$1" in
-            --json)       JSON=true; shift ;;
-            --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;;
-            -h|--help)    usage; exit 0 ;;
-            *) echo "Unknown option: $1" >&2; exit 1 ;;
-        esac
-    done
-    KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH"
-}
-
-# Prometheus query — Prometheus + reload + backup share a network namespace,
-# so reaching localhost:9090 works from any of the three sidecars.
-prom_q() {
-    local q="$1"
-    $KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
-        wget -qO- "http://localhost:9090/api/v1/query?query=${q}" 2>/dev/null || true
-}
-
-pg_metrics() {
-    $KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \
-        wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true
-}
-
-ssh_node() {
-    local ip="$1"; shift
-    ssh "${SSH_OPTS[@]}" "wizard@$ip" "$@" 2>/dev/null || true
-}
-
-human_age() {
-    local secs="$1"
-    if   [[ "$secs" -lt 60    ]]; then printf '%ds ago' "$secs"
-    elif [[ "$secs" -lt 3600  ]]; then printf '%dm ago' $((secs/60))
-    elif [[ "$secs" -lt 86400 ]]; then printf '%dh ago' $((secs/3600))
-    else                               printf '%dd ago' $((secs/86400))
-    fi
-}
-
-# Pushgateway emits floats and scientific notation — coerce to integer
-# epoch seconds. Returns 0 if the input is empty / zero / unparseable.
-to_epoch_int() {
-    local v="${1:-}"
-    if [[ -z "$v" || "$v" == "0" ]]; then echo 0; return; fi
-    python3 -c "import sys; v=sys.argv[1]; print(int(float(v)))" "$v" 2>/dev/null || echo 0
-}
-
-# --- 1. Apps (Keel) ---
-collect_apps() {
-    local pending tracked enrolled updates_24h errors
-
-    # Enrolled: count Deployments with keel.sh/policy != never (Keel itself
-    # is policy=never). The Kyverno auto-injection labels namespaces
-    # keel.sh/enrolled=true, but the annotation is what Keel watches.
-    enrolled=$($KUBECTL get deploy -A -o json 2>/dev/null | python3 -c '
-import json, sys
-data = json.load(sys.stdin)
-n = sum(1 for d in data["items"]
-        if (d["metadata"].get("annotations") or {}).get("keel.sh/policy", "never") != "never")
-print(n)
-' 2>/dev/null || echo 0)
-    APPS_ENROLLED="$enrolled"
-
-    # Pending approvals (sum across Keel pods).
-    pending=$(prom_q 'sum(pending_approvals)' | python3 -c '
-import json, sys
-try:
-    r = json.load(sys.stdin)["data"]["result"]
-    print(int(float(r[0]["value"][1])) if r else 0)
-except Exception:
-    print(0)
-' 2>/dev/null || echo 0)
-    APPS_PENDING="$pending"
-
-    # Tracked images — proxy for "is the scrape live?".
-    tracked=$(prom_q 'count(count by (image) (registries_scanned_total))' | python3 -c '
-import json, sys
-try:
-    r = json.load(sys.stdin)["data"]["result"]
-    print(int(float(r[0]["value"][1])) if r else 0)
-except Exception:
-    print(0)
-' 2>/dev/null || echo 0)
-
-    # Last scrape age — `up{job="kubernetes-pods", app="keel"}` is 1 if the
-    # most recent scrape succeeded. We surface the wallclock age via a tiny
-    # `time() - timestamp(up{...})` query.
-    APPS_LAST_CHECK=$(prom_q 'time()-timestamp(up{job="kubernetes-pods",app="keel"})' | python3 -c '
-import json, sys
-try:
-    r = json.load(sys.stdin)["data"]["result"]
-    if not r: print("scrape not live")
-    else:
-        secs = int(float(r[0]["value"][1]))
-        if secs < 60:  print(f"{secs}s ago")
-        elif secs < 3600: print(f"{secs//60}m ago")
-        else: print(f"{secs//3600}h ago")
-except Exception:
-    print("?")
-' 2>/dev/null || echo "?")
-
-    # Recent updates: count lines in Keel logs that report a successful
-    # rollout. Keel logs an "update completed" message per rollout.
-    local log_24h
-    log_24h=$($KUBECTL -n keel logs deploy/keel --since=24h --tail=2000 2>/dev/null || true)
-    updates_24h=$(echo "$log_24h" | grep -cE 'update completed|successfully updated|deployment updated' 2>/dev/null || true)
-    [[ -z "$updates_24h" ]] && updates_24h=0
-    APPS_UPDATES_LINE="$updates_24h in last 24h (tracked images: $tracked)"
-
-    # Known-benign Keel error patterns to suppress. Each is a real error
-    # line Keel emits, but the surrounding behaviour is fine, so flagging
-    # them in /upgrade-state is just noise.
-    #   - `bot.Run(): can not get configuration for bot [slack]` — Keel
-    #     1.2.0 registers a Slack socket-mode bot whenever SLACK_BOT_TOKEN
-    #     is set, then fails because we don't supply an `xapp-` app-level
-    #     token. We don't want the interactive bot (no approvals; opt-out
-    #     auto-update). The Slack NOTIFICATION sender works independently
-    #     of the bot, so rollout messages still post to #general.
-    #   - `failed to check digest` with a transient network error —
-    #     Keel polls ~175 image manifests against public registries
-    #     hourly. Occasional `i/o timeout` / `connection refused` /
-    #     `TLS handshake timeout` / `no such host` / `EOF` /
-    #     `context deadline exceeded` are inherent to public-internet
-    #     polling at that scale and auto-recover on the next poll.
-    #     Actionable digest-check failures surface as HTTP 401/404
-    #     (auth, removed-tag) — those are NOT filtered.
-    #   - `failed to check digest` with HTTP 5xx — upstream registry
-    #     having a problem (DockerHub maintenance, Forgejo restart,
-    #     etc.). Same recovery pattern as network errors: next hourly
-    #     poll succeeds once upstream is back. Persistent 5xx for >24h
-    #     would indicate a real registry-side issue, but that surfaces
-    #     via the registry's own monitoring (e.g. forgejo-integrity-probe
-    #     + RegistryCatalogInaccessible), not via Keel logs.
-    local benign_re='bot\.Run\(\): can not get configuration for bot \[slack\]'
-    benign_re+='|SLACK_APP_TOKEN must have the (previf|prefix)'
-    benign_re+='|failed to check digest.*(i/o timeout|connection refused|connection reset|context deadline exceeded|TLS handshake timeout|no such host|: EOF)'
-    benign_re+='|failed to check digest.*non-successful response \(status=5[0-9][0-9]'
-    errors=$(echo "$log_24h" | grep -iE '"level":"(error|fatal)"|level=error' | grep -vE "$benign_re" | tail -3 || true)
-    if [[ -z "$errors" ]]; then
-        APPS_ERROR_LINE="(none in last 24h)"
-    else
-        APPS_ERROR_LINE="$(echo "$errors" | wc -l | tr -d ' ') error(s); newest: $(echo "$errors" | tail -1 | cut -c1-120)"
-    fi
-
-    # Keel pod state.
-    local pod_status
-    pod_status=$($KUBECTL -n keel get pods -l app=keel -o jsonpath='{.items[*].status.phase}' 2>/dev/null || true)
-
-    if [[ "$pod_status" != *"Running"* ]]; then
-        APPS_STATUS_ICON="✗"; APPS_STATUS_TEXT="down"
-        APPS_NOTES="Keel pod not Running ($pod_status)"
-        raise_exit 2
-    elif [[ "$pending" -gt 0 || -n "$errors" ]]; then
-        APPS_STATUS_ICON="⚠"; APPS_STATUS_TEXT="attn"
-        APPS_NOTES="$enrolled enrolled; $pending pending; $(echo "$errors" | wc -l | tr -d ' ') recent error(s)"
-        raise_exit 1
-    else
-        APPS_STATUS_ICON="✓"; APPS_STATUS_TEXT="healthy"
-        APPS_NOTES="$enrolled enrolled, 0 pending, 0 errors"
-    fi
-
-    APPS_NEXT="rolling, hourly poll"
-}
-
-# --- 2. OS (apt + kured) ---
-collect_os() {
-    local distros kernels distro_uniq kernel_uniq
-    distros=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.osImage}{"\n"}{end}' 2>/dev/null)
-    kernels=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kernelVersion}{"\n"}{end}' 2>/dev/null)
-    distro_uniq=$(echo "$distros" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
-    kernel_uniq=$(echo "$kernels" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g')
-    OS_DISTRO_SUMMARY="$distro_uniq"
-    OS_KERNEL_SUMMARY="$kernel_uniq"
-
-    # SSH fan-out — parallel background subshells, write per-node results to tmp files.
-    local tmpdir; tmpdir=$(mktemp -d)
-    trap 'rm -rf "$tmpdir"' RETURN
-    local entry name ip
-    for entry in "${NODES[@]}"; do
-        name="${entry%%:*}"; ip="${entry##*:}"
-        (
-            local out reboot held upgradable uu_log
-            reboot=$(ssh_node "$ip" 'test -f /var/run/reboot-required && echo yes || echo no')
-            held=$(ssh_node "$ip" 'apt-mark showhold 2>/dev/null')
-            upgradable=$(ssh_node "$ip" 'apt list --upgradable 2>/dev/null | tail -n +2')
-            uu_log=$(ssh_node "$ip" 'tail -1 /var/log/unattended-upgrades/unattended-upgrades.log 2>/dev/null')
-            printf 'reboot=%s\n' "$reboot"      >  "$tmpdir/$name"
-            printf 'held<<<EOF\n%s\nEOF\n' "$held"         >> "$tmpdir/$name"
-            printf 'upgradable<<<EOF\n%s\nEOF\n' "$upgradable" >> "$tmpdir/$name"
-            printf 'uu_log=%s\n' "$uu_log"     >> "$tmpdir/$name"
-        ) &
-    done
-    wait
-
-    # Aggregate.
-    local pending_reboots=() held_with_bumps_lines=() newest_uu_ts=0 newest_uu_iso=""
-    for entry in "${NODES[@]}"; do
-        name="${entry%%:*}"
-        [[ -f "$tmpdir/$name" ]] || continue
-        local reboot held upgradable uu_log uu_ts
-        reboot=$(awk -F= '/^reboot=/{print $2}' "$tmpdir/$name")
-        held=$(awk '/^held<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
-        upgradable=$(awk '/^upgradable<<<EOF$/,/^EOF$/' "$tmpdir/$name" | sed '1d;$d')
-        uu_log=$(awk -F= '/^uu_log=/{sub(/^uu_log=/,""); print}' "$tmpdir/$name")
-
-        [[ "$reboot" == "yes" ]] && pending_reboots+=("$name")
-
-        # Held + upgradable, excluding k8s components (managed by k8s pipeline).
-        local pkg from to bump
-        while IFS= read -r line; do
-            [[ -z "$line" ]] && continue
-            pkg=$(echo "$line" | awk -F/ '{print $1}')
-            # Skip k8s and kernel/linux-image — the chain handles those.
-            case "$pkg" in
-                kubeadm|kubectl|kubelet) continue ;;
-                linux-image-*|linux-headers-*|linux-modules-*|linux-generic|linux-headers-generic|linux-image-generic) continue ;;
-            esac
-            # Only flag if the package is held.
-            if echo "$held" | grep -qx "$pkg"; then
-                to=$(echo "$line" | awk '{print $2}')
-                from=$(echo "$line" | sed -n 's/.*from: \([^ ]*\).*/\1/p')
-                bump="$pkg ${from%-*}→${to%-*}"
-                held_with_bumps_lines+=("$name: $bump")
-            fi
-        done <<<"$upgradable"
-
-        # Newest uu timestamp (ISO at start of log line).
-        uu_ts=$(echo "$uu_log" | sed -E 's/^([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}).*/\1/')
-        if [[ -n "$uu_ts" ]]; then
-            local epoch; epoch=$(date -u -d "$uu_ts" +%s 2>/dev/null || echo 0)
-            if [[ "$epoch" -gt "$newest_uu_ts" ]]; then
-                newest_uu_ts="$epoch"; newest_uu_iso="$uu_ts"
-            fi
-        fi
-    done
-
-    OS_PENDING_REBOOT_NODES="${pending_reboots[*]:-}"
-    if [[ ${#held_with_bumps_lines[@]} -gt 0 ]]; then
-        OS_HELD_DETAIL=$(printf '%s\n' "${held_with_bumps_lines[@]}" | sort -u | paste -sd '; ' -)
-    fi
-
-    if [[ "$newest_uu_ts" -gt 0 ]]; then
-        local age=$((NOW_EPOCH - newest_uu_ts))
-        OS_LAST_UU="$newest_uu_iso UTC ($(human_age "$age"))"
-        OS_LAST_CHECK="$(human_age "$age") (uu daily)"
-    else
-        OS_LAST_UU="(no uu log accessible)"
-        OS_LAST_CHECK="?"
-    fi
-
-    # Last kured reboot — newest Ready transition across worker nodes.
-    # `Ready -> True` is what kured causes when the node returns; we surface
-    # the most recent timestamp and the node it belongs to.
-    local kured_raw kured_iso kured_node kured_ep kured_age
-    kured_raw=$($KUBECTL get nodes -o json 2>/dev/null | python3 -c '
-import json, sys
-from datetime import datetime
-data = json.load(sys.stdin)
-best = (0, "", "")
-for n in data["items"]:
-    name = n["metadata"]["name"]
-    for c in n["status"].get("conditions", []):
-        if c["type"] == "Ready":
-            dt = datetime.strptime(c["lastTransitionTime"], "%Y-%m-%dT%H:%M:%SZ")
-            ep = int(dt.timestamp())
-            if ep > best[0]:
-                best = (ep, name, c["lastTransitionTime"])
-print(f"{best[0]}|{best[1]}|{best[2]}")
-' 2>/dev/null || echo "0||")
-    kured_ep="${kured_raw%%|*}"
-    kured_node=$(echo "$kured_raw" | cut -d'|' -f2)
-    kured_iso=$(echo "$kured_raw" | cut -d'|' -f3)
-    if [[ "$kured_ep" -gt 0 ]]; then
-        kured_age=$((NOW_EPOCH - kured_ep))
-        OS_LAST_KURED="$kured_iso ($kured_node, $(human_age "$kured_age"))"
-    else
-        OS_LAST_KURED="?"
-    fi
-
-    OS_NEXT="daily 02:00-06:00 London"
-
-    # Kured pod health.
-    local kured_pods kured_unhealthy
-    kured_pods=$($KUBECTL -n kured get pods -l app.kubernetes.io/name=kured -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/dev/null)
-    kured_unhealthy=$(echo "$kured_pods" | grep -cv '^Running$' 2>/dev/null || true)
-
-    local notes=()
-    [[ -n "$OS_HELD_DETAIL" ]]            && notes+=("held with bumps: $OS_HELD_DETAIL")
-    [[ -n "$OS_PENDING_REBOOT_NODES" ]]   && notes+=("pending reboot: $OS_PENDING_REBOOT_NODES")
-
-    if [[ "$kured_unhealthy" -gt 0 ]]; then
-        OS_STATUS_ICON="✗"; OS_STATUS_TEXT="kured down"
-        OS_NOTES="kured pods not all Running"
-        raise_exit 2
-    elif [[ ${#notes[@]} -gt 0 ]]; then
-        OS_STATUS_ICON="⚠"; OS_STATUS_TEXT="attn"
-        OS_NOTES="${notes[*]}"
-        raise_exit 1
-    else
-        OS_STATUS_ICON="✓"; OS_STATUS_TEXT="healthy"
-        OS_NOTES="distros uniform; no held bumps; no pending reboots"
-    fi
-}
-
-# --- 3. K8s (kubeadm/kubelet/kubectl) ---
-collect_k8s() {
-    local kver_list kver_uniq metrics target_patch target_minor last_run in_flight started
-
-    kver_list=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' 2>/dev/null)
-    kver_uniq=$(echo "$kver_list" | sort -u)
-    local n_uniq; n_uniq=$(echo "$kver_uniq" | wc -l | tr -d ' ')
-    if [[ "$n_uniq" -eq 1 ]]; then
-        K8S_RUNNING="$kver_uniq across $(echo "$kver_list" | wc -l | tr -d ' ')/$(echo "$kver_list" | wc -l | tr -d ' ') nodes"
-    else
-        K8S_RUNNING="mixed: $(echo "$kver_uniq" | paste -sd', ' -)"
-    fi
-    local running_ver; running_ver=$(echo "$kver_uniq" | head -1)
-
-    metrics=$(pg_metrics)
-    # All five may legitimately be absent (cluster never ran the upgrade
-    # chain, kind="minor" not detected, etc.) — `|| true` keeps pipefail
-    # from killing the script on no-match.
-    target_patch=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="patch"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
-    target_minor=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="minor"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1)
-    # Pushgateway emits these with `{instance="",job="..."}` labels — the
-    # `awk '$1 ~ /^name(\{|$)/'` form matches both bare and labelled metrics.
-    last_run=$(echo "$metrics"  | awk '$1 ~ /^k8s_version_check_last_run_timestamp(\{|$)/{print $2}' | head -1 || true)
-    in_flight=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_in_flight(\{|$)/{print $2}' | head -1 || true)
-    started=$(echo "$metrics"   | awk '$1 ~ /^k8s_upgrade_started_timestamp(\{|$)/{print $2}' | head -1 || true)
-
-    # Pushgateway timestamps come back in scientific notation
-    # (e.g. 1.779052159e+09) — convert to plain integer seconds.
-    local last_run_int started_int
-    last_run_int=$(to_epoch_int "$last_run")
-    started_int=$(to_epoch_int "$started")
-
-    if [[ "$last_run_int" -gt 0 ]]; then
-        local age=$((NOW_EPOCH - last_run_int))
-        K8S_LAST_CHECK="$(human_age "$age") (daily cron)"
-        if [[ -n "$target_patch" ]]; then
-            K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_patch (patch)"
-        elif [[ -n "$target_minor" ]]; then
-            K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_minor (minor)"
-        else
-            K8S_LAST_DETECT_LINE="last run $(human_age "$age"): no upgrade available"
-        fi
-    else
-        K8S_LAST_CHECK="(metric missing)"
-        K8S_LAST_DETECT_LINE="(no k8s_version_check_last_run_timestamp in Pushgateway)"
-    fi
-    K8S_PATCH="${target_patch:-none}"
-    K8S_MINOR="${target_minor:-none}"
-
-    # In-flight / last chain.
-    if [[ "${in_flight:-0}" == "1" ]]; then
-        K8S_IN_FLIGHT="yes"
-        local since=0
-        [[ "$started_int" -gt 0 ]] && since=$((NOW_EPOCH - started_int))
-        K8S_LAST_CHAIN="in-flight (started $(human_age "$since"))"
-    else
-        K8S_IN_FLIGHT="no"
-        if [[ "$started_int" -gt 0 ]]; then
-            local age=$((NOW_EPOCH - started_int))
-            K8S_LAST_CHAIN="$(human_age "$age")"
-        else
-            K8S_LAST_CHAIN="never (or zeroed)"
-        fi
-    fi
-
-    K8S_NEXT="$(next_daily_noon_utc)"
-
-    # Status logic.
-    local stalled=0
-    if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then
-        # K8sUpgradeStalled fires after 5400s (90m) per monitoring stack.
-        local since=$((NOW_EPOCH - started_int))
-        [[ "$since" -gt 5400 ]] && stalled=1
-    fi
-    local last_run_age=999999999
-    [[ "$last_run_int" -gt 0 ]] && last_run_age=$((NOW_EPOCH - last_run_int))
-
-    if [[ "$stalled" == "1" ]]; then
-        K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="stalled"
-        K8S_NOTES="K8sUpgradeStalled would fire — chain in-flight >90m"
-        raise_exit 2
-    elif [[ "$last_run_age" -gt $((9*86400)) ]]; then
-        K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale"
-        K8S_NOTES="last detection >9d ago"
-        raise_exit 2
-    elif [[ "${in_flight:-0}" == "1" ]]; then
-        K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight"
-        K8S_NOTES="upgrade chain running"
-        raise_exit 1
-    elif [[ -n "$target_patch" ]]; then
-        K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_patch"
-        K8S_NOTES="running $running_ver → v$target_patch (patch) available"
-        raise_exit 1
-    elif [[ -n "$target_minor" ]]; then
-        K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_minor"
-        K8S_NOTES="running $running_ver → v$target_minor (minor) available"
-        raise_exit 1
-    else
-        K8S_STATUS_ICON="✓"; K8S_STATUS_TEXT="current"
-        K8S_NOTES="running $running_ver, nothing newer"
-    fi
-}
-
-# Next daily 12:00 UTC — pure bash date math, no croniter. Schedule was
-# weekly Sunday until 2026-05-18; now `0 12 * * *` in the
-# k8s-version-upgrade stack. If we're still before today's 12:00 UTC,
-# the next run is today; otherwise it's tomorrow.
-next_daily_noon_utc() {
-    local hr days_ahead
-    hr=$(date -u +%H)
-    if [[ "$hr" -lt 12 ]]; then days_ahead=0; else days_ahead=1; fi
-    date -u -d "+$days_ahead days" +"%a %Y-%m-%d 12:00 UTC"
-}
-
-# --- Renderers ---
-# The table uses `column -t` so we don't have to compute visual widths
-# manually (the status icons are multi-byte UTF-8 and ANSI escapes don't
-# play nice with `printf %-Xs`). Trade-off: no in-cell colour, but the
-# icon character already carries the signal.
-render_table() {
-    echo
-    printf "${BOLD}Upgrade state — %s${NC}\n" "$(date -u +'%Y-%m-%d %H:%M UTC')"
-    echo
-    {
-        echo "Layer|Status|Last check|Next upgrade|Notes"
-        echo "-----|------|----------|------------|-----"
-        printf 'Apps|%s %s|%s|%s|%s\n' "$APPS_STATUS_ICON" "$APPS_STATUS_TEXT" "$APPS_LAST_CHECK" "$APPS_NEXT" "$APPS_NOTES"
-        printf 'OS  |%s %s|%s|%s|%s\n' "$OS_STATUS_ICON"   "$OS_STATUS_TEXT"   "$OS_LAST_CHECK"   "$OS_NEXT"   "$OS_NOTES"
-        printf 'K8s |%s %s|%s|%s|%s\n' "$K8S_STATUS_ICON"  "$K8S_STATUS_TEXT"  "$K8S_LAST_CHECK"  "$K8S_NEXT"  "$K8S_NOTES"
-    } | column -t -s '|' -o ' | '
-
-    echo
-    printf "${BOLD}--- Apps (Keel) ---${NC}\n"
-    echo "Enrolled deployments: $APPS_ENROLLED"
-    echo "Recent rollouts: $APPS_UPDATES_LINE"
-    echo "Pending approvals: $APPS_PENDING"
-    echo "Last Keel error: $APPS_ERROR_LINE"
-
-    echo
-    printf "${BOLD}--- OS (apt + kured) ---${NC}\n"
-    echo "Ubuntu per node: $OS_DISTRO_SUMMARY"
-    echo "Kernel per node: $OS_KERNEL_SUMMARY"
-    echo "Pending reboot: ${OS_PENDING_REBOOT_NODES:-none}"
-    echo "Held packages with upstream bumps: ${OS_HELD_DETAIL:-none (excluding k8s components)}"
-    echo "Last uu run (newest across nodes): $OS_LAST_UU"
-    echo "Last kured reboot (newest Ready transition): $OS_LAST_KURED"
-    echo "Next kured window: $OS_NEXT"
-
-    echo
-    printf "${BOLD}--- K8s (kubeadm/kubelet/kubectl) ---${NC}\n"
-    echo "Running: $K8S_RUNNING"
-    echo "Latest patch (apt): ${K8S_PATCH}"
-    echo "Next minor available: ${K8S_MINOR}"
-    echo "Detection: $K8S_LAST_DETECT_LINE"
-    echo "In-flight: $K8S_IN_FLIGHT  |  Last chain start: $K8S_LAST_CHAIN"
-    echo "Next detection: $K8S_NEXT"
-    echo
-}
-
-render_json() {
-    # Pipe values into Python via env vars so we don't need to worry about
-    # embedded quotes/backslashes in error lines.
-    APPS_STATUS_ICON="$APPS_STATUS_ICON" APPS_STATUS_TEXT="$APPS_STATUS_TEXT" \
-    APPS_LAST_CHECK="$APPS_LAST_CHECK" APPS_NEXT="$APPS_NEXT" APPS_NOTES="$APPS_NOTES" \
-    APPS_ENROLLED="$APPS_ENROLLED" APPS_PENDING="$APPS_PENDING" \
-    APPS_UPDATES_LINE="$APPS_UPDATES_LINE" APPS_ERROR_LINE="$APPS_ERROR_LINE" \
-    OS_STATUS_ICON="$OS_STATUS_ICON" OS_STATUS_TEXT="$OS_STATUS_TEXT" \
-    OS_LAST_CHECK="$OS_LAST_CHECK" OS_NEXT="$OS_NEXT" OS_NOTES="$OS_NOTES" \
-    OS_DISTRO_SUMMARY="$OS_DISTRO_SUMMARY" OS_KERNEL_SUMMARY="$OS_KERNEL_SUMMARY" \
-    OS_PENDING_REBOOT_NODES="$OS_PENDING_REBOOT_NODES" OS_HELD_DETAIL="$OS_HELD_DETAIL" \
-    OS_LAST_UU="$OS_LAST_UU" OS_LAST_KURED="$OS_LAST_KURED" \
-    K8S_STATUS_ICON="$K8S_STATUS_ICON" K8S_STATUS_TEXT="$K8S_STATUS_TEXT" \
-    K8S_LAST_CHECK="$K8S_LAST_CHECK" K8S_NEXT="$K8S_NEXT" K8S_NOTES="$K8S_NOTES" \
-    K8S_RUNNING="$K8S_RUNNING" K8S_PATCH="$K8S_PATCH" K8S_MINOR="$K8S_MINOR" \
-    K8S_LAST_DETECT_LINE="$K8S_LAST_DETECT_LINE" K8S_IN_FLIGHT="$K8S_IN_FLIGHT" K8S_LAST_CHAIN="$K8S_LAST_CHAIN" \
-    HIGHEST_EXIT="$HIGHEST_EXIT" \
-    python3 -c '
-import json, os
-from datetime import datetime, timezone
-def env(k): return os.environ.get(k, "")
-out = {
-    "as_of_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
-    "highest_exit": int(env("HIGHEST_EXIT")),
-    "apps": {
-        "status": env("APPS_STATUS_ICON"),
-        "status_text": env("APPS_STATUS_TEXT"),
-        "last_check": env("APPS_LAST_CHECK"),
-        "next_upgrade": env("APPS_NEXT"),
-        "notes": env("APPS_NOTES"),
-        "enrolled": int(env("APPS_ENROLLED") or 0),
-        "pending_approvals": int(env("APPS_PENDING") or 0),
-        "updates_line": env("APPS_UPDATES_LINE"),
-        "errors_line": env("APPS_ERROR_LINE"),
-    },
-    "os": {
-        "status": env("OS_STATUS_ICON"),
-        "status_text": env("OS_STATUS_TEXT"),
-        "last_check": env("OS_LAST_CHECK"),
-        "next_upgrade": env("OS_NEXT"),
-        "notes": env("OS_NOTES"),
-        "distros": env("OS_DISTRO_SUMMARY"),
-        "kernels": env("OS_KERNEL_SUMMARY"),
-        "pending_reboot_nodes": env("OS_PENDING_REBOOT_NODES"),
-        "held_with_bumps": env("OS_HELD_DETAIL"),
-        "last_uu_run": env("OS_LAST_UU"),
-        "last_kured_reboot": env("OS_LAST_KURED"),
-    },
-    "k8s": {
-        "status": env("K8S_STATUS_ICON"),
-        "status_text": env("K8S_STATUS_TEXT"),
-        "last_check": env("K8S_LAST_CHECK"),
-        "next_upgrade": env("K8S_NEXT"),
-        "notes": env("K8S_NOTES"),
-        "running": env("K8S_RUNNING"),
-        "patch_target": env("K8S_PATCH"),
-        "minor_target": env("K8S_MINOR"),
-        "last_detection_line": env("K8S_LAST_DETECT_LINE"),
-        "in_flight": env("K8S_IN_FLIGHT"),
-        "last_chain": env("K8S_LAST_CHAIN"),
-    },
-}
-print(json.dumps(out, indent=2))
-'
-}
-
-main() {
-    parse_args "$@"
-    collect_apps
-    collect_os
-    collect_k8s
-    if [[ "$JSON" == true ]]; then
-        render_json
-    else
-        render_table
-    fi
-    exit "$HIGHEST_EXIT"
-}
-
-main "$@"
--- a/scripts/vault-kubeconfig
+++ b/scripts/vault-kubeconfig
@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-# Generate a short-lived kubeconfig from Vault K8s secrets engine.
-# Requires: vault login -method=oidc (or VAULT_TOKEN set)
-set -euo pipefail
-
-TOKEN=$(vault write -format=json kubernetes/creds/local-admin kubernetes_namespace=default | jq -r .data.service_account_token)
-kubectl config set-credentials vault-admin --token="$TOKEN"
-kubectl config set-context vault --cluster=kubernetes --user=vault-admin
-kubectl config use-context vault
-echo "Kubeconfig set with 1h token"
--- a/scripts/vault-token-renew.service
+++ b/scripts/vault-token-renew.service
@ -1,9 +0,0 @@
-[Unit]
-Description=Renew the periodic Vault/OpenBao token in ~/.vault-token
-Documentation=https://github.com/ViktorBarzin/infra/blob/master/scripts/vault-token-renew.sh
-Wants=network-online.target
-After=network-online.target
-
-[Service]
-Type=oneshot
-ExecStart=%h/.local/bin/vault-token-renew
--- a/scripts/vault-token-renew.sh
+++ b/scripts/vault-token-renew.sh
@ -1,90 +0,0 @@
-#!/usr/bin/env bash
-# Renew the long-lived PERIODIC Vault/OpenBao token stored in ~/.vault-token.
-#
-# Background: wizard@devvm used to hold a 7-day OIDC login token (re-auth weekly
-# via `vault login -method=oidc`). On 2026-06-05 that was replaced with a
-# periodic, orphan token so it never expires. Periodic tokens have no max-TTL;
-# they only need renewing within each `period` (768h / 32d here). This unit
-# renews daily, so the token stays alive indefinitely with huge margin. If the
-# box is ever decommissioned and this stops running, the token self-expires
-# within ~32 days (unlike a root token, which would live forever).
-#
-# Token was minted with (vault-admin = path "*" sudo; sops-admin = transit for SOPS):
-#   vault token create -orphan -period=768h \
-#     -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard
-# To recreate if ever lost: `vault login -method=oidc`, run the above with
-#   `-field=token > ~/.vault-token`, then `chmod 600 ~/.vault-token`.
-#
-# Source of truth: infra/scripts/vault-token-renew.sh (deployed to
-# ~/.local/bin/vault-token-renew). Driven by the systemd USER units
-# vault-token-renew.{service,timer}. Deploy + recovery runbook:
-# infra/docs/runbooks/vault-token-renew-devvm.md
-
-EXPECTED_DN="token-devvm-wizard"
-REQUIRED_POLICY="vault-admin"
-
-# vtr_display_name <lookup-json> -> display_name (empty if absent).
-vtr_display_name() {
-  printf '%s' "$1" | jq -r '.data.display_name // ""'
-}
-
-# vtr_policies_csv <lookup-json> -> comma-joined token policies + identity policies.
-# Both are merged because a token minted via OIDC carries vault-admin only in
-# identity_policies, while .data.policies shows just [default] (misleading on its
-# own — see memory id=4211). Our periodic token carries them as token policies.
-vtr_policies_csv() {
-  printf '%s' "$1" | jq -r '((.data.policies // []) + (.data.identity_policies // [])) | join(",")'
-}
-
-# vtr_drift_ok <display_name> <policies-csv> -> 0 if this is OUR periodic admin
-# token (right display name AND vault-admin present), 1 otherwise. The comma
-# fencing makes the policy match exact (so "vault-admin-ro" never matches).
-vtr_drift_ok() {
-  local dn="$1" pols="$2"
-  [ "$dn" = "$EXPECTED_DN" ] || return 1
-  printf ',%s,' "$pols" | grep -q ",$REQUIRED_POLICY," || return 1
-}
-
-vtr_main() {
-  set -euo pipefail
-  export PATH="/usr/local/bin:/usr/bin:/bin:${PATH:-}"
-  export VAULT_ADDR="${VAULT_ADDR:-https://vault.viktorbarzin.me}"
-
-  local log info dn pols out ttl
-  log="${XDG_STATE_HOME:-$HOME/.local/state}/vault-token-renew.log"
-  mkdir -p "$(dirname "$log")"
-
-  if ! info=$(vault token lookup -format=json 2>&1); then
-    printf '%s FAIL: token lookup: %s\n' "$(date -Is)" "$info" >>"$log"
-    exit 1
-  fi
-  dn=$(vtr_display_name "$info")
-  pols=$(vtr_policies_csv "$info")
-
-  # Drift guard (added 2026-06-07): the renewer must NOT keep a FOREIGN token alive.
-  # On 2026-06-05 a stray `vault login -method=kubernetes` overwrote ~/.vault-token
-  # with a read-only woodpecker token, and this script then silently renewed THAT
-  # for two days — masking the loss of write access. So before renewing, confirm
-  # the token is our periodic admin token; if it has drifted, fail loudly (systemd
-  # marks the unit failed) instead of keeping someone else's token alive.
-  if ! vtr_drift_ok "$dn" "$pols"; then
-    printf '%s DRIFT: ~/.vault-token is dn=%q policies=%q (expected dn=%q with %q). Refusing to renew a foreign token. Re-mint: vault login -method=oidc && vault token create -orphan -period=768h -policy=vault-admin -policy=sops-admin -display-name=devvm-wizard -field=token > ~/.vault-token && chmod 600 ~/.vault-token\n' \
-      "$(date -Is)" "$dn" "$pols" "$EXPECTED_DN" "$REQUIRED_POLICY" >>"$log"
-    exit 1
-  fi
-
-  # `vault token renew` with no argument renews the calling token (renew-self).
-  # On success, log only the new TTL (never the raw JSON — it contains the token).
-  if out=$(vault token renew -format=json 2>&1); then
-    ttl=$(printf '%s' "$out" | jq -r '.auth.lease_duration' 2>/dev/null || echo '?')
-    printf '%s OK renewed (dn=%s ttl=%ss)\n' "$(date -Is)" "$dn" "$ttl" >>"$log"
-  else
-    printf '%s FAIL: %s\n' "$(date -Is)" "$out" >>"$log"
-    exit 1
-  fi
-}
-
-# Run main only when executed directly, so the test can source the pure functions.
-if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
-  vtr_main "$@"
-fi
--- a/scripts/vault-token-renew.timer
+++ b/scripts/vault-token-renew.timer
@ -1,10 +0,0 @@
-[Unit]
-Description=Daily renewal of the periodic Vault token in ~/.vault-token
-
-[Timer]
-OnCalendar=daily
-Persistent=true
-RandomizedDelaySec=300
-
-[Install]
-WantedBy=timers.target
--- a/scripts/woodpecker-register-forgejo-repo.sh
+++ b/scripts/woodpecker-register-forgejo-repo.sh
@ -1,121 +0,0 @@
-#!/usr/bin/env bash
-# Programmatically register a Forgejo repo in Woodpecker without needing the
-# Web UI's OAuth flow.
-#
-# Earlier we believed only the OAuth login could create a working webhook
-# because the webhook URL contains a JWT signed with a server-side key.
-# That's true for the JWT, BUT the webhook is created server-side when the
-# repo is activated through POST /api/repos — Woodpecker handles the JWT
-# generation internally. We just need to call that endpoint as the right
-# user (the one whose forge OAuth token can read the repo).
-#
-# The Woodpecker admin token (mine, ViktorBarzin@github) is a session JWT
-# of the form `{"type":"user","user-id":"1"}` signed with the user's
-# `hash` column (per-user, stored in the `users` table). Forge-API calls
-# made on behalf of that user use the user's stored OAuth `access_token`
-# from the same row. My GitHub admin can't read Forgejo repos, so the
-# admin token can't activate Forgejo repos.
-#
-# The fix: mint a session JWT for the Forgejo `viktor` user (user_id=2)
-# using `viktor`'s `hash`. Then POST /api/repos as viktor — viktor's
-# stored Forgejo OAuth token has the access needed.
-#
-# Usage:
-#   ./woodpecker-register-forgejo-repo.sh <forgejo-org/repo> [<forgejo-org/repo> ...]
-# Example:
-#   ./woodpecker-register-forgejo-repo.sh viktor/broker-sync viktor/freedify
-#
-# Requires:
-# - vault CLI logged in (oidc or token), with read access to
-#   secret/database/static-creds/pg-woodpecker AND a Forgejo PAT in
-#   secret/viktor/forgejo_admin_token (or pass FORGEJO_TOKEN env var)
-# - kubectl with cluster access (for the temporary psql pod)
-# - openssl
-
-set -euo pipefail
-
-NS=${NS:-woodpecker}
-WP_URL=${WP_URL:-https://ci.viktorbarzin.me}
-FORGEJO_URL=${FORGEJO_URL:-https://forgejo.viktorbarzin.me}
-FORGEJO_USER_LOGIN=${FORGEJO_USER_LOGIN:-viktor}
-
-if [ "$#" -lt 1 ]; then
-  echo "usage: $0 <org/repo> [<org/repo> ...]" >&2
-  exit 1
-fi
-
-# Pull viktor's `hash` from the woodpecker DB (used to sign the session JWT)
-# and OAuth access_token (sanity check it exists).
-WP_DB_USER=$(vault read -format=json database/static-creds/pg-woodpecker | jq -r .data.username)
-WP_DB_PASS=$(vault read -format=json database/static-creds/pg-woodpecker | jq -r .data.password)
-
-PG_POD=tmp-wp-register-$$
-cat <<EOF | kubectl apply -f - >/dev/null
-apiVersion: v1
-kind: Pod
-metadata: { name: $PG_POD, namespace: $NS }
-spec:
-  restartPolicy: Never
-  containers:
-  - name: psql
-    image: postgres:15
-    env: [{name: PGPASSWORD, value: "$WP_DB_PASS"}]
-    command: ["sleep", "300"]
-EOF
-trap "kubectl delete pod -n $NS $PG_POD --wait=false >/dev/null 2>&1 || true" EXIT
-for _ in $(seq 1 30); do
-  PHASE=$(kubectl get pod -n $NS $PG_POD -o jsonpath='{.status.phase}' 2>/dev/null || true)
-  [ "$PHASE" = "Running" ] && break
-  sleep 1
-done
-
-VIKTOR_HASH=$(kubectl exec -n $NS $PG_POD -- psql -h pg-cluster-rw.dbaas -U "$WP_DB_USER" -d woodpecker -tA -c \
-  "SELECT hash FROM users WHERE login='$FORGEJO_USER_LOGIN' AND forge_id=2" | tr -d '[:space:]')
-
-if [ -z "$VIKTOR_HASH" ]; then
-  echo "ERROR: no woodpecker user found for forge_id=2 login=$FORGEJO_USER_LOGIN" >&2
-  echo "       (have they ever logged in via Forgejo OAuth?)" >&2
-  exit 1
-fi
-
-# Mint a session JWT (HS256) for that user.
-b64() { openssl base64 -A | tr '+/' '-_' | tr -d '='; }
-HEADER=$(printf '%s' '{"alg":"HS256","typ":"JWT"}' | b64)
-PAYLOAD=$(printf '{"type":"user","user-id":"%s"}' \
-  "$(kubectl exec -n $NS $PG_POD -- psql -h pg-cluster-rw.dbaas -U "$WP_DB_USER" -d woodpecker -tA -c \
-       "SELECT id FROM users WHERE login='$FORGEJO_USER_LOGIN' AND forge_id=2" | tr -d '[:space:]')" | b64)
-SIG=$(printf '%s.%s' "$HEADER" "$PAYLOAD" | openssl dgst -sha256 -hmac "$VIKTOR_HASH" -binary | b64)
-TOKEN="$HEADER.$PAYLOAD.$SIG"
-
-# Sanity check: am I really logged in as viktor?
-ME=$(curl -sf "$WP_URL/api/user" -H "Authorization: Bearer $TOKEN" | jq -r '.login')
-if [ "$ME" != "$FORGEJO_USER_LOGIN" ]; then
-  echo "ERROR: minted token authenticates as '$ME', not '$FORGEJO_USER_LOGIN'" >&2
-  exit 1
-fi
-echo "Authenticated as: $ME"
-
-# Activate each repo via POST /api/repos?forge_remote_id=N
-# Forgejo repo ID is fetched via the Forgejo API.
-FORGEJO_AUTH="${FORGEJO_TOKEN:-$(vault kv get -field=forgejo_admin_token secret/viktor 2>/dev/null || true)}"
-if [ -z "$FORGEJO_AUTH" ]; then
-  echo "ERROR: set FORGEJO_TOKEN env or seed secret/viktor/forgejo_admin_token in vault" >&2
-  exit 1
-fi
-
-for repo in "$@"; do
-  FRID=$(curl -sf "$FORGEJO_URL/api/v1/repos/$repo" -H "Authorization: token $FORGEJO_AUTH" | jq -r .id 2>/dev/null || true)
-  if [ -z "$FRID" ] || [ "$FRID" = "null" ]; then
-    echo "  $repo: ERROR resolving Forgejo repo id" >&2
-    continue
-  fi
-  HTTP=$(curl -s -X POST "$WP_URL/api/repos?forge_remote_id=$FRID" \
-    -H "Authorization: Bearer $TOKEN" \
-    -o /tmp/wp-add-$FRID.json -w "%{http_code}")
-  case "$HTTP" in
-    200) echo "  $repo: activated (id=$(jq -r .id /tmp/wp-add-$FRID.json))" ;;
-    409) echo "  $repo: already active" ;;
-    *)   echo "  $repo: HTTP $HTTP — $(cat /tmp/wp-add-$FRID.json)" ;;
-  esac
-  rm -f /tmp/wp-add-$FRID.json
-done
--- a/scripts/workstation/.gitignore
+++ b/scripts/workstation/.gitignore
@ -1,3 +0,0 @@
-__pycache__/
-.pytest_cache/
-*.pyc
--- a/scripts/workstation/managed-settings.json
+++ b/scripts/workstation/managed-settings.json
@ -1,3 +0,0 @@
-{
-  "claudeMd": "# Viktor Barzin homelab — shared multi-user Claude Code Workstation (devvm)\n\nYou are running as a specific OS user on a SHARED devvm Workstation, not as the admin. These org-wide rules apply to EVERY user and sit at the top of settings precedence (they cannot be overridden by a user's own config):\n\n- Respect your permission tier. Your kubectl, Vault, and infra access are scoped to your RBAC tier (admin / power-user / namespace-owner). Do not attempt to escalate privileges or reach another user's resources.\n- Secrets are per-user. Never read another user's home directory, credentials, tokens, or ~/.claude secrets. Your own secrets live in your home at mode 600.\n- Infrastructure changes go through Terraform/Terragrunt (scripts/tg apply) — never direct kubectl apply/edit/patch. Pushing to git does NOT deploy; applies are manual and admin-gated, so your edits cannot take effect without an admin apply.\n- Follow the engineering rules in ~/.claude/rules/ (execution, planning, quality) and every CLAUDE.md in the repo tree.\n- The monorepo is at ~/code. Non-admins get a git-crypt-LOCKED clone: secret files read as ciphertext — that is expected, not an error."
-}
--- a/scripts/workstation/packages.txt
+++ b/scripts/workstation/packages.txt
@ -1,26 +0,0 @@
-# Declarative host toolset for the devvm Workstation (apt packages, one per line).
-# Consumed by setup-devvm.sh:  apt-get install -y $(grep -vE '^\s*(#|$)' packages.txt)
-# Comments (#) and blank lines are ignored. Tools NOT in the standard apt repos
-# are listed below as comments with their real install path (handled explicitly
-# in setup-devvm.sh) so this manifest stays a safe argument to `apt-get install`.
-git
-zsh
-tmux
-ripgrep
-fd-find
-jq
-curl
-ca-certificates
-python3
-python3-yaml
-python3-pip
-podman
-
-# --- installed by setup-devvm.sh via NON-apt paths (not apt-installable) ---
-# nodejs + npm                -> NodeSource repo (claude-code needs node >= 18; distro nodejs is too old)
-# @anthropic-ai/claude-code   -> npm install -g
-# kubectl                     -> k8s apt repo OR pinned binary (already present on devvm)
-# vault                       -> HashiCorp apt repo OR pinned binary (already present on devvm)
-# kubelogin (kubectl oidc-login) -> `kubectl krew install oidc-login` or int128/kubelogin release.
-#                                NOTE: the apt package literally named "kubelogin" is the AZURE
-#                                tool, NOT the OIDC one we need -- do not apt-install it.
--- a/scripts/workstation/roster.yaml
+++ b/scripts/workstation/roster.yaml
@ -1,21 +0,0 @@
-# THE single source of truth for the devvm Workstation lifecycle (onboard -> offboard).
-# Consumed by roster_engine.py (derive/validate) + t3-provision-users.sh (apply).
-#
-# os_user (the map KEY, pinned) -> authentik_user . k8s_user . tier . namespaces
-# The three identifiers differ per person (verified 2026-06-08) -- no email->username
-# derivation; record each explicitly.
-#
-# Tiers: admin | power-user | namespace-owner
-#   admin           - cluster-admin, unlocked tree, secrets (groups: sudo,docker,code-shared)
-#   power-user      - cluster-wide READ (no Secrets) via oidc-power-user-readonly; locked clone
-#   namespace-owner - admin in their own namespace(s) only; locked clone
-#
-# wizard IS listed (as admin): the reconcile REGENERATES /etc/ttyd-user-map +
-# dispatch.json from this file, so omitting him would drop his t3 instance. The
-# provisioner skips account/group/clone mutations for already-existing users, so
-# listing him is safe (he keeps his unlocked tree + cluster-admin untouched).
-users:
-  wizard:    {authentik_user: vbarzin,     k8s_user: wizard, tier: admin}                                          # base config author + cluster-admin
-  emo:       {authentik_user: emil.barzin, k8s_user: emo,    tier: power-user}                                     # NET-NEW k8s_users entry (add as power-user before provisioning)
-  ancamilea: {authentik_user: ancaelena98, k8s_user: anca,   tier: namespace-owner, namespaces: [plotting-book]}   # ALREADY provisioned in-cluster -- assert, don't re-create
-# gheorghe:  {authentik_user: vabbit81,    k8s_user: vabbit81, tier: namespace-owner, namespaces: [vabbit81]}      # already a cluster ns-owner; uncomment to give him a devvm workstation
--- a/scripts/workstation/roster_engine.py
+++ b/scripts/workstation/roster_engine.py
@ -1,299 +0,0 @@
-#!/usr/bin/env python3
-"""Pure derivation + offboarding-diff engine for the devvm Workstation roster.
-
-Functional core (this module, unit-tested) / imperative shell (the bash
-provisioner that consumes the JSON this emits and performs the host mutations).
-No host I/O lives in the tested functions. See PRD ViktorBarzin/infra#9.
-
-The roster (`roster.yaml`) is the single source of truth for the workstation
-lifecycle. `os_user` is the pinned key; `authentik_user` / `k8s_user` differ
-per person and are recorded explicitly (no email->username derivation).
-"""
-
-from __future__ import annotations
-
-import json
-import sys
-from dataclasses import dataclass, field
-from typing import Iterable
-
-import yaml
-
-BASE_PORT = 3773
-VALID_TIERS = ("admin", "power-user", "namespace-owner")
-# Tier -> supplementary groups the reconcile ENSURES (additive-only; never stripped).
-TIER_GROUPS: dict[str, tuple[str, ...]] = {
-    "admin": ("code-shared", "docker", "sudo"),
-    "power-user": (),
-    "namespace-owner": (),
-}
-DEFAULT_SHELL = "/bin/zsh"
-_REVERSIBLE_OFFBOARD_KINDS = (
-    "disable_instance",
-    "unmap_dispatch",
-    "remove_from_t3_group",
-    "lock_login",
-    "revoke_cluster_rbac",
-)
-
-
-class RosterError(ValueError):
-    """Raised when the roster is structurally invalid."""
-
-
-@dataclass(frozen=True)
-class User:
-    os_user: str
-    authentik_user: str
-    k8s_user: str
-    tier: str
-    namespaces: tuple[str, ...] = ()
-
-
-@dataclass(frozen=True)
-class Roster:
-    users: dict[str, User] = field(default_factory=dict)
-
-
-@dataclass(frozen=True)
-class Account:
-    os_user: str
-    tier: str
-    shell: str
-    login_locked: bool
-    groups: tuple[str, ...]
-
-
-@dataclass(frozen=True)
-class DesiredState:
-    accounts: dict[str, Account]
-    ttyd_user_map: str
-    dispatch: dict[str, dict]
-    ports: dict[str, int]
-
-
-@dataclass(frozen=True)
-class OffboardAction:
-    os_user: str
-    kind: str
-    reversible: bool
-
-
-# --------------------------------------------------------------------------
-# Parsing + structural validation
-# --------------------------------------------------------------------------
-
-
-def _parse_user(os_user: str, spec: dict) -> User:
-    for required in ("authentik_user", "k8s_user", "tier"):
-        if required not in spec:
-            raise RosterError(f"user {os_user!r}: missing required field {required!r}")
-    tier = spec["tier"]
-    if tier not in VALID_TIERS:
-        raise RosterError(
-            f"user {os_user!r}: unknown tier {tier!r} (valid: {list(VALID_TIERS)})"
-        )
-    namespaces = tuple(spec.get("namespaces") or ())
-    if tier == "namespace-owner" and not namespaces:
-        raise RosterError(f"user {os_user!r}: namespace-owner requires namespaces")
-    if tier != "namespace-owner" and namespaces:
-        raise RosterError(f"user {os_user!r}: only namespace-owner may set namespaces")
-    return User(os_user, spec["authentik_user"], spec["k8s_user"], tier, namespaces)
-
-
-def load_roster(text: str) -> Roster:
-    data = yaml.safe_load(text) or {}
-    users_raw = data.get("users") or {}
-    return Roster({name: _parse_user(name, spec) for name, spec in users_raw.items()})
-
-
-def load_roster_file(path: str) -> Roster:
-    with open(path, encoding="utf-8") as fh:
-        return load_roster(fh.read())
-
-
-# --------------------------------------------------------------------------
-# Tier validation against live k8s_users (fail-loud)
-# --------------------------------------------------------------------------
-
-
-@dataclass(frozen=True)
-class ValidationIssue:
-    os_user: str
-    severity: str  # "error" = tier conflict (abort) | "warn" = absent (grant pending)
-    message: str
-
-
-def validate_tiers(
-    roster: Roster, k8s_user_tiers: dict[str, str]
-) -> list[ValidationIssue]:
-    """Compare each roster user's tier against the live `k8s_users` map. A real
-    conflict (roster tier != cluster tier) is an "error" (abort). A net-new user
-    not yet in `k8s_users` is a "warn" (onboarding proceeds; the kubectl grant is
-    pending). Admins are exempt (cluster-admin is granted out of band). An empty
-    list means the roster is consistent with the cluster."""
-    issues = []
-    for user in roster.users.values():
-        if user.tier == "admin":
-            continue
-        actual = k8s_user_tiers.get(user.k8s_user)
-        if actual is None:
-            issues.append(
-                ValidationIssue(
-                    user.os_user,
-                    "warn",
-                    f"{user.os_user}: tier {user.tier} but k8s_user {user.k8s_user!r} "
-                    f"absent from k8s_users (kubectl grant pending — add the entry)",
-                )
-            )
-        elif actual != user.tier:
-            issues.append(
-                ValidationIssue(
-                    user.os_user,
-                    "error",
-                    f"{user.os_user}: roster tier {user.tier} != k8s_users tier "
-                    f"{actual} for {user.k8s_user!r}",
-                )
-            )
-    return issues
-
-
-def has_blocking_errors(issues: list[ValidationIssue]) -> bool:
-    return any(issue.severity == "error" for issue in issues)
-
-
-# --------------------------------------------------------------------------
-# Desired-state derivation (sticky ports, ttyd map, dispatch, accounts)
-# --------------------------------------------------------------------------
-
-
-def _allocate_ports(roster: Roster, existing_ports: dict[str, int]) -> dict[str, int]:
-    ports = {u: existing_ports[u] for u in roster.users if u in existing_ports}
-    used = set(ports.values())
-    for os_user in sorted(roster.users):
-        if os_user in ports:
-            continue
-        candidate = BASE_PORT
-        while candidate in used:
-            candidate += 1
-        ports[os_user] = candidate
-        used.add(candidate)
-    return ports
-
-
-_TTYD_MAP_HEADER = (
-    "# Generated from roster.yaml by roster_engine.py — DO NOT EDIT BY HAND.\n"
-    "# <authentik_user>=<os_user>; consumed by t3-dispatch.\n"
-)
-
-
-def derive_desired_state(
-    roster: Roster, existing_ports: dict[str, int]
-) -> DesiredState:
-    ports = _allocate_ports(roster, existing_ports)
-    ordered = sorted(roster.users.values(), key=lambda u: ports[u.os_user])
-    ttyd_lines = [f"{u.authentik_user}={u.os_user}" for u in ordered]
-    ttyd_user_map = _TTYD_MAP_HEADER + "\n".join(ttyd_lines) + "\n"
-    dispatch = {
-        u.authentik_user: {"os_user": u.os_user, "port": ports[u.os_user]}
-        for u in ordered
-    }
-    accounts = {
-        u.os_user: Account(
-            os_user=u.os_user,
-            tier=u.tier,
-            shell=DEFAULT_SHELL,
-            login_locked=True,
-            groups=TIER_GROUPS[u.tier],
-        )
-        for u in roster.users.values()
-    }
-    return DesiredState(accounts, ttyd_user_map, dispatch, ports)
-
-
-def groups_to_add(desired: Iterable[str], current: Iterable[str]) -> list[str]:
-    """Additive-only: the groups to `gpasswd -a`. Never proposes a removal, so a
-    routine reconcile can't strip a pre-existing user's legacy groups."""
-    return sorted(set(desired) - set(current))
-
-
-# --------------------------------------------------------------------------
-# Offboarding diff (staged: reversible cut, then gated destructive removal)
-# --------------------------------------------------------------------------
-
-
-def to_deprovision(old: Roster, new: Roster) -> list[str]:
-    return sorted(set(old.users) - set(new.users))
-
-
-def offboard_plan(
-    old: Roster, new: Roster, *, include_destructive: bool
-) -> list[OffboardAction]:
-    """Staged offboarding actions for users dropped from the roster. The
-    reversible cut (disable instance, unmap, lock, revoke RBAC) is always
-    returned; the irreversible `userdel_archive` is included ONLY when
-    explicitly requested, so it can never be auto-applied by a reconcile."""
-    plan: list[OffboardAction] = []
-    for os_user in to_deprovision(old, new):
-        plan.extend(
-            OffboardAction(os_user, kind, True) for kind in _REVERSIBLE_OFFBOARD_KINDS
-        )
-        if include_destructive:
-            plan.append(OffboardAction(os_user, "userdel_archive", False))
-    return plan
-
-
-# --------------------------------------------------------------------------
-# CLI adapter (imperative shell entrypoint — consumed by t3-provision-users.sh)
-# --------------------------------------------------------------------------
-
-
-def _desired_state_to_dict(ds: DesiredState) -> dict:
-    return {
-        "accounts": {
-            name: {
-                "os_user": a.os_user,
-                "tier": a.tier,
-                "shell": a.shell,
-                "login_locked": a.login_locked,
-                "groups": list(a.groups),
-            }
-            for name, a in ds.accounts.items()
-        },
-        "ttyd_user_map": ds.ttyd_user_map,
-        "dispatch": ds.dispatch,
-        "ports": ds.ports,
-    }
-
-
-def _main(argv: list[str]) -> int:
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Workstation roster engine")
-    sub = parser.add_subparsers(dest="cmd", required=True)
-    pv = sub.add_parser(
-        "validate", help="exit 1 if roster tiers diverge from k8s_users"
-    )
-    pv.add_argument("--roster", required=True)
-    pv.add_argument("--k8s-users-json", required=True, help="JSON map {k8s_user: tier}")
-    pd = sub.add_parser("derive", help="emit desired state as JSON")
-    pd.add_argument("--roster", required=True)
-    pd.add_argument("--ports-json", required=True, help="JSON map {os_user: port}")
-    args = parser.parse_args(argv)
-
-    roster = load_roster_file(args.roster)
-    if args.cmd == "validate":
-        with open(args.k8s_users_json, encoding="utf-8") as fh:
-            issues = validate_tiers(roster, json.load(fh))
-        for issue in issues:
-            print(f"{issue.severity.upper()}: {issue.message}", file=sys.stderr)
-        return 1 if has_blocking_errors(issues) else 0
-    with open(args.ports_json, encoding="utf-8") as fh:
-        desired = derive_desired_state(roster, json.load(fh))
-    json.dump(_desired_state_to_dict(desired), sys.stdout, indent=2, sort_keys=True)
-    sys.stdout.write("\n")
-    return 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(_main(sys.argv[1:]))
--- a/scripts/workstation/setup-devvm.sh
+++ b/scripts/workstation/setup-devvm.sh
@ -1,80 +0,0 @@
-#!/usr/bin/env bash
-# Idempotent machine-wide host base for the devvm Claude Code Workstation.
-# Run as root. Sets up ONLY machine-wide state: the apt toolset, node + claude-code,
-# kubelogin, the ENFORCED managed Claude config, and /etc/skel defaults (launcher,
-# tmux UX, and live config-inheritance symlinks into the shared config base).
-#
-# PER-USER provisioning (accounts, per-tier groups, kubeconfig, secrets, infra
-# clone) lives in t3-provision-users.sh — NOT here. Safe to re-run.
-set -euo pipefail
-HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# The shared config base every user inherits from (live, chezmoi-versioned).
-# Coupled to the admin's home today; override to relocate to a neutral path.
-CONFIG_BASE="${WORKSTATION_CONFIG_BASE:-/home/wizard/.claude}"
-[[ $EUID -eq 0 ]] || { echo "setup-devvm.sh: must run as root" >&2; exit 1; }
-log() { echo "[setup-devvm] $*"; }
-
-# 1) apt toolset (declarative manifest; comments/blank lines stripped)
-mapfile -t PKGS < <(grep -vE '^[[:space:]]*(#|$)' "$HERE/packages.txt")
-log "apt: ensuring ${#PKGS[@]} packages present"
-export DEBIAN_FRONTEND=noninteractive
-apt-get update -qq
-apt-get install -y "${PKGS[@]}" >/dev/null
-
-# 2) node >= 18 + claude-code (claude-code requires node >= 18)
-need_node=1
-if command -v node >/dev/null; then
-  [[ "$(node -v | sed 's/^v\([0-9]*\).*/\1/')" -ge 18 ]] && need_node=0
-fi
-if [[ $need_node -eq 1 ]]; then
-  log "node: installing NodeSource 22.x"
-  curl -fsSL https://deb.nodesource.com/setup_22.x | bash - >/dev/null
-  apt-get install -y nodejs >/dev/null
-fi
-command -v claude >/dev/null || { log "npm: installing @anthropic-ai/claude-code"; npm install -g @anthropic-ai/claude-code >/dev/null; }
-
-# 3) kubelogin (kubectl oidc-login) system-wide — NOT the apt 'kubelogin' (= Azure tool)
-if [[ ! -x /usr/local/bin/kubelogin ]]; then
-  log "kubelogin: installing int128/kubelogin"
-  tmp="$(mktemp -d)"
-  curl -fsSL -o "$tmp/kl.zip" https://github.com/int128/kubelogin/releases/latest/download/kubelogin_linux_amd64.zip
-  ( cd "$tmp" && { unzip -o kl.zip kubelogin >/dev/null 2>&1 || python3 -m zipfile -e kl.zip .; } )
-  install -m 0755 "$tmp/kubelogin" /usr/local/bin/kubelogin
-  ln -sf /usr/local/bin/kubelogin /usr/local/bin/kubectl-oidc_login
-  rm -rf "$tmp"
-fi
-
-# 4) machine-wide ENFORCED Claude config (org claudeMd; top precedence; NO secrets)
-install -d -m 0755 /etc/claude-code
-install -m 0644 "$HERE/managed-settings.json" /etc/claude-code/managed-settings.json
-log "managed-settings.json -> /etc/claude-code/ (enforced org claudeMd)"
-
-# 5) /etc/skel for NEW accounts: launcher + tmux UX + live-inheritance symlinks.
-#    A symlink placed in /etc/skel is copied (as a symlink) into each new home by
-#    `useradd -m`, so new users' ~/.claude/{skills,rules,...} resolve to the shared
-#    base and pick up the admin's edits live. Secrets + hooks are per-user (written
-#    by the provisioner), NEVER symlinked here.
-install -d -m 0755 /etc/skel
-install -m 0755 "$HERE/skel/start-claude.sh" /etc/skel/start-claude.sh
-install -m 0644 "$HERE/skel/tmux.conf" /etc/skel/.tmux.conf
-install -d -m 0755 /etc/skel/.claude
-for d in skills rules agents commands; do
-  [[ -d "$CONFIG_BASE/$d" ]] && ln -sfn "$CONFIG_BASE/$d" "/etc/skel/.claude/$d"
-done
-log "skel: launcher + tmux + inheritance symlinks (base=$CONFIG_BASE)"
-
-# 6) deploy the roster-driven provisioner to /usr/local/bin (run hourly by
-#    t3-provision-users.timer). Re-deployed here so its logic is reproducible.
-install -m 0755 "$HERE/../t3-provision-users.sh" /usr/local/bin/t3-provision-users
-log "t3-provision-users -> /usr/local/bin/ (roster-driven)"
-
-# 7) harden the admin's unlocked tree: it holds git-crypt-DECRYPTED secrets, so it
-#    must NOT be world-readable — only the admin + code-shared. Without this, ANY
-#    devvm user (even outside code-shared) could read decrypted secrets by path.
-ADMIN_CODE="${ADMIN_CODE:-/home/wizard/code}"
-if [[ -d "$ADMIN_CODE" ]]; then
-  chmod o-rx "$ADMIN_CODE"
-  log "hardened $ADMIN_CODE (o-rx — not world-readable)"
-fi
-
-log "OK (idempotent)"
--- a/scripts/workstation/skel/start-claude.sh
+++ b/scripts/workstation/skel/start-claude.sh
@ -1,42 +0,0 @@
-#!/bin/bash
-# Per-user Claude Code Workstation launcher (devvm). Lands the user in their OWN
-# ~/code clone (NOT a hardcoded /home/wizard/code) and names the Claude session
-# after the tmux session so /resume, the prompt box, and the terminal title line
-# up. Deployed via /etc/skel by setup-devvm.sh, so new accounts get it on
-# `useradd -m`. Existing users are repointed to this during their migration.
-echo ""
-echo "  Welcome, $(id -un)! 🚀"
-echo ""
-echo "  Starting Claude Code in $HOME/code ..."
-echo "  (Right-click for tmux menu, or Ctrl+B then | or - to split)"
-echo ""
-
-name_args=()
-if [ -n "${TMUX:-}" ]; then
-  sess="$(tmux display-message -p '#{session_name}' 2>/dev/null)"
-  [ -n "$sess" ] && name_args=(--name "$sess")
-fi
-
-cd "$HOME/code" 2>/dev/null || cd "$HOME"
-
-# Prefer the system-wide `claude` (installed by setup-devvm.sh); fall back to npx.
-launch() {
-  if command -v claude >/dev/null 2>&1; then
-    claude "$@"
-  else
-    npx @anthropic-ai/claude-code "$@"
-  fi
-}
-
-# Deliberately not `exec` so we can branch on the exit code: clean quit ends the
-# pane (ttyd closes the terminal); a crash drops to a shell so the tmux session
-# isn't destroyed-and-recreated in a ttyd auto-reconnect loop.
-launch --dangerously-skip-permissions --model claude-opus-4-8 "${name_args[@]}"
-code=$?
-[ "$code" -eq 0 ] && exit 0
-
-echo ""
-echo "  claude exited abnormally (status $code). Dropping to a shell — your tmux session is preserved."
-echo "  Re-launch any time with: ~/start-claude.sh"
-echo ""
-exec "${SHELL:-/bin/bash}" -l
--- a/scripts/workstation/skel/tmux.conf
+++ b/scripts/workstation/skel/tmux.conf
@ -1,51 +0,0 @@
-# Workstation base tmux config (deployed to /etc/skel/.tmux.conf by
-# setup-devvm.sh; new accounts inherit it). Uses $HOME (expanded by the shell at
-# run time) so it works for ANY user — never a hardcoded /home/<name>.
-#
-# NOTE: the tmux-resurrect/continuum "persistence" block is owned by the separate
-# terminal-lobby tool, which appends its own managed section + installs tpm. This
-# base file intentionally omits it so a fresh account isn't left with broken
-# `run ~/.tmux/plugins/tpm/tpm` references before terminal-lobby runs.
-
-# Launch the per-user Claude launcher in every new pane/window (lands in ~/code).
-set -g default-command "$HOME/start-claude.sh"
-
-# Mouse support — click panes, drag to resize, scroll with wheel
-set -g mouse on
-
-# Easy splits: Ctrl+b then | for vertical, - for horizontal
-bind | split-window -h -c "#{pane_current_path}"
-bind - split-window -v -c "#{pane_current_path}"
-bind c new-window -c "#{pane_current_path}"
-
-# Right-click context menu — clickable actions popup
-bind -n MouseDown3Pane display-menu -T "#[align=centre]Terminal Menu" -x M -y M \
-    "New Claude"          w "new-window -c '#{pane_current_path}'" \
-    "Split Horizontal"    h "split-window -v -c '#{pane_current_path}'" \
-    "Split Vertical"      v "split-window -h -c '#{pane_current_path}'" \
-    "" \
-    "Shell"               s "split-window -v -c '#{pane_current_path}' /bin/zsh" \
-    "" \
-    "Close Pane"          x "confirm-before -p 'Close pane? (y/n)' kill-pane" \
-    "Close Window"        X "confirm-before -p 'Close window? (y/n)' kill-window" \
-    "" \
-    "Detach"              d "detach-client"
-
-# Clickable [+] button in the status bar — left-click to open the same menu
-set -g status-right '#[fg=black bg=green] [+] #[default] #[fg=cyan]Right-click for menu '
-set -g status-right-length 60
-bind -n MouseDown1StatusRight display-menu -T "#[align=centre]Terminal Menu" -x M -y S \
-    "New Claude"          w "new-window -c '#{pane_current_path}'" \
-    "Split Horizontal"    h "split-window -v -c '#{pane_current_path}'" \
-    "Split Vertical"      v "split-window -h -c '#{pane_current_path}'" \
-    "" \
-    "Shell"               s "split-window -v -c '#{pane_current_path}' /bin/zsh" \
-    "" \
-    "Close Pane"          x "confirm-before -p 'Close pane? (y/n)' kill-pane" \
-    "Close Window"        X "confirm-before -p 'Close window? (y/n)' kill-window"
-
-# Status bar styling + 1-based numbering
-set -g status-style 'bg=colour235 fg=colour136'
-set -g status-left '#[fg=green][#S] '
-set -g base-index 1
-setw -g pane-base-index 1
--- a/scripts/workstation/test_roster_engine.py
+++ b/scripts/workstation/test_roster_engine.py
@ -1,280 +0,0 @@
-"""Unit tests for the pure roster derivation + offboarding-diff engine.
-
-These exercise external behaviour only (parse -> validate -> derive -> diff);
-no host I/O is touched. Mirrors the pure-core pytest style used elsewhere in
-the monorepo. See PRD ViktorBarzin/infra#9 (modules #1 roster engine, #5
-offboarding diff).
-"""
-
-import textwrap
-
-import pytest
-
-import roster_engine as eng
-
-
-def _roster(yaml_text: str) -> "eng.Roster":
-    return eng.load_roster(textwrap.dedent(yaml_text))
-
-
-# --------------------------------------------------------------------------
-# load_roster: parsing + structural validation (module #1)
-# --------------------------------------------------------------------------
-
-
-def test_parses_user_fields_and_tier():
-    r = _roster(
-        """
-        users:
-          emo: {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
-        """
-    )
-    u = r.users["emo"]
-    assert u.os_user == "emo"
-    assert u.authentik_user == "emil.barzin"
-    assert u.k8s_user == "emo"
-    assert u.tier == "power-user"
-    assert u.namespaces == ()
-
-
-def test_namespace_owner_carries_namespaces():
-    r = _roster(
-        """
-        users:
-          ancamilea: {authentik_user: ancaelena98, k8s_user: anca,
-                      tier: namespace-owner, namespaces: [plotting-book]}
-        """
-    )
-    assert r.users["ancamilea"].namespaces == ("plotting-book",)
-
-
-def test_admin_tier_is_accepted():
-    r = _roster(
-        "users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
-    )
-    assert r.users["wizard"].tier == "admin"
-
-
-def test_rejects_unknown_tier():
-    with pytest.raises(eng.RosterError, match="tier"):
-        _roster("users: {bob: {authentik_user: b, k8s_user: b, tier: wizard-king}}")
-
-
-def test_rejects_missing_required_field():
-    with pytest.raises(eng.RosterError, match="authentik_user"):
-        _roster("users: {bob: {k8s_user: b, tier: power-user}}")
-
-
-def test_namespace_owner_requires_namespaces():
-    with pytest.raises(eng.RosterError, match="namespace"):
-        _roster("users: {bob: {authentik_user: b, k8s_user: b, tier: namespace-owner}}")
-
-
-def test_non_namespace_owner_must_not_set_namespaces():
-    with pytest.raises(eng.RosterError, match="namespace"):
-        _roster(
-            "users: {bob: {authentik_user: b, k8s_user: b, tier: power-user, "
-            "namespaces: [x]}}"
-        )
-
-
-def test_empty_roster_is_valid():
-    assert _roster("users: {}").users == {}
-
-
-def test_missing_users_key_is_valid_empty():
-    assert _roster("{}").users == {}
-
-
-# --------------------------------------------------------------------------
-# validate_tiers: roster tier vs live k8s_users (fail-loud, module #1)
-# --------------------------------------------------------------------------
-
-
-def test_validate_ok_when_tiers_match():
-    r = _roster(
-        "users: {ancamilea: {authentik_user: a, k8s_user: anca, "
-        "tier: namespace-owner, namespaces: [plotting-book]}}"
-    )
-    assert eng.validate_tiers(r, {"anca": "namespace-owner"}) == []
-
-
-def test_validate_flags_tier_mismatch_as_error():
-    # roster says power-user, cluster says namespace-owner -> a real conflict -> ERROR (abort).
-    r = _roster(
-        "users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}"
-    )
-    issues = eng.validate_tiers(r, {"anca": "namespace-owner"})
-    assert len(issues) == 1
-    assert issues[0].severity == "error"
-    assert issues[0].os_user == "ancamilea"
-    assert "power-user" in issues[0].message and "namespace-owner" in issues[0].message
-
-
-def test_validate_flags_netnew_absent_as_warn():
-    # emo is power-user in the roster but has no k8s_users entry yet. Onboarding the
-    # workstation should still proceed; the kubectl grant is pending -> WARN, not error.
-    r = _roster("users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}")
-    issues = eng.validate_tiers(r, {})
-    assert len(issues) == 1
-    assert issues[0].severity == "warn"
-    assert "emo" in issues[0].message and "k8s_users" in issues[0].message
-
-
-def test_validate_skips_admin_tier():
-    # wizard (admin) is cluster-admin via a separate mechanism, not k8s_users.
-    r = _roster(
-        "users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
-    )
-    assert eng.validate_tiers(r, {}) == []
-
-
-def test_has_blocking_errors_distinguishes_mismatch_from_absent():
-    mismatch = _roster(
-        "users: {ancamilea: {authentik_user: a, k8s_user: anca, tier: power-user}}"
-    )
-    absent = _roster(
-        "users: {emo: {authentik_user: e, k8s_user: emo, tier: power-user}}"
-    )
-    assert (
-        eng.has_blocking_errors(
-            eng.validate_tiers(mismatch, {"anca": "namespace-owner"})
-        )
-        is True
-    )
-    assert eng.has_blocking_errors(eng.validate_tiers(absent, {})) is False
-
-
-# --------------------------------------------------------------------------
-# derive_desired_state: accounts, sticky ports, ttyd map, dispatch (module #1)
-# --------------------------------------------------------------------------
-
-THREE = """
-    users:
-      wizard:    {authentik_user: vbarzin,     k8s_user: wizard, tier: admin}
-      emo:       {authentik_user: emil.barzin, k8s_user: emo,    tier: power-user}
-      ancamilea: {authentik_user: ancaelena98, k8s_user: anca,   tier: namespace-owner, namespaces: [plotting-book]}
-"""
-
-LIVE_PORTS = {"wizard": 3773, "emo": 3774, "ancamilea": 3775}
-
-
-def test_derive_preserves_existing_sticky_ports():
-    ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
-    assert ds.ports == {"wizard": 3773, "emo": 3774, "ancamilea": 3775}
-
-
-def test_derive_allocates_next_free_port_for_new_user():
-    ds = eng.derive_desired_state(_roster(THREE), {"wizard": 3773})
-    # emo + ancamilea are new -> next free from 3773 skipping the used 3773
-    assert ds.ports["wizard"] == 3773
-    assert sorted([ds.ports["emo"], ds.ports["ancamilea"]]) == [3774, 3775]
-
-
-def test_derive_dispatch_keyed_by_authentik_user():
-    ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
-    assert ds.dispatch == {
-        "vbarzin": {"os_user": "wizard", "port": 3773},
-        "emil.barzin": {"os_user": "emo", "port": 3774},
-        "ancaelena98": {"os_user": "ancamilea", "port": 3775},
-    }
-
-
-def test_derive_ttyd_map_has_one_mapping_per_user():
-    ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
-    body = [
-        line
-        for line in ds.ttyd_user_map.splitlines()
-        if line.strip() and not line.lstrip().startswith("#")
-    ]
-    assert set(body) == {"vbarzin=wizard", "emil.barzin=emo", "ancaelena98=ancamilea"}
-
-
-def test_derive_accounts_assign_tier_groups_and_shell():
-    ds = eng.derive_desired_state(_roster(THREE), LIVE_PORTS)
-    assert ds.accounts["wizard"].groups == ("code-shared", "docker", "sudo")
-    assert ds.accounts["emo"].groups == ()
-    assert ds.accounts["ancamilea"].groups == ()
-    assert ds.accounts["emo"].shell == "/bin/zsh"
-
-
-def test_derive_is_deterministic():
-    r = _roster(THREE)
-    assert eng.derive_desired_state(r, LIVE_PORTS) == eng.derive_desired_state(
-        r, LIVE_PORTS
-    )
-
-
-# --------------------------------------------------------------------------
-# groups_to_add: the additive-only invariant (module #1)
-# --------------------------------------------------------------------------
-
-
-def test_groups_to_add_returns_only_missing():
-    assert eng.groups_to_add(("sudo", "docker", "code-shared"), ("docker",)) == [
-        "code-shared",
-        "sudo",
-    ]
-
-
-def test_groups_to_add_never_proposes_removal_of_extra_groups():
-    # emo currently has code-shared+docker (legacy). A power-user reconcile wants
-    # no groups -> must NOT strip anything (additive-only invariant).
-    assert eng.groups_to_add((), ("code-shared", "docker")) == []
-
-
-def test_groups_to_add_idempotent_when_all_present():
-    assert eng.groups_to_add(("sudo",), ("sudo", "docker")) == []
-
-
-# --------------------------------------------------------------------------
-# offboarding diff: staged plan, destructive never auto (module #5)
-# --------------------------------------------------------------------------
-
-
-def test_to_deprovision_is_old_minus_new():
-    old = _roster(THREE)
-    new = _roster(
-        """
-        users:
-          wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}
-          emo:    {authentik_user: emil.barzin, k8s_user: emo, tier: power-user}
-        """
-    )
-    assert eng.to_deprovision(old, new) == ["ancamilea"]
-
-
-def test_to_deprovision_empty_when_nothing_removed():
-    r = _roster(THREE)
-    assert eng.to_deprovision(r, r) == []
-
-
-def test_offboard_plan_reversible_cut_targets_exactly_the_removed_user():
-    old = _roster(THREE)
-    new = _roster(
-        "users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
-    )
-    plan = eng.offboard_plan(old, new, include_destructive=False)
-    cut_users = {a.os_user for a in plan}
-    assert cut_users == {"emo", "ancamilea"}
-    assert all(a.reversible for a in plan)
-
-
-def test_offboard_plan_excludes_destructive_by_default():
-    old = _roster(THREE)
-    new = _roster(
-        "users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
-    )
-    auto = eng.offboard_plan(old, new, include_destructive=False)
-    assert all(a.kind != "userdel_archive" for a in auto)
-
-
-def test_offboard_plan_includes_destructive_only_when_explicitly_requested():
-    old = _roster(THREE)
-    new = _roster(
-        "users: {wizard: {authentik_user: vbarzin, k8s_user: wizard, tier: admin}}"
-    )
-    full = eng.offboard_plan(old, new, include_destructive=True)
-    destructive = [a for a in full if a.kind == "userdel_archive"]
-    assert {a.os_user for a in destructive} == {"emo", "ancamilea"}
-    assert all(not a.reversible for a in destructive)