From 43e4f3f68e279db6aeea926de161ffd5aba08ac5 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 25 Apr 2026 15:47:30 +0000 Subject: [PATCH] immich: migrate PostgreSQL off NFS to proxmox-lvm-encrypted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live PG data moves to a 10Gi LUKS-encrypted RWO PVC. WAL fsync per commit on NFS contributed to the 2026-04-22 NFS writeback storm (2h43m recovery, 3 of 4 nodes hard-reset). Backups remain on NFS (append-only, NFS-tolerant). The init container that writes postgresql.override.conf is now gated on PG_VERSION presence — on a fresh PVC the file would otherwise make initdb refuse the non-empty PGDATA. First boot skips the override and initdb's cleanly; second boot (after a forced restart) writes the override so vchord/vectors/pg_prewarm load before the dump restore. Idempotent on initialised PVCs. Migration executed: pg_dumpall (1.9GB) → restore on encrypted PVC → REINDEX clip_index/face_index → 111,843 assets verified, external HTTP 200, all 10 extensions present (vector minor 0.8.0→0.8.1 only). LV created on PVE host, picked up by lvm-pvc-snapshot. See docs/plans/2026-04-25-nfs-hostile-migration-{design,plan}.md. Phase 2 (Vault Raft) follows under code-gy7h. Closes: code-ahr7 Co-Authored-By: Claude Opus 4.7 --- ...2026-04-25-nfs-hostile-migration-design.md | 119 ++++++++++++++++++ .../2026-04-25-nfs-hostile-migration-plan.md | 116 +++++++++++++++++ stacks/immich/main.tf | 33 ++++- 3 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 docs/plans/2026-04-25-nfs-hostile-migration-design.md create mode 100644 docs/plans/2026-04-25-nfs-hostile-migration-plan.md diff --git a/docs/plans/2026-04-25-nfs-hostile-migration-design.md b/docs/plans/2026-04-25-nfs-hostile-migration-design.md new file mode 100644 index 00000000..0b74d050 --- /dev/null +++ b/docs/plans/2026-04-25-nfs-hostile-migration-design.md @@ -0,0 +1,119 @@ +# NFS-Hostile Workload Migration — Design + +**Date**: 2026-04-25 +**Author**: Viktor (with Claude) +**Status**: Phase 1 done, Phase 2 in progress +**Beads**: code-gy7h (Vault), code-ahr7 (Immich PG) + +## Problem + +The 2026-04-22 Vault Raft leader deadlock (post-mortem +`2026-04-22-vault-raft-leader-deadlock.md`) traced to NFS client +writeback stalls poisoning kernel state. Recovery took 2h43m and +required hard-resetting 3 of 4 cluster VMs. Two workload classes on +NFS are NFS-hostile per the criteria in +`infra/.claude/CLAUDE.md` ("Critical services MUST NOT use NFS"): + +1. **Postgres with WAL fsync per commit** — Immich primary +2. **Vault Raft consensus log** — fsync per append-entry, 3 replicas + +Everything else on NFS (47 PVCs, ~455 GiB) is correctly placed: +RWX media libraries, append-only backups, ML caches. + +## Decision + +Migrate exactly those two workload classes to +`proxmox-lvm-encrypted` (LUKS2 LVM-thin via Proxmox CSI). No iSCSI, +no RWX media migration, no backup-target migration. + +## Rationale + +- Block storage decouples PG / Raft fsync from NFS client kernel + state. Failure mode that triggered the post-mortem cannot recur for + these workloads. +- `proxmox-lvm-encrypted` is the documented default for sensitive data + (`infra/.claude/CLAUDE.md` storage decision rule). It already backs + ~28 PVCs across the cluster — pattern is proven. +- Existing nightly `lvm-pvc-snapshot` PVE host script (03:00, 7-day + retention) auto-picks-up new PVCs via thin snapshots — no extra + backup wiring needed for the live data side. +- LUKS2 satisfies "encrypted at rest for sensitive data" requirement. + +## Out of scope + +- iSCSI evaluation (already retired 2026-04-13). +- RWX media (Immich library, music, ebooks) — correct placement. +- Backup target PVCs (`*-backup` on NFS) — append-only, NFS-tolerant. +- Prometheus 200 GiB — already on `proxmox-lvm`. + +## Pattern per workload + +### Immich PG (single replica, Deployment, Recreate strategy) + +- Add new RWO PVC on `proxmox-lvm-encrypted`. +- Quiesce app pods (server + ML + frame). +- `pg_dumpall` from running NFS pod → local file. +- Swap deployment `claim_name` → encrypted PVC. +- PG bootstraps fresh on empty PVC; restore dump. +- REINDEX vector indexes (`clip_index`, `face_index`). +- Backup CronJob keeps writing to NFS module (correct: append-only). + +### Vault Raft (3 replicas, StatefulSet, helm-managed) + +- Change `dataStorage.storageClass` and `auditStorage.storageClass` + from `nfs-proxmox` → `proxmox-lvm-encrypted`. +- StatefulSet `volumeClaimTemplates` is immutable → use + `kubectl delete sts vault --cascade=orphan` then re-apply (memory + pattern for VCT swaps). +- Per-pod rolling: delete pod + PVCs, controller recreates with new + template. Auto-unseal sidecar handles unseal; raft `retry_join` + rejoins cluster. +- 24h validation window between pods. Migrate non-leader pods first; + step-down current leader before migrating it last. +- Backup target (`vault-backup-host` on NFS) stays on NFS. + +## Risks and rollbacks + +### Immich PG + +- pg_dumpall captures schema + data, not file-level state. Vector + index versions matter (vchord 0.3.0 unchanged; vector 0.8.0 → + 0.8.1 is a minor automatic bump on `CREATE EXTENSION` — confirmed + benign). Rollback: revert `claim_name`, scale apps; old NFS PVC + retained for 7 days post-migration. + +### Vault Raft + +- Cluster keeps quorum from 2 standby replicas while one pod is + swapped. Migrating the leader last avoids quorum churn. +- Recovery anchor: pre-migration `vault operator raft snapshot save` + + nightly `vault-raft-backup` CronJob. RTO < 1h via snapshot + restore. + +## Init container chicken-and-egg (Immich PG, discovered during execution) + +The pre-existing `write-pg-override-conf` init container on the +Immich PG deployment writes `postgresql.override.conf` directly to +`PGDATA`. On a populated NFS PVC this was a no-op (init was already +run). On the fresh encrypted PVC, the file made `initdb` refuse the +non-empty directory and the pod CrashLoopBackOff'd. + +Resolution: gate the init container on `PG_VERSION` presence — first +boot skips the override write, PG `initdb`s cleanly; force a pod +restart and the second boot writes the override and PG loads +`vchord` / `vectors` / `pg_prewarm` before the dump restore. Change +is permanent and idempotent (correct on both fresh and initialised +PVCs). One restart pre-migration only. + +## Verification + +End-to-end DONE when: + +- `kubectl get pvc -A | grep nfs-proxmox` returns only the + `vault-backup-host` PVC (or zero, if backup PVC moves elsewhere). +- `vault operator raft list-peers` shows 3 voters on + `proxmox-lvm-encrypted`, leader elected. +- Immich PG `\dx` matches pre-migration extensions (vector minor + drift OK). +- `lvm-pvc-snapshot` captures new LVs in next 03:00 run. +- 7 consecutive days of clean backup CronJob runs and no new alerts. diff --git a/docs/plans/2026-04-25-nfs-hostile-migration-plan.md b/docs/plans/2026-04-25-nfs-hostile-migration-plan.md new file mode 100644 index 00000000..e36041c3 --- /dev/null +++ b/docs/plans/2026-04-25-nfs-hostile-migration-plan.md @@ -0,0 +1,116 @@ +# NFS-Hostile Workload Migration — Plan + +**Date**: 2026-04-25 +**Design**: `2026-04-25-nfs-hostile-migration-design.md` +**Beads**: code-gy7h (Vault, epic), code-ahr7 (Immich PG) + +## Phase 1 — Immich PG (DONE 2026-04-25) + +| Step | Done | +|---|---| +| Snapshot extensions + row counts to `/tmp/immich-pre-migration-*` | ✓ | +| Quiesce `immich-server` + `immich-machine-learning` + `immich-frame` | ✓ | +| `pg_dumpall` → `/tmp/immich-pre-migration-.sql` (1.9 GB) | ✓ | +| Add `kubernetes_persistent_volume_claim.immich_postgresql_encrypted` (10Gi, autoresize 20Gi cap) | ✓ | +| Swap `claim_name` at `infra/stacks/immich/main.tf` deployment | ✓ | +| Patch init container to gate on `PG_VERSION` (chicken-and-egg fix) | ✓ | +| Force pod restart so override.conf gets written | ✓ | +| Restore dump | ✓ | +| `REINDEX clip_index`, `REINDEX face_index` | ✓ | +| Scale apps back up | ✓ | +| Verify: `\dx`, row counts (~111k assets), HTTP 200 internal/external | ✓ | +| LV present on PVE host (`vm-9999-pvc-...`) | ✓ | + +### Phase 1 follow-ups (not blocking) + +- Old NFS PVC `immich-postgresql-data-host` retained 7 days for + rollback. After 2026-05-02: remove `module.nfs_postgresql_host` + from `infra/stacks/immich/main.tf` and the CronJob's reference. +- Backup CronJob (`postgresql-backup`) still writes to the NFS + module. After cleanup, point it at a dedicated backup PVC or to + the existing `immich-backups` NFS share. + +## Phase 2 — Vault Raft (IN PROGRESS) + +### Pre-flight (T-0) + +- [ ] Verify all 3 vault pods sealed=false, raft healthy. +- [ ] Take fresh `vault operator raft snapshot save` (anchor). +- [ ] Optional: scale ESO to 0 to reduce mid-migration churn. +- [ ] Step-down leader if it's not vault-0 (current leader: vault-2 — needs step-down). +- [ ] Verify thin pool headroom on PVE. + +### Step 0 — Helm values + StatefulSet swap + +- [ ] Edit `infra/stacks/vault/main.tf`: change + `dataStorage.storageClass` and `auditStorage.storageClass` + from `nfs-proxmox` → `proxmox-lvm-encrypted`. +- [ ] `kubectl -n vault delete sts vault --cascade=orphan` (StatefulSet + `volumeClaimTemplates` is immutable; orphan keeps pods+PVCs + alive while we recreate the controller with the new template). +- [ ] `tg apply` → recreates StatefulSet with new VCT. Existing pods + still on old NFS PVCs. + +### Step 1 — Roll vault-2 (T+0) + +- [ ] `kubectl -n vault delete pod vault-2 --grace-period=30` +- [ ] `kubectl -n vault delete pvc data-vault-2 audit-vault-2` +- [ ] STS controller recreates pod; new PVCs auto-provision on + `proxmox-lvm-encrypted`. +- [ ] Wait Ready; auto-unseal sidecar unseals; `retry_join` rejoins + raft cluster. +- [ ] Verify: `vault operator raft list-peers` shows 3 voters, + vault-2 reachable. + +### Step 2 — 24h soak + +Wait 24h. Confirm no Raft alarms, no Vault errors, downstream +healthy. Rollback window for vault-2 closes here. + +### Step 3 — Roll vault-1 (T+24h) + +Same shape as Step 1. + +### Step 4 — 24h soak + +### Step 5 — Roll vault-0 (T+48h) + +- [ ] If vault-0 is leader at this point, step-down first: + `kubectl -n vault exec vault-0 -- vault operator step-down`. +- [ ] Then delete pod + PVCs as Step 1. + +### Step 6 — Cleanup + +- [ ] Re-enable ESO if disabled: `kubectl -n external-secrets scale deploy external-secrets --replicas=2`. +- [ ] Verify `kubectl get pvc -A | grep nfs-proxmox` returns zero + live-data results (only backup-host should remain, if any). +- [ ] If no consumers: remove inline `kubernetes_storage_class.nfs_proxmox` + from `infra/stacks/vault/main.tf` (lines 29-42). + +### Verify (after each pod, then again at the end) + +- [ ] All 3 PVC pairs on `proxmox-lvm-encrypted`. +- [ ] `vault operator raft autopilot state` healthy=true. +- [ ] External `https://vault.viktorbarzin.me/v1/sys/health` = 200. +- [ ] `vault-raft-backup` CronJob completes overnight (writes to NFS, + stays NFS — correct). +- [ ] No Prometheus alerts (`VaultSealed`, `VaultLeaderless`). + +## Phase 3 — Released-PV cleanup (FOLLOW-UP) + +After Phase 1+2 land cleanly, ~30 PVs in `Released` hold dead LVs. +Reclaim by: + +1. List Released PVs, confirm LV exists on PVE. +2. `kubectl delete pv ` (CSI removes underlying LV when PV is + orphaned with `Retain` reclaim policy and no PVC reference). +3. If LV survives: manual `lvremove pve/vm-9999-pvc-`. + +## Rollback + +| Phase | Trigger | Action | +|---|---|---| +| 1 | Immich UI broken / data loss | Revert `claim_name`; restore from `/tmp/immich-pre-migration-*.sql` to old NFS PVC | +| 2 (mid-rolling) | Single pod broken | Delete the encrypted PVC; recreate with NFS SC explicitly; cluster keeps quorum from 2 healthy pods | +| 2 (post-rolling, raft corrupt) | Cluster-wide failure | `vault operator raft snapshot restore ` | +| Catastrophic | All Vault data lost | Restore from latest `/srv/nfs/vault-backup/` snapshot via CronJob output | diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 8cf5162a..9775a87e 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -85,6 +85,30 @@ module "nfs_postgresql_host" { nfs_path = "/srv/nfs/immich/postgresql" } +# Migrated 2026-04-25: PG live data moved off NFS to LUKS-encrypted block. +# WAL fsync per commit on NFS contributed to the 2026-04-22 NFS writeback storm +# (see post-mortems/2026-04-22-vault-raft-leader-deadlock.md). +# Backup CronJob still writes to module.nfs_postgresql_host (NFS append-only). +resource "kubernetes_persistent_volume_claim" "immich_postgresql_encrypted" { + wait_until_bound = false + metadata { + name = "immich-postgresql-data-encrypted" + namespace = kubernetes_namespace.immich.metadata[0].name + annotations = { + "resize.topolvm.io/threshold" = "80%" + "resize.topolvm.io/increase" = "100%" + "resize.topolvm.io/storage_limit" = "20Gi" + } + } + spec { + access_modes = ["ReadWriteOnce"] + storage_class_name = "proxmox-lvm-encrypted" + resources { + requests = { storage = "10Gi" } + } + } +} + module "nfs_ml_cache_host" { source = "../../modules/kubernetes/nfs_volume" name = "immich-ml-cache-host" @@ -462,6 +486,13 @@ resource "kubernetes_deployment" "immich-postgres" { name = "write-pg-override-conf" image = "busybox:1.36" command = ["sh", "-c", <<-EOT + # Skip write on uninitialised PGDATA — initdb refuses non-empty dirs. + # On first boot the override is absent; trigger a pod restart after + # initdb completes so the override is applied before extension load. + if [ ! -f /data/PG_VERSION ]; then + echo "PGDATA uninitialised, skipping override conf (will write on next pod start)" + exit 0 + fi cat > /data/postgresql.override.conf <<'PGCONF' # Immich vector search performance tuning shared_buffers = 2048MB @@ -481,7 +512,7 @@ resource "kubernetes_deployment" "immich-postgres" { volume { name = "postgresql-persistent-storage" persistent_volume_claim { - claim_name = module.nfs_postgresql_host.claim_name + claim_name = kubernetes_persistent_volume_claim.immich_postgresql_encrypted.metadata[0].name } } }