diff --git a/.claude/reference/proxmox-inventory.md b/.claude/reference/proxmox-inventory.md index 1d1ab9bb..60dfab0b 100644 --- a/.claude/reference/proxmox-inventory.md +++ b/.claude/reference/proxmox-inventory.md @@ -122,8 +122,9 @@ Channel 3: A4 [32G] ──── A8 [32G] ──── A12[ 8G ] = 72 GB | `offsite-sync-backup.timer` | Timer | Daily 06:00 | Two-step rsync to Synology (sda + NFS via inotify) | | `nfs-change-tracker.service` | Service | Continuous | inotifywait on `/srv/nfs` + `/srv/nfs-ssd`, logs to `/mnt/backup/.nfs-changes.log` | -## GPU Node (k8s-node1) -- **VMID**: 201, **PCIe**: `0000:06:00.0` (NVIDIA Tesla T4) -- **Taint**: `nvidia.com/gpu=true:NoSchedule`, **Label**: `gpu=true` -- GPU workloads need: `node_selector = { "gpu": "true" }` + nvidia toleration -- Taint applied via `null_resource.gpu_node_taint` in `modules/kubernetes/nvidia/main.tf` +## GPU Node (currently k8s-node1) +- **VMID**: 201, **PCIe**: `0000:06:00.0` (NVIDIA Tesla T4) — physical passthrough, no Terraform pin +- **Taint**: `nvidia.com/gpu=true:PreferNoSchedule` (applied dynamically to every NFD-discovered GPU node) +- **Label**: `nvidia.com/gpu.present=true` (auto-applied by gpu-feature-discovery; also `feature.node.kubernetes.io/pci-10de.present=true` from NFD) +- GPU workloads need: `node_selector = { "nvidia.com/gpu.present" : "true" }` + nvidia toleration +- Taint applied via `null_resource.gpu_node_config` in `stacks/nvidia/modules/nvidia/main.tf`; node discovery keyed on the NFD `pci-10de.present` label so the taint follows the card to whichever host is carrying it diff --git a/.woodpecker/default.yml b/.woodpecker/default.yml index 9e0d1fe5..fa6ffc4a 100644 --- a/.woodpecker/default.yml +++ b/.woodpecker/default.yml @@ -128,7 +128,7 @@ steps: # ── Pre-warm provider cache ── - | if [ -s .platform_apply ] || [ -s .app_apply ]; then - FIRST_STACK=$(head -1 .platform_apply .app_apply 2>/dev/null | head -1) + FIRST_STACK=$(cat .platform_apply .app_apply 2>/dev/null | head -1) if [ -n "$FIRST_STACK" ]; then echo "Pre-warming provider cache from stacks/$FIRST_STACK..." cd "stacks/$FIRST_STACK" && terragrunt init --terragrunt-non-interactive -input=false 2>&1 | tail -3 && cd ../.. @@ -150,7 +150,7 @@ steps: if echo "$OUTPUT" | grep -q "is locked by"; then echo "[$stack] SKIPPED (locked by another session)" else - echo "$OUTPUT" | tail -5 + echo "$OUTPUT" | tail -50 echo "[$stack] FAILED (exit $EXIT)" FAILED_PLATFORM_STACKS="$FAILED_PLATFORM_STACKS $stack" fi @@ -178,7 +178,7 @@ steps: if echo "$OUTPUT" | grep -q "is locked by"; then echo "[$stack] SKIPPED (locked by another session)" else - echo "$OUTPUT" | tail -5 + echo "$OUTPUT" | tail -50 echo "[$stack] FAILED (exit $EXIT)" FAILED_APP_STACKS="$FAILED_APP_STACKS $stack" fi diff --git a/AGENTS.md b/AGENTS.md index 0f1794f1..5f9c0839 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -163,10 +163,10 @@ lifecycle { ## Infrastructure - **Proxmox**: 192.168.1.127 (Dell R730, 22c/44t, 142GB RAM) - **Nodes**: k8s-master (10.0.20.100), node1 (GPU, Tesla T4), node2-4 -- **GPU**: `node_selector = { "gpu": "true" }` + toleration `nvidia.com/gpu` +- **GPU**: `node_selector = { "nvidia.com/gpu.present" : "true" }` + toleration `nvidia.com/gpu`. The label is auto-applied by NFD/gpu-feature-discovery on any node with an NVIDIA PCI device — nothing is hostname-pinned, so the GPU card can move between nodes without Terraform edits. - **Pull-through cache**: 10.0.20.10 — docker.io (:5000), ghcr.io (:5010) only. Caches stale manifests for :latest tags — use versioned tags or pre-pull with `ctr --hosts-dir ''` to bypass. - **pfSense**: 10.0.20.1 (gateway, firewall, DNS forwarding) -- **MySQL InnoDB Cluster**: 1 instance on proxmox-lvm (scaled from 3 — only Uptime Kuma + phpIPAM remain), PriorityClass `mysql-critical` + PDB, anti-affinity excludes k8s-node1 (GPU node) +- **MySQL InnoDB Cluster**: 1 instance on proxmox-lvm (scaled from 3 — only Uptime Kuma + phpIPAM remain), PriorityClass `mysql-critical` + PDB, anti-affinity excludes any GPU node (`nvidia.com/gpu.present=true`) so MySQL moves off the GPU host automatically if the card is relocated - **SMTP**: `var.mail_host` port 587 STARTTLS (not internal svc address — cert mismatch) ## Contributor Onboarding diff --git a/ci/Dockerfile b/ci/Dockerfile index ea534d6e..2a02b586 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -1,12 +1,11 @@ FROM alpine:3.20 -# Rebuild 2026-04-19 — previous :latest index referenced missing blobs (404 on 98f718c8 / 27d5ab83) - # Pin versions to match CI requirements ARG TERRAFORM_VERSION=1.5.7 ARG TERRAGRUNT_VERSION=0.99.4 ARG SOPS_VERSION=3.9.4 ARG KUBECTL_VERSION=1.34.0 +ARG VAULT_VERSION=1.18.1 # Install system packages (single layer) RUN apk add --no-cache \ @@ -36,6 +35,16 @@ RUN curl -fsSL "https://dl.k8s.io/release/v${KUBECTL_VERSION}/bin/linux/amd64/ku -o /usr/local/bin/kubectl \ && chmod +x /usr/local/bin/kubectl +# Vault CLI — required by scripts/tg for Tier 1 stack PG credential reads +# and Tier 0 advisory locks. Pinned to server version (1.18.1). Without this +# the CI pipeline surfaces the misleading "Cannot read PG credentials" error +# because scripts/tg swallows stderr ("vault: not found"). +RUN curl -fsSL "https://releases.hashicorp.com/vault/${VAULT_VERSION}/vault_${VAULT_VERSION}_linux_amd64.zip" \ + -o /tmp/vault.zip \ + && unzip /tmp/vault.zip -d /usr/local/bin/ \ + && rm /tmp/vault.zip \ + && vault version + # Provider cache directory (shared across stacks) ENV TF_PLUGIN_CACHE_DIR=/tmp/terraform-plugin-cache ENV TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1 diff --git a/config.tfvars b/config.tfvars index 6db48575..790a48ae 100644 Binary files a/config.tfvars and b/config.tfvars differ diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index 2c992c20..b307ec6c 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -217,7 +217,7 @@ graph LR Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 Proxmox CSI PVCs. These are CoW snapshots — instant creation, minimal overhead, sharing the thin pool's free space. -**Script**: `/usr/local/bin/lvm-pvc-snapshot` on PVE host (source: `infra/scripts/lvm-pvc-snapshot`) +**Script**: `/usr/local/bin/lvm-pvc-snapshot` on PVE host (source: `infra/scripts/lvm-pvc-snapshot.sh`). Deploy: `scp infra/scripts/lvm-pvc-snapshot.sh root@192.168.1.127:/usr/local/bin/lvm-pvc-snapshot` **Schedule**: Daily 03:00 via systemd timer, 7-day retention **Discovery**: Auto-discovers PVC LVs matching `vm-*-pvc-*` pattern in VG `pve` thin pool `data` @@ -226,7 +226,7 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 - They already have app-level dumps (Layer 2) - Including them causes ~36% write amplification; excluding them reduces overhead to ~0% -**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>24h), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free). +**Monitoring**: Pushes metrics to Pushgateway via NodePort (30091). Alerts: `LVMSnapshotStale` (>30h since last run + 30m `for:`), `LVMSnapshotFailing`, `LVMThinPoolLow` (<15% free). **Restore**: `lvm-pvc-snapshot restore ` — auto-discovers K8s workload, scales down, swaps LVs, scales back up. See `docs/runbooks/restore-lvm-snapshot.md`. @@ -234,7 +234,7 @@ Native LVM thin snapshots provide crash-consistent point-in-time recovery for 62 **Backup disk**: sda (1.1TB RAID1 SAS) → VG `backup` → LV `data` → ext4 → mounted at `/mnt/backup` on PVE host. Dedicated backup disk, independent of live storage. -**Script**: `/usr/local/bin/daily-backup` on PVE host (source: `infra/scripts/daily-backup`) +**Script**: `/usr/local/bin/daily-backup` on PVE host (source: `infra/scripts/daily-backup.sh`) **Schedule**: Daily 05:00 via systemd timer **Retention**: 4 weekly versions (weeks 0-3 via `--link-dest` hardlink dedup) @@ -673,7 +673,7 @@ module "nfs_backup" { │ ~~CloudSyncNeverRun~~ REMOVED (TrueNAS decommissioned) │ │ ~~CloudSyncFailing~~ REMOVED (TrueNAS decommissioned) │ │ VaultwardenIntegrityFail integrity_ok == 0 │ -│ LVMSnapshotStale > 24h since last snapshot │ +│ LVMSnapshotStale > 30h since last snapshot │ │ LVMSnapshotFailing snapshot creation failed │ │ LVMThinPoolLow < 15% free space in thin pool │ │ WeeklyBackupStale > 8d since last success │ @@ -692,6 +692,16 @@ module "nfs_backup" { - ~~CloudSync monitor~~: Removed (TrueNAS decommissioned) - Vaultwarden integrity: Pushes `vaultwarden_sqlite_integrity_ok` hourly +**Pushgateway persistence**: The Pushgateway is configured with +`--persistence.file=/data/pushgateway.bin --persistence.interval=1m` +on a 2Gi `proxmox-lvm-encrypted` PVC (helm values: +`prometheus-pushgateway.persistentVolume`). Without this, every pod +restart drops in-memory metrics. Once-per-day pushers (offsite-sync, +weekly backup) are otherwise invisible for up to 24h if the +Pushgateway restarts between pushes — which is exactly what triggered +the 2026-04-22 backup_offsite_sync FAIL (node3 kubelet hiccup at +11:42 UTC terminated the Pushgateway 8h after the 03:12 UTC push). + **Alert routing**: - All backup alerts → Slack `#infra-alerts` - Vaultwarden integrity fail → Slack `#infra-critical` (immediate action required) diff --git a/docs/architecture/compute.md b/docs/architecture/compute.md index bf456030..cc9c4786 100644 --- a/docs/architecture/compute.md +++ b/docs/architecture/compute.md @@ -18,7 +18,7 @@ graph TB subgraph Proxmox["Proxmox VE"] direction TB MASTER["VM 200: k8s-master
8c / 32GB
10.0.20.100"] - NODE1["VM 201: k8s-node1
16c / 32GB
GPU Passthrough
nvidia.com/gpu=true:NoSchedule"] + NODE1["VM 201: k8s-node1
16c / 32GB
GPU Passthrough
nvidia.com/gpu=true:PreferNoSchedule"] NODE2["VM 202: k8s-node2
8c / 32GB"] NODE3["VM 203: k8s-node3
8c / 32GB"] NODE4["VM 204: k8s-node4
8c / 32GB"] @@ -72,7 +72,7 @@ graph TB | VM | VMID | vCPUs | RAM | Network | Role | Taints | |----|------|-------|-----|---------|------|--------| | k8s-master | 200 | 8 | 32GB | vmbr1:vlan20 (10.0.20.100) | Control Plane | `node-role.kubernetes.io/control-plane:NoSchedule` | -| k8s-node1 | 201 | 16 | 32GB | vmbr1:vlan20 | GPU Worker | `nvidia.com/gpu=true:NoSchedule` | +| k8s-node1 | 201 | 16 | 32GB | vmbr1:vlan20 | GPU Worker | `nvidia.com/gpu=true:PreferNoSchedule` (applied dynamically to whichever node carries the GPU) | | k8s-node2 | 202 | 8 | 32GB | vmbr1:vlan20 | Worker | None | | k8s-node3 | 203 | 8 | 32GB | vmbr1:vlan20 | Worker | None | | k8s-node4 | 204 | 8 | 32GB | vmbr1:vlan20 | Worker | None | @@ -85,9 +85,9 @@ graph TB |-----------|-------| | Device | NVIDIA Tesla T4 (16GB GDDR6) | | PCIe Address | 0000:06:00.0 | -| Assigned VM | VMID 201 (k8s-node1) | -| Node Label | `gpu=true` | -| Node Taint | `nvidia.com/gpu=true:NoSchedule` | +| Assigned VM | VMID 201 (k8s-node1) — physical location only, no Terraform pin | +| Node Label | `nvidia.com/gpu.present=true` (auto-applied by gpu-feature-discovery; also `feature.node.kubernetes.io/pci-10de.present=true` from NFD) | +| Node Taint | `nvidia.com/gpu=true:PreferNoSchedule` (applied by `null_resource.gpu_node_config` to every NFD-tagged GPU node) | | Driver | NVIDIA GPU Operator | | Resource Name | `nvidia.com/gpu` | @@ -273,8 +273,8 @@ resources { ### GPU Resource Management **Node Selection**: GPU pods must: -1. Tolerate `nvidia.com/gpu=true:NoSchedule` taint -2. Select `gpu=true` label +1. Tolerate `nvidia.com/gpu=true:PreferNoSchedule` taint +2. Select `nvidia.com/gpu.present=true` label (auto-applied by gpu-feature-discovery wherever the card is) 3. Request `nvidia.com/gpu: 1` resource **Example**: @@ -286,7 +286,7 @@ spec: value: "true" effect: NoSchedule nodeSelector: - gpu: "true" + nvidia.com/gpu.present: "true" containers: - name: app resources: @@ -294,6 +294,14 @@ spec: nvidia.com/gpu: 1 ``` +**Portability**: No Terraform code references a specific hostname for +GPU scheduling. If the GPU card is physically moved to a different +node, gpu-feature-discovery moves the `nvidia.com/gpu.present=true` +label with it, and `null_resource.gpu_node_config` re-applies the +`nvidia.com/gpu=true:PreferNoSchedule` taint to the new host on the +next apply (discovery keyed on +`feature.node.kubernetes.io/pci-10de.present=true`). + **GPU Workloads**: - Ollama (LLM inference) - ComfyUI (Stable Diffusion workflows) @@ -529,7 +537,7 @@ kubectl describe pod -n ``` 0/5 nodes are available: 5 Insufficient nvidia.com/gpu. ``` - **Fix**: Verify GPU node (201) is Ready and labeled `gpu=true`. + **Fix**: Verify the GPU-carrying node is Ready and has the `nvidia.com/gpu.present=true` label. Check `kubectl get nodes -l nvidia.com/gpu.present=true` — if empty, gpu-feature-discovery hasn't labeled any node (operator not running, driver not loaded, or PCI passthrough broken). ### Pods OOMKilled repeatedly @@ -614,7 +622,7 @@ spec: value: "true" effect: NoSchedule nodeSelector: - gpu: "true" + nvidia.com/gpu.present: "true" containers: - name: app resources: diff --git a/docs/architecture/databases.md b/docs/architecture/databases.md index 810fe85c..c47fcb3d 100644 --- a/docs/architecture/databases.md +++ b/docs/architecture/databases.md @@ -127,9 +127,13 @@ Single shared cluster for all 17 consumers (Immich, Authentik, Nextcloud, Paperl 3 pods in StatefulSet `redis-v2`, each co-locating redis + sentinel + redis_exporter, using `docker.io/library/redis:8-alpine` (8.6.2). HAProxy (3 replicas, PDB minAvailable=2) routes clients to the current master via 1s `INFO replication` tcp-checks. Full context behind the April 2026 rework in beads `code-v2b`. - 3 redis pods + 3 co-located sentinels (quorum=2). Odd sentinel count eliminates split-brain. +- **Pod anti-affinity is `required` (hard)** — each redis pod must land on a distinct node. Soft anti-affinity previously let the scheduler co-locate 2/3 pods on the same node; when that node (`k8s-node3`) went `NotReady→Ready` at 11:42 UTC on 2026-04-22 it took 2 redis pods with it and the cluster lost quorum. Cluster-wide PV `nodeAffinity` matches one zone (`topology.kubernetes.io/region=pve, zone=pve`), so PVCs rebind freely on reschedule. - `podManagementPolicy=Parallel` + init container that regenerates `sentinel.conf` on every boot by probing peer sentinels for consensus master (priority: sentinel vote → peer role:master with slaves → deterministic pod-0 fallback). No persistent sentinel runtime state — can't drift out of sync with reality (root cause of 2026-04-19 PM incident). - redis.conf has `include /shared/replica.conf`; the init container writes either an empty file (master) or `replicaof 6379` (replicas), so pods come up already in the right role — no bootstrap race. - **Sentinel hostname persistence**: `sentinel resolve-hostnames yes` + `sentinel announce-hostnames yes` in the init-generated sentinel.conf are mandatory — without them, sentinel stores resolved IPs in its rewritten config, and pod-IP churn on restart breaks failover. The MONITOR command itself must be issued with a hostname and the flags must be active before MONITOR, otherwise sentinel stores an IP that goes stale the next time the pod is deleted. +- **Failover timing (tuned 2026-04-22)**: `sentinel down-after-milliseconds=15000` + `sentinel failover-timeout=60000`. Redis liveness probe `timeout_seconds=10, failure_threshold=5`; sentinel liveness probe same. LUKS-encrypted LVM + BGSAVE fork can briefly stall master I/O >5s, which under the old 5s/30s sentinel timings + 3s/3 probes induced spurious `+sdown`→`+odown`→`+switch-master` cycles every 1-2 minutes. The new values absorb normal BGSAVE pauses without triggering failover. +- **HAProxy check smoothing (tuned 2026-04-22)**: `check inter 2s fall 3 rise 2` (was `1s / 2 / 2`) + `timeout check 5s` (was `3s`). The aggressive 1s polling used to race sentinel failovers — during a legitimate promote, HAProxy could catch the old master serving `role:slave` in the 1-3s window before re-probing the new master, leaving the backend empty and clients receiving `ReadOnlyError`. +- **Headless service `publish_not_ready_addresses=false`** (flipped 2026-04-22). Previously `true` meant HAProxy's DNS resolver saw not-yet-ready pods during rollouts, compounding the check-race above. Sentinel peer discovery is unaffected because sentinels announce to each other explicitly via `sentinel announce-hostnames yes`. - Memory: master + replicas `requests=limits=768Mi`. Concurrent BGSAVE + AOF-rewrite fork can double RSS via COW, so headroom must cover it. `auto-aof-rewrite-percentage=200` + `auto-aof-rewrite-min-size=128mb` tune down rewrite frequency. - Persistence: RDB (`save 900 1 / 300 100 / 60 10000`) + AOF `appendfsync=everysec`. Disk-wear analysis on 2026-04-19 (sdb Samsung 850 EVO 1TB, 150 TBW): Redis contributes <1 GB/day cluster-wide → 40+ year runway at the 20% TBW budget. - `maxmemory=640mb` (83% of 768Mi limit), `maxmemory-policy=allkeys-lru`. @@ -138,7 +142,7 @@ Single shared cluster for all 17 consumers (Immich, Authentik, Nextcloud, Paperl **Observability** (redis-v2 only): `oliver006/redis_exporter:v1.62.0` sidecar per pod on port 9121, auto-scraped via Prometheus pod annotation. Alerts: `RedisDown`, `RedisMemoryPressure`, `RedisEvictions`, `RedisReplicationLagHigh`, `RedisForkLatencyHigh`, `RedisAOFRewriteLong`, `RedisReplicasMissing`, `RedisBackupStale`, `RedisBackupNeverSucceeded`. -**Why this design** — three incidents in April 2026 drove the rework: (a) 2026-04-04 service selector routed reads+writes to master+replica causing `READONLY` errors; (b) 2026-04-19 AM master OOMKilled during BGSAVE+PSYNC with the 256Mi limit too tight for a 204 MB working set under COW amplification; (c) 2026-04-19 PM sentinel runtime state drifted (only 2 sentinels, no majority) and routed writes to a slave. See beads epic `code-v2b` for the full plan and linked challenger analyses. +**Why this design** — four incidents in April 2026 drove the rework: (a) 2026-04-04 service selector routed reads+writes to master+replica causing `READONLY` errors; (b) 2026-04-19 AM master OOMKilled during BGSAVE+PSYNC with the 256Mi limit too tight for a 204 MB working set under COW amplification; (c) 2026-04-19 PM sentinel runtime state drifted (only 2 sentinels, no majority) and routed writes to a slave; (d) 2026-04-22 five-factor flap cascade — soft anti-affinity let 2/3 pods co-locate on `k8s-node3`, node bounced NotReady→Ready and took quorum with it; aggressive sentinel/probe timing (5s/30s + 3s/3) amplified disk-I/O stalls under LUKS-encrypted LVM into spurious `+switch-master` loops; HAProxy's 1s polling raced sentinel failovers and routed writes to demoted masters; `publish_not_ready_addresses=true` fed not-yet-ready pods into HAProxy DNS; downstream `realestate-crawler-celery` CrashLoopBackOff closed the feedback loop. See beads epic `code-v2b` for the full plan and linked challenger analyses. ### SQLite (Per-App) diff --git a/docs/architecture/mailserver.md b/docs/architecture/mailserver.md index 21b2f957..0026b932 100644 --- a/docs/architecture/mailserver.md +++ b/docs/architecture/mailserver.md @@ -231,7 +231,7 @@ Push secrets (`BREVO_API_KEY`, `EMAIL_MONITOR_IMAP_PASSWORD`) come from External |-------|-----------|----------| | MailServerDown | No replicas for 5m | warning | | EmailRoundtripFailing | Probe failing for 30m | warning | -| EmailRoundtripStale | No success in >40m | warning | +| EmailRoundtripStale | No success in >80m (60m threshold + for:20m) | warning | | EmailRoundtripNeverRun | Metric absent for 40m | warning | ### Uptime Kuma Monitors diff --git a/docs/architecture/monitoring.md b/docs/architecture/monitoring.md index 0de2a219..5fa3bbba 100644 --- a/docs/architecture/monitoring.md +++ b/docs/architecture/monitoring.md @@ -158,7 +158,7 @@ spec: #### Email Monitoring Alerts - **EmailRoundtripFailing**: E2E email probe returning failure for >30m -- **EmailRoundtripStale**: No successful email round-trip in >40m +- **EmailRoundtripStale**: No successful email round-trip in >80m (60m threshold + for:20m) - **EmailRoundtripNeverRun**: Email probe has never reported (40m) #### Registry Integrity Alerts diff --git a/docs/architecture/overview.md b/docs/architecture/overview.md index 9e0fe7be..cb0f8e6d 100644 --- a/docs/architecture/overview.md +++ b/docs/architecture/overview.md @@ -139,7 +139,7 @@ The Kubernetes cluster consists of 5 nodes: - **k8s-node1 (201)**: 16c/32GB GPU node with Tesla T4 passthrough, tainted for GPU workloads only - **k8s-node2-4 (202-204)**: 8c/32GB workers running general-purpose workloads -GPU passthrough on node1 uses PCIe device 0000:06:00.0, with Kubernetes taint `nvidia.com/gpu=true:NoSchedule` and label `gpu=true` to ensure only GPU-requesting pods schedule there. +GPU passthrough on node1 uses PCIe device 0000:06:00.0. The NVIDIA GPU Operator's gpu-feature-discovery auto-labels whichever node carries the card with `nvidia.com/gpu.present=true`; `null_resource.gpu_node_config` taints the same set of nodes with `nvidia.com/gpu=true:PreferNoSchedule`. No hostname is hardcoded — moving the card to a different node requires no Terraform edits. ### Service Organization diff --git a/docs/architecture/storage.md b/docs/architecture/storage.md index 69b32a1a..df1e89f9 100644 --- a/docs/architecture/storage.md +++ b/docs/architecture/storage.md @@ -129,7 +129,9 @@ graph TB 5. **Passphrase management**: ExternalSecret syncs passphrase from Vault KV (`secret/viktor/proxmox_csi_encryption_passphrase`) → K8s Secret. Backup key at `/root/.luks-backup-key` on PVE host. **Services on encrypted storage (2026-04-15 migration):** -vaultwarden, dbaas (mysql+pg+pgadmin), mailserver, nextcloud, forgejo, matrix, n8n, affine, health, hackmd, redis, headscale, frigate, meshcentral, technitium, actualbudget, grampsweb, owntracks, paperless-ngx, wealthfolio, monitoring (alertmanager) +vaultwarden, dbaas (mysql+pg+pgadmin), mailserver, nextcloud, forgejo, matrix, n8n, affine, health, hackmd, redis, headscale, frigate, meshcentral, technitium, actualbudget, grampsweb, owntracks, wealthfolio, monitoring (alertmanager) + +**Services migrated later** (post-audit catch-up): paperless-ngx (2026-04-25 — sensitive document scans had been left on plain `proxmox-lvm` by an abandoned attempt; rsync swap cleaned up the orphan and re-did via Terraform). Vault raft cluster (2026-04-25 — all 3 voters migrated from `nfs-proxmox` to `proxmox-lvm-encrypted` after the 2026-04-22 raft-leader-deadlock post-mortem found NFS fsync semantics incompatible with raft consensus log; rolled non-leader-first with force-finalize on the pvc-protection finalizer to avoid pod-recreating on the old PVCs). **CSI node plugin memory**: Requires 1280Mi limit for LUKS2 Argon2id key derivation (~1GiB). Set via `node.plugin.resources` in Helm values (not `node.resources`). diff --git a/docs/plans/2026-04-25-nfs-hostile-migration-design.md b/docs/plans/2026-04-25-nfs-hostile-migration-design.md new file mode 100644 index 00000000..832064ea --- /dev/null +++ b/docs/plans/2026-04-25-nfs-hostile-migration-design.md @@ -0,0 +1,142 @@ +# NFS-Hostile Workload Migration — Design + +**Date**: 2026-04-25 +**Author**: Viktor (with Claude) +**Status**: Phase 1 done, Phase 2 in progress +**Beads**: code-gy7h (Vault), code-ahr7 (Immich PG) + +## Problem + +The 2026-04-22 Vault Raft leader deadlock (post-mortem +`2026-04-22-vault-raft-leader-deadlock.md`) traced to NFS client +writeback stalls poisoning kernel state. Recovery took 2h43m and +required hard-resetting 3 of 4 cluster VMs. Two workload classes on +NFS are NFS-hostile per the criteria in +`infra/.claude/CLAUDE.md` ("Critical services MUST NOT use NFS"): + +1. **Postgres with WAL fsync per commit** — Immich primary +2. **Vault Raft consensus log** — fsync per append-entry, 3 replicas + +Everything else on NFS (47 PVCs, ~455 GiB) is correctly placed: +RWX media libraries, append-only backups, ML caches. + +## Decision + +Migrate exactly those two workload classes to +`proxmox-lvm-encrypted` (LUKS2 LVM-thin via Proxmox CSI). No iSCSI, +no RWX media migration, no backup-target migration. + +## Rationale + +- Block storage decouples PG / Raft fsync from NFS client kernel + state. Failure mode that triggered the post-mortem cannot recur for + these workloads. +- `proxmox-lvm-encrypted` is the documented default for sensitive data + (`infra/.claude/CLAUDE.md` storage decision rule). It already backs + ~28 PVCs across the cluster — pattern is proven. +- Existing nightly `lvm-pvc-snapshot` PVE host script (03:00, 7-day + retention) auto-picks-up new PVCs via thin snapshots — no extra + backup wiring needed for the live data side. +- LUKS2 satisfies "encrypted at rest for sensitive data" requirement. + +## Out of scope + +- iSCSI evaluation (already retired 2026-04-13). +- RWX media (Immich library, music, ebooks) — correct placement. +- Backup target PVCs (`*-backup` on NFS) — append-only, NFS-tolerant. +- Prometheus 200 GiB — already on `proxmox-lvm`. + +## Pattern per workload + +### Immich PG (single replica, Deployment, Recreate strategy) + +- Add new RWO PVC on `proxmox-lvm-encrypted`. +- Quiesce app pods (server + ML + frame). +- `pg_dumpall` from running NFS pod → local file. +- Swap deployment `claim_name` → encrypted PVC. +- PG bootstraps fresh on empty PVC; restore dump. +- REINDEX vector indexes (`clip_index`, `face_index`). +- Backup CronJob keeps writing to NFS module (correct: append-only). + +### Vault Raft (3 replicas, StatefulSet, helm-managed) + +- Change `dataStorage.storageClass` and `auditStorage.storageClass` + from `nfs-proxmox` → `proxmox-lvm-encrypted`. +- StatefulSet `volumeClaimTemplates` is immutable → use + `kubectl delete sts vault --cascade=orphan` then re-apply (memory + pattern for VCT swaps). +- Per-pod rolling: delete pod + PVCs, controller recreates with new + template. Auto-unseal sidecar handles unseal; raft `retry_join` + rejoins cluster. +- 24h validation window between pods. Migrate non-leader pods first; + step-down current leader before migrating it last. +- Backup target (`vault-backup-host` on NFS) stays on NFS. + +## Risks and rollbacks + +### Immich PG + +- pg_dumpall captures schema + data, not file-level state. Vector + index versions matter (vchord 0.3.0 unchanged; vector 0.8.0 → + 0.8.1 is a minor automatic bump on `CREATE EXTENSION` — confirmed + benign). Rollback: revert `claim_name`, scale apps; old NFS PVC + retained for 7 days post-migration. + +### Vault Raft + +- Cluster keeps quorum from 2 standby replicas while one pod is + swapped. Migrating the leader last avoids quorum churn. +- Recovery anchor: pre-migration `vault operator raft snapshot save` + + nightly `vault-raft-backup` CronJob. RTO < 1h via snapshot + restore. + +## Helm `securityContext.pod` replace-not-merge (Vault, discovered during execution) + +The Vault helm chart sets pod-level securityContext defaults +(`fsGroup=1000, runAsGroup=1000, runAsUser=100, runAsNonRoot=true`) +from chart templates, not from values.yaml. When `main.tf` provided +its own `server.statefulSet.securityContext.pod = {fsGroupChangePolicy += "OnRootMismatch"}` the helm rendering REPLACED the chart defaults +rather than merging into them. On NFS this was harmless (`async, +insecure` exports made the volume world-writable enough for any UID), +but on a fresh ext4 LV via Proxmox CSI the volume root is `root:root` +and vault user (UID 100) cannot open `/vault/data/vault.db`. + +vault-1 and vault-2 happened to be Running with the correct +securityContext because their pod specs were written into etcd +**before** the customization landed; helm chart upgrades don't +restart pods, so the broken values lay dormant until vault-0 was +recreated by the orphan-deleted STS during this migration. + +Resolution: provide all five fields (`fsGroup`, `fsGroupChangePolicy`, +`runAsGroup`, `runAsUser`, `runAsNonRoot`) explicitly in main.tf so +`runAsGroup=1000` etc. survive future chart bumps. Idempotent on +both fresh PVCs and existing pods. + +## Init container chicken-and-egg (Immich PG, discovered during execution) + +The pre-existing `write-pg-override-conf` init container on the +Immich PG deployment writes `postgresql.override.conf` directly to +`PGDATA`. On a populated NFS PVC this was a no-op (init was already +run). On the fresh encrypted PVC, the file made `initdb` refuse the +non-empty directory and the pod CrashLoopBackOff'd. + +Resolution: gate the init container on `PG_VERSION` presence — first +boot skips the override write, PG `initdb`s cleanly; force a pod +restart and the second boot writes the override and PG loads +`vchord` / `vectors` / `pg_prewarm` before the dump restore. Change +is permanent and idempotent (correct on both fresh and initialised +PVCs). One restart pre-migration only. + +## Verification + +End-to-end DONE when: + +- `kubectl get pvc -A | grep nfs-proxmox` returns only the + `vault-backup-host` PVC (or zero, if backup PVC moves elsewhere). +- `vault operator raft list-peers` shows 3 voters on + `proxmox-lvm-encrypted`, leader elected. +- Immich PG `\dx` matches pre-migration extensions (vector minor + drift OK). +- `lvm-pvc-snapshot` captures new LVs in next 03:00 run. +- 7 consecutive days of clean backup CronJob runs and no new alerts. diff --git a/docs/plans/2026-04-25-nfs-hostile-migration-plan.md b/docs/plans/2026-04-25-nfs-hostile-migration-plan.md new file mode 100644 index 00000000..f24c562a --- /dev/null +++ b/docs/plans/2026-04-25-nfs-hostile-migration-plan.md @@ -0,0 +1,169 @@ +# NFS-Hostile Workload Migration — Plan + +**Date**: 2026-04-25 +**Design**: `2026-04-25-nfs-hostile-migration-design.md` +**Beads**: code-gy7h (Vault, epic), code-ahr7 (Immich PG) + +## Phase 1 — Immich PG (DONE 2026-04-25) + +| Step | Done | +|---|---| +| Snapshot extensions + row counts to `/tmp/immich-pre-migration-*` | ✓ | +| Quiesce `immich-server` + `immich-machine-learning` + `immich-frame` | ✓ | +| `pg_dumpall` → `/tmp/immich-pre-migration-.sql` (1.9 GB) | ✓ | +| Add `kubernetes_persistent_volume_claim.immich_postgresql_encrypted` (10Gi, autoresize 20Gi cap) | ✓ | +| Swap `claim_name` at `infra/stacks/immich/main.tf` deployment | ✓ | +| Patch init container to gate on `PG_VERSION` (chicken-and-egg fix) | ✓ | +| Force pod restart so override.conf gets written | ✓ | +| Restore dump | ✓ | +| `REINDEX clip_index`, `REINDEX face_index` | ✓ | +| Scale apps back up | ✓ | +| Verify: `\dx`, row counts (~111k assets), HTTP 200 internal/external | ✓ | +| LV present on PVE host (`vm-9999-pvc-...`) | ✓ | + +### Phase 1 follow-ups (not blocking) + +- Old NFS PVC `immich-postgresql-data-host` retained 7 days for + rollback. After 2026-05-02: remove `module.nfs_postgresql_host` + from `infra/stacks/immich/main.tf` and the CronJob's reference. +- Backup CronJob (`postgresql-backup`) still writes to the NFS + module. After cleanup, point it at a dedicated backup PVC or to + the existing `immich-backups` NFS share. + +## Phase 2 — Vault Raft (DONE 2026-04-25) + +**Phase 2 complete 2026-04-25; all 3 voters on `proxmox-lvm-encrypted`.** + +### Pre-flight (T-0) — DONE 2026-04-25 15:50 UTC + +- [x] Verify all 3 vault pods sealed=false, raft healthy. +- [x] Take fresh `vault operator raft snapshot save` (anchor saved at + `/tmp/vault-pre-migration-20260425-155029.snap`, 1.5 MB). +- [ ] Optional: scale ESO to 0 — skipped (auto-unseal sidecar is + independent; ESO refresh churn is non-disruptive for one swap). +- [x] Confirmed leader is **vault-2** → migrate vault-0 first + (non-leader), vault-1 next, vault-2 last (with step-down). + Plan originally assumed vault-0 was leader; same intent + (non-leader first). +- [x] Thin pool headroom: 54.63% used, plenty for 6 × 2 GiB LVs. + +### Step 0 — Helm values + StatefulSet swap — DONE 2026-04-25 16:08 UTC + +- [x] Edit `infra/stacks/vault/main.tf`: change + `dataStorage.storageClass` and `auditStorage.storageClass` + from `nfs-proxmox` → `proxmox-lvm-encrypted`. +- [x] `kubectl -n vault delete sts vault --cascade=orphan` (StatefulSet + `volumeClaimTemplates` is immutable; orphan keeps pods+PVCs + alive while we recreate the controller with the new template). +- [x] `tg apply -target=helm_release.vault` → recreates STS with new + VCT (full-stack `tg plan` blocks on unrelated for_each-with- + apply-time-keys errors at lines 848/865/909/917; targeted + apply on the helm release alone is the right scope here). + Existing pods still on old NFS PVCs. + +### Step 1 — Roll vault-0 first (non-leader) — DONE 2026-04-25 16:18 UTC + +- [x] `kubectl -n vault delete pod vault-0 --grace-period=30` +- [x] `kubectl -n vault delete pvc data-vault-0 audit-vault-0` +- [x] STS controller recreated pod; new PVCs auto-provisioned on + `proxmox-lvm-encrypted` (LVs `vm-9999-pvc-fb732fd7-...` data + 4.12%, `vm-9999-pvc-36451f42-...` audit 3.99%). +- [x] **Hit and fixed**: vault-0 CrashLoopBackOff'd with + `permission denied` on `/vault/data/vault.db`. The helm chart's + `statefulSet.securityContext.pod` block in main.tf only set + `fsGroupChangePolicy`, replacing (not merging) the chart's + defaults `fsGroup=1000, runAsGroup=1000, runAsUser=100, + runAsNonRoot=true`. NFS exports made the missing fsGroup a + no-op; ext4 LV needs it to chown the volume root for the + vault user. Old vault-1/vault-2 pods were created before that + block was added so they still had the chart-default + securityContext from their original spec. Fix: provide all + five fields explicitly in main.tf and re-apply. Same root + cause will affect vault-1 and vault-2 swaps unless this stays + in place. +- [x] Wait Ready; auto-unseal sidecar unsealed; `retry_join` rejoined + raft cluster. +- [x] Verify: `vault operator raft list-peers` shows 3 voters, + vault-0 follower, leader=vault-2. External HTTPS 200. + +### Step 2 — 24h soak (SKIPPED per user direction 2026-04-25) + +User instructed "continue with all the remaining actions" — soak +gates compressed to per-pod settle windows + raft-state verification +between rollings. No Raft alarms, no Vault errors observed at each +verification gate. + +### Step 3 — Roll vault-1 — DONE 2026-04-25 + +- [x] Force-finalize PVCs to break re-mount race: + `kubectl -n vault patch pvc data-vault-1 audit-vault-1 -p '{"metadata":{"finalizers":null}}' --type=merge`. + (Initial pod-then-PVC delete recreated pod on the OLD NFS PVCs + because pvc-protection finalizer hadn't cleared. Lesson learned + and applied to vault-2 below.) +- [x] Pod recreated on encrypted PVCs; auto-unsealed; rejoined raft. + +### Step 4 — Settle window — DONE 2026-04-25 + +3-check verification over 90s; raft index advancing (2730010→2730012), +all 3 voters healthy. + +### Step 5 — Roll vault-2 (leader) — DONE 2026-04-25 + +- [x] `vault operator step-down` on vault-2; vault-0 took leadership. + Confirmed vault-0 active, vault-1+vault-2 standby before delete. +- [x] Snapshot anchor at `/tmp/vault-pre-vault2.snap` (1.5 MB) from new + leader vault-0. +- [x] Force-finalize + delete PVCs + delete pod (lesson from vault-1). +- [x] Pod recreated on encrypted PVCs; auto-unsealed; rejoined raft. +- [x] `vault operator raft list-peers` shows 3 voters all healthy on + encrypted storage; leader vault-0. + +### Step 6 — Cleanup — DONE 2026-04-25 + +- [x] `kubectl get pvc -A` cross-cluster shows zero PVCs on + `nfs-proxmox` SC (only Released PVs remain → Phase 3). +- [x] Removed inline `kubernetes_storage_class.nfs_proxmox` from + `infra/stacks/vault/main.tf` (was lines 29–42). +- [x] All 3 PVC pairs on `proxmox-lvm-encrypted`. +- [x] `vault operator raft autopilot state` healthy=true. +- [x] External `https://vault.viktorbarzin.me/v1/sys/health` = 200. + +## Phase 3 — Released-PV cleanup (FOLLOW-UP) + +### Step 3.1 — vault Released PVs — DONE 2026-04-25 + +6 vault NFS PVs (Released, `nfs-proxmox` SC, Retain policy) deleted +along with their NFS subdirectories on PVE host (~1.5 GB reclaimed): + +| PV | Claim | Size on disk | +|---|---|---| +| pvc-004a5d3b-… | data-vault-2 | 45M | +| pvc-808a78ec-… | audit-vault-1 | 1.4M | +| pvc-918ee7c1-… | audit-vault-0 | 3.2M | +| pvc-9d2ddcb4-… | data-vault-0 | 46M | +| pvc-a659711d-… | data-vault-1 | 46M | +| pvc-d2e65109-… | audit-vault-2 | 1.4G | + +Procedure: `kubectl delete pv ` (cluster object only — Retain +policy means CSI never touches NFS) then `rm -rf /srv/nfs/` on +192.168.1.127. + +### Step 3.2 — Cluster-wide Released PV sweep (DEFERRED) + +~50 other Released PVs persist across the cluster (~200 GiB on +`proxmox-lvm` and `proxmox-lvm-encrypted`). Out of scope for the +2026-04-25 NFS-hostile session per user direction. To reclaim: + +1. List Released PVs, confirm LV exists on PVE. +2. `kubectl delete pv ` (CSI removes underlying LV when PV is + orphaned with `Retain` reclaim policy and no PVC reference). +3. If LV survives: manual `lvremove pve/vm-9999-pvc-`. + +## Rollback + +| Phase | Trigger | Action | +|---|---|---| +| 1 | Immich UI broken / data loss | Revert `claim_name`; restore from `/tmp/immich-pre-migration-*.sql` to old NFS PVC | +| 2 (mid-rolling) | Single pod broken | Delete the encrypted PVC; recreate with NFS SC explicitly; cluster keeps quorum from 2 healthy pods | +| 2 (post-rolling, raft corrupt) | Cluster-wide failure | `vault operator raft snapshot restore ` | +| Catastrophic | All Vault data lost | Restore from latest `/srv/nfs/vault-backup/` snapshot via CronJob output | diff --git a/docs/post-mortems/2026-04-19-registry-orphan-index.md b/docs/post-mortems/2026-04-19-registry-orphan-index.md index da883760..ee596c63 100644 --- a/docs/post-mortems/2026-04-19-registry-orphan-index.md +++ b/docs/post-mortems/2026-04-19-registry-orphan-index.md @@ -190,3 +190,57 @@ unaddressed. - **Runbook**: `docs/runbooks/registry-rebuild-image.md` (new). - **Hot-fix commits**: `a05d63ee`, `6371e75e`, `c113be4d`. - **Upstream bug class**: `distribution/distribution#3324`. + +## 2026-04-19 — Bulk cleanup sweep (beads code-8hk + code-jh3c) + +Same failure class, broader scope. The `registry-integrity-probe` +surfaced 38 broken manifest references persisting after the 04-19 +infra-ci fix. `beads-dispatcher` + `beads-reaper` CronJobs were stuck +`ImagePullBackOff` on `claude-agent-service:0c24c9b6` for >6h. All 34 +affected `repo:tag` pairs were OCI indexes whose `linux/amd64` child +manifests were absent from blob storage (same orphan pattern). + +**Action taken**: +1. Bumped `beads-server/main.tf` var default `claude_agent_service_image_tag` + from `0c24c9b6` → `2fd7670d` (the canonical tag in + `claude-agent-service/main.tf`), reused — same image already healthy + on the registry. `scripts/tg apply` on `beads-server`. Deleted the + stuck Jobs so new CronJob ticks could fire. +2. Enumerated 34 broken `(repo, tag, parent_digest)` triples via HTTP + probe using `registry-probe-credentials` K8s Secret. Deleted each + via `DELETE /v2//manifests/` (33× 202, 1× 404 — + claude-agent-service:latest pointed at an already-deleted digest). +3. Ran `docker exec registry-private /bin/registry garbage-collect + /etc/docker/registry/config.yml` — reclaimed ~3GB of orphan blob + storage. +4. Rebuilt the 3 in-use broken tags (all 3 OCI-index parents pointed + at missing children, so no cached copies would survive pod + reschedule): + - `freedify:latest` / `freedify:c803de02` — built on registry VM + directly (no CI pipeline exists for this image; python FastAPI). + - `beadboard:17a38e43` / `beadboard:latest` — GHA + `workflow_dispatch` failed at registry login (missing + `REGISTRY_USERNAME`/`REGISTRY_PASSWORD` GH secrets). Built on + registry VM directly as the fallback. GitHub secret gap is a + follow-up — beads `code-8hk` notes it. + - `priority-pass-backend:ae1420a0` / `priority-pass-frontend:ae1420a0` + — Woodpecker pipeline #8 on repo 81. Pipeline `kubectl set image`'d + the Deployment to `ae1420a0` (drift vs TF `v5`/`v8` defaults, but + that drift is pre-existing, not introduced by this cleanup). + - `wealthfolio-sync:latest` — **not rebuilt**. Monthly CronJob (next + run 2026-05-01), no source tree or CI pipeline available in the + monorepo; deferred for separate follow-up. + +**Post-cleanup state**: +- Probe: 39 tags, 0 failures. `registry_manifest_integrity_failures{} = 0`. +- Alert `RegistryManifestIntegrityFailure` cleared (was firing for + 5h 32m). +- No `ImagePullBackOff` pods anywhere in the cluster. +- 28 of 34 deleted manifests were **dangling tags not referenced by any + workload** — old `382d6b1*`, `v2`-`v7`, `yt-fallback`, etc. Safe + deletes, no rebuilds needed. + +**Permanent fix still in flight**: Phase 2/3 of this post-mortem +(post-push verification in CI, atomic `cleanup-tags.sh`) — not +addressed by this cleanup. The probe continues to be the +authoritative detector. diff --git a/docs/post-mortems/2026-04-22-vault-raft-leader-deadlock.md b/docs/post-mortems/2026-04-22-vault-raft-leader-deadlock.md new file mode 100644 index 00000000..dcbb8e02 --- /dev/null +++ b/docs/post-mortems/2026-04-22-vault-raft-leader-deadlock.md @@ -0,0 +1,155 @@ +# Post-Mortem: Vault Raft Leader Deadlock + NFS Kernel Client Corruption Cascade + +> **Resolution status (2026-04-25):** Resolved structurally by code-gy7h +> migration. All 3 vault voters now on `proxmox-lvm-encrypted` block +> storage; the NFS fsync incompatibility that triggered the original +> raft hang is no longer reachable. See +> `docs/plans/2026-04-25-nfs-hostile-migration-plan.md` Phase 2. + +| Field | Value | +|-------|-------| +| **Date** | 2026-04-22 | +| **Duration** | External endpoint 503 from ~09:00 UTC to ~11:43 UTC (~2h 43m). vault-2 became active leader 11:43:28 UTC. | +| **Severity** | SEV1 (Vault — single source of secrets for 40+ services) | +| **Affected Services** | All ESO-backed services (password rotation paused). CronJobs that read plan-time secrets (14 stacks). Woodpecker CI (blocked pipeline `d39770b3`). Everything with `ExternalSecret` refresh interval ≤ 2h. | +| **Status** | Vault HA operational with vault-0 + vault-2 quorum. vault-1 still stuck ContainerCreating on node2 (third node2 reboot pending; workload can accept 2/3 quorum). Terraform fix committed as `2f1f9107`; apply pending. | + +## Summary + +A Vault raft leader (`vault-2`) entered a stuck goroutine state where its cluster port (8201) accepted TCP but never completed msgpack RPC. Standbys could not detect leader death because the TCP layer looked healthy, so no re-election fired. The only recovery was to kill the leader. During recovery, abrupt `kubectl delete --force` of the stuck Vault pods left kernel-side NFS client state on k8s-node1/node3/node4 in a corrupted state — **all new NFS mounts from those nodes timed out at 110s**, while existing mounts kept working. This created a cascade: the stuck leader blocked quorum, killing the leader broke NFS on the destination node for the recreated pod, force-killing the stuck pods left zombie `containerd-shim` processes kubelet couldn't clean up, and the resulting volume-manager loops pegged kubelet into 2-minute timeouts. Recovery required a VM hard-reset for node2 and node3 (kubelet was zombie on both). vault-0 remains down pending node4 reboot. + +## Impact + +- **User-facing**: `vault.viktorbarzin.me` returned HTTP 503 for ~2h. Any service that needed a Vault token during that window was degraded; Woodpecker CI pipeline blocked. +- **Blast radius**: 3/3 Vault pods affected (raft deadlock blocked re-election even with standbys up). Three k8s nodes degraded simultaneously with kernel NFS client stuck state (node1, node3, node4). Two nodes required VM hard-reset to recover kubelet (node2, node3). +- **Duration**: Degraded ~2h; resolution required sequential hard reboots. +- **Data loss**: None. Raft data integrity preserved on NFS. vault-1 came up with index 2475732, caught up to 2476009+ once leader was elected. +- **Observability gap**: No alert fired for the stuck raft leader. Standbys report `HA Mode: standby, Active Node Address: ` as if healthy even when leader is hung. + +## Timeline (UTC) + +| Time | Event | +|------|-------| +| **~09:00** | `vault-2` (original raft leader) enters hung state — port 8201 open but msgpack RPCs hang. Its own logs go silent. Standbys continue heartbeat/appendEntries with `msgpack decode error [pos 0]: i/o timeout`. Neither standby triggers re-election because raft transport does not distinguish "TCP open + silent" from "TCP open + healthy". | +| **~09:15** | External endpoint starts serving 503. Woodpecker CI pipeline `d39770b3` blocks waiting for Vault. | +| **09:59** | Operator force-deletes `vault-2` pod — replacement comes up on node3 and enters candidate loop (term=32), cannot get quorum because DNS for `vault-0` is NXDOMAIN (ContainerCreating) and vault-1 does not respond (its raft goroutine also hung). | +| **10:07** | Operator force-deletes `vault-1` — new `vault-1` gets scheduled to node2. Its raft would be fine, but kubelet on node2 hangs in the pod cleanup path for the old pod's NFS mount. Concurrently, a new `vault-0` pod is attempted on node4, but **NFS mount from node4 times out at 110s** — the host kernel NFS client is in a degraded state that blocks all new mounts (including to completely different NFS paths like `/srv/nfs/ytdlp`). | +| **10:09** | Diagnostic test: from node1 and node4 CSI pods, `mount -t nfs -o nfsvers=4 192.168.1.127:/srv/nfs/ytdlp /tmp/test` times out. From node2 and node3 the same mount succeeds. NFS server is healthy (`showmount -e` works; `rpcinfo` shows all programs registered). The common factor on the broken nodes: they had a force-terminated Vault pod earlier in the session, leaving stuck `mount.nfs` processes in D-state. | +| **10:18** | Manual unmount of stale NFS mount from the force-deleted old vault-0 pod on node4. New mount attempts from CSI still time out — clearing the old mount did not recover kernel NFS client state. | +| **10:22** | Workaround discovered: mounting with `nfsvers=4.0` or `nfsvers=4.1` (instead of default `nfsvers=4` which negotiates to 4.2) succeeds on broken nodes. Confirms the stuck state is version-specific (NFSv4.2 session state), not a general NFS issue. Decision: rather than change CSI mount options cluster-wide (risk of remounting existing 48+ PVs), fix the nodes directly. | +| **10:31** | Investigated node2 kubelet state: old `vault-1` container shows `vault` process in **Z (zombie)** state with its `sh` wrapper stuck in `do_wait` in kernel (`zap_pid_ns_processes`). Containerd-shim PID killed manually — `sh` and zombie reparented to init but remained stuck (uninterruptible kernel wait tied to NFS). | +| **10:34** | Attempted `systemctl restart kubelet` on node2 — kubelet itself went into Z (zombie) with 2 tasks still attached. Classic NFS-related kernel deadlock. | +| **10:42** | **Decision: hard-reset node2 VM** (`qm reset 202`). Disruption: 22 pods evicted. | +| **10:43** | node2 back up (Ready). CSI registered. New `vault-1` scheduled to node2. NFS mount succeeded (fresh kernel state). Kubelet began chowning volume — **extremely slow, ~3 files per minute over NFS**. | +| **10:48** | `vault-1` (2/2 Running) unsealed. **Raft leader elected: `vault-2` wins term 32, election tally=2** (vault-1 voted yes once it came up, vault-0 unreachable). However vault-2's vault-layer (HA active/standby) never transitioned to active — raft leader with `active_time: 0001-01-01T00:00:00Z` and `/sys/ha-status` returning 500. | +| **10:50** | Restarted `vault-2` pod to force clean leader transition. New `vault-2` stuck in chown loop on node3 (same pattern as node2 earlier). | +| **10:54** | Patched the Vault `StatefulSet` with `fsGroupChangePolicy: OnRootMismatch` so subsequent recreations skip the recursive chown. | +| **10:57** | Force-deleted `vault-2` and `06fa940b` pod directory on node3. New pod spawned but kubelet again stuck on phantom state from the old pod. | +| **11:01** | **Hard-reset node3 VM** (`qm reset 203`). | +| **11:03** | First 200 response: vault-1 elected leader, vault-2 standby. Premature celebration — vault-1's audit log on node2 NFS starts timing out; `/sys/ha-status` returns 500 even though raft thinks vault-1 is active. | +| **~11:18** | Service regresses. `vault-1` audit writes hanging (`event not processed by enough 'sink' nodes, context deadline exceeded`). Readiness probe fails; pod goes 1/2; `vault-active` endpoint stays pointed at vault-1's IP but backend unresponsive → 503. | +| **11:22** | Force-restart `vault-1` to trigger re-election with new pod. Delete + containerd-shim cleanup leaves yet another zombie on node2. Same pattern: force-delete → zombie. | +| **11:29** | **Hard-reset node4 VM** (`qm reset 204`). Rationale: vault-0 was still blocked there; 74 pods on node4 contribute to NFS server load (load avg 16 on PVE). After reboot, vault-0 mounts its PVCs on fresh kernel state and comes up 2/2 Running 11:31. | +| **11:31** | Increased PVE NFS threads from 16 to 64 (`echo 64 > /proc/fs/nfsd/threads`). Did not help immediate mount failures — the stuck state is per-client kernel, not server capacity. | +| **11:38** | Discover DNS resolution issue: vault-2's Go resolver returns NXDOMAIN for short names `vault-0.vault-internal` even though glibc resolver works. CoreDNS restart issued earlier didn't fix. Restart vault-2 pod to force fresh resolver state. | +| **11:42** | **Second hard-reset of node3 VM** (`qm reset 203`). Kubelet+CSI re-register; vault-2 scheduled, NFS mounts finally succeed on fresh kernel state. | +| **11:43:28** | **vault-2 becomes active leader.** External endpoint returns 200 and stays there. vault-0 follower, catches up to index 2477632+. vault-1 still stuck on node2; left for later recovery. | + +## Root Cause Chain + +``` +[1] Vault-2 raft goroutine hang (root cause — upstream Vault bug or infra-induced) + └─> Cluster port 8201 accepts TCP but never responds to msgpack RPCs + └─> Standbys' appendEntries calls return `msgpack decode error [pos 0]: i/o timeout` + └─> Raft protocol: no re-election because leader is heartbeating at the TCP level + └─> External endpoint returns 503 because HA layer has no active leader + +[2] Recovery complication — abrupt pod termination + └─> `kubectl delete --force --grace-period=0` on vault-0/1/2 + └─> containerd-shim fails to kill container cleanly (NFS I/O in D-state) + └─> vault process ends as zombie; sh wrapper stuck in do_wait + └─> Kubelet retries forever, cannot tear down old pod volumes + └─> NFS-CSI unmount requests succeed at the NFS layer but kubelet's + volume state-machine never marks the volume as unmounted + (stale 0000-mode mount directory blocks teardown completion) + +[3] Kernel NFS client corruption on node1/node4 + └─> Force-terminated Vault pod left stuck `mount.nfs` processes in D-state + └─> Kernel NFS4.2 client session state corrupted (held open mount slot) + └─> All subsequent mount syscalls for nfsvers=4 block 110s+ waiting for + session slot that will never be freed + └─> Manual workaround: nfsvers=4.1 bypasses the corrupted session state + +[4] Kubelet starvation + └─> Combination of (2) and (3) means kubelet is stuck in a 2-minute volume-setup + context deadline loop — each iteration times out, new iteration restarts, + infinite loop + └─> Hard VM reset is the only exit + └─> After reset, kubelet starts clean, CSI re-registers, mounts succeed + +[5] Slow recursive chown amplifies impact + └─> Default fsGroupChangePolicy: Always (Vault Helm chart 0.29.1 default) + └─> Kubelet walks every file on NFS setting gid=1000 + └─> Over a 1GB audit log and a 47MB raft.db on NFS with timeo=30,retrans=3, + each chown syscall takes seconds; kubelet 2-minute deadline runs out + before the walk finishes + └─> Loop never exits even when ownership is already correct +``` + +## Why This Failed + +1. **Raft transport does not detect stuck leaders.** If TCP is open and the process is alive enough to hold the port, standbys assume the leader is healthy. A stuck goroutine that never responds to RPCs appears to raft as "leader with high RTT" and does not trigger re-election. This is an upstream Vault bug (or at least a missing liveness check). + +2. **Abrupt pod termination + NFS = kernel-level zombie.** When a Vault pod holding an NFS mount is force-killed before it cleanly closes file handles, the kernel's NFS4.2 client session state enters a corrupted state. This blocks all new mounts from that node — not just to the same NFS path, but to ANY NFS path on the same server. The fix is a kernel reboot; there is no userspace recovery. + +3. **Vault data on NFS violates the documented rule.** `infra/.claude/CLAUDE.md` explicitly states: *"Critical services MUST NOT use NFS storage — circular dependency risk."* Vault currently uses `nfs-proxmox` for both `dataStorage` and `auditStorage`. If Vault had been on `proxmox-lvm-encrypted`, none of the NFS corruption cascade would have happened. + +4. **fsGroupChangePolicy: Always is the Helm default.** Every pod restart walks every file over NFS. On a 1GB audit log with degraded NFS RTT, this takes longer than kubelet's internal 2-minute deadline, causing infinite restart loops. `OnRootMismatch` makes chown a no-op when the root is already correct (which it always is after first setup). + +5. **No alert for this failure mode.** Prometheus alerts exist for `VaultSealed`, `VaultDown` (`up` metric), and backup staleness, but none for "raft leader has been running without advancing commit index" or "standby reports leader but leader's `/sys/ha-status` returns 500". + +## Remediation (Applied) + +- [x] Hard-reset node2 and node3 VMs to clear kernel NFS state and kubelet zombies. +- [x] Manually patched live `StatefulSet vault/vault` with `fsGroupChangePolicy: OnRootMismatch` to stop the chown loop. +- [x] Lazy-unmounted stale NFS mounts from force-deleted pod directories on node2 and node3. +- [x] Removed stale kubelet pod directories (`/var/lib/kubelet/pods/`) that had 0000-mode mount subdirectories blocking teardown. +- [x] Updated `stacks/vault/main.tf` with the `fsGroupChangePolicy` setting so the next `scripts/tg apply vault` makes it durable. + +## Remediation (Pending) + +- [ ] **Hard-reset node4** to recover vault-0 (same NFS kernel corruption pattern). +- [ ] **Run `scripts/tg apply` on the vault stack** to persist the fsGroupChangePolicy change. +- [ ] **Add Prometheus alert `VaultRaftLeaderStuck`** — fire when `vault_raft_last_index_gauge` (or derivation from `vault_runtime_total_gc_runs`) stops advancing for >2 minutes while `vault_core_active` is 1. +- [ ] **Add Prometheus alert `VaultHAStatusUnavailable`** — fire when `vault_core_active{}` reports 0 across all pods but `up{job="vault"}` reports 1 (HA layer broken but pods alive). +- [ ] **Migrate Vault to `proxmox-lvm-encrypted` block storage** — eliminates the entire NFS failure class. This follows the rule already documented in `infra/.claude/CLAUDE.md`. Tracked as beads task (open after Dolt is back up; currently down on node4). +- [ ] **Consider raising kubelet volume-manager deadline** for large-volume chown scenarios, or document the `fsGroupChangePolicy: OnRootMismatch` requirement for all NFS-backed StatefulSets. +- [ ] **Runbook**: `docs/runbooks/vault-raft-leader-deadlock.md` — how to detect stuck leader, safe force-restart procedure that avoids zombie pods, NFS kernel state recovery. + +## Contributing Factors + +1. **NFS mount options use bare `nfsvers=4`**. This negotiates to the highest version the server supports (NFSv4.2). When 4.2 session state corrupts, mounts fail; 4.1 works. Pinning to `nfsvers=4.1` in the `nfs-proxmox` StorageClass would make the failure mode recoverable without node reboot, but would also require recreating 48+ existing PVs (volumeAttributes are immutable). Deferred. + +2. **`kubectl delete --force` is the default for stuck pods**. Operators reach for force-delete when a pod won't terminate, but this leaves containerd in an inconsistent state when the underlying storage is hung. Better approach: identify the stuck process (typically `mount.nfs` or a kernel NFS callback) and fix the root cause before force-deleting. + +3. **Beads / Dolt server was on node4**, so beads task tracking went offline during this incident and couldn't be used to log progress cross-session. + +4. **node1 was cordoned mid-incident** to prevent rescheduling to a node with confirmed NFS issues, but this reduced the scheduling surface for anti-affinity-sensitive StatefulSets. + +## Learnings + +1. **NFS for stateful critical services is structurally unsafe.** When NFS breaks, the recovery involves killing pods → which can break NFS further → until a reboot. The rule exists for a reason; Vault should never have been on NFS. + +2. **Raft liveness needs application-layer probing, not TCP.** Every time we've seen a "stuck leader" issue in the homelab, TCP was fine and the app was unresponsive. A lightweight RPC probe with a short timeout and Prometheus alert would catch this in minutes instead of hours. + +3. **kubelet volume-manager is fragile against stuck NFS.** Once kubelet enters a chown loop with a context deadline shorter than the chown duration, it cannot make progress — even when the filesystem is otherwise healthy. `OnRootMismatch` is effectively mandatory for any pod with `fsGroup` and a volume >100MB. + +4. **VM hard-reset is cheap but disruptive.** The two reboots took ~60 seconds each but evicted 22+44 = 66 pods. Doing this twice in one session is a lot of churn. A post-mortem-driven improvement: pre-prepare "hot-standby" capacity so we can cordon+drain instead of hard-reset when kubelet zombies appear. + +5. **Documentation of this rule is worth more than the rule itself.** The CLAUDE.md already says "critical services must not use NFS". The vault stack violates it. The rule without enforcement (validation, linting, CI) is ignored during the rush to ship. + +## References + +- Related: `docs/post-mortems/2026-04-14-nfs-fsid0-dns-vault-outage.md` — previous Vault+NFS incident (different root cause, similar blast pattern). +- Vault helm chart 0.29.1 default `fsGroupChangePolicy` is unset (behaves as `Always`). +- Upstream Vault HA layer: raft leader → vault-active transition is in `vault/external_tests/raft`. Stuck goroutine pattern not documented as a known issue. diff --git a/docs/runbooks/vault-raft-leader-deadlock.md b/docs/runbooks/vault-raft-leader-deadlock.md new file mode 100644 index 00000000..5b4f1ece --- /dev/null +++ b/docs/runbooks/vault-raft-leader-deadlock.md @@ -0,0 +1,217 @@ +# Runbook: Vault Raft Leader Deadlock + Safe Pod Restart + +Captures the 2026-04-22 incident pattern. When a Vault raft leader enters a +stuck goroutine state (port 8201 accepts TCP but RPCs never return), the +recovery is *not* `kubectl delete --force`. Force-deleting a Vault pod that +holds a stuck NFS mount leaves kernel NFS client state corrupted, which +blocks all subsequent NFS mounts from the node and usually requires a VM +hard-reset to clear. + +**Related**: [post-mortems/2026-04-22-vault-raft-leader-deadlock.md](../post-mortems/2026-04-22-vault-raft-leader-deadlock.md). + +## Symptoms + +- `https://vault.viktorbarzin.me/v1/sys/health` returns HTTP 503. +- Standbys log `msgpack decode error [pos 0]: i/o timeout` every 2s. +- `kubectl exec` into a standby shows raft thinks the leader is alive + (peers list all `Voter`, leader address populated) but `vault operator + raft autopilot state` stalls or errors. +- The "leader" pod's logs go silent — no heartbeats, no audit writes, + nothing. TCP on 8201 still accepts connections. +- ESO-backed secrets stop refreshing (ExternalSecret `SecretSyncedError`). +- Woodpecker CI pipelines that read from Vault at plan time hang. + +## 0. Confirm the diagnosis (before touching anything) + +Don't jump to force-delete. Verify the leader is actually stuck, not just +slow: + +```sh +# 1. Who does raft think the leader is? +kubectl exec -n vault vault-0 -c vault -- vault status 2>&1 | \ + grep -E 'HA Mode|Active Node|Leader|Raft' + +# 2. Is the leader's port open but unresponsive? +LEADER_POD=vault-2 # or whichever vault status reports +kubectl exec -n vault $LEADER_POD -c vault -- sh -c \ + 'timeout 3 nc -zv 127.0.0.1 8200 2>&1; echo; timeout 3 vault status' + +# 3. Is the active vault service pointing at a real pod? +kubectl get endpoints -n vault vault-active -o yaml | \ + grep -E 'addresses|notReadyAddresses' -A2 + +# 4. What do standby logs say? +kubectl logs -n vault vault-0 -c vault --tail=40 | grep -iE 'msgpack|decode|rpc' +``` + +If (2) hangs and (4) shows repeated msgpack errors → stuck leader. + +## 1. Identify the stuck pod precisely + +```sh +# Find the pod whose vault_core_active would be 1 if it were scraping +# (currently no telemetry — use logs as proxy until telemetry is enabled). +for p in vault-0 vault-1 vault-2; do + echo "=== $p ===" + kubectl logs -n vault $p -c vault --tail=5 2>&1 | head -5 +done | grep -B1 'no recent output' +``` + +The pod whose logs have been silent for minutes while the others are +actively erroring is the stuck leader. + +## 2. The safe restart sequence (avoids zombie containers) + +**DO NOT** `kubectl delete pod --force --grace-period=0` as the first +step. On NFS-backed Vault that's the exact move that leaves the kernel +NFS client corrupted on the node where the stuck pod ran. + +Instead: + +### 2a. Graceful delete first (30s grace) + +```sh +kubectl delete pod -n vault vault-2 +``` + +Wait 30 seconds. Most of the time the TERM → SIGKILL path works and the +new pod schedules cleanly. The remaining leaders re-elect and the external +endpoint recovers. + +### 2b. If the pod is Terminating after 60s, find the stuck process + +```sh +NODE=$(kubectl get pod -n vault vault-2- -o jsonpath='{.spec.nodeName}') +POD_UID=$(kubectl get pod -n vault vault-2- -o jsonpath='{.metadata.uid}') + +ssh $NODE "sudo ps auxf | grep -A2 $POD_UID | head -20" +# Look for: mount.nfs (D-state), vault (Z-state), or the sh wrapper in do_wait +``` + +### 2c. Unmount stale NFS before force-deleting + +If the old pod's NFS mount is still present, lazy-unmount it FIRST so +the kernel can release NFS session state cleanly: + +```sh +ssh $NODE "sudo mount | grep $POD_UID | awk '{print \$3}' | xargs -I{} sudo umount -l {}" +``` + +Verify no mount.nfs processes are in D-state on the node: + +```sh +ssh $NODE "ps -eo state,pid,comm | grep '^D' | head -5" +``` + +### 2d. Only NOW force-delete if needed + +```sh +kubectl delete pod -n vault vault-2- --force --grace-period=0 +``` + +## 3. Recovery when the node is already stuck + +If you force-deleted before reading this runbook and NFS is now broken +on the node: + +**Diagnostic — confirm NFS client state is corrupted:** + +```sh +NODE=k8s-node2 # node where the force-delete happened +ssh $NODE "sudo mkdir -p /tmp/nfstest && sudo timeout 30 \ + mount -t nfs 192.168.1.127:/srv/nfs /tmp/nfstest && echo MOUNT_OK" +``` + +If the mount times out at 30-110s, kernel NFS client state is stuck. +No userspace recovery exists — only a VM reboot clears it. + +**Workaround before rebooting**: mounting with `nfsvers=4.1` succeeds +on broken nodes (the corruption is NFSv4.2 session-state specific). +This is useful for diagnostic mounts, but does NOT fix CSI pods — +their mount options come from the `nfs-proxmox` StorageClass and can't +be overridden per-pod. + +**Reboot the affected node VM:** + +```sh +# Find PVE VM ID — nodes numbered 201-204 for k8s-node1..4 +ssh root@192.168.1.127 "qm reset 20" + +# If qm reset leaves the VM PID unchanged (it didn't actually reboot), +# use qm stop/start: +ssh root@192.168.1.127 "qm stop 20 && qm start 20" +``` + +Wait for the node to become Ready (`kubectl get node k8s-node -w`) +and CSI driver to register (`kubectl get pods -n nfs-csi -o wide`). + +**Gotcha — `qm reset` can be a no-op.** On the 2026-04-22 incident, +`qm reset 201` returned exit 0 but did NOT restart the VM (same QEMU PID +before and after). `qm status` reported "running" throughout. Always +verify by checking the QEMU PID or VM uptime post-reset. If uptime is +unchanged, escalate to `qm stop && qm start`. + +**Gotcha — check boot order before stop/start.** Long-running VMs +(630+ day uptime) may have stale `bootdisk:` config that's been hidden +by never rebooting. On 2026-04-22, k8s-node1's config had `bootdisk: +scsi0` but the actual OS disk was on `scsi1`, so the first boot after +stop attempted iPXE and failed. Before stopping, verify: + +```sh +ssh root@192.168.1.127 "grep -E 'boot|scsi[0-9]+:' /etc/pve/qemu-server/20.conf" +``` + +If `bootdisk` references a disk ID that doesn't exist, fix it first +with `qm set 20 --boot "order=scsi"` (use the ID of the main +OS disk). + +## 4. Prevent re-infection — the chown loop + +After the node comes back, the vault pod's PV chown walk can still +peg kubelet. The durable fix is in `stacks/vault/main.tf`: + +```hcl +statefulSet = { + securityContext = { + pod = { + fsGroupChangePolicy = "OnRootMismatch" + } + } +} +``` + +This was applied in commit `2f1f9107` (2026-04-22). If you find +yourself editing this in a kubectl patch for live recovery, follow +up with a Terraform apply the same session — leaving the cluster +ahead of Terraform state is technical debt that re-triggers on the +next apply. + +## 5. Verify end-to-end + +```sh +# External endpoint — the user-facing health check +curl -sk -o /dev/null -w "%{http_code}\n" https://vault.viktorbarzin.me/v1/sys/health +# expect: 200 + +# Raft peers (needs VAULT_TOKEN with operator capability) +kubectl exec -n vault vault-0 -c vault -- vault operator raft list-peers + +# All pods 2/2 +kubectl get pods -n vault -l app.kubernetes.io/name=vault -o wide + +# No alerts fired (once VaultRaftLeaderStuck + VaultHAStatusUnavailable are live) +curl -s https://alertmanager.viktorbarzin.me/api/v2/alerts | \ + jq '.[] | select(.labels.alertname | test("Vault"))' +``` + +## Known limitations + +- **No alert for stuck leaders yet.** `VaultRaftLeaderStuck` and + `VaultHAStatusUnavailable` require Vault telemetry enabled + (`telemetry { unauthenticated_metrics_access = true }`) and a + scrape job. Alerts are defined in `prometheus_chart_values.tpl` + but stay silent until telemetry lands — tracked as a beads task. +- **Vault on NFS violates the documented rule.** `infra/.claude/CLAUDE.md` + says critical services must use `proxmox-lvm-encrypted`. The + `dataStorage`/`auditStorage` still use `nfs-proxmox`. Migration + tracked as an epic-level beads task. diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index 997c0b7d..b5237378 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -1242,9 +1242,17 @@ check_overcommit() { HA_CACHE_DIR="" ha_sofia_available() { - if [[ -z "${HOME_ASSISTANT_SOFIA_URL:-}" ]] || [[ -z "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]]; then - return 1 + if [[ -z "${HOME_ASSISTANT_SOFIA_URL:-}" ]]; then + export HOME_ASSISTANT_SOFIA_URL="https://ha-sofia.viktorbarzin.me" fi + if [[ -z "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]]; then + if command -v vault >/dev/null 2>&1 && [[ -n "${VAULT_TOKEN:-}${HOME:-}" ]]; then + local t + t=$(vault kv get -field=haos_api_token secret/viktor 2>/dev/null || true) + [[ -n "$t" ]] && export HOME_ASSISTANT_SOFIA_TOKEN="$t" + fi + fi + [[ -n "${HOME_ASSISTANT_SOFIA_TOKEN:-}" ]] || return 1 return 0 } @@ -1752,14 +1760,25 @@ else: json_add "hardware_exporters" "$status" "${detail:-All healthy}" } +# Returns 0 if cert-manager CRDs are installed, 1 otherwise. +cert_manager_installed() { + $KUBECTL get crd certificates.cert-manager.io -o name >/dev/null 2>&1 +} + # --- 31. cert-manager: Certificate Readiness --- check_cert_manager_certificates() { section 31 "cert-manager — Certificate Readiness" local certs not_ready detail="" status="PASS" + if ! cert_manager_installed; then + pass "cert-manager not installed — N/A" + json_add "certmanager_certificates" "PASS" "N/A (cert-manager not installed)" + return 0 + fi + certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || { - warn "cert-manager CRDs not installed or inaccessible" - json_add "certmanager_certificates" "WARN" "CRDs unavailable" + warn "cert-manager CRDs installed but API query failed" + json_add "certmanager_certificates" "WARN" "API query failed" return 0 } @@ -1797,9 +1816,15 @@ check_cert_manager_expiry() { section 32 "cert-manager — Certificate Expiry (<14d)" local certs expiring detail="" status="PASS" + if ! cert_manager_installed; then + pass "cert-manager not installed — N/A" + json_add "certmanager_expiry" "PASS" "N/A (cert-manager not installed)" + return 0 + fi + certs=$($KUBECTL get certificates.cert-manager.io -A -o json 2>/dev/null) || { - warn "cert-manager CRDs not installed or inaccessible" - json_add "certmanager_expiry" "WARN" "CRDs unavailable" + warn "cert-manager CRDs installed but API query failed" + json_add "certmanager_expiry" "WARN" "API query failed" return 0 } @@ -1852,9 +1877,15 @@ check_cert_manager_requests() { section 33 "cert-manager — Failed CertificateRequests" local requests failed detail="" status="PASS" + if ! cert_manager_installed; then + pass "cert-manager not installed — N/A" + json_add "certmanager_requests" "PASS" "N/A (cert-manager not installed)" + return 0 + fi + requests=$($KUBECTL get certificaterequests.cert-manager.io -A -o json 2>/dev/null) || { - warn "cert-manager CRDs not installed or inaccessible" - json_add "certmanager_requests" "WARN" "CRDs unavailable" + warn "cert-manager CRDs installed but API query failed" + json_add "certmanager_requests" "WARN" "API query failed" return 0 } @@ -1998,7 +2029,7 @@ check_backup_lvm_snapshots() { local snap_output detail="" status="PASS" snap_output=$(ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no \ - root@192.168.1.127 "lvs -o lv_name,lv_time --noheadings 2>/dev/null | grep -- -snap" 2>/dev/null || true) + root@192.168.1.127 "lvs -o lv_name,lv_time --noheadings 2>/dev/null | grep _snap" 2>/dev/null || true) if [[ -z "$snap_output" ]]; then [[ "$QUIET" == true ]] && section_always 36 "Backup Freshness — LVM PVC Snapshots" diff --git a/scripts/lvm-pvc-snapshot.sh b/scripts/lvm-pvc-snapshot.sh new file mode 100755 index 00000000..6ec5dc34 --- /dev/null +++ b/scripts/lvm-pvc-snapshot.sh @@ -0,0 +1,469 @@ +#!/usr/bin/env bash +# lvm-pvc-snapshot — LVM thin snapshot management for Proxmox CSI PVCs +# Deploy to PVE host at /usr/local/bin/lvm-pvc-snapshot +set -euo pipefail + +# --- Configuration --- +VG="pve" +THINPOOL="data" +SNAP_SUFFIX_FORMAT="%Y%m%d_%H%M" +RETENTION_DAYS=7 +MIN_FREE_PCT=10 +PUSHGATEWAY="${LVM_SNAP_PUSHGATEWAY:-http://10.0.20.100:30091}" +PUSHGATEWAY_JOB="lvm-pvc-snapshot" +LOCKFILE="/run/lvm-pvc-snapshot.lock" +KUBECONFIG="${KUBECONFIG:-/root/.kube/config}" +export KUBECONFIG + +# Namespaces to exclude from snapshots (high-churn, have app-level dumps) +# These PVCs cause significant CoW write amplification (~36% overhead) +EXCLUDE_NAMESPACES="${LVM_SNAP_EXCLUDE_NS:-dbaas,monitoring}" + +# --- Logging --- +log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } +warn() { log "WARN: $*" >&2; } +die() { log "FATAL: $*" >&2; exit 1; } + +# --- Helpers --- + +get_thinpool_free_pct() { + local data_pct + data_pct=$(lvs --noheadings --nosuffix -o data_percent "${VG}/${THINPOOL}" 2>/dev/null | tr -d ' ') + echo "scale=2; 100 - ${data_pct}" | bc +} + +build_exclude_lv_list() { + # Query K8s for PVs in excluded namespaces, extract their LV names + if [[ -z "${EXCLUDE_NAMESPACES}" ]] || ! command -v kubectl &>/dev/null; then + return + fi + kubectl get pv -o json 2>/dev/null | jq -r --arg ns "${EXCLUDE_NAMESPACES}" ' + ($ns | split(",")) as $excl | + .items[] | + select(.spec.csi.driver == "csi.proxmox.sinextra.dev") | + select(.spec.claimRef.namespace as $n | $excl | index($n)) | + .spec.csi.volumeHandle | split("/") | last + ' 2>/dev/null || true +} + +discover_pvc_lvs() { + # List thin LVs matching PVC pattern, excluding snapshots, pre-restore backups, + # and LVs belonging to excluded namespaces (high-churn databases/metrics) + local all_lvs exclude_lvs + all_lvs=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \ + | awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \ + | grep -E '^vm-[0-9]+-pvc-' \ + | grep -v '_snap_' \ + | grep -v '_pre_restore_') + + exclude_lvs=$(build_exclude_lv_list) + + if [[ -n "${exclude_lvs}" ]]; then + # Filter out excluded LVs + local exclude_pattern + exclude_pattern=$(echo "${exclude_lvs}" | paste -sd'|' -) + echo "${all_lvs}" | grep -vE "(${exclude_pattern})" || true + else + echo "${all_lvs}" + fi +} + +list_snapshots() { + lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \ + | awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \ + | grep '_snap_' || true +} + +parse_snap_timestamp() { + # Extract YYYYMMDD_HHMM from snapshot name, convert to epoch + local snap_name="$1" + local ts_str + ts_str=$(echo "${snap_name}" | grep -oE '[0-9]{8}_[0-9]{4}$') + if [[ -z "${ts_str}" ]]; then + echo "0" + return + fi + local ymd="${ts_str:0:8}" + local hm="${ts_str:9:4}" + date -d "${ymd:0:4}-${ymd:4:2}-${ymd:6:2} ${hm:0:2}:${hm:2:2}" +%s 2>/dev/null || echo "0" +} + +get_original_lv_from_snap() { + # vm-200-pvc-abc_snap_20260403_1200 -> vm-200-pvc-abc + echo "$1" | sed 's/_snap_[0-9]\{8\}_[0-9]\{4\}$//' +} + +push_metrics() { + local status="$1" created="$2" failed="$3" pruned="$4" + local free_pct + free_pct=$(get_thinpool_free_pct) + + cat </dev/null || warn "Failed to push metrics to Pushgateway" +# HELP lvm_snapshot_last_run_timestamp Unix timestamp of last snapshot run +# TYPE lvm_snapshot_last_run_timestamp gauge +lvm_snapshot_last_run_timestamp $(date +%s) +# HELP lvm_snapshot_last_status Exit status (0=success, 1=partial failure, 2=aborted) +# TYPE lvm_snapshot_last_status gauge +lvm_snapshot_last_status ${status} +# HELP lvm_snapshot_created_total Number of snapshots created in last run +# TYPE lvm_snapshot_created_total gauge +lvm_snapshot_created_total ${created} +# HELP lvm_snapshot_failed_total Number of snapshot failures in last run +# TYPE lvm_snapshot_failed_total gauge +lvm_snapshot_failed_total ${failed} +# HELP lvm_snapshot_pruned_total Number of snapshots pruned in last run +# TYPE lvm_snapshot_pruned_total gauge +lvm_snapshot_pruned_total ${pruned} +# HELP lvm_snapshot_thinpool_free_pct Thin pool free percentage +# TYPE lvm_snapshot_thinpool_free_pct gauge +lvm_snapshot_thinpool_free_pct ${free_pct} +METRICS +} + +# --- Subcommands --- + +cmd_snapshot() { + log "Starting PVC LVM thin snapshot run" + + # Check thin pool free space + local free_pct + free_pct=$(get_thinpool_free_pct) + log "Thin pool free space: ${free_pct}%" + if (( $(echo "${free_pct} < ${MIN_FREE_PCT}" | bc -l) )); then + warn "Thin pool has only ${free_pct}% free (minimum: ${MIN_FREE_PCT}%). Aborting." + push_metrics 2 0 0 0 + exit 1 + fi + + # Discover PVC LVs + local lvs_list + lvs_list=$(discover_pvc_lvs) + if [[ -z "${lvs_list}" ]]; then + warn "No PVC LVs found matching pattern" + push_metrics 2 0 0 0 + exit 1 + fi + + local count=0 failed=0 total + total=$(echo "${lvs_list}" | wc -l | tr -d ' ') + local snap_ts + snap_ts=$(date +"${SNAP_SUFFIX_FORMAT}") + + log "Found ${total} PVC LVs to snapshot" + + while IFS= read -r lv; do + local snap_name="${lv}_snap_${snap_ts}" + if lvcreate -s -kn -n "${snap_name}" "${VG}/${lv}" >/dev/null 2>&1; then + log " Created: ${snap_name}" + count=$((count + 1)) + else + warn " Failed to create snapshot for ${lv}" + failed=$((failed + 1)) + fi + done <<< "${lvs_list}" + + log "Snapshot run complete: ${count} created, ${failed} failed out of ${total}" + + # Auto-prune + log "Running auto-prune..." + local pruned + pruned=$(cmd_prune_count) + + # Determine status + local status=0 + if (( failed > 0 && count > 0 )); then + status=1 # partial + elif (( failed > 0 && count == 0 )); then + status=2 # all failed + fi + + push_metrics "${status}" "${count}" "${failed}" "${pruned}" + log "Done" +} + +cmd_list() { + printf "%-45s %-50s %8s %8s\n" "ORIGINAL LV" "SNAPSHOT" "AGE" "DATA%" + printf "%-45s %-50s %8s %8s\n" "-----------" "--------" "---" "-----" + + local now + now=$(date +%s) + + local snap_lines + snap_lines=$(lvs --noheadings --nosuffix -o lv_name,lv_size,data_percent "${VG}" 2>/dev/null \ + | grep -E '_snap_|_pre_restore_' || true) + + if [[ -z "${snap_lines}" ]]; then + echo "(no snapshots found)" + return + fi + + echo "${snap_lines}" | while read -r name size data_pct; do + local original age_str ts epoch + if [[ "${name}" == *"_pre_restore_"* ]]; then + original=$(echo "${name}" | sed 's/_pre_restore_[0-9]\{8\}_[0-9]\{4\}$//') + ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$') + else + original=$(get_original_lv_from_snap "${name}") + ts=$(echo "${name}" | grep -oE '[0-9]{8}_[0-9]{4}$') + fi + epoch=$(parse_snap_timestamp "${name}") + if (( epoch > 0 )); then + local age_s=$(( now - epoch )) + local days=$(( age_s / 86400 )) + local hours=$(( (age_s % 86400) / 3600 )) + age_str="${days}d${hours}h" + else + age_str="unknown" + fi + printf "%-45s %-50s %8s %7s%%\n" "${original}" "${name}" "${age_str}" "${data_pct}" + done +} + +cmd_prune() { + local pruned + pruned=$(cmd_prune_count) + log "Pruned ${pruned} expired snapshots" +} + +cmd_prune_count() { + # NOTE: stdout of this function is captured by callers (`pruned=$(cmd_prune_count)`), + # so all log/warn output must go to stderr — the only thing on stdout is the count. + local now cutoff pruned=0 + now=$(date +%s) + cutoff=$(( now - RETENTION_DAYS * 86400 )) + + local snaps + snaps=$(lvs --noheadings -o lv_name,pool_lv "${VG}" 2>/dev/null \ + | awk -v pool="${THINPOOL}" '$2 == pool { print $1 }' \ + | grep -E '_snap_|_pre_restore_' || true) + + if [[ -z "${snaps}" ]]; then + echo "0" + return + fi + + while IFS= read -r snap; do + local epoch + epoch=$(parse_snap_timestamp "${snap}") + if (( epoch > 0 && epoch < cutoff )); then + if lvremove -f "${VG}/${snap}" >/dev/null 2>&1; then + log " Pruned: ${snap}" >&2 + pruned=$((pruned + 1)) + else + warn " Failed to prune: ${snap}" + fi + fi + done <<< "${snaps}" + + echo "${pruned}" +} + +cmd_restore() { + local pvc_lv="${1:-}" snapshot_lv="${2:-}" + + if [[ -z "${pvc_lv}" || -z "${snapshot_lv}" ]]; then + die "Usage: $0 restore " + fi + + # Validate LVs exist + if ! lvs "${VG}/${pvc_lv}" >/dev/null 2>&1; then + die "PVC LV '${pvc_lv}' not found in VG '${VG}'" + fi + if ! lvs "${VG}/${snapshot_lv}" >/dev/null 2>&1; then + die "Snapshot LV '${snapshot_lv}' not found in VG '${VG}'" + fi + + # Discover K8s context + log "Discovering Kubernetes context for LV '${pvc_lv}'..." + + local volume_handle="local-lvm:${pvc_lv}" + local pv_info + pv_info=$(kubectl get pv -o json 2>/dev/null | jq -r \ + --arg vh "${volume_handle}" \ + '.items[] | select(.spec.csi.volumeHandle == $vh) | "\(.metadata.name) \(.spec.claimRef.namespace) \(.spec.claimRef.name)"' \ + ) || die "Failed to query PVs (is kubectl configured?)" + + if [[ -z "${pv_info}" ]]; then + die "No PV found with volumeHandle '${volume_handle}'" + fi + + local pv_name pvc_ns pvc_name + read -r pv_name pvc_ns pvc_name <<< "${pv_info}" + log "Found: PV=${pv_name}, PVC=${pvc_ns}/${pvc_name}" + + # Find the workload (Deployment or StatefulSet) that uses this PVC + local workload_type="" workload_name="" original_replicas="" + + # Check StatefulSets first (databases use these) + local sts_info + sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \ + --arg pvc "${pvc_name}" \ + '.items[] | select( + (.spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc) or + (.spec.volumeClaimTemplates // [] | .[].metadata.name as $vct | + .spec.replicas as $r | range($r) | "\($vct)-\(.metadata.name)-\(.)" ) == $pvc + ) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \ + ) || true + + # If not found via simple volume check, try matching VCT naming pattern + if [[ -z "${sts_info}" ]]; then + sts_info=$(kubectl get statefulset -n "${pvc_ns}" -o json 2>/dev/null | jq -r \ + --arg pvc "${pvc_name}" \ + '.items[] | .metadata.name as $sts | .spec.replicas as $r | + select(.spec.volumeClaimTemplates != null) | + .spec.volumeClaimTemplates[].metadata.name as $vct | + [range($r)] | map("\($vct)-\($sts)-\(.)") | + if any(. == $pvc) then "\($sts) \($r)" else empty end' 2>/dev/null \ + ) || true + fi + + if [[ -n "${sts_info}" ]]; then + read -r workload_name original_replicas <<< "${sts_info}" + workload_type="statefulset" + else + # Check Deployments + local deploy_info + deploy_info=$(kubectl get deployment -n "${pvc_ns}" -o json 2>/dev/null | jq -r \ + --arg pvc "${pvc_name}" \ + '.items[] | select( + .spec.template.spec.volumes // [] | .[].persistentVolumeClaim.claimName == $pvc + ) | "\(.metadata.name) \(.spec.replicas)"' 2>/dev/null \ + ) || true + + if [[ -n "${deploy_info}" ]]; then + read -r workload_name original_replicas <<< "${deploy_info}" + workload_type="deployment" + fi + fi + + if [[ -z "${workload_type}" ]]; then + warn "Could not auto-discover workload for PVC '${pvc_name}' in namespace '${pvc_ns}'." + warn "You may need to scale down the pod manually." + echo "" + read -rp "Continue with LV swap anyway? (yes/no): " confirm + [[ "${confirm}" == "yes" ]] || die "Aborted by user" + workload_type="manual" + fi + + # Dry-run output + local backup_name="${pvc_lv}_pre_restore_$(date +"${SNAP_SUFFIX_FORMAT}")" + echo "" + echo "╔══════════════════════════════════════════════════════════════╗" + echo "║ RESTORE DRY-RUN ║" + echo "╠══════════════════════════════════════════════════════════════╣" + echo "║ PVC: ${pvc_ns}/${pvc_name}" + echo "║ PV: ${pv_name}" + if [[ "${workload_type}" != "manual" ]]; then + echo "║ Workload: ${workload_type}/${workload_name} (replicas: ${original_replicas}→0→${original_replicas})" + fi + echo "║" + echo "║ Actions:" + if [[ "${workload_type}" != "manual" ]]; then + echo "║ 1. Scale ${workload_type}/${workload_name} to 0 replicas" + echo "║ 2. Wait for pod termination" + fi + echo "║ 3. Rename ${pvc_lv} → ${backup_name}" + echo "║ 4. Rename ${snapshot_lv} → ${pvc_lv}" + if [[ "${workload_type}" != "manual" ]]; then + echo "║ 5. Scale ${workload_type}/${workload_name} back to ${original_replicas} replicas" + fi + echo "╚══════════════════════════════════════════════════════════════╝" + echo "" + + # Interactive confirmation + read -rp "Type 'yes' to proceed with restore: " confirm + if [[ "${confirm}" != "yes" ]]; then + die "Aborted by user" + fi + + # Scale down + if [[ "${workload_type}" != "manual" ]]; then + log "Scaling ${workload_type}/${workload_name} to 0 replicas..." + kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas=0 + + log "Waiting for pod termination (timeout: 120s)..." + kubectl wait --for=delete pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \ + kubectl wait --for=delete pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=120s 2>/dev/null || \ + warn "Timeout waiting for pods — continuing anyway (LV may still be in use)" + sleep 5 # extra grace period for device detach + fi + + # Verify LV is not active + local lv_active + lv_active=$(lvs --noheadings -o lv_active "${VG}/${pvc_lv}" 2>/dev/null | tr -d ' ') + if [[ "${lv_active}" == "active" ]]; then + warn "LV ${pvc_lv} is still active. Attempting to deactivate..." + # Close any LUKS mapper on the LV before deactivation + if dmsetup ls 2>/dev/null | grep -q "${pvc_lv}"; then + log "Closing LUKS mapper for ${pvc_lv}..." + cryptsetup luksClose "${pvc_lv}" 2>/dev/null || true + fi + lvchange -an "${VG}/${pvc_lv}" 2>/dev/null || warn "Could not deactivate — proceeding with caution" + fi + + # LV swap + log "Renaming ${pvc_lv} → ${backup_name}" + lvrename "${VG}" "${pvc_lv}" "${backup_name}" || die "Failed to rename original LV" + + log "Renaming ${snapshot_lv} → ${pvc_lv}" + lvrename "${VG}" "${snapshot_lv}" "${pvc_lv}" || die "Failed to rename snapshot LV" + + # Scale back up + if [[ "${workload_type}" != "manual" ]]; then + log "Scaling ${workload_type}/${workload_name} back to ${original_replicas} replicas..." + kubectl scale "${workload_type}/${workload_name}" -n "${pvc_ns}" --replicas="${original_replicas}" + + log "Waiting for pod to become Ready (timeout: 300s)..." + kubectl wait --for=condition=Ready pod -l "app.kubernetes.io/name=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \ + kubectl wait --for=condition=Ready pod -l "app=${workload_name}" -n "${pvc_ns}" --timeout=300s 2>/dev/null || \ + warn "Timeout waiting for pod Ready — check manually" + fi + + echo "" + log "Restore complete!" + log "Old data preserved as: ${backup_name}" + log "To delete old data after verification: lvremove -f ${VG}/${backup_name}" +} + +# --- Main --- + +usage() { + cat < [args] + +Commands: + snapshot Create thin snapshots of all PVC LVs + list List existing snapshots with age and data% + prune Remove snapshots older than ${RETENTION_DAYS} days + restore Restore a PVC from a snapshot (interactive) + +Environment: + LVM_SNAP_PUSHGATEWAY Pushgateway URL (default: ${PUSHGATEWAY}) + KUBECONFIG Kubeconfig path (default: /root/.kube/config) +EOF +} + +main() { + local cmd="${1:-}" + shift || true + + # Acquire lock (except for list which is read-only) + if [[ "${cmd}" != "list" && "${cmd}" != "" && "${cmd}" != "help" && "${cmd}" != "--help" && "${cmd}" != "-h" ]]; then + exec 200>"${LOCKFILE}" + if ! flock -n 200; then + die "Another instance is already running (lockfile: ${LOCKFILE})" + fi + fi + + case "${cmd}" in + snapshot) cmd_snapshot ;; + list) cmd_list ;; + prune) cmd_prune ;; + restore) cmd_restore "$@" ;; + help|--help|-h|"") usage ;; + *) die "Unknown command: ${cmd}. Run '$0 help' for usage." ;; + esac +} + +main "$@" diff --git a/scripts/tg b/scripts/tg index 8cb38e20..15cea845 100755 --- a/scripts/tg +++ b/scripts/tg @@ -72,12 +72,23 @@ if [ -n "$STACK_NAME" ]; then else # Tier 1: PG backend — fetch credentials from Vault if [ -z "${PG_CONN_STR:-}" ]; then - PG_CREDS=$(vault read -format=json database/static-creds/pg-terraform-state 2>/dev/null) || { - echo "ERROR: Cannot read PG credentials from Vault. Run: vault login -method=oidc" >&2 + # Pre-flight: vault CLI must be available. Previously CI failed with a + # misleading "Cannot read PG credentials" message because the Alpine CI + # image lacked the vault binary — the 2>/dev/null below swallowed the + # real "vault: not found" error. Fail fast with a clear message instead. + if ! command -v vault >/dev/null 2>&1; then + echo "ERROR: vault CLI not found on PATH. Install it or use an image that includes it (ci/Dockerfile)." >&2 + exit 1 + fi + VAULT_OUT=$(vault read -format=json database/static-creds/pg-terraform-state 2>&1) || { + echo "ERROR: Cannot read PG credentials from Vault. Vault output follows:" >&2 + echo "$VAULT_OUT" >&2 + echo "" >&2 + echo "Hint: humans run 'vault login -method=oidc'; CI auths via K8s SA (role=ci)." >&2 exit 1 } - PG_USER=$(echo "$PG_CREDS" | jq -r .data.username) - PG_PASS=$(echo "$PG_CREDS" | jq -r .data.password) + PG_USER=$(echo "$VAULT_OUT" | jq -r .data.username) + PG_PASS=$(echo "$VAULT_OUT" | jq -r .data.password) export PG_CONN_STR="postgres://${PG_USER}:${PG_PASS}@10.0.20.200:5432/terraform_state?sslmode=disable" fi fi diff --git a/secrets/fullchain.pem b/secrets/fullchain.pem index be5b1a00..e4bc0d60 100644 Binary files a/secrets/fullchain.pem and b/secrets/fullchain.pem differ diff --git a/secrets/privkey.pem b/secrets/privkey.pem index b488a0d6..1f38edfe 100644 Binary files a/secrets/privkey.pem and b/secrets/privkey.pem differ diff --git a/stacks/beads-server/main.tf b/stacks/beads-server/main.tf index 01f75ff4..e11b0ac7 100644 --- a/stacks/beads-server/main.tf +++ b/stacks/beads-server/main.tf @@ -14,7 +14,7 @@ variable "beadboard_image_tag" { # already ships. variable "claude_agent_service_image_tag" { type = string - default = "0c24c9b6" + default = "2fd7670d" } # Kill switch for auto-dispatch. When false, both CronJobs are suspended. The diff --git a/stacks/broker-sync/main.tf b/stacks/broker-sync/main.tf index b3c71905..7c99a916 100644 --- a/stacks/broker-sync/main.tf +++ b/stacks/broker-sync/main.tf @@ -105,7 +105,7 @@ resource "kubernetes_cron_job_v1" "version_probe" { metadata {} spec { backoff_limit = 1 - ttl_seconds_after_finished = 300 + ttl_seconds_after_finished = 86400 template { metadata { labels = { app = "broker-sync", component = "version-probe" } @@ -246,7 +246,12 @@ resource "kubernetes_cron_job_v1" "imap" { concurrency_policy = "Forbid" successful_jobs_history_limit = 3 failed_jobs_history_limit = 5 - suspend = true # enable in Phase 2 + # Unsuspended 2026-04-19 for RSU vest ground-truth ingestion — the parser + # now detects Schwab Release Confirmations and scaffolds VestEvents; the + # postgres sink that persists them into payslip_ingest.rsu_vest_events is + # pending a real-email fixture and cross-service DB grant (see + # follow-up beads task filed under the RSU tax spike fix epic). + suspend = false job_template { metadata {} spec { diff --git a/stacks/crowdsec/modules/crowdsec/main.tf b/stacks/crowdsec/modules/crowdsec/main.tf index cf59ea47..ca7b1998 100644 --- a/stacks/crowdsec/modules/crowdsec/main.tf +++ b/stacks/crowdsec/modules/crowdsec/main.tf @@ -96,6 +96,21 @@ resource "kubernetes_config_map" "crowdsec_whitelist" { reason: "Trusted IP - never block" ip: - "176.12.22.76" + --- + name: viktor/immich-asset-paths-whitelist + description: "Don't penalise legit Immich timeline bursts (mobile scrub, web grid)" + whitelist: + reason: "Immich asset endpoints are auth-gated; mobile scrub legitimately bursts" + expression: + - > + evt.Parsed.target_fqdn == "immich.viktorbarzin.me" && + (evt.Parsed.request startsWith "/api/assets/" || + evt.Parsed.request startsWith "/api/timeline/" || + evt.Parsed.request startsWith "/api/asset/" || + evt.Parsed.request startsWith "/api/search/" || + evt.Parsed.request startsWith "/api/memories" || + evt.Parsed.request startsWith "/api/albums" || + evt.Parsed.request startsWith "/api/activities") YAML } } diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 8389aa93..1ae6f415 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -157,9 +157,9 @@ resource "kubernetes_stateful_set_v1" "mysql_standalone" { required_during_scheduling_ignored_during_execution { node_selector_term { match_expressions { - key = "kubernetes.io/hostname" + key = "nvidia.com/gpu.present" operator = "NotIn" - values = ["k8s-node1"] + values = ["true"] } } } @@ -1209,6 +1209,61 @@ resource "null_resource" "pg_job_hunter_db" { } } +# Create wealthfolio_sync database for the SQLite→PG ETL sidecar that mirrors +# Wealthfolio's daily_account_valuation/accounts/activities into PG so Grafana +# can chart net worth, contributions, and growth. +# Role password is managed by Vault Database Secrets Engine (static role `pg-wealthfolio-sync`, 7d rotation). +resource "null_resource" "pg_wealthfolio_sync_db" { + depends_on = [null_resource.pg_cluster] + + triggers = { + db_name = "wealthfolio_sync" + username = "wealthfolio_sync" + } + + provisioner "local-exec" { + command = <<-EOT + PRIMARY=$(kubectl --kubeconfig ${var.kube_config_path} get cluster -n dbaas pg-cluster -o jsonpath='{.status.currentPrimary}') + kubectl --kubeconfig ${var.kube_config_path} exec -n dbaas $PRIMARY -c postgres -- \ + bash -c ' + psql -U postgres -tc "SELECT 1 FROM pg_catalog.pg_roles WHERE rolname = '"'"'wealthfolio_sync'"'"'" | grep -q 1 || \ + psql -U postgres -c "CREATE ROLE wealthfolio_sync WITH LOGIN PASSWORD '"'"'changeme-vault-will-rotate'"'"'" + psql -U postgres -tc "SELECT 1 FROM pg_catalog.pg_database WHERE datname = '"'"'wealthfolio_sync'"'"'" | grep -q 1 || \ + psql -U postgres -c "CREATE DATABASE wealthfolio_sync OWNER wealthfolio_sync" + psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE wealthfolio_sync TO wealthfolio_sync" + ' + EOT + } +} + +# Create fire_planner database for the FIRE retirement-planning service. +# Role password is managed by Vault Database Secrets Engine +# (static role `pg-fire-planner`, 7d rotation). +# fire_planner reads from payslip_ingest + wealthfolio_sync (read-only) +# and writes its own MC results into schema fire_planner. +resource "null_resource" "pg_fire_planner_db" { + depends_on = [null_resource.pg_cluster] + + triggers = { + db_name = "fire_planner" + username = "fire_planner" + } + + provisioner "local-exec" { + command = <<-EOT + PRIMARY=$(kubectl --kubeconfig ${var.kube_config_path} get cluster -n dbaas pg-cluster -o jsonpath='{.status.currentPrimary}') + kubectl --kubeconfig ${var.kube_config_path} exec -n dbaas $PRIMARY -c postgres -- \ + bash -c ' + psql -U postgres -tc "SELECT 1 FROM pg_catalog.pg_roles WHERE rolname = '"'"'fire_planner'"'"'" | grep -q 1 || \ + psql -U postgres -c "CREATE ROLE fire_planner WITH LOGIN PASSWORD '"'"'changeme-vault-will-rotate'"'"'" + psql -U postgres -tc "SELECT 1 FROM pg_catalog.pg_database WHERE datname = '"'"'fire_planner'"'"'" | grep -q 1 || \ + psql -U postgres -c "CREATE DATABASE fire_planner OWNER fire_planner" + psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE fire_planner TO fire_planner" + ' + EOT + } +} + # Old PostgreSQL deployment — kept commented for rollback reference # resource "kubernetes_deployment" "postgres" { # metadata { diff --git a/stacks/ebook2audiobook/main.tf b/stacks/ebook2audiobook/main.tf index f9871882..8492991f 100644 --- a/stacks/ebook2audiobook/main.tf +++ b/stacks/ebook2audiobook/main.tf @@ -72,7 +72,7 @@ resource "kubernetes_deployment" "ebook2audiobook" { spec { node_selector = { - "gpu" : "true" + "nvidia.com/gpu.present" : "true" } toleration { key = "nvidia.com/gpu" @@ -290,7 +290,7 @@ resource "kubernetes_deployment" "audiblez" { } spec { node_selector = { - "gpu" : "true" + "nvidia.com/gpu.present" : "true" } toleration { key = "nvidia.com/gpu" @@ -356,7 +356,7 @@ resource "kubernetes_deployment" "audiblez-web" { } spec { node_selector = { - "gpu" : "true" + "nvidia.com/gpu.present" : "true" } toleration { key = "nvidia.com/gpu" diff --git a/stacks/fire-planner/main.tf b/stacks/fire-planner/main.tf new file mode 100644 index 00000000..09e1177b --- /dev/null +++ b/stacks/fire-planner/main.tf @@ -0,0 +1,383 @@ +variable "image_tag" { + type = string + default = "latest" + description = "fire-planner image tag. Use 8-char git SHA in CI; :latest only for local trials." +} + +variable "postgresql_host" { type = string } + +locals { + namespace = "fire-planner" + image = "registry.viktorbarzin.me/fire-planner:${var.image_tag}" + labels = { + app = "fire-planner" + } +} + +resource "kubernetes_namespace" "fire_planner" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.aux + "istio-injection" = "disabled" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps + # this label on every namespace. + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# App secrets — the recompute-API bearer token (manual seed in Vault). +# Seed before applying: +# secret/fire-planner -> property `recompute_bearer_token` +resource "kubernetes_manifest" "external_secret" { + manifest = { + apiVersion = "external-secrets.io/v1beta1" + kind = "ExternalSecret" + metadata = { + name = "fire-planner-secrets" + namespace = local.namespace + } + spec = { + refreshInterval = "15m" + secretStoreRef = { + name = "vault-kv" + kind = "ClusterSecretStore" + } + target = { + name = "fire-planner-secrets" + template = { + metadata = { + annotations = { + "reloader.stakater.com/match" = "true" + } + } + } + } + data = [ + { + secretKey = "RECOMPUTE_BEARER_TOKEN" + remoteRef = { + key = "fire-planner" + property = "recompute_bearer_token" + } + }, + ] + } + } + depends_on = [kubernetes_namespace.fire_planner] +} + +# DB credentials from Vault database engine (rotated every 7 days). +# Template builds the asyncpg DSN consumed by the FastAPI app + CronJob +# as DB_CONNECTION_STRING. +resource "kubernetes_manifest" "db_external_secret" { + manifest = { + apiVersion = "external-secrets.io/v1beta1" + kind = "ExternalSecret" + metadata = { + name = "fire-planner-db-creds" + namespace = local.namespace + } + spec = { + refreshInterval = "15m" + secretStoreRef = { + name = "vault-database" + kind = "ClusterSecretStore" + } + target = { + name = "fire-planner-db-creds" + template = { + metadata = { + annotations = { + "reloader.stakater.com/match" = "true" + } + } + data = { + DB_CONNECTION_STRING = "postgresql+asyncpg://fire_planner:{{ .password }}@${var.postgresql_host}:5432/fire_planner" + DB_PASSWORD = "{{ .password }}" + } + } + } + data = [{ + secretKey = "password" + remoteRef = { + key = "static-creds/pg-fire-planner" + property = "password" + } + }] + } + } + depends_on = [kubernetes_namespace.fire_planner] +} + +resource "kubernetes_deployment" "fire_planner" { + metadata { + name = "fire-planner" + namespace = kubernetes_namespace.fire_planner.metadata[0].name + labels = merge(local.labels, { + tier = local.tiers.aux + }) + annotations = { + "reloader.stakater.com/search" = "true" + } + } + + spec { + replicas = 1 + strategy { + type = "Recreate" + } + + selector { + match_labels = local.labels + } + + template { + metadata { + labels = local.labels + annotations = { + "dependency.kyverno.io/wait-for" = "postgresql.dbaas:5432" + } + } + + spec { + image_pull_secrets { + name = "registry-credentials" + } + + init_container { + name = "alembic-migrate" + image = local.image + command = ["python", "-m", "fire_planner", "migrate"] + + env_from { + secret_ref { + name = "fire-planner-db-creds" + } + } + + resources { + requests = { + cpu = "50m" + memory = "256Mi" + } + limits = { + memory = "512Mi" + } + } + } + + container { + name = "fire-planner" + image = local.image + + command = ["python", "-m", "fire_planner", "serve"] + + port { + container_port = 8080 + } + + env_from { + secret_ref { + name = "fire-planner-secrets" + } + } + env_from { + secret_ref { + name = "fire-planner-db-creds" + } + } + + readiness_probe { + http_get { + path = "/healthz" + port = 8080 + } + initial_delay_seconds = 5 + period_seconds = 10 + } + + liveness_probe { + http_get { + path = "/healthz" + port = 8080 + } + initial_delay_seconds = 5 + period_seconds = 10 + } + + resources { + requests = { + cpu = "100m" + memory = "512Mi" + } + limits = { + memory = "1024Mi" + } + } + } + } + } + } + + lifecycle { + ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + } + + depends_on = [ + kubernetes_manifest.external_secret, + kubernetes_manifest.db_external_secret, + ] +} + +# ClusterIP-only — /recompute is cluster-internal (operator triggers +# via kubectl port-forward or ad-hoc CronJob). +resource "kubernetes_service" "fire_planner" { + metadata { + name = "fire-planner" + namespace = kubernetes_namespace.fire_planner.metadata[0].name + labels = local.labels + } + + spec { + type = "ClusterIP" + selector = local.labels + + port { + name = "http" + port = 8080 + target_port = 8080 + } + } +} + +# Monthly recompute on the 2nd at 09:00 UTC. Wealthfolio-sync runs on +# the 1st at 08:00, so account_snapshot is fresh by the time the +# planner picks up. +resource "kubernetes_cron_job_v1" "fire_planner_recompute" { + metadata { + name = "fire-planner-recompute" + namespace = kubernetes_namespace.fire_planner.metadata[0].name + } + spec { + schedule = "0 9 2 * *" + concurrency_policy = "Forbid" + successful_jobs_history_limit = 3 + failed_jobs_history_limit = 5 + starting_deadline_seconds = 600 + + job_template { + metadata { + labels = local.labels + } + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 86400 + template { + metadata { + labels = local.labels + } + spec { + restart_policy = "OnFailure" + image_pull_secrets { + name = "registry-credentials" + } + container { + name = "recompute" + image = local.image + command = ["python", "-m", "fire_planner", "recompute-all"] + + env_from { + secret_ref { + name = "fire-planner-secrets" + } + } + env_from { + secret_ref { + name = "fire-planner-db-creds" + } + } + + resources { + requests = { + cpu = "200m" + memory = "1Gi" + } + limits = { + memory = "2Gi" + } + } + } + } + } + } + } + } + + lifecycle { + # KYVERNO_LIFECYCLE_V1 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } + + depends_on = [ + kubernetes_manifest.external_secret, + kubernetes_manifest.db_external_secret, + ] +} + +# Plan-time read of the ESO-created K8s Secret for Grafana datasource +# password. First-apply gotcha: must +# `terragrunt apply -target=kubernetes_manifest.db_external_secret` so +# the Secret exists before this data source plans. +data "kubernetes_secret" "fire_planner_db_creds" { + metadata { + name = "fire-planner-db-creds" + namespace = kubernetes_namespace.fire_planner.metadata[0].name + } + depends_on = [kubernetes_manifest.db_external_secret] +} + +# Grafana datasource for fire_planner PostgreSQL DB. +# Lives in the monitoring namespace so the grafana sidecar +# (label grafana_datasource=1) picks it up. +# +# Grafana 11.2+ Postgres plugin reads the DB name from jsonData.database; +# the top-level `database` field is silently ignored by the frontend and +# triggers "you do not have default database" on every panel. +# See github.com/grafana/grafana#112418 — same fix as the payslip-ingest +# datasource (commit cc56ba29). +resource "kubernetes_config_map" "grafana_fire_planner_datasource" { + metadata { + name = "grafana-fire-planner-datasource" + namespace = "monitoring" + labels = { + grafana_datasource = "1" + } + } + data = { + "fire-planner-datasource.yaml" = yamlencode({ + apiVersion = 1 + datasources = [{ + name = "FirePlanner" + type = "postgres" + access = "proxy" + url = "${var.postgresql_host}:5432" + user = "fire_planner" + uid = "fire-planner-pg" + jsonData = { + database = "fire_planner" + sslmode = "disable" + postgresVersion = 1600 + timescaledb = false + } + secureJsonData = { + password = data.kubernetes_secret.fire_planner_db_creds.data["DB_PASSWORD"] + } + editable = true + }] + }) + } +} diff --git a/stacks/fire-planner/terragrunt.hcl b/stacks/fire-planner/terragrunt.hcl new file mode 100644 index 00000000..c1d2e468 --- /dev/null +++ b/stacks/fire-planner/terragrunt.hcl @@ -0,0 +1,28 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +dependency "vault" { + config_path = "../vault" + skip_outputs = true +} + +dependency "external-secrets" { + config_path = "../external-secrets" + skip_outputs = true +} + +dependency "dbaas" { + config_path = "../dbaas" + skip_outputs = true +} + +inputs = { + # fire-planner repo HEAD — bump on every deploy. + image_tag = "latest" +} diff --git a/stacks/frigate/main.tf b/stacks/frigate/main.tf index 31079be9..489daa63 100644 --- a/stacks/frigate/main.tf +++ b/stacks/frigate/main.tf @@ -87,7 +87,7 @@ resource "kubernetes_deployment" "frigate" { } spec { node_selector = { - "gpu" : true + "nvidia.com/gpu.present" : "true" } toleration { key = "nvidia.com/gpu" diff --git a/stacks/hermes-agent/main.tf b/stacks/hermes-agent/main.tf index 89de6d6b..0881932f 100644 --- a/stacks/hermes-agent/main.tf +++ b/stacks/hermes-agent/main.tf @@ -220,7 +220,8 @@ resource "kubernetes_deployment" "hermes_agent" { strategy { type = "Recreate" } - replicas = 1 + # Disabled 2026-04-22 — main container fails with "mkdir: cannot create directory '/opt/data': Permission denied" (fsGroup/runAsUser mismatch vs init container). Re-enable after fixing PVC permissions. + replicas = 0 selector { match_labels = { app = "hermes-agent" diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index b17e7d55..3ee56d1f 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -85,6 +85,30 @@ module "nfs_postgresql_host" { nfs_path = "/srv/nfs/immich/postgresql" } +# Migrated 2026-04-25: PG live data moved off NFS to LUKS-encrypted block. +# WAL fsync per commit on NFS contributed to the 2026-04-22 NFS writeback storm +# (see post-mortems/2026-04-22-vault-raft-leader-deadlock.md). +# Backup CronJob still writes to module.nfs_postgresql_host (NFS append-only). +resource "kubernetes_persistent_volume_claim" "immich_postgresql_encrypted" { + wait_until_bound = false + metadata { + name = "immich-postgresql-data-encrypted" + namespace = kubernetes_namespace.immich.metadata[0].name + annotations = { + "resize.topolvm.io/threshold" = "80%" + "resize.topolvm.io/increase" = "100%" + "resize.topolvm.io/storage_limit" = "20Gi" + } + } + spec { + access_modes = ["ReadWriteOnce"] + storage_class_name = "proxmox-lvm-encrypted" + resources { + requests = { storage = "10Gi" } + } + } +} + module "nfs_ml_cache_host" { source = "../../modules/kubernetes/nfs_volume" name = "immich-ml-cache-host" @@ -164,7 +188,7 @@ resource "kubernetes_deployment" "immich_server" { } strategy { - type = "RollingUpdate" + type = "Recreate" } template { @@ -287,10 +311,10 @@ resource "kubernetes_deployment" "immich_server" { resources { requests = { cpu = "100m" - memory = "2000Mi" + memory = "4096Mi" } limits = { - memory = "3500Mi" + memory = "4096Mi" } } } @@ -462,6 +486,13 @@ resource "kubernetes_deployment" "immich-postgres" { name = "write-pg-override-conf" image = "busybox:1.36" command = ["sh", "-c", <<-EOT + # Skip write on uninitialised PGDATA — initdb refuses non-empty dirs. + # On first boot the override is absent; trigger a pod restart after + # initdb completes so the override is applied before extension load. + if [ ! -f /data/PG_VERSION ]; then + echo "PGDATA uninitialised, skipping override conf (will write on next pod start)" + exit 0 + fi cat > /data/postgresql.override.conf <<'PGCONF' # Immich vector search performance tuning shared_buffers = 2048MB @@ -481,7 +512,7 @@ resource "kubernetes_deployment" "immich-postgres" { volume { name = "postgresql-persistent-storage" persistent_volume_claim { - claim_name = module.nfs_postgresql_host.claim_name + claim_name = kubernetes_persistent_volume_claim.immich_postgresql_encrypted.metadata[0].name } } } @@ -548,7 +579,7 @@ resource "kubernetes_deployment" "immich-machine-learning" { } } strategy { - type = "RollingUpdate" + type = "Recreate" } template { metadata { @@ -559,7 +590,7 @@ resource "kubernetes_deployment" "immich-machine-learning" { spec { priority_class_name = "gpu-workload" node_selector = { - "gpu" : "true" + "nvidia.com/gpu.present" : "true" } toleration { key = "nvidia.com/gpu" diff --git a/stacks/job-hunter/terragrunt.hcl b/stacks/job-hunter/terragrunt.hcl index 93df44f1..8f4a32fb 100644 --- a/stacks/job-hunter/terragrunt.hcl +++ b/stacks/job-hunter/terragrunt.hcl @@ -18,8 +18,9 @@ dependency "external-secrets" { } inputs = { - # 8-char SHA from the Forgejo commit viktor/job-hunter@9c42eac9 - # (first image built locally + pushed 2026-04-19 due to a Woodpecker - # v3.13 Forgejo webhook bug; bump on every deploy once CI recovers). - image_tag = "48f8615d" + # 92afc38d = master HEAD with levels.fyi scraper + comp_table COALESCE + # fix + Frankfurter FX backend (exchangerate.host free tier deprecated + # in 2026). Built + pushed locally 2026-04-19 while the Woodpecker + # Forgejo webhook remains broken. + image_tag = "92afc38d" } diff --git a/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts b/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts index f96f4d56..21405a94 100644 --- a/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts +++ b/stacks/k8s-portal/modules/k8s-portal/files/src/routes/agent/+server.ts @@ -138,7 +138,7 @@ Kyverno auto-generates LimitRange + ResourceQuota per namespace based on tier la - **Proxmox**: 192.168.1.127 (Dell R730, 22c/44t, 142GB RAM) - **Nodes**: k8s-master (10.0.20.100), node1 (GPU, Tesla T4), node2-4 -- **GPU workloads**: \`node_selector = { "gpu": "true" }\` + toleration \`nvidia.com/gpu\` +- **GPU workloads**: \`node_selector = { "nvidia.com/gpu.present" : "true" }\` + toleration \`nvidia.com/gpu\` (label auto-applied by gpu-feature-discovery, no hostname pins) - **Pull-through cache**: 10.0.20.10 — use versioned image tags (cache serves stale :latest manifests) - **MySQL InnoDB Cluster**: 3 instances on iSCSI - **SMTP**: \`var.mail_host\` port 587 STARTTLS diff --git a/stacks/monitoring/main.tf b/stacks/monitoring/main.tf index c4961fdd..0c207aa0 100644 --- a/stacks/monitoring/main.tf +++ b/stacks/monitoring/main.tf @@ -30,6 +30,7 @@ module "monitoring" { haos_api_token = data.vault_kv_secret_v2.secrets.data["haos_api_token"] pve_password = data.vault_kv_secret_v2.secrets.data["pve_password"] grafana_admin_password = data.vault_kv_secret_v2.secrets.data["grafana_admin_password"] + kube_config_path = var.kube_config_path registry_user = data.vault_kv_secret_v2.viktor.data["registry_user"] registry_password = data.vault_kv_secret_v2.viktor.data["registry_password"] tier = local.tiers.cluster diff --git a/stacks/monitoring/modules/monitoring/dashboards/fire-planner.json b/stacks/monitoring/modules/monitoring/dashboards/fire-planner.json new file mode 100644 index 00000000..9dba9e11 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/dashboards/fire-planner.json @@ -0,0 +1,226 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "datasource", "uid": "grafana"}, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "FIRE Retirement Planner — risk-adjusted, tax-minimised Monte Carlo over jurisdictions, withdrawal strategies, and UK-departure years. Backed by fire_planner schema on pg-cluster-rw.", + "editable": true, + "fiscalYearStartMonth": 0, + "id": null, + "templating": { + "list": [ + { + "name": "scenario", + "type": "query", + "label": "Scenario", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "query": "SELECT external_id FROM fire_planner.scenario ORDER BY external_id", + "refresh": 1, + "includeAll": false, + "multi": false, + "current": {"selected": false, "text": "cyprus-vpw-leave-y3-glide-rising", "value": "cyprus-vpw-leave-y3-glide-rising"} + } + ] + }, + "links": [], + "panels": [ + { + "id": 1, + "title": "Net worth over time (real + nominal)", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}, + "fieldConfig": { + "defaults": {"unit": "currencyGBP", "decimals": 0}, + "overrides": [] + }, + "options": {"legend": {"displayMode": "table", "showLegend": true}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT snapshot_date AS time, account_name AS metric, SUM(market_value_gbp) AS value FROM fire_planner.account_snapshot WHERE snapshot_date >= NOW() - INTERVAL '10 years' GROUP BY snapshot_date, account_name ORDER BY snapshot_date" + } + ] + }, + { + "id": 2, + "title": "Monte Carlo fan chart — selected scenario", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 8}, + "description": "P10/p25/p50/p75/p90 portfolio value across MC paths, for the scenario picked in the selector at the top.", + "fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []}, + "options": {"legend": {"displayMode": "table", "showLegend": true}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval) AS time, 'p10' AS metric, p10_portfolio_gbp AS value FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p25', p25_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p50', p50_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p75', p75_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' UNION ALL SELECT (DATE_TRUNC('year', NOW()) + (year_idx || ' years')::interval), 'p90', p90_portfolio_gbp FROM fire_planner.projection_yearly p JOIN fire_planner.mc_run r ON r.id = p.mc_run_id JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE s.external_id = '$scenario' ORDER BY time" + } + ] + }, + { + "id": 3, + "title": "Confidence heatmap — jurisdiction × strategy", + "type": "table", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 18}, + "description": "Median success rate by (jurisdiction, strategy), averaged across leave-UK years and glide paths.", + "fieldConfig": { + "defaults": {"custom": {"align": "left", "displayMode": "auto"}, "unit": "percentunit", "decimals": 2}, + "overrides": [] + }, + "options": {"showHeader": true}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT jurisdiction, strategy, AVG(success_rate) AS avg_success FROM fire_planner.scenario_summary GROUP BY jurisdiction, strategy ORDER BY jurisdiction, strategy" + } + ] + }, + { + "id": 4, + "title": "Median lifetime tax — by jurisdiction", + "type": "barchart", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 18}, + "fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []}, + "options": {"orientation": "horizontal", "showValue": "auto", "stacking": "none", "legend": {"displayMode": "list"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT jurisdiction, AVG(median_lifetime_tax_gbp) AS lifetime_tax FROM fire_planner.scenario_summary GROUP BY jurisdiction ORDER BY lifetime_tax DESC" + } + ] + }, + { + "id": 5, + "title": "Withdrawal runway — years to ruin (failing paths)", + "type": "table", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 26}, + "description": "Among scenarios where some MC paths failed, the median year-to-ruin. Empty where every path survives.", + "fieldConfig": {"defaults": {"unit": "y", "decimals": 1}, "overrides": []}, + "options": {"showHeader": true}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT jurisdiction, strategy, leave_uk_year, glide_path, median_years_to_ruin FROM fire_planner.scenario_summary WHERE median_years_to_ruin IS NOT NULL ORDER BY median_years_to_ruin ASC LIMIT 20" + } + ] + }, + { + "id": 6, + "title": "Optimal leave-UK year", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 4, "w": 6, "x": 12, "y": 26}, + "description": "leave_uk_year that maximises success_rate − lifetime_tax (tax in £M; small weighting).", + "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []}, + "options": {"colorMode": "value", "reduceOptions": {"calcs": ["lastNotNull"]}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT leave_uk_year FROM fire_planner.scenario_summary WHERE jurisdiction <> 'uk' ORDER BY (success_rate - median_lifetime_tax_gbp / 1000000.0) DESC LIMIT 1" + } + ] + }, + { + "id": 7, + "title": "Median ending wealth — selected scenario", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 26}, + "fieldConfig": {"defaults": {"unit": "currencyGBP", "decimals": 0}, "overrides": []}, + "options": {"colorMode": "value", "reduceOptions": {"calcs": ["lastNotNull"]}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT p50_ending_gbp FROM fire_planner.scenario_summary WHERE scenario_id = (SELECT id FROM fire_planner.scenario WHERE external_id = '$scenario')" + } + ] + }, + { + "id": 8, + "title": "Success rate vs spend (UK-stay)", + "type": "barchart", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 30}, + "description": "Sanity gauge — UK success rate by strategy, helps anchor expectations against published cFIREsim numbers.", + "fieldConfig": {"defaults": {"unit": "percentunit", "decimals": 2}, "overrides": []}, + "options": {"orientation": "horizontal", "showValue": "auto", "legend": {"displayMode": "list"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT strategy, AVG(success_rate) AS success FROM fire_planner.scenario_summary WHERE jurisdiction = 'uk' GROUP BY strategy ORDER BY success DESC" + } + ] + }, + { + "id": 9, + "title": "Sequence-of-returns sensitivity (top failing scenarios)", + "type": "table", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 30}, + "description": "Pearson correlation between year-1 portfolio drawdown and overall success — strongly negative ⇒ scenario is sequence-of-returns sensitive (case for the rising-equity glide).", + "fieldConfig": {"defaults": {"unit": "none", "decimals": 4}, "overrides": []}, + "options": {"showHeader": true}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "fire-planner-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT s.external_id, r.sequence_risk_correlation, r.success_rate FROM fire_planner.mc_run r JOIN fire_planner.scenario s ON s.id = r.scenario_id WHERE r.id IN (SELECT MAX(id) FROM fire_planner.mc_run GROUP BY scenario_id) ORDER BY r.sequence_risk_correlation ASC LIMIT 15" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["finance", "fire", "retirement", "monte-carlo"], + "title": "FIRE Planner", + "uid": "fire-planner", + "version": 1, + "weekStart": "" +} diff --git a/stacks/monitoring/modules/monitoring/dashboards/job-hunter.json b/stacks/monitoring/modules/monitoring/dashboards/job-hunter.json index d38bc40c..526e2fe0 100644 --- a/stacks/monitoring/modules/monitoring/dashboards/job-hunter.json +++ b/stacks/monitoring/modules/monitoring/dashboards/job-hunter.json @@ -197,12 +197,192 @@ ], "title": "Top roles", "type": "table" + }, + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "description": "Per-company median base salary broken out by seniority level (comp_points, GBP).", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "custom": { + "align": "auto", + "cellOptions": {"type": "auto"}, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": []}, + "unit": "currencyGBP" + }, + "overrides": [] + }, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 29}, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, + "showHeader": true + }, + "targets": [ + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT c.display_name AS company, l.slug AS level, percentile_cont(0.5) WITHIN GROUP (ORDER BY cp.base_gbp) AS p50_base_gbp, COUNT(*) AS n FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id LEFT JOIN job_hunter.levels l ON cp.level_id = l.id WHERE cp.base_gbp IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) AND (c.slug = ANY(string_to_array(${company:sqlstring}, ',')) OR ${company:sqlstring} = 'all') GROUP BY c.display_name, l.slug ORDER BY c.display_name, l.rank NULLS LAST", + "refId": "A" + } + ], + "title": "Per-company salary by level (p50 base)", + "type": "table" + }, + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "description": "p50 total comp (base + bonus + RSU/year + sign-on/year) per (company, level).", + "fieldConfig": { + "defaults": { + "color": {"mode": "continuous-GrYlRd"}, + "custom": {"align": "center", "cellOptions": {"type": "color-background"}}, + "unit": "currencyGBP" + }, + "overrides": [] + }, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 39}, + "id": 7, + "options": { + "cellHeight": "sm", + "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, + "showHeader": true + }, + "targets": [ + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "format": "table", + "rawQuery": true, + "rawSql": "SELECT c.display_name AS company, l.slug AS level, percentile_cont(0.5) WITHIN GROUP (ORDER BY COALESCE(cp.base_gbp, 0) + COALESCE(cp.bonus_gbp, 0) + COALESCE(cp.rsu_annual_gbp, 0) + COALESCE(cp.signon_gbp, 0)) AS p50_total_gbp FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id LEFT JOIN job_hunter.levels l ON cp.level_id = l.id WHERE cp.base_gbp IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) GROUP BY c.display_name, l.slug ORDER BY c.display_name", + "refId": "A" + } + ], + "title": "Total comp heatmap (p50, GBP)", + "type": "table" + }, + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "description": "Comp-datapoint ingestion volume by source.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "bars", + "fillOpacity": 60, + "lineWidth": 1, + "stacking": {"mode": "normal"} + } + }, + "overrides": [] + }, + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 39}, + "id": 8, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, + "tooltip": {"mode": "single", "sort": "none"} + }, + "targets": [ + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT date_trunc('day', fetched_at) AT TIME ZONE 'UTC' AS time, source, COUNT(*) AS value FROM job_hunter.comp_points WHERE $__timeFilter(fetched_at) GROUP BY 1, 2 ORDER BY 1", + "refId": "A" + } + ], + "title": "Comp-point volume by source", + "type": "timeseries" + }, + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "description": "p50 base salary trend by (company, level) for top 5 companies.", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 6, + "showPoints": "auto" + }, + "unit": "currencyGBP" + }, + "overrides": [] + }, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 49}, + "id": 9, + "options": { + "legend": {"displayMode": "table", "placement": "right", "showLegend": true}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "format": "time_series", + "rawQuery": true, + "rawSql": "WITH ranked AS (SELECT c.slug AS company_slug, COUNT(*) AS n FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id WHERE cp.base_gbp IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) GROUP BY c.slug ORDER BY n DESC LIMIT 5) SELECT date_trunc('month', cp.effective_date)::timestamp AS time, c.display_name || ' / ' || COALESCE(l.slug, 'unknown') AS metric, percentile_cont(0.5) WITHIN GROUP (ORDER BY cp.base_gbp) AS value FROM job_hunter.comp_points cp JOIN job_hunter.companies c ON cp.company_id = c.id LEFT JOIN job_hunter.levels l ON cp.level_id = l.id WHERE cp.base_gbp IS NOT NULL AND cp.effective_date IS NOT NULL AND cp.location_bucket IN (${location:sqlstring}) AND c.slug IN (SELECT company_slug FROM ranked) AND (l.slug = ${level:sqlstring} OR ${level:sqlstring} = 'all') GROUP BY 1, 2 ORDER BY 1", + "refId": "A" + } + ], + "title": "Base-salary trend (p50) — top 5 companies", + "type": "timeseries" } ], "refresh": "", "schemaVersion": 39, "tags": ["job-hunter", "jobs", "careers"], - "templating": {"list": []}, + "templating": {"list": [ + { + "current": {"selected": true, "text": ["london"], "value": ["london"]}, + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "definition": "SELECT DISTINCT location_bucket FROM job_hunter.comp_points ORDER BY 1", + "includeAll": false, + "label": "Location", + "multi": true, + "name": "location", + "options": [], + "query": "SELECT DISTINCT location_bucket FROM job_hunter.comp_points ORDER BY 1", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {"selected": true, "text": "senior", "value": "senior"}, + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "definition": "SELECT slug FROM job_hunter.levels WHERE company_id IS NULL ORDER BY rank", + "includeAll": true, + "allValue": "all", + "label": "Level", + "multi": false, + "name": "level", + "options": [], + "query": "SELECT slug FROM job_hunter.levels WHERE company_id IS NULL ORDER BY rank", + "refresh": 1, + "regex": "", + "type": "query" + }, + { + "current": {"selected": true, "text": "all", "value": "all"}, + "datasource": {"type": "grafana-postgresql-datasource", "uid": "job-hunter-pg"}, + "definition": "SELECT slug FROM job_hunter.companies ORDER BY slug", + "includeAll": true, + "allValue": "all", + "label": "Company", + "multi": true, + "name": "company", + "options": [], + "query": "SELECT slug FROM job_hunter.companies ORDER BY slug", + "refresh": 1, + "regex": "", + "type": "query" + } + ]}, "time": {"from": "now-30d", "to": "now"}, "timepicker": {}, "timezone": "browser", diff --git a/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json b/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json index 9b0c2644..226a3c43 100644 --- a/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json +++ b/stacks/monitoring/modules/monitoring/dashboards/uk-payslip.json @@ -179,7 +179,7 @@ { "id": 7, "title": "YTD uses \u2014 where gross went", - "description": "Year-to-date cumulative breakdown of where the gross went. Stacked \u2014 top equals gross_pay minus student loan and RSU offset (both small; shown on Panel 8 Sankey). Green = take-home; red = cash income tax; orange = RSU-attributed income tax + NI; purple = pension.", + "description": "Year-to-date cumulative breakdown of where the gross went. Stacked \u2014 top equals gross_pay minus student loan and RSU offset (both small; shown on Panel 8 Sankey). RSU vest tax broken out at the exact band-aware marginal (PA-taper aware: 60% in the \u00a3100k\u2013\u00a3125,140 zone, 47% additional-rate, etc.) \u2014 see SQL for full bands. Green = take-home; red = cash income tax; orange = tax on RSU vest; orange = cash NI; purple = pension.", "type": "timeseries", "datasource": { "type": "grafana-postgresql-datasource", @@ -258,14 +258,14 @@ }, { "id": "displayName", - "value": "Income Tax (cash pay)" + "value": "Income Tax (cash)" } ] }, { "matcher": { "id": "byName", - "options": "ytd_rsu_income_tax" + "options": "ytd_rsu_tax_marginal" }, "properties": [ { @@ -277,14 +277,14 @@ }, { "id": "displayName", - "value": "Income Tax (RSU-attributed)" + "value": "Tax on RSU vest (band-aware marginal)" } ] }, { "matcher": { "id": "byName", - "options": "ytd_ni" + "options": "ytd_cash_ni" }, "properties": [ { @@ -296,7 +296,7 @@ }, { "id": "displayName", - "value": "National Insurance" + "value": "National Insurance (cash)" } ] }, @@ -341,7 +341,7 @@ "type": "grafana-postgresql-datasource", "uid": "payslips-pg" }, - "rawSql": "SELECT pay_date AS \"time\", SUM(net_pay) OVER w AS ytd_net, SUM(COALESCE(cash_income_tax, income_tax)) OVER w AS ytd_cash_income_tax, SUM(income_tax - COALESCE(cash_income_tax, income_tax)) OVER w AS ytd_rsu_income_tax, SUM(national_insurance) OVER w AS ytd_ni, SUM(pension_employee) OVER w AS ytd_pension_employee FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date) WINDOW w AS (PARTITION BY tax_year ORDER BY pay_date) ORDER BY pay_date", + "rawSql": "WITH r AS (SELECT * FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date)), ani AS (SELECT *, COALESCE(SUM(gross_pay - COALESCE(pension_sacrifice, 0)) OVER (PARTITION BY tax_year ORDER BY pay_date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0) AS ani_prior FROM r), slice AS (SELECT *, ani_prior + gross_pay - COALESCE(rsu_vest, 0) - COALESCE(pension_sacrifice, 0) AS ani_pre, ani_prior + gross_pay - COALESCE(pension_sacrifice, 0) AS ani_post FROM ani), m AS (SELECT *, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.20 + GREATEST(0, LEAST(ani_post, 100000) - GREATEST(ani_pre, 50270)) * 0.40 + GREATEST(0, LEAST(ani_post, 125140) - GREATEST(ani_pre, 100000)) * 0.60 + GREATEST(0, ani_post - GREATEST(ani_pre, 125140)) * 0.45 AS rsu_paye_marginal, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.08 + GREATEST(0, ani_post - GREATEST(ani_pre, 50270)) * 0.02 AS rsu_ni_marginal FROM slice) SELECT pay_date AS \"time\", SUM(net_pay) OVER w AS ytd_net, SUM(GREATEST(0, income_tax - rsu_paye_marginal)) OVER w AS ytd_cash_income_tax, SUM(rsu_paye_marginal + rsu_ni_marginal) OVER w AS ytd_rsu_tax_marginal, SUM(GREATEST(0, national_insurance - rsu_ni_marginal)) OVER w AS ytd_cash_ni, SUM(pension_employee) OVER w AS ytd_pension_employee FROM m WINDOW w AS (PARTITION BY tax_year ORDER BY pay_date) ORDER BY pay_date", "format": "time_series", "refId": "A", "rawQuery": true, @@ -352,6 +352,7 @@ { "id": 2, "title": "Monthly cash flow (RSU stripped)", + "description": "Cash-only view: gross pay minus the RSU vest (cash_gross) and the bank-deposited net_pay. Tax and NI are not shown here because UK cumulative PAYE genuinely takes a YTD true-up chunk in vest months on top of the marginal RSU PAYE \u2014 see Panel 11 for the full tax breakdown with the band-aware RSU split.", "type": "timeseries", "datasource": { "type": "grafana-postgresql-datasource", @@ -422,7 +423,7 @@ "type": "grafana-postgresql-datasource", "uid": "payslips-pg" }, - "rawSql": "SELECT pay_date AS \"time\", (gross_pay - rsu_vest) AS cash_gross, net_pay, COALESCE(cash_income_tax, income_tax) AS income_tax, national_insurance FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date) ORDER BY pay_date", + "rawSql": "SELECT pay_date AS \"time\", (gross_pay - rsu_vest) AS cash_gross, net_pay FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date) ORDER BY pay_date", "format": "time_series", "refId": "A", "rawQuery": true, @@ -433,7 +434,7 @@ { "id": 3, "title": "Effective rate & take-home % (YTD cumulative)", - "description": "YTD-cumulative rates. PAYE rate uses reported taxable_pay as the base; all-deductions rate uses gross_pay. Computed from cumulative SUM over the tax year, so vest-month RSU tax is blended proportionally with RSU value \u2014 no per-slip attribution hack, no spikes.", + "description": "YTD-cumulative rates \u2014 three angles on take-home. (1) PAYE rate = SUM(income_tax) / SUM(taxable_pay): the audit number HMRC uses, converges to ~marginal in the additional-rate band. (2) Cash take-home % = SUM(net_pay) / SUM(gross_pay - rsu_vest): what fraction of cash earnings becomes a bank deposit; useful for cash-flow planning. (3) Total keep % = (SUM(net_pay) + SUM(rsu_vest - rsu_paye_marginal - rsu_ni_marginal)) / SUM(gross_pay): true 'what I actually keep' including post-tax RSU shares with the exact band-aware marginal (PA-taper aware). Resets on 6-April tax year boundary.", "type": "timeseries", "datasource": { "type": "grafana-postgresql-datasource", @@ -484,7 +485,65 @@ } } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "ytd_paye_rate_pct" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#C4162A" + } + }, + { + "id": "displayName", + "value": "PAYE rate (HMRC, on taxable_pay)" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ytd_cash_take_home_pct" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "green" + } + }, + { + "id": "displayName", + "value": "Cash take-home % (net / cash_gross)" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ytd_total_keep_pct" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "blue" + } + }, + { + "id": "displayName", + "value": "Total keep % (cash + post-tax shares)" + } + ] + } + ] }, "options": { "legend": { @@ -506,7 +565,7 @@ "type": "grafana-postgresql-datasource", "uid": "payslips-pg" }, - "rawSql": "SELECT pay_date AS \"time\", ROUND(((SUM(income_tax) OVER w)::numeric / NULLIF(SUM(COALESCE(taxable_pay, gross_pay)) OVER w, 0)) * 100, 2) AS \"ytd_paye_rate_pct\", ROUND((((SUM(income_tax) OVER w) + (SUM(national_insurance) OVER w) + (SUM(student_loan) OVER w))::numeric / NULLIF(SUM(gross_pay) OVER w, 0)) * 100, 2) AS \"ytd_all_deductions_pct\", ROUND(((SUM(net_pay) OVER w)::numeric / NULLIF(SUM(gross_pay) OVER w, 0)) * 100, 2) AS \"ytd_take_home_pct\" FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date) WINDOW w AS (PARTITION BY tax_year ORDER BY pay_date) ORDER BY pay_date", + "rawSql": "WITH r AS (SELECT * FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date)), ani AS (SELECT *, COALESCE(SUM(gross_pay - COALESCE(pension_sacrifice, 0)) OVER (PARTITION BY tax_year ORDER BY pay_date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0) AS ani_prior FROM r), slice AS (SELECT *, ani_prior + gross_pay - COALESCE(rsu_vest, 0) - COALESCE(pension_sacrifice, 0) AS ani_pre, ani_prior + gross_pay - COALESCE(pension_sacrifice, 0) AS ani_post FROM ani), m AS (SELECT *, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.20 + GREATEST(0, LEAST(ani_post, 100000) - GREATEST(ani_pre, 50270)) * 0.40 + GREATEST(0, LEAST(ani_post, 125140) - GREATEST(ani_pre, 100000)) * 0.60 + GREATEST(0, ani_post - GREATEST(ani_pre, 125140)) * 0.45 AS rsu_paye_marginal, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.08 + GREATEST(0, ani_post - GREATEST(ani_pre, 50270)) * 0.02 AS rsu_ni_marginal FROM slice) SELECT pay_date AS \"time\", ROUND(((SUM(income_tax) OVER w)::numeric / NULLIF(SUM(COALESCE(taxable_pay, gross_pay)) OVER w, 0)) * 100, 2) AS \"ytd_paye_rate_pct\", ROUND(((SUM(net_pay) OVER w)::numeric / NULLIF(SUM(gross_pay - rsu_vest) OVER w, 0)) * 100, 2) AS \"ytd_cash_take_home_pct\", ROUND((((SUM(net_pay) OVER w) + (SUM(rsu_vest - rsu_paye_marginal - rsu_ni_marginal) OVER w))::numeric / NULLIF(SUM(gross_pay) OVER w, 0)) * 100, 2) AS \"ytd_total_keep_pct\" FROM m WINDOW w AS (PARTITION BY tax_year ORDER BY pay_date) ORDER BY pay_date", "format": "time_series", "refId": "A", "rawQuery": true, @@ -517,7 +576,7 @@ { "id": 11, "title": "Tax & pension \u2014 monthly", - "description": "Per-month deductions and pension contributions. Stacked \u2014 top equals total tax + pension (both sides). Red = cash income tax; orange = RSU-attributed income tax; amber = NI; brown = student loan; purple = employee pension; light purple = employer pension (paid on top of salary).", + "description": "Per-month RSU vest tax + recurring deductions. Cash-side PAYE/NI hidden because UK cumulative PAYE makes them inherently bumpy in vest months despite the marginal RSU strip \u2014 see Panel 12 (YTD cumulative) for the smoothed totals or Panel 3 for the effective rate. Orange = tax on RSU vest at the exact band-aware marginal (PA-taper aware: 60% in \u00a3100k\u2013\u00a3125,140 zone); brown = student loan; purple = employee pension; light purple = employer pension (paid on top of salary).", "type": "timeseries", "datasource": { "type": "grafana-postgresql-datasource", @@ -565,26 +624,7 @@ { "matcher": { "id": "byName", - "options": "cash_income_tax" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "#C4162A" - } - }, - { - "id": "displayName", - "value": "Income Tax (cash pay)" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "rsu_income_tax" + "options": "rsu_tax_marginal" }, "properties": [ { @@ -596,26 +636,7 @@ }, { "id": "displayName", - "value": "Income Tax (RSU-attributed)" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "ni" - }, - "properties": [ - { - "id": "color", - "value": { - "mode": "fixed", - "fixedColor": "orange" - } - }, - { - "id": "displayName", - "value": "National Insurance" + "value": "Tax on RSU vest (band-aware marginal)" } ] }, @@ -698,7 +719,7 @@ "type": "grafana-postgresql-datasource", "uid": "payslips-pg" }, - "rawSql": "SELECT pay_date AS \"time\", COALESCE(cash_income_tax, income_tax) AS cash_income_tax, income_tax - COALESCE(cash_income_tax, income_tax) AS rsu_income_tax, national_insurance AS ni, student_loan, pension_employee, pension_employer FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date) ORDER BY pay_date", + "rawSql": "WITH r AS (SELECT * FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date)), ani AS (SELECT *, COALESCE(SUM(gross_pay - COALESCE(pension_sacrifice, 0)) OVER (PARTITION BY tax_year ORDER BY pay_date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0) AS ani_prior FROM r), slice AS (SELECT *, ani_prior + gross_pay - COALESCE(rsu_vest, 0) - COALESCE(pension_sacrifice, 0) AS ani_pre, ani_prior + gross_pay - COALESCE(pension_sacrifice, 0) AS ani_post FROM ani), m AS (SELECT *, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.20 + GREATEST(0, LEAST(ani_post, 100000) - GREATEST(ani_pre, 50270)) * 0.40 + GREATEST(0, LEAST(ani_post, 125140) - GREATEST(ani_pre, 100000)) * 0.60 + GREATEST(0, ani_post - GREATEST(ani_pre, 125140)) * 0.45 AS rsu_paye_marginal, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.08 + GREATEST(0, ani_post - GREATEST(ani_pre, 50270)) * 0.02 AS rsu_ni_marginal FROM slice) SELECT pay_date AS \"time\", (rsu_paye_marginal + rsu_ni_marginal) AS rsu_tax_marginal, student_loan, pension_employee, pension_employer FROM m ORDER BY pay_date", "format": "time_series", "refId": "A", "rawQuery": true, @@ -709,7 +730,7 @@ { "id": 12, "title": "Tax & pension \u2014 YTD cumulative", - "description": "Year-to-date cumulative tax and pension. Same series and colors as the monthly panel; resets on 6-April tax year boundary.", + "description": "Year-to-date cumulative tax and pension. Same series and colors as the monthly panel \u2014 RSU vest tax broken out at the exact band-aware marginal (PA-taper aware: 60% in \u00a3100k\u2013\u00a3125,140 zone, 47% additional-rate, etc.). Resets on 6-April tax year boundary.", "type": "timeseries", "datasource": { "type": "grafana-postgresql-datasource", @@ -769,14 +790,14 @@ }, { "id": "displayName", - "value": "Income Tax (cash pay)" + "value": "Income Tax (cash)" } ] }, { "matcher": { "id": "byName", - "options": "ytd_rsu_income_tax" + "options": "ytd_rsu_tax_marginal" }, "properties": [ { @@ -788,14 +809,14 @@ }, { "id": "displayName", - "value": "Income Tax (RSU-attributed)" + "value": "Tax on RSU vest (band-aware marginal)" } ] }, { "matcher": { "id": "byName", - "options": "ytd_ni" + "options": "ytd_cash_ni" }, "properties": [ { @@ -807,7 +828,7 @@ }, { "id": "displayName", - "value": "National Insurance" + "value": "National Insurance (cash)" } ] }, @@ -890,7 +911,7 @@ "type": "grafana-postgresql-datasource", "uid": "payslips-pg" }, - "rawSql": "SELECT pay_date AS \"time\", SUM(COALESCE(cash_income_tax, income_tax)) OVER w AS ytd_cash_income_tax, SUM(income_tax - COALESCE(cash_income_tax, income_tax)) OVER w AS ytd_rsu_income_tax, SUM(national_insurance) OVER w AS ytd_ni, SUM(student_loan) OVER w AS ytd_student_loan, SUM(pension_employee) OVER w AS ytd_pension_employee, SUM(pension_employer) OVER w AS ytd_pension_employer FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date) WINDOW w AS (PARTITION BY tax_year ORDER BY pay_date) ORDER BY pay_date", + "rawSql": "WITH r AS (SELECT * FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date)), ani AS (SELECT *, COALESCE(SUM(gross_pay - COALESCE(pension_sacrifice, 0)) OVER (PARTITION BY tax_year ORDER BY pay_date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0) AS ani_prior FROM r), slice AS (SELECT *, ani_prior + gross_pay - COALESCE(rsu_vest, 0) - COALESCE(pension_sacrifice, 0) AS ani_pre, ani_prior + gross_pay - COALESCE(pension_sacrifice, 0) AS ani_post FROM ani), m AS (SELECT *, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.20 + GREATEST(0, LEAST(ani_post, 100000) - GREATEST(ani_pre, 50270)) * 0.40 + GREATEST(0, LEAST(ani_post, 125140) - GREATEST(ani_pre, 100000)) * 0.60 + GREATEST(0, ani_post - GREATEST(ani_pre, 125140)) * 0.45 AS rsu_paye_marginal, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.08 + GREATEST(0, ani_post - GREATEST(ani_pre, 50270)) * 0.02 AS rsu_ni_marginal FROM slice) SELECT pay_date AS \"time\", SUM(GREATEST(0, income_tax - rsu_paye_marginal)) OVER w AS ytd_cash_income_tax, SUM(rsu_paye_marginal + rsu_ni_marginal) OVER w AS ytd_rsu_tax_marginal, SUM(GREATEST(0, national_insurance - rsu_ni_marginal)) OVER w AS ytd_cash_ni, SUM(student_loan) OVER w AS ytd_student_loan, SUM(pension_employee) OVER w AS ytd_pension_employee, SUM(pension_employer) OVER w AS ytd_pension_employer FROM m WINDOW w AS (PARTITION BY tax_year ORDER BY pay_date) ORDER BY pay_date", "format": "time_series", "refId": "A", "rawQuery": true, @@ -911,7 +932,7 @@ "h": 9, "w": 24, "x": 0, - "y": 29 + "y": 39 }, "fieldConfig": { "defaults": { @@ -921,20 +942,20 @@ "unit": "currencyGBP", "custom": { "axisPlacement": "auto", - "drawStyle": "line", - "fillOpacity": 70, + "drawStyle": "bars", + "fillOpacity": 100, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineWidth": 1, + "lineWidth": 0, "pointSize": 4, "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", + "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", @@ -1028,7 +1049,8 @@ "legend": { "calcs": [ "last", - "max" + "max", + "sum" ], "displayMode": "table", "placement": "bottom" @@ -1064,7 +1086,7 @@ "h": 6, "w": 24, "x": 0, - "y": 38 + "y": 62 }, "fieldConfig": { "defaults": { @@ -1238,7 +1260,7 @@ "h": 14, "w": 24, "x": 0, - "y": 44 + "y": 68 }, "fieldConfig": { "defaults": { @@ -1414,7 +1436,7 @@ "h": 12, "w": 24, "x": 0, - "y": 58 + "y": 82 }, "fieldConfig": { "defaults": { @@ -1724,7 +1746,7 @@ "h": 14, "w": 24, "x": 0, - "y": 70 + "y": 48 }, "options": { "monochrome": false, @@ -1745,7 +1767,7 @@ "rawQuery": true, "editorMode": "code", "format": "table", - "rawSql": "WITH agg AS (SELECT COALESCE(SUM(salary), 0) AS salary, COALESCE(SUM(bonus), 0) AS bonus, COALESCE(SUM(rsu_vest), 0) AS rsu_vest, COALESCE(SUM(GREATEST(gross_pay - salary - bonus - rsu_vest, 0)), 0) AS other_income, COALESCE(SUM(net_pay), 0) AS net_pay, COALESCE(SUM(COALESCE(cash_income_tax, income_tax)), 0) AS cash_income_tax, COALESCE(SUM(income_tax - COALESCE(cash_income_tax, income_tax)), 0) AS rsu_income_tax, COALESCE(SUM(national_insurance), 0) AS ni, COALESCE(SUM(pension_employee), 0) AS pension, COALESCE(SUM(student_loan), 0) AS student_loan, COALESCE(SUM(rsu_offset), 0) AS rsu_offset FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date)) SELECT 'Salary' AS source, 'Gross' AS target, salary AS value FROM agg WHERE salary > 0 UNION ALL SELECT 'Bonus', 'Gross', bonus FROM agg WHERE bonus > 0 UNION ALL SELECT 'RSU', 'Gross', rsu_vest FROM agg WHERE rsu_vest > 0 UNION ALL SELECT 'Other income', 'Gross', other_income FROM agg WHERE other_income > 0 UNION ALL SELECT 'Gross', 'Net pay', net_pay FROM agg WHERE net_pay > 0 UNION ALL SELECT 'Gross', 'Income Tax (cash)', cash_income_tax FROM agg WHERE cash_income_tax > 0 UNION ALL SELECT 'Gross', 'Income Tax (RSU)', rsu_income_tax FROM agg WHERE rsu_income_tax > 0 UNION ALL SELECT 'Gross', 'National Insurance', ni FROM agg WHERE ni > 0 UNION ALL SELECT 'Gross', 'Pension', pension FROM agg WHERE pension > 0 UNION ALL SELECT 'Gross', 'Student Loan', student_loan FROM agg WHERE student_loan > 0 UNION ALL SELECT 'Gross', 'RSU Offset', rsu_offset FROM agg WHERE rsu_offset > 0" + "rawSql": "WITH r AS (SELECT * FROM payslip_ingest.payslip WHERE $__timeFilter(pay_date)), ani AS (SELECT *, COALESCE(SUM(gross_pay - COALESCE(pension_sacrifice, 0)) OVER (PARTITION BY tax_year ORDER BY pay_date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), 0) AS ani_prior FROM r), slice AS (SELECT *, ani_prior + gross_pay - COALESCE(rsu_vest, 0) - COALESCE(pension_sacrifice, 0) AS ani_pre, ani_prior + gross_pay - COALESCE(pension_sacrifice, 0) AS ani_post FROM ani), m AS (SELECT *, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.20 + GREATEST(0, LEAST(ani_post, 100000) - GREATEST(ani_pre, 50270)) * 0.40 + GREATEST(0, LEAST(ani_post, 125140) - GREATEST(ani_pre, 100000)) * 0.60 + GREATEST(0, ani_post - GREATEST(ani_pre, 125140)) * 0.45 AS rsu_paye_marginal, GREATEST(0, LEAST(ani_post, 12570) - GREATEST(ani_pre, 0)) * 0.00 + GREATEST(0, LEAST(ani_post, 50270) - GREATEST(ani_pre, 12570)) * 0.08 + GREATEST(0, ani_post - GREATEST(ani_pre, 50270)) * 0.02 AS rsu_ni_marginal FROM slice), agg AS (SELECT COALESCE(SUM(salary), 0) AS salary, COALESCE(SUM(bonus), 0) AS bonus, COALESCE(SUM(rsu_vest), 0) AS rsu_vest, COALESCE(SUM(GREATEST(gross_pay - salary - bonus - rsu_vest, 0)), 0) AS other_income, COALESCE(SUM(net_pay), 0) AS net_pay, COALESCE(SUM(GREATEST(0, income_tax - rsu_paye_marginal)), 0) AS cash_income_tax, COALESCE(SUM(rsu_paye_marginal + rsu_ni_marginal), 0) AS rsu_tax_marginal, COALESCE(SUM(GREATEST(0, national_insurance - rsu_ni_marginal)), 0) AS cash_ni, COALESCE(SUM(pension_employee), 0) AS pension, COALESCE(SUM(student_loan), 0) AS student_loan, COALESCE(SUM(rsu_offset), 0) AS rsu_offset FROM m) SELECT 'Salary' AS source, 'Gross' AS target, salary AS value FROM agg WHERE salary > 0 UNION ALL SELECT 'Bonus', 'Gross', bonus FROM agg WHERE bonus > 0 UNION ALL SELECT 'RSU', 'Gross', rsu_vest FROM agg WHERE rsu_vest > 0 UNION ALL SELECT 'Other income', 'Gross', other_income FROM agg WHERE other_income > 0 UNION ALL SELECT 'Gross', 'Net pay', net_pay FROM agg WHERE net_pay > 0 UNION ALL SELECT 'Gross', 'Income Tax (cash)', cash_income_tax FROM agg WHERE cash_income_tax > 0 UNION ALL SELECT 'Gross', 'Tax on RSU vest', rsu_tax_marginal FROM agg WHERE rsu_tax_marginal > 0 UNION ALL SELECT 'Gross', 'National Insurance (cash)', cash_ni FROM agg WHERE cash_ni > 0 UNION ALL SELECT 'Gross', 'Pension', pension FROM agg WHERE pension > 0 UNION ALL SELECT 'Gross', 'Student Loan', student_loan FROM agg WHERE student_loan > 0 UNION ALL SELECT 'Gross', 'RSU Offset', rsu_offset FROM agg WHERE rsu_offset > 0" } ] }, @@ -1762,7 +1784,7 @@ "h": 10, "w": 24, "x": 0, - "y": 84 + "y": 94 }, "fieldConfig": { "defaults": { @@ -1908,116 +1930,142 @@ ] }, { - "id": 10, - "title": "HMRC Tax Year Reconciliation \u2014 Individual Tax API", - "description": "Latest snapshot from HMRC Individual Tax API v1.1 vs SUM(payslip.income_tax) per tax year. Delta > \u00a310 turns red \u2014 that's parser drift vs HMRC's held figures, the authoritative ground truth. Shown only for years where hmrc-sync has pulled a snapshot.", - "type": "table", + "id": 16, + "title": "Yearly receipt \u2014 gross income per tax year", + "description": "One stacked bar per tax year showing all gross income components: salary (cash, post-pension-sacrifice), pension (salary-sacrifice \u2014 untaxed but real income), bonus, and RSU vest gross. Bar total = pre-sacrifice gross compensation. Aligns with P60: bar \u2212 pension_sacrifice \u2248 ytd_gross reported on the final March payslip / P60. Where the parser correctly captured bonus into gross_pay (every year except 2023/24 and 2024/25 \u2014 March payslip parsing bug), the match is exact.", + "type": "barchart", "datasource": { "type": "grafana-postgresql-datasource", "uid": "payslips-pg" }, "gridPos": { "h": 10, - "w": 24, + "w": 12, "x": 0, - "y": 94 + "y": 29 }, "fieldConfig": { "defaults": { + "color": { + "mode": "palette-classic" + }, "unit": "currencyGBP", "custom": { - "align": "right", - "displayMode": "auto" + "axisPlacement": "auto", + "axisLabel": "", + "axisCenteredZero": false, + "fillOpacity": 80, + "gradientMode": "none", + "lineWidth": 1, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "thresholdsStyle": { + "mode": "off" + } } }, "overrides": [ { "matcher": { - "id": "byRegexp", - "options": "^delta_" + "id": "byName", + "options": "salary_cash" }, "properties": [ { - "id": "custom.displayMode", - "value": "color-background" - }, - { - "id": "thresholds", + "id": "color", "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "green", - "value": -10 - }, - { - "color": "red", - "value": 10 - }, - { - "color": "red", - "value": -10 - } - ] + "mode": "fixed", + "fixedColor": "green" } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "tax_year" - }, - "properties": [ - { - "id": "unit", - "value": "string" }, { - "id": "custom.align", - "value": "left" + "id": "displayName", + "value": "Salary (cash, post-sacrifice)" } ] }, { "matcher": { "id": "byName", - "options": "employer_paye_ref" + "options": "pension_sacrifice" }, "properties": [ { - "id": "unit", - "value": "string" + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#CE96D8" + } }, { - "id": "custom.align", - "value": "left" + "id": "displayName", + "value": "Pension (salary sacrifice, untaxed)" } ] }, { "matcher": { "id": "byName", - "options": "snapshot_date" + "options": "bonus" }, "properties": [ { - "id": "unit", - "value": "dateTimeAsIso" + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#FADE2A" + } + }, + { + "id": "displayName", + "value": "Bonus (gross)" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "rsu_gross" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed", + "fixedColor": "#3274D9" + } + }, + { + "id": "displayName", + "value": "RSU vest (gross)" } ] } ] }, "options": { - "showHeader": true, - "cellHeight": "sm", - "footer": { - "show": false + "barRadius": 0, + "barWidth": 0.6, + "groupWidth": 0.7, + "orientation": "auto", + "showValue": "auto", + "stacking": "normal", + "xField": "tax_year", + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0, + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" } }, "targets": [ @@ -2030,7 +2078,74 @@ "rawQuery": true, "editorMode": "code", "format": "table", - "rawSql": "WITH latest AS (SELECT DISTINCT ON (tax_year, employer_paye_ref) tax_year, employer_paye_ref, snapshot_date, gross_pay, income_tax, ni_contributions FROM hmrc_sync.tax_year_snapshot ORDER BY tax_year, employer_paye_ref, snapshot_date DESC), summed AS (SELECT tax_year, COALESCE(SUM(gross_pay), 0) AS sum_gross, COALESCE(SUM(income_tax), 0) AS sum_tax, COALESCE(SUM(national_insurance), 0) AS sum_ni FROM payslip_ingest.payslip GROUP BY tax_year) SELECT l.tax_year, l.employer_paye_ref, l.snapshot_date, l.gross_pay AS hmrc_gross, s.sum_gross AS computed_gross, (l.gross_pay - s.sum_gross) AS delta_gross, l.income_tax AS hmrc_tax, s.sum_tax AS computed_tax, (l.income_tax - s.sum_tax) AS delta_tax, l.ni_contributions AS hmrc_ni, s.sum_ni AS computed_ni, (l.ni_contributions - s.sum_ni) AS delta_ni FROM latest l LEFT JOIN summed s ON s.tax_year = l.tax_year ORDER BY l.tax_year DESC" + "rawSql": "SELECT tax_year, SUM(salary - COALESCE(pension_sacrifice, 0)) AS salary_cash, SUM(COALESCE(pension_sacrifice, 0)) AS pension_sacrifice, SUM(bonus) AS bonus, SUM(rsu_vest) AS rsu_gross FROM payslip_ingest.payslip GROUP BY tax_year ORDER BY tax_year" + } + ] + }, + { + "id": 17, + "title": "YTD gross salary \u2014 year-over-year comparison", + "description": "Cumulative gross pay built up month by month within each UK tax year (April \u2192 March). One line per tax year. Pay dates are projected onto a sliding 12-month window ending now, so years overlay cleanly without falling outside the dashboard's time range. X-axis shows month-of-tax-year (April first, March last).", + "type": "timeseries", + "timeFrom": "13M", + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "payslips-pg" + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 29 + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "currencyGBP", + "decimals": 0, + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 0, + "pointSize": 5, + "showPoints": "auto", + "spanNulls": true, + "axisPlacement": "auto", + "stacking": { + "group": "A", + "mode": "none" + } + } + }, + "overrides": [] + }, + "options": { + "legend": { + "calcs": [ + "last", + "max" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "payslips-pg" + }, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "WITH projected AS (SELECT ((CURRENT_DATE - INTERVAL '12 months')::date + (pay_date - MAKE_DATE(SUBSTRING(tax_year, 1, 4)::int, 4, 6)))::timestamp AS t, tax_year, SUM(gross_pay) OVER (PARTITION BY tax_year ORDER BY pay_date) AS ytd FROM payslip_ingest.payslip) SELECT t AS \"time\", tax_year AS metric, ytd AS ytd_gross FROM projected ORDER BY t, tax_year" } ] } diff --git a/stacks/monitoring/modules/monitoring/dashboards/wealth.json b/stacks/monitoring/modules/monitoring/dashboards/wealth.json new file mode 100644 index 00000000..1b8a8aed --- /dev/null +++ b/stacks/monitoring/modules/monitoring/dashboards/wealth.json @@ -0,0 +1,671 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "datasource", "uid": "grafana"}, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Wealth — net worth, contributions, and growth over time. Backed by the wealthfolio_sync PG mirror of Wealthfolio's SQLite, refreshed hourly by the pg-sync sidecar.", + "editable": true, + "fiscalYearStartMonth": 0, + "id": null, + "links": [], + "panels": [ + { + "id": 1, + "title": "Net worth (current)", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 4, "w": 5, "x": 0, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "currencyGBP", + "color": {"mode": "fixed", "fixedColor": "green"}, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT SUM(total_value) AS net_worth FROM daily_account_valuation WHERE valuation_date = (SELECT MAX(valuation_date) FROM daily_account_valuation)" + } + ] + }, + { + "id": 2, + "title": "Net contribution (cumulative)", + "description": "Total deposits minus withdrawals across all accounts.", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 4, "w": 5, "x": 5, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "currencyGBP", + "color": {"mode": "fixed", "fixedColor": "blue"}, + "decimals": 0 + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT SUM(net_contribution) AS contribution FROM daily_account_valuation WHERE valuation_date = (SELECT MAX(valuation_date) FROM daily_account_valuation)" + } + ] + }, + { + "id": 3, + "title": "Growth (unrealised)", + "description": "Net worth minus net contribution — the gain on everything you've put in.", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 4, "w": 5, "x": 10, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "currencyGBP", + "color": {"mode": "thresholds"}, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "green", "value": 0} + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT (SUM(total_value) - SUM(net_contribution)) AS growth FROM daily_account_valuation WHERE valuation_date = (SELECT MAX(valuation_date) FROM daily_account_valuation)" + } + ] + }, + { + "id": 4, + "title": "ROI %", + "description": "Growth / net contribution × 100. Excludes accounts with zero/negative contribution (Schwab) to avoid distortion.", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 4, "w": 5, "x": 15, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": {"mode": "thresholds"}, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0}, + {"color": "green", "value": 5} + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "WITH latest AS (SELECT * FROM daily_account_valuation WHERE valuation_date = (SELECT MAX(valuation_date) FROM daily_account_valuation) AND net_contribution > 0) SELECT (SUM(total_value - net_contribution) / NULLIF(SUM(net_contribution), 0) * 100) AS roi_pct FROM latest" + } + ] + }, + { + "id": 5, + "title": "Net worth — total over time", + "description": "Daily total_value summed across all accounts (base GBP).", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 4}, + "fieldConfig": { + "defaults": { + "color": {"mode": "fixed", "fixedColor": "green"}, + "unit": "currencyGBP", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 20, + "pointSize": 4, + "showPoints": "never", + "spanNulls": true, + "axisPlacement": "auto", + "stacking": {"group": "A", "mode": "none"} + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "net_worth"}, + "properties": [{"id": "displayName", "value": "Net worth"}] + } + ] + }, + "options": { + "legend": {"calcs": ["last", "max"], "displayMode": "table", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT valuation_date::timestamp AS \"time\", SUM(total_value) AS net_worth FROM daily_account_valuation WHERE $__timeFilter(valuation_date) GROUP BY valuation_date ORDER BY valuation_date" + } + ] + }, + { + "id": 6, + "title": "Net contribution vs market value", + "description": "Net contribution = cumulative deposits − withdrawals. Market value = total_value (cash + investments). Gap between the two = unrealised growth.", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 10, "w": 12, "x": 0, "y": 14}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "unit": "currencyGBP", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 0, + "pointSize": 4, + "showPoints": "never", + "spanNulls": true, + "axisPlacement": "auto", + "stacking": {"group": "A", "mode": "none"} + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "market_value"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "green"}}, + {"id": "displayName", "value": "Market value"} + ] + }, + { + "matcher": {"id": "byName", "options": "net_contribution"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "blue"}}, + {"id": "displayName", "value": "Net contribution"} + ] + } + ] + }, + "options": { + "legend": {"calcs": ["last"], "displayMode": "table", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT valuation_date::timestamp AS \"time\", SUM(net_contribution) AS net_contribution, SUM(total_value) AS market_value FROM daily_account_valuation WHERE $__timeFilter(valuation_date) GROUP BY valuation_date ORDER BY valuation_date" + } + ] + }, + { + "id": 7, + "title": "Growth (market value − contribution) over time", + "description": "Unrealised gain across all accounts. Filled area to emphasise the wealth created above the contributed capital.", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 10, "w": 12, "x": 12, "y": 14}, + "fieldConfig": { + "defaults": { + "color": {"mode": "fixed", "fixedColor": "#56A64B"}, + "unit": "currencyGBP", + "custom": { + "drawStyle": "line", + "lineWidth": 2, + "fillOpacity": 50, + "gradientMode": "opacity", + "pointSize": 4, + "showPoints": "never", + "spanNulls": true, + "axisPlacement": "auto", + "stacking": {"group": "A", "mode": "none"} + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "growth"}, + "properties": [{"id": "displayName", "value": "Growth"}] + } + ] + }, + "options": { + "legend": {"calcs": ["last", "max"], "displayMode": "table", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT valuation_date::timestamp AS \"time\", (SUM(total_value) - SUM(net_contribution)) AS growth FROM daily_account_valuation WHERE $__timeFilter(valuation_date) GROUP BY valuation_date ORDER BY valuation_date" + } + ] + }, + { + "id": 8, + "title": "Per-account stacked — total value", + "description": "Stacked area showing each account's contribution to total net worth over time. Useful for spotting which account drives the trajectory.", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 11, "w": 24, "x": 0, "y": 24}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "unit": "currencyGBP", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 70, + "pointSize": 3, + "showPoints": "never", + "spanNulls": true, + "axisPlacement": "auto", + "stacking": {"group": "A", "mode": "normal"} + } + }, + "overrides": [] + }, + "options": { + "legend": {"calcs": ["last"], "displayMode": "table", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT d.valuation_date::timestamp AS \"time\", a.name AS metric, d.total_value AS value FROM daily_account_valuation d JOIN accounts a ON a.id = d.account_id WHERE $__timeFilter(d.valuation_date) ORDER BY d.valuation_date, a.name" + } + ] + }, + { + "id": 9, + "title": "Cash vs invested (stacked)", + "description": "Daily breakdown of uninvested broker cash vs market value of investments. WORKPLACE_PENSION accounts (Fidelity) are reclassified entirely as invested — Wealthfolio dumps pension wrappers into cash_balance because it doesn't track the underlying fund holdings, but they are not actually cash.", + "type": "timeseries", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 35}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "unit": "currencyGBP", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 70, + "pointSize": 3, + "showPoints": "never", + "spanNulls": true, + "axisPlacement": "auto", + "stacking": {"group": "A", "mode": "normal"} + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "cash"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "#FADE2A"}}, + {"id": "displayName", "value": "Cash"} + ] + }, + { + "matcher": {"id": "byName", "options": "invested"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "#56A64B"}}, + {"id": "displayName", "value": "Invested"} + ] + } + ] + }, + "options": { + "legend": {"calcs": ["last"], "displayMode": "table", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "time_series", + "rawSql": "SELECT d.valuation_date::timestamp AS \"time\", SUM(CASE WHEN a.account_type = 'WORKPLACE_PENSION' THEN 0 ELSE d.cash_balance END) AS cash, SUM(CASE WHEN a.account_type = 'WORKPLACE_PENSION' THEN d.cash_balance + d.investment_market_value ELSE d.investment_market_value END) AS invested FROM daily_account_valuation d JOIN accounts a ON a.id = d.account_id WHERE $__timeFilter(d.valuation_date) GROUP BY d.valuation_date ORDER BY d.valuation_date" + } + ] + }, + { + "id": 10, + "title": "Activity log", + "description": "Recent activities (BUY / SELL / DEPOSIT / WITHDRAWAL / DIVIDEND / etc.) across all accounts. Limited to 100 most recent.", + "type": "table", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 14, "w": 24, "x": 0, "y": 77}, + "fieldConfig": { + "defaults": { + "custom": {"align": "auto", "displayMode": "auto"} + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "amount"}, + "properties": [{"id": "unit", "value": "currencyGBP"}] + } + ] + }, + "options": { + "cellHeight": "sm", + "footer": {"show": false} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT a.activity_date AS \"date\", acc.name AS \"account\", a.activity_type AS \"type\", a.asset_id AS \"asset\", a.quantity AS \"qty\", a.unit_price AS \"unit_price\", a.amount AS \"amount\", a.currency AS \"ccy\", a.notes AS \"notes\" FROM activities a LEFT JOIN accounts acc ON acc.id = a.account_id WHERE $__timeFilter(a.activity_date) ORDER BY a.activity_date DESC LIMIT 100" + } + ] + }, + { + "id": 11, + "title": "12mo return", + "description": "Modified-Dietz return over the trailing 12 months: market_gain / (nw_12mo_ago + 0.5 × contributions_12mo). Excludes new money in — answers 'how did my investments perform' rather than 'how much did my net worth change'.", + "type": "stat", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 4, "w": 4, "x": 20, "y": 0}, + "fieldConfig": { + "defaults": { + "unit": "percent", + "color": {"mode": "thresholds"}, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0}, + {"color": "green", "value": 5} + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "textMode": "auto" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "WITH bounds AS (SELECT (SELECT MAX(valuation_date) FROM daily_account_valuation) AS d_now, (SELECT MIN(valuation_date) FROM daily_account_valuation WHERE valuation_date >= (SELECT MAX(valuation_date) - INTERVAL '12 months' FROM daily_account_valuation)) AS d_ago), agg AS (SELECT (SELECT SUM(total_value) FROM daily_account_valuation WHERE valuation_date = b.d_now) AS nw_now, (SELECT SUM(net_contribution) FROM daily_account_valuation WHERE valuation_date = b.d_now) AS contrib_now, (SELECT SUM(total_value) FROM daily_account_valuation WHERE valuation_date = b.d_ago) AS nw_ago, (SELECT SUM(net_contribution) FROM daily_account_valuation WHERE valuation_date = b.d_ago) AS contrib_ago FROM bounds b) SELECT ROUND((((nw_now - nw_ago - (contrib_now - contrib_ago)) / NULLIF(nw_ago + 0.5 * (contrib_now - contrib_ago), 0)) * 100)::numeric, 2) AS pct_12mo FROM agg" + } + ] + }, + { + "id": 12, + "title": "Yearly investment return %", + "description": "Modified-Dietz return per calendar year: market_gain / (nw_start + 0.5 × contributions). Pure investment performance — excludes new contributions, so a £100k vest doesn't show as 100% growth. Negative bars = market losses (e.g., 2022 bear market).", + "type": "barchart", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 11, "w": 24, "x": 0, "y": 45}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "unit": "percent", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0}, + {"color": "green", "value": 5} + ] + }, + "custom": { + "axisPlacement": "auto", + "axisLabel": "", + "fillOpacity": 80, + "gradientMode": "none", + "lineWidth": 1 + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "year"}, + "properties": [ + {"id": "unit", "value": "string"} + ] + } + ] + }, + "options": { + "barRadius": 0, + "barWidth": 0.6, + "groupWidth": 0.7, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "xField": "year", + "xTickLabelRotation": 0, + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "single", "sort": "none"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "WITH yearly AS (SELECT EXTRACT(YEAR FROM valuation_date)::int AS yr, valuation_date, SUM(total_value) AS nw, SUM(net_contribution) AS contrib FROM daily_account_valuation GROUP BY valuation_date), endpoints AS (SELECT yr, (array_agg(nw ORDER BY valuation_date ASC))[1] AS nw_start, (array_agg(nw ORDER BY valuation_date DESC))[1] AS nw_end, (array_agg(contrib ORDER BY valuation_date ASC))[1] AS contrib_start, (array_agg(contrib ORDER BY valuation_date DESC))[1] AS contrib_end FROM yearly GROUP BY yr) SELECT yr::text AS year, ROUND((((nw_end - nw_start - (contrib_end - contrib_start)) / NULLIF(nw_start + 0.5 * (contrib_end - contrib_start), 0)) * 100)::numeric, 2) AS return_pct FROM endpoints WHERE (nw_start + 0.5 * (contrib_end - contrib_start)) > 0 ORDER BY yr" + } + ] + }, + { + "id": 13, + "title": "Annual change decomposition — contributions vs market gain", + "description": "Each calendar year's net worth change split into 'new money in' (contributions − withdrawals) and 'market gain' (everything else: price appreciation, dividends, etc.). Shows whether you grew because you saved or because the market did the work. Negative bars = withdrawals or market losses.", + "type": "barchart", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 11, "w": 24, "x": 0, "y": 56}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "unit": "currencyGBP", + "decimals": 0, + "custom": { + "axisPlacement": "auto", + "axisLabel": "", + "fillOpacity": 80, + "gradientMode": "none", + "lineWidth": 1 + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "year"}, + "properties": [ + {"id": "unit", "value": "string"} + ] + }, + { + "matcher": {"id": "byName", "options": "contributions"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "blue"}}, + {"id": "displayName", "value": "Net contributions"} + ] + }, + { + "matcher": {"id": "byName", "options": "market_gain"}, + "properties": [ + {"id": "color", "value": {"mode": "fixed", "fixedColor": "#56A64B"}}, + {"id": "displayName", "value": "Market gain"} + ] + } + ] + }, + "options": { + "barRadius": 0, + "barWidth": 0.6, + "groupWidth": 0.7, + "orientation": "auto", + "showValue": "auto", + "stacking": "normal", + "xField": "year", + "xTickLabelRotation": 0, + "legend": {"calcs": ["sum"], "displayMode": "table", "placement": "bottom"}, + "tooltip": {"mode": "multi", "sort": "desc"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "WITH yearly AS (SELECT EXTRACT(YEAR FROM valuation_date)::int AS yr, valuation_date, SUM(total_value) AS nw, SUM(net_contribution) AS contrib FROM daily_account_valuation GROUP BY valuation_date), endpoints AS (SELECT yr, (array_agg(nw ORDER BY valuation_date ASC))[1] AS nw_start, (array_agg(nw ORDER BY valuation_date DESC))[1] AS nw_end, (array_agg(contrib ORDER BY valuation_date ASC))[1] AS contrib_start, (array_agg(contrib ORDER BY valuation_date DESC))[1] AS contrib_end FROM yearly GROUP BY yr) SELECT yr::text AS year, ROUND((contrib_end - contrib_start)::numeric, 0) AS contributions, ROUND((nw_end - nw_start - (contrib_end - contrib_start))::numeric, 0) AS market_gain FROM endpoints ORDER BY yr" + } + ] + }, + { + "id": 14, + "title": "Per-account ROI %", + "description": "(market value − net contribution) / net contribution × 100, latest snapshot. Excludes accounts with zero/negative net contribution (Schwab — RSU vests sold = negative contribution distorts the ratio). Pension shows 0% because Wealthfolio doesn't track underlying fund holdings, so cost_basis = 0 and 'growth' is just the cash balance reported.", + "type": "barchart", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 67}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "unit": "percent", + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0}, + {"color": "green", "value": 10} + ] + }, + "custom": { + "axisPlacement": "auto", + "axisLabel": "", + "fillOpacity": 80, + "gradientMode": "none", + "lineWidth": 1 + } + }, + "overrides": [] + }, + "options": { + "barRadius": 0, + "barWidth": 0.6, + "groupWidth": 0.7, + "orientation": "horizontal", + "showValue": "always", + "stacking": "none", + "xField": "account", + "legend": {"displayMode": "list", "placement": "bottom"}, + "tooltip": {"mode": "single", "sort": "none"} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "grafana-postgresql-datasource", "uid": "wealth-pg"}, + "rawQuery": true, + "editorMode": "code", + "format": "table", + "rawSql": "SELECT a.name AS account, ROUND(((d.total_value - d.net_contribution) / NULLIF(d.net_contribution, 0) * 100)::numeric, 2) AS roi_pct FROM daily_account_valuation d JOIN accounts a ON a.id = d.account_id WHERE d.valuation_date = (SELECT MAX(valuation_date) FROM daily_account_valuation) AND d.net_contribution > 0 ORDER BY roi_pct DESC" + } + ] + } + ], + "refresh": "5m", + "schemaVersion": 39, + "tags": ["finance", "personal", "wealth"], + "templating": {"list": []}, + "time": {"from": "now-5y", "to": "now"}, + "timepicker": {}, + "timezone": "browser", + "title": "Wealth", + "uid": "wealth", + "version": 1 +} diff --git a/stacks/monitoring/modules/monitoring/grafana.tf b/stacks/monitoring/modules/monitoring/grafana.tf index 2c5089ee..b5a5f249 100644 --- a/stacks/monitoring/modules/monitoring/grafana.tf +++ b/stacks/monitoring/modules/monitoring/grafana.tf @@ -134,9 +134,19 @@ locals { # Applications "qbittorrent.json" = "Applications" "realestate-crawler.json" = "Applications" - "uk-payslip.json" = "Finance" + "uk-payslip.json" = "Finance (Personal)" + "wealth.json" = "Finance (Personal)" "job-hunter.json" = "Finance" + "fire-planner.json" = "Finance" } + + # Folders restricted to the Grafana admin user (anonymous Viewer + any future + # non-admin users are denied). Permission set by null_resource below via the + # Grafana folder permissions API after the dashboard sidecar auto-creates the + # folder. Server-admin always retains access regardless of folder ACL. + admin_only_folders = [ + "Finance (Personal)", + ] } resource "kubernetes_config_map" "grafana_dashboards" { @@ -157,6 +167,60 @@ resource "kubernetes_config_map" "grafana_dashboards" { } } +# Lock down "admin only" folders via Grafana folder permissions API. +# Default org-role inheritance gives Viewer + Editor read access to every +# folder; explicitly setting the folder ACL to {Admin: 4} overrides that +# inheritance so Viewer/Editor (incl. anonymous-Viewer) get no access. +# The Grafana super-admin (`admin` user) always retains access regardless. +resource "null_resource" "grafana_admin_only_folder_acl" { + for_each = toset(local.admin_only_folders) + + # Re-runs on tg apply (cheap, idempotent API call). Catches drift if anyone + # edits permissions via the UI or the folder is rebuilt. + triggers = { + folder = each.value + always = timestamp() + } + + provisioner "local-exec" { + interpreter = ["/bin/bash", "-c"] + command = <<-EOT + set -euo pipefail + FOLDER='${each.value}' + KUBECONFIG_FLAG='--kubeconfig ${var.kube_config_path}' + POD=$(kubectl $KUBECONFIG_FLAG get pod -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}') + ADMIN_PW=$(kubectl $KUBECONFIG_FLAG get secret -n monitoring grafana -o jsonpath='{.data.admin-password}' | base64 -d) + + # Wait up to 60s for the dashboard sidecar to materialise the folder. + for i in $(seq 1 12); do + FOLDER_UID=$(kubectl $KUBECONFIG_FLAG exec -n monitoring "$POD" -c grafana -- \ + curl -sf -u "admin:$ADMIN_PW" "http://localhost:3000/api/folders" \ + | python3 -c "import json,sys; folders=json.load(sys.stdin); print(next((f['uid'] for f in folders if f['title']==sys.argv[1]), ''))" "$FOLDER" || true) + if [ -n "$FOLDER_UID" ]; then break; fi + sleep 5 + done + + if [ -z "$FOLDER_UID" ]; then + echo "ERROR: folder '$FOLDER' not found in Grafana after 60s" + exit 1 + fi + + # Admin-only ACL. permission codes: 1=View, 2=Edit, 4=Admin. + kubectl $KUBECONFIG_FLAG exec -n monitoring "$POD" -c grafana -- \ + curl -sf -u "admin:$ADMIN_PW" -X POST \ + -H "Content-Type: application/json" \ + -d '{"items":[{"role":"Admin","permission":4}]}' \ + "http://localhost:3000/api/folders/$FOLDER_UID/permissions" >/dev/null + echo "set admin-only ACL on folder '$FOLDER' (uid=$FOLDER_UID)" + EOT + } + + depends_on = [ + helm_release.grafana, + kubernetes_config_map.grafana_dashboards, + ] +} + resource "helm_release" "grafana" { namespace = kubernetes_namespace.monitoring.metadata[0].name create_namespace = true diff --git a/stacks/monitoring/modules/monitoring/main.tf b/stacks/monitoring/modules/monitoring/main.tf index db0c798e..d55ac703 100644 --- a/stacks/monitoring/modules/monitoring/main.tf +++ b/stacks/monitoring/modules/monitoring/main.tf @@ -27,6 +27,10 @@ variable "grafana_admin_password" { type = string sensitive = true } +variable "kube_config_path" { + type = string + sensitive = true +} variable "tier" { type = string } variable "mysql_host" { type = string } variable "registry_user" { diff --git a/stacks/monitoring/modules/monitoring/prometheus.tf b/stacks/monitoring/modules/monitoring/prometheus.tf index 7e998b91..c317775e 100644 --- a/stacks/monitoring/modules/monitoring/prometheus.tf +++ b/stacks/monitoring/modules/monitoring/prometheus.tf @@ -40,8 +40,11 @@ resource "helm_release" "prometheus" { # version = "15.0.2" version = "25.8.2" - timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow - force_update = true # Required for StatefulSet volumeClaimTemplate changes (immutable field) + timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow + # force_update disabled 2026-04-23: caused Helm to try replacing the bound + # pushgateway PVC (added in rev 188, see commit e51c104), which is immutable. + # Re-enable temporarily only when a StatefulSet volumeClaimTemplate change needs --force. + force_update = false values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })] } diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index b0233985..0051982e 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -73,7 +73,7 @@ alertmanager: - source_matchers: - alertname = NodeDown target_matchers: - - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" + - alertname =~ "NodeNotReady|NodeConditionBad|PodCrashLooping|ContainerOOMKilled|DeploymentReplicasMismatch|StatefulSetReplicasMismatch|DaemonSetMissingPods|ScrapeTargetDown|NodeLowFreeMemory|PostgreSQLDown|RedisDown|HeadscaleDown|HeadscaleReplicasMismatch|AuthentikDown|PoisonFountainDown|HackmdDown|PrivatebinDown|MailServerDown|EmailRoundtripFailing|EmailRoundtripStale|NodeExporterDown|DockerRegistryDown|HomeAssistantDown|CloudflaredDown|TechnitiumDNSDown|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|HomeAssistantMetricsMissing" # NFS down causes mass pod failures and NFS-dependent service outages - source_matchers: - alertname = NFSServerUnresponsive @@ -98,7 +98,7 @@ alertmanager: - source_matchers: - alertname = PowerOutage target_matchers: - - alertname =~ "NodeDown|NFSServerUnresponsive|NodeExporterDown|CloudflaredDown|MetalLBSpeakerDown|MetalLBControllerDown|UPSMetricsMissing|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|ATSMetricsMissing|HomeAssistantMetricsMissing|FuseMainMetricsMissing|FuseGarageMetricsMissing|ProxmoxMetricsMissing|iDRACSystemUnhealthy|iDRACServerPoweredOff|ProxmoxExporterDown" + - alertname =~ "NodeDown|NFSServerUnresponsive|NodeExporterDown|CloudflaredDown|MetalLBSpeakerDown|MetalLBControllerDown|UPSMetricsMissing|iDRACRedfishMetricsMissing|iDRACSNMPMetricsMissing|ATSMetricsMissing|HomeAssistantMetricsMissing|FuseMainMetricsMissing|FuseGarageMetricsMissing|ThermostatHolMetricsMissing|ThermostatMasterBedroomMetricsMissing|ThermostatOfficeMetricsMissing|ThermostatKidsRoomMetricsMissing|ProxmoxMetricsMissing|iDRACSystemUnhealthy|iDRACServerPoweredOff|ProxmoxExporterDown" # iDRAC system-level unhealthy suppresses component-level alerts - source_matchers: - alertname = iDRACSystemUnhealthy @@ -113,6 +113,11 @@ alertmanager: - alertname = FuseGarageFault target_matchers: - alertname = FuseGarageMetricsMissing + # Tuya Cloud API down suppresses all per-device metrics-missing alerts + - source_matchers: + - alertname = TuyaCloudDown + target_matchers: + - alertname =~ "ATSMetricsMissing|FuseMainMetricsMissing|FuseGarageMetricsMissing|ThermostatHolMetricsMissing|ThermostatMasterBedroomMetricsMissing|ThermostatOfficeMetricsMissing|ThermostatKidsRoomMetricsMissing" # Containerd broken suppresses downstream pod alerts - source_matchers: - alertname = KubeletImagePullErrors @@ -160,6 +165,27 @@ prometheus-node-exporter: memory: 100Mi limits: memory: 100Mi +# NOTE: The parent chart forwards subchart values under `prometheus-pushgateway:`, +# not `pushgateway:` — using the wrong key silently no-ops. +prometheus-pushgateway: + # Without persistence the pushgateway's in-memory metrics are lost on restart. + # Once-per-day pushers (offsite-backup-sync) stay invisible until their next run, + # which is why backup_last_success_timestamp{job="offsite-backup-sync"} vanished + # after the 2026-04-22 node3 kubelet hiccup. + persistentVolume: + enabled: true + size: 2Gi + storageClass: proxmox-lvm-encrypted + mountPath: /data + extraArgs: + - --persistence.file=/data/pushgateway.bin + - --persistence.interval=1m + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 256Mi server: # Enable me to delete metrics extraFlags: @@ -726,6 +752,7 @@ serverFiles: for: 30m labels: severity: info + subsystem: gpu annotations: summary: "GPU power: {{ $value | printf \"%.0f\" }}W (threshold: 50W)" - alert: HighUtilization @@ -749,6 +776,14 @@ serverFiles: severity: critical annotations: summary: "NVIDIA GPU exporter is down - no GPU metrics available" + - alert: GPUNodeUnschedulable + expr: kube_node_spec_unschedulable{node="k8s-node1"} == 1 + for: 5m + labels: + severity: critical + subsystem: gpu + annotations: + summary: "GPU node {{ $labels.node }} is cordoned — Frigate and GPU workloads cannot schedule" - name: Power rules: - alert: OnBattery @@ -777,6 +812,7 @@ serverFiles: for: 60m labels: severity: info + subsystem: r730 annotations: summary: "Server power: {{ $value | printf \"%.0f\" }}W (threshold: 300W)" - alert: UsingInverterEnergyForTooLong @@ -914,6 +950,83 @@ serverFiles: severity: critical annotations: summary: "Garage fuse panel fault detected" + - alert: FuseMainHighLeakage + expr: fuse_main_leakage_current > 30 + for: 5m + labels: + severity: critical + annotations: + summary: "Main fuse leakage current: {{ $value }}mA (threshold: 30mA)" + - alert: FuseGarageHighLeakage + expr: fuse_garage_leakage_current > 30 + for: 5m + labels: + severity: critical + annotations: + summary: "Garage fuse leakage current: {{ $value }}mA (threshold: 30mA)" + - alert: FuseMainOvertemperature + expr: fuse_main_temperature > 70 + for: 5m + labels: + severity: warning + annotations: + summary: "Main fuse temperature: {{ $value }}°C (threshold: 70°C)" + - alert: FuseGarageOvertemperature + expr: fuse_garage_temperature > 70 + for: 5m + labels: + severity: warning + annotations: + summary: "Garage fuse temperature: {{ $value }}°C (threshold: 70°C)" + - alert: FuseMainVoltageAbnormal + expr: fuse_main_voltage / 10 < 200 or fuse_main_voltage / 10 > 260 + for: 5m + labels: + severity: critical + annotations: + summary: "Main fuse voltage: {{ $value }}V (expected 200-260V)" + - alert: FuseGarageVoltageAbnormal + expr: fuse_garage_voltage / 10 < 200 or fuse_garage_voltage / 10 > 260 + for: 5m + labels: + severity: critical + annotations: + summary: "Garage fuse voltage: {{ $value }}V (expected 200-260V)" + - name: Thermostats + rules: + - alert: ThermostatOverheating + expr: > + thermostat_hol_temp_current > 400 + or thermostat_master_bedroom_temp_current > 400 + or thermostat_office_temp_current > 400 + or thermostat_kids_room_temp_current > 400 + for: 10m + labels: + severity: warning + annotations: + summary: "Thermostat temperature {{ $value | printf \"%.1f\" }} (x10 °C) exceeds 40°C" + - alert: ThermostatFreezing + expr: > + thermostat_hol_temp_current < 50 + or thermostat_master_bedroom_temp_current < 50 + or thermostat_office_temp_current < 50 + or thermostat_kids_room_temp_current < 50 + for: 15m + labels: + severity: critical + annotations: + summary: "Thermostat temperature {{ $value | printf \"%.1f\" }} (x10 °C) below 5°C — risk of freezing" + - alert: ThermostatHumidityHigh + expr: > + thermostat_hol_humidity > 80 + or thermostat_master_bedroom_humidity > 80 + or thermostat_office_humidity > 80 + or thermostat_kids_room_humidity > 80 + for: 30m + labels: + severity: warning + annotations: + summary: "Thermostat humidity {{ $value }}% exceeds 80%" - name: Metric Staleness rules: - alert: UPSMetricsMissing @@ -958,6 +1071,41 @@ serverFiles: severity: warning annotations: summary: "Fuse garage panel metrics missing for 15m - check tuya-bridge pod" + - alert: ThermostatHolMetricsMissing + expr: absent(thermostat_hol_temp_current) + for: 15m + labels: + severity: warning + annotations: + summary: "Thermostat hol metrics missing for 15m - check tuya-bridge pod" + - alert: ThermostatMasterBedroomMetricsMissing + expr: absent(thermostat_master_bedroom_temp_current) + for: 15m + labels: + severity: warning + annotations: + summary: "Thermostat master bedroom metrics missing for 15m - check tuya-bridge pod" + - alert: ThermostatOfficeMetricsMissing + expr: absent(thermostat_office_temp_current) + for: 15m + labels: + severity: warning + annotations: + summary: "Thermostat office metrics missing for 15m - check tuya-bridge pod" + - alert: ThermostatKidsRoomMetricsMissing + expr: absent(thermostat_kids_room_temp_current) + for: 15m + labels: + severity: warning + annotations: + summary: "Thermostat kids room metrics missing for 15m - check tuya-bridge pod" + - alert: TuyaCloudDown + expr: count(({__name__=~".*_tuya_cloud_up"}) == 0) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Tuya Cloud API rejecting calls ({{ $value }} devices affected) — renew subscription at iot.tuya.com (code 28841002 = expired trial) or rotate TINYTUYA_API_KEY" - alert: ProxmoxMetricsMissing expr: absent(pve_up) for: 10m @@ -1071,6 +1219,14 @@ serverFiles: severity: warning annotations: summary: "Home Assistant down: {{ $labels.instance }}" + - alert: HomeAssistantCriticalSensorUnavailable + expr: haos_entity_available{entity=~"sensor\\.(tesla_t4_gpu_(temperature|power_usage|utilization|memory_used)|r730_(cpu_temperature|power_consumption|power_supply_input_voltage_[12]|system_board_(exhaust|inlet)_temperature)|ups_(input_voltage|output_voltage|load|battery_remaining|output_source))"} == 0 + for: 15m + labels: + severity: critical + annotations: + summary: "HA sensor unavailable: {{ $labels.friendly_name }} ({{ $labels.entity }})" + description: "{{ $labels.entity }} on {{ $labels.instance }} has been unavailable for 15+ minutes. Common cause: REST sensor needs HA restart (reload_all doesn't rebuild rest: platform). Verify exporter endpoint from HA: `ssh vbarzin@192.168.1.8` → `curl -sk `. Fix: `curl -X POST -H \"Authorization: Bearer $HOME_ASSISTANT_SOFIA_TOKEN\" $HOME_ASSISTANT_SOFIA_URL/api/services/homeassistant/restart`." - alert: CoreDNSErrors expr: rate(coredns_dns_responses_total{rcode="SERVFAIL"}[5m]) > 1 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 10m @@ -1155,6 +1311,28 @@ serverFiles: severity: critical annotations: summary: "Vault backup CronJob has never completed successfully" + - alert: VaultRaftLeaderStuck + expr: | + (vault_core_active == 1) + and on(instance) + (rate(vault_raft_last_index_gauge[5m]) == 0) + for: 2m + labels: + severity: critical + annotations: + summary: "Vault raft leader {{ $labels.instance }} is active but commit index has not advanced for >2m" + description: "The raft leader is reachable on TCP but its commit index has stalled — likely a stuck goroutine hang (see 2026-04-22 post-mortem). External /v1/sys/health will be 503. Recovery: graceful delete of the stuck pod (see docs/runbooks/vault-raft-leader-deadlock.md). NOTE: silent until vault telemetry + scrape job are enabled." + - alert: VaultHAStatusUnavailable + expr: | + (count(up{job="vault"} == 1) > 0) + and + (count(vault_core_active == 1) == 0) + for: 5m + labels: + severity: critical + annotations: + summary: "Vault pods are Up but no pod reports HA active leader" + description: "At least one Vault pod is scraping healthy, but no pod has vault_core_active=1. HA layer is broken — external endpoint will be 503 even though the pods themselves are alive. See docs/runbooks/vault-raft-leader-deadlock.md. NOTE: silent until vault telemetry + scrape job are enabled." - alert: VaultwardenBackupStale expr: (time() - kube_cronjob_status_last_successful_time{cronjob="vaultwarden-backup", namespace="vaultwarden"}) > 86400 for: 30m @@ -1247,12 +1425,13 @@ serverFiles: annotations: summary: "Backup job failed: {{ $labels.namespace }}/{{ $labels.job_name }}" - alert: LVMSnapshotStale - expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 172800 + expr: (time() - lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) > 108000 for: 30m labels: severity: critical annotations: summary: "LVM PVC snapshots are {{ $value | humanizeDuration }} old (expected daily)" + description: "Timer lvm-pvc-snapshot.timer on 192.168.1.127 hasn't pushed fresh metrics. Runbook: docs/runbooks/restore-lvm-snapshot.md" - alert: LVMSnapshotNeverRun expr: absent(lvm_snapshot_last_run_timestamp{job="lvm-pvc-snapshot"}) for: 48h @@ -1411,7 +1590,7 @@ serverFiles: severity: warning annotations: summary: "Redis master {{ $labels.pod }} has only {{ $value }} connected replicas (expected 2)" - - alert: HeadscaleDown + - alert: HeadscaleReplicasMismatch expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1 for: 5m labels: @@ -1815,7 +1994,7 @@ serverFiles: summary: "Email round-trip probe failing. Check MX DNS, Postfix, Mailgun API, and IMAP." - alert: EmailRoundtripStale expr: (time() - email_roundtrip_last_success_timestamp{job="email-roundtrip-monitor"}) > 3600 - for: 10m + for: 20m labels: severity: warning annotations: @@ -1980,7 +2159,7 @@ serverFiles: annotations: summary: "Technitium zone-sync has not run successfully in >1h (last: {{ $value | humanizeDuration }} ago)" - alert: TechnitiumZoneCountMismatch - expr: (max(technitium_zone_count) - min(technitium_zone_count)) > 0 + expr: (max(technitium_zone_count{instance!="primary"}) - min(technitium_zone_count{instance!="primary"})) > 0 for: 15m labels: severity: warning @@ -1995,13 +2174,6 @@ serverFiles: summary: "CoreDNS forward SERVFAIL/REFUSED rate: {{ $value | printf \"%.2f\" }}/s — upstream DNS (pfSense/public) may be unhealthy" - name: qbittorrent rules: - - alert: MAMMouseClass - expr: mam_class_code == 0 - for: 1h - labels: - severity: critical - annotations: - summary: "MAM account is in Mouse class — tracker is refusing announces, ratio cannot recover" - alert: MAMCookieExpired expr: mam_farming_cookie_expired > 0 for: 0m @@ -2040,13 +2212,6 @@ serverFiles: severity: critical annotations: summary: "qBittorrent is disconnected from the network" - - alert: QBittorrentMAMUnsatisfied - expr: qbt_tracker_unsatisfied{tracker="mam"} > 15 - for: 10m - labels: - severity: warning - annotations: - summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)" - name: Headscale VPN rules: @@ -2361,6 +2526,58 @@ extraScrapeConfigs: | action: replace regex: '(.*)' replacement: 'fuse_main_$${1}' + - job_name: 'thermostat-hol' + static_configs: + - targets: + - "tuya-bridge.tuya-bridge.svc.cluster.local:80" + metrics_path: '/metrics/bf7efce9519bd508df431s' + params: + api-key: ['${tuya_api_key}'] + metric_relabel_configs: + - source_labels: [ __name__ ] + target_label: '__name__' + action: replace + regex: '(.*)' + replacement: 'thermostat_hol_$${1}' + - job_name: 'thermostat-master-bedroom' + static_configs: + - targets: + - "tuya-bridge.tuya-bridge.svc.cluster.local:80" + metrics_path: '/metrics/bf70e80159641f61a5lzho' + params: + api-key: ['${tuya_api_key}'] + metric_relabel_configs: + - source_labels: [ __name__ ] + target_label: '__name__' + action: replace + regex: '(.*)' + replacement: 'thermostat_master_bedroom_$${1}' + - job_name: 'thermostat-office' + static_configs: + - targets: + - "tuya-bridge.tuya-bridge.svc.cluster.local:80" + metrics_path: '/metrics/bf9597a0064f0349d4b09x' + params: + api-key: ['${tuya_api_key}'] + metric_relabel_configs: + - source_labels: [ __name__ ] + target_label: '__name__' + action: replace + regex: '(.*)' + replacement: 'thermostat_office_$${1}' + - job_name: 'thermostat-kids-room' + static_configs: + - targets: + - "tuya-bridge.tuya-bridge.svc.cluster.local:80" + metrics_path: '/metrics/bfe64da91577117e0annt5' + params: + api-key: ['${tuya_api_key}'] + metric_relabel_configs: + - source_labels: [ __name__ ] + target_label: '__name__' + action: replace + regex: '(.*)' + replacement: 'thermostat_kids_room_$${1}' - job_name: 'haos' static_configs: - targets: diff --git a/stacks/nextcloud/main.tf b/stacks/nextcloud/main.tf index 14a5122d..d737fa5c 100644 --- a/stacks/nextcloud/main.tf +++ b/stacks/nextcloud/main.tf @@ -493,6 +493,25 @@ resource "kubernetes_cron_job_v1" "nextcloud-backup" { spec { restart_policy = "OnFailure" + # Backup mounts the same RWO PVC (proxmox-lvm-encrypted) as the + # main nextcloud pod, so it MUST schedule on the same node — the + # volume cannot attach to two nodes simultaneously. Without this + # the backup pod is stuck in ContainerCreating until cron retries. + affinity { + pod_affinity { + required_during_scheduling_ignored_during_execution { + label_selector { + match_labels = { + "app.kubernetes.io/name" = "nextcloud" + "app.kubernetes.io/instance" = "nextcloud" + } + } + topology_key = "kubernetes.io/hostname" + namespaces = [kubernetes_namespace.nextcloud.metadata[0].name] + } + } + } + container { name = "backup" image = "alpine:latest" diff --git a/stacks/nvidia/modules/nvidia/main.tf b/stacks/nvidia/modules/nvidia/main.tf index f11bd2c3..720f6daf 100644 --- a/stacks/nvidia/modules/nvidia/main.tf +++ b/stacks/nvidia/modules/nvidia/main.tf @@ -63,18 +63,25 @@ resource "kubernetes_resource_quota" "nvidia_quota" { } } -# Apply GPU taint and label to ensure only GPU workloads run on GPU node +# Apply GPU taint dynamically based on NFD-discovered GPU nodes. The +# NFD label `feature.node.kubernetes.io/pci-10de.present=true` is +# auto-applied on any node with an NVIDIA PCI device (vendor 0x10de), +# so the taint follows the card if it moves between nodes. Workload +# nodeSelectors key off `nvidia.com/gpu.present=true` (applied by +# gpu-feature-discovery once the operator is up). resource "null_resource" "gpu_node_config" { provisioner "local-exec" { command = <<-EOT - kubectl taint nodes k8s-node1 nvidia.com/gpu=true:PreferNoSchedule --overwrite - kubectl label nodes k8s-node1 gpu=true --overwrite + set -euo pipefail + for node in $(kubectl get nodes -l feature.node.kubernetes.io/pci-10de.present=true -o jsonpath='{.items[*].metadata.name}'); do + kubectl taint nodes "$node" nvidia.com/gpu=true:PreferNoSchedule --overwrite + done EOT } - # Re-run if namespace changes (proxy for cluster changes) triggers = { - namespace = kubernetes_namespace.nvidia.metadata[0].name + namespace = kubernetes_namespace.nvidia.metadata[0].name + command_hash = "dynamic-taint-v1" } } @@ -141,7 +148,7 @@ resource "kubernetes_deployment" "nvidia-exporter" { } spec { node_selector = { - "gpu" : "true" + "nvidia.com/gpu.present" : "true" } toleration { key = "nvidia.com/gpu" @@ -604,7 +611,7 @@ resource "kubernetes_daemonset" "gpu_pod_exporter" { service_account_name = kubernetes_service_account.gpu_pod_exporter.metadata[0].name node_selector = { - "gpu" : "true" + "nvidia.com/gpu.present" : "true" } toleration { diff --git a/stacks/paperless-ngx/main.tf b/stacks/paperless-ngx/main.tf index bceafaf2..4bafe6ce 100644 --- a/stacks/paperless-ngx/main.tf +++ b/stacks/paperless-ngx/main.tf @@ -86,6 +86,28 @@ resource "kubernetes_persistent_volume_claim" "data_proxmox" { } } +resource "kubernetes_persistent_volume_claim" "data_encrypted" { + wait_until_bound = false + metadata { + name = "paperless-ngx-data-encrypted" + namespace = kubernetes_namespace.paperless-ngx.metadata[0].name + annotations = { + "resize.topolvm.io/threshold" = "80%" + "resize.topolvm.io/increase" = "100%" + "resize.topolvm.io/storage_limit" = "5Gi" + } + } + spec { + access_modes = ["ReadWriteOnce"] + storage_class_name = "proxmox-lvm-encrypted" + resources { + requests = { + storage = "1Gi" + } + } + } +} + resource "kubernetes_deployment" "paperless-ngx" { metadata { @@ -196,7 +218,7 @@ resource "kubernetes_deployment" "paperless-ngx" { volume { name = "data" persistent_volume_claim { - claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name + claim_name = kubernetes_persistent_volume_claim.data_encrypted.metadata[0].name } } } diff --git a/stacks/payslip-ingest/main.tf b/stacks/payslip-ingest/main.tf index 7e4d0006..8c313c25 100644 --- a/stacks/payslip-ingest/main.tf +++ b/stacks/payslip-ingest/main.tf @@ -32,7 +32,20 @@ resource "kubernetes_namespace" "payslip_ingest" { # Seed these manually in Vault before applying: # secret/paperless-ngx -> property `api_token` # secret/claude-agent-service -> property `api_bearer_token` -# secret/payslip-ingest -> property `webhook_bearer_token` +# secret/payslip-ingest -> properties: +# - `webhook_bearer_token` +# - `actualbudget_api_key` (same value as +# actualbudget-http-api-viktor random +# api-key — fetch via `kubectl get pods +# -n actualbudget -l +# app=actualbudget-http-api-viktor -o +# jsonpath={.items[0].spec.containers[0].env}` +# and grep API_KEY) +# - `actualbudget_encryption_password` +# (same as Viktor's budget password in +# secret/actualbudget/credentials[viktor]) +# - `actualbudget_budget_sync_id` +# (same as Viktor's sync_id) resource "kubernetes_manifest" "external_secret" { manifest = { apiVersion = "external-secrets.io/v1beta1" @@ -79,6 +92,27 @@ resource "kubernetes_manifest" "external_secret" { property = "webhook_bearer_token" } }, + { + secretKey = "ACTUALBUDGET_API_KEY" + remoteRef = { + key = "payslip-ingest" + property = "actualbudget_api_key" + } + }, + { + secretKey = "ACTUALBUDGET_ENCRYPTION_PASSWORD" + remoteRef = { + key = "payslip-ingest" + property = "actualbudget_encryption_password" + } + }, + { + secretKey = "ACTUALBUDGET_BUDGET_SYNC_ID" + remoteRef = { + key = "payslip-ingest" + property = "actualbudget_budget_sync_id" + } + }, ] } } @@ -288,6 +322,85 @@ resource "kubernetes_service" "payslip_ingest" { } } +# Daily sync of Meta payroll deposits from ActualBudget's http-api sidecar. +# Populates payslip_ingest.external_meta_deposits so Panel 14 can overlay bank +# deposits against payslip.net_pay — catches parser drift on net_pay. +resource "kubernetes_cron_job_v1" "actualbudget_payroll_sync" { + metadata { + name = "actualbudget-payroll-sync" + namespace = kubernetes_namespace.payslip_ingest.metadata[0].name + } + spec { + schedule = "0 2 * * *" + concurrency_policy = "Forbid" + successful_jobs_history_limit = 3 + failed_jobs_history_limit = 5 + starting_deadline_seconds = 300 + + job_template { + metadata { + labels = local.labels + } + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 86400 + template { + metadata { + labels = local.labels + } + spec { + restart_policy = "OnFailure" + image_pull_secrets { + name = "registry-credentials" + } + container { + name = "sync" + image = local.image + command = ["python", "-m", "payslip_ingest", "sync-meta-deposits"] + + env_from { + secret_ref { + name = "payslip-ingest-secrets" + } + } + env_from { + secret_ref { + name = "payslip-ingest-db-creds" + } + } + + env { + name = "ACTUALBUDGET_HTTP_API_URL" + value = "http://budget-http-api-viktor.actualbudget.svc.cluster.local" + } + + resources { + requests = { + cpu = "50m" + memory = "128Mi" + } + limits = { + memory = "256Mi" + } + } + } + } + } + } + } + } + + lifecycle { + # KYVERNO_LIFECYCLE_V1 + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } + + depends_on = [ + kubernetes_manifest.external_secret, + kubernetes_manifest.db_external_secret, + ] +} + # Plan-time read of the ESO-created K8s Secret for Grafana datasource password. # First apply: -target=kubernetes_manifest.db_external_secret first so the Secret exists. data "kubernetes_secret" "payslip_ingest_db_creds" { diff --git a/stacks/poison-fountain/main.tf b/stacks/poison-fountain/main.tf index 64e7ae21..870a1675 100644 --- a/stacks/poison-fountain/main.tf +++ b/stacks/poison-fountain/main.tf @@ -219,6 +219,10 @@ module "ingress" { skip_default_rate_limit = true exclude_crowdsec = true anti_ai_scraping = false + # Deployment is scaled to 0 (see replicas above). Opt the ingress out of + # Uptime Kuma external monitoring so the sync CronJob deletes the orphaned + # `[External] poison` monitor instead of flapping DOWN. + external_monitor = false extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "Poison Fountain" @@ -252,6 +256,13 @@ resource "kubernetes_cron_job_v1" "poison_fetcher" { name = "poison-fountain-fetcher" } spec { + security_context { + # curlimages/curl defaults to uid 100, but the NFS mount at /data is + # owned root:root 755 (writes from the main Deployment which runs as + # root). Align the CronJob with the Deployment so mkdir /data/cache + # succeeds. no_root_squash is set on the /srv/nfs export. + run_as_user = 0 + } container { name = "fetcher" image = "curlimages/curl:latest" diff --git a/stacks/poison-fountain/providers.tf b/stacks/poison-fountain/providers.tf index b337a2e9..012af700 100644 --- a/stacks/poison-fountain/providers.tf +++ b/stacks/poison-fountain/providers.tf @@ -9,6 +9,10 @@ terraform { source = "cloudflare/cloudflare" version = "~> 4" } + authentik = { + source = "goauthentik/authentik" + version = "~> 2024.10" + } } } diff --git a/stacks/redis/modules/redis/main.tf b/stacks/redis/modules/redis/main.tf index 91a938bc..c7eb9245 100644 --- a/stacks/redis/modules/redis/main.tf +++ b/stacks/redis/modules/redis/main.tf @@ -43,7 +43,7 @@ resource "kubernetes_config_map" "haproxy" { timeout connect 5s timeout client 30s timeout server 30s - timeout check 3s + timeout check 5s # Dynamic DNS resolution via cluster CoreDNS. Without this, haproxy # resolves server hostnames once at startup and caches forever, so @@ -82,9 +82,9 @@ resource "kubernetes_config_map" "haproxy" { tcp-check expect rstring role:master tcp-check send "QUIT\r\n" tcp-check expect string +OK - server redis-v2-0 redis-v2-0.redis-v2-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none - server redis-v2-1 redis-v2-1.redis-v2-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none - server redis-v2-2 redis-v2-2.redis-v2-headless.redis.svc.cluster.local:6379 check inter 1s fall 2 rise 2 resolvers kubernetes init-addr last,libc,none + server redis-v2-0 redis-v2-0.redis-v2-headless.redis.svc.cluster.local:6379 check inter 2s fall 3 rise 2 resolvers kubernetes init-addr last,libc,none + server redis-v2-1 redis-v2-1.redis-v2-headless.redis.svc.cluster.local:6379 check inter 2s fall 3 rise 2 resolvers kubernetes init-addr last,libc,none + server redis-v2-2 redis-v2-2.redis-v2-headless.redis.svc.cluster.local:6379 check inter 2s fall 3 rise 2 resolvers kubernetes init-addr last,libc,none backend redis_sentinel balance roundrobin @@ -362,8 +362,8 @@ resource "kubernetes_config_map" "redis_v2_sentinel_bootstrap" { sentinel resolve-hostnames yes sentinel announce-hostnames yes sentinel monitor mymaster $MASTER_HOST 6379 2 - sentinel down-after-milliseconds mymaster 5000 - sentinel failover-timeout mymaster 30000 + sentinel down-after-milliseconds mymaster 15000 + sentinel failover-timeout mymaster 60000 sentinel parallel-syncs mymaster 1 EOF @@ -396,7 +396,7 @@ resource "kubernetes_service" "redis_v2_headless" { } spec { cluster_ip = "None" - publish_not_ready_addresses = true + publish_not_ready_addresses = false selector = { app = "redis-v2" } @@ -451,18 +451,15 @@ resource "kubernetes_stateful_set_v1" "redis_v2" { affinity { pod_anti_affinity { - preferred_during_scheduling_ignored_during_execution { - weight = 100 - pod_affinity_term { - label_selector { - match_expressions { - key = "app" - operator = "In" - values = ["redis-v2"] - } + required_during_scheduling_ignored_during_execution { + label_selector { + match_expressions { + key = "app" + operator = "In" + values = ["redis-v2"] } - topology_key = "kubernetes.io/hostname" } + topology_key = "kubernetes.io/hostname" } } } @@ -535,8 +532,8 @@ resource "kubernetes_stateful_set_v1" "redis_v2" { } initial_delay_seconds = 15 period_seconds = 10 - timeout_seconds = 3 - failure_threshold = 3 + timeout_seconds = 10 + failure_threshold = 5 } readiness_probe { exec { @@ -580,8 +577,8 @@ resource "kubernetes_stateful_set_v1" "redis_v2" { } initial_delay_seconds = 20 period_seconds = 10 - timeout_seconds = 3 - failure_threshold = 3 + timeout_seconds = 10 + failure_threshold = 5 } readiness_probe { exec { diff --git a/stacks/technitium/modules/technitium/ha.tf b/stacks/technitium/modules/technitium/ha.tf index bd90cbfa..71097afe 100644 --- a/stacks/technitium/modules/technitium/ha.tf +++ b/stacks/technitium/modules/technitium/ha.tf @@ -434,12 +434,17 @@ resource "kubernetes_cron_job_v1" "technitium_zone_sync" { while read -r zone; do if grep -qx "$zone" /tmp/replica_zones.txt; then - # Zone exists — just resync + # Zone exists — reconcile primaryNameServerAddresses to the + # stable FQDN before resync. Without this, a zone created + # against an old pod IP (pre-service-ClusterIP era) stays + # pinned to that dead IP forever and zone transfers fail + # silently. Idempotent — Technitium accepts identical values. + curl -sf "$REPLICA/api/zones/options/set?token=$R_TOKEN&zone=$zone&primaryNameServerAddresses=$PRIMARY_HOST" > /dev/null || true curl -sf "$REPLICA/api/zones/resync?token=$R_TOKEN&zone=$zone" > /dev/null || true else # New zone — create as Secondary and validate response echo "NEW: Creating $zone on $REPLICA" - RESP=$(curl -sf "$REPLICA/api/zones/create?token=$R_TOKEN&zone=$zone&type=Secondary&primaryNameServerAddresses=$PRIMARY_IP" || echo '{"status":"error"}') + RESP=$(curl -sf "$REPLICA/api/zones/create?token=$R_TOKEN&zone=$zone&type=Secondary&primaryNameServerAddresses=$PRIMARY_HOST" || echo '{"status":"error"}') if echo "$RESP" | grep -q '"status":"ok"'; then SYNCED=$((SYNCED + 1)) else @@ -486,7 +491,14 @@ resource "kubernetes_cron_job_v1" "technitium_zone_sync" { value = var.technitium_password } env { - name = "PRIMARY_IP" + # Service ClusterIP — Terraform tracks it on every apply, and the + # reconcile loop below re-applies it to every existing zone on + # every run (*/30m), so any drift (e.g. service recreate → new + # ClusterIP, or historical pod-IP values still pinned on replicas) + # self-heals within a sync cycle. Hostname form was tried but + # Technitium's own resolver doesn't forward svc.cluster.local, + # so `primaryNameServerAddresses` must be a literal IP. + name = "PRIMARY_HOST" value = kubernetes_service.technitium_primary.spec[0].cluster_ip } } diff --git a/stacks/traefik/modules/traefik/main.tf b/stacks/traefik/modules/traefik/main.tf index 788b1678..14d0e907 100644 --- a/stacks/traefik/modules/traefik/main.tf +++ b/stacks/traefik/modules/traefik/main.tf @@ -200,7 +200,7 @@ resource "helm_release" "traefik" { # Explicit entrypoint timeouts to bound tail latency from slow clients "--entryPoints.websecure.transport.respondingTimeouts.readTimeout=60s", "--entryPoints.websecure.transport.respondingTimeouts.writeTimeout=60s", - "--entryPoints.websecure.transport.respondingTimeouts.idleTimeout=180s", + "--entryPoints.websecure.transport.respondingTimeouts.idleTimeout=600s", # Use forwarded headers from trusted proxies "--entryPoints.websecure.forwardedHeaders.insecure=false", "--entryPoints.web.forwardedHeaders.insecure=false", diff --git a/stacks/traefik/modules/traefik/middleware.tf b/stacks/traefik/modules/traefik/middleware.tf index 9cfac0a3..2c8ae8c4 100644 --- a/stacks/traefik/modules/traefik/middleware.tf +++ b/stacks/traefik/modules/traefik/middleware.tf @@ -244,8 +244,8 @@ resource "kubernetes_manifest" "middleware_immich_rate_limit" { } spec = { rateLimit = { - average = 500 - burst = 5000 + average = 1000 + burst = 20000 } } } diff --git a/stacks/tuya-bridge/main.tf b/stacks/tuya-bridge/main.tf index 4d87f8aa..574ed95d 100644 --- a/stacks/tuya-bridge/main.tf +++ b/stacks/tuya-bridge/main.tf @@ -118,6 +118,26 @@ resource "kubernetes_deployment" "tuya-bridge" { } } } + liveness_probe { + http_get { + path = "/health" + port = 8080 + } + initial_delay_seconds = 60 + period_seconds = 30 + timeout_seconds = 5 + failure_threshold = 6 + } + readiness_probe { + http_get { + path = "/health" + port = 8080 + } + initial_delay_seconds = 10 + period_seconds = 15 + timeout_seconds = 5 + failure_threshold = 2 + } resources { requests = { cpu = "10m" diff --git a/stacks/vault/main.tf b/stacks/vault/main.tf index 0b8ef993..a47379ff 100644 --- a/stacks/vault/main.tf +++ b/stacks/vault/main.tf @@ -25,22 +25,6 @@ module "tls_secret" { tls_secret_name = var.tls_secret_name } -# NFS StorageClass pointing to Proxmox host (replaces nfs-truenas for vault) -resource "kubernetes_storage_class" "nfs_proxmox" { - metadata { - name = "nfs-proxmox" - } - storage_provisioner = "nfs.csi.k8s.io" - reclaim_policy = "Retain" - volume_binding_mode = "Immediate" - allow_volume_expansion = true - parameters = { - server = "192.168.1.127" - share = "/srv/nfs" - } - mount_options = ["soft", "actimeo=5", "retrans=3", "timeo=30"] -} - resource "helm_release" "vault" { name = "vault" namespace = kubernetes_namespace.vault.metadata[0].name @@ -72,13 +56,13 @@ resource "helm_release" "vault" { dataStorage = { enabled = true size = "2Gi" - storageClass = "nfs-proxmox" # Proxmox host NFS (was nfs-truenas) + storageClass = "proxmox-lvm-encrypted" # Migrated 2026-04-25 from nfs-proxmox; raft fsync is NFS-hostile (post-mortems/2026-04-22-vault-raft-leader-deadlock.md) } auditStorage = { enabled = true size = "2Gi" - storageClass = "nfs-proxmox" # Proxmox host NFS (was nfs-truenas) + storageClass = "proxmox-lvm-encrypted" # Migrated 2026-04-25 from nfs-proxmox } standalone = { enabled = false } @@ -117,6 +101,24 @@ resource "helm_release" "vault" { } } + # fsGroupChangePolicy=OnRootMismatch skips recursive chown on restart. + # Without this, kubelet walks every file over NFS each restart; during + # 2026-04-22 outage this looped for 10m+ and blocked quorum recovery. + # The other four fields restore the chart defaults — providing pod{} + # replaces them, and missing fsGroup left vault unable to write to + # the freshly-formatted ext4 PVC during the 2026-04-25 migration. + statefulSet = { + securityContext = { + pod = { + fsGroupChangePolicy = "OnRootMismatch" + fsGroup = 1000 + runAsGroup = 1000 + runAsUser = 100 + runAsNonRoot = true + } + } + } + # Mount unseal key secret extraVolumes = [{ type = "secret" @@ -536,7 +538,8 @@ resource "vault_database_secret_backend_connection" "postgresql" { # "pg-trading", # Commented out 2026-04-06 - trading-bot disabled "pg-health", "pg-linkwarden", "pg-affine", "pg-woodpecker", "pg-claude-memory", - "pg-terraform-state", "pg-payslip-ingest", "pg-job-hunter" + "pg-terraform-state", "pg-payslip-ingest", "pg-job-hunter", + "pg-wealthfolio-sync", "pg-fire-planner" ] postgresql { @@ -690,6 +693,22 @@ resource "vault_database_secret_backend_static_role" "pg_job_hunter" { rotation_period = 604800 } +resource "vault_database_secret_backend_static_role" "pg_wealthfolio_sync" { + backend = vault_mount.database.path + db_name = vault_database_secret_backend_connection.postgresql.name + name = "pg-wealthfolio-sync" + username = "wealthfolio_sync" + rotation_period = 604800 +} + +resource "vault_database_secret_backend_static_role" "pg_fire_planner" { + backend = vault_mount.database.path + db_name = vault_database_secret_backend_connection.postgresql.name + name = "pg-fire-planner" + username = "fire_planner" + rotation_period = 604800 +} + # ============================================================================= # Kubernetes Secrets Engine — Dynamic K8s Credentials # ============================================================================= diff --git a/stacks/wealthfolio/main.tf b/stacks/wealthfolio/main.tf index a469e9b3..df4dca48 100644 --- a/stacks/wealthfolio/main.tf +++ b/stacks/wealthfolio/main.tf @@ -3,6 +3,7 @@ variable "tls_secret_name" { sensitive = true } variable "nfs_server" { type = string } +variable "postgresql_host" { type = string } resource "kubernetes_namespace" "wealthfolio" { metadata { @@ -45,6 +46,52 @@ resource "kubernetes_manifest" "external_secret" { depends_on = [kubernetes_namespace.wealthfolio] } +# DB credentials for the SQLite→PG ETL sidecar. Vault DB engine static role +# `pg-wealthfolio-sync` rotates this every 7 days; ExternalSecret refreshes +# the K8s Secret every 15m so the sidecar always has a valid password. +resource "kubernetes_manifest" "wealthfolio_sync_db_external_secret" { + manifest = { + apiVersion = "external-secrets.io/v1beta1" + kind = "ExternalSecret" + metadata = { + name = "wealthfolio-sync-db-creds" + namespace = "wealthfolio" + } + spec = { + refreshInterval = "15m" + secretStoreRef = { + name = "vault-database" + kind = "ClusterSecretStore" + } + target = { + name = "wealthfolio-sync-db-creds" + template = { + metadata = { + annotations = { + "reloader.stakater.com/match" = "true" + } + } + data = { + PGHOST = var.postgresql_host + PGPORT = "5432" + PGDATABASE = "wealthfolio_sync" + PGUSER = "wealthfolio_sync" + PGPASSWORD = "{{ .password }}" + } + } + } + data = [{ + secretKey = "password" + remoteRef = { + key = "static-creds/pg-wealthfolio-sync" + property = "password" + } + }] + } + } + depends_on = [kubernetes_namespace.wealthfolio] +} + module "tls_secret" { source = "../../modules/kubernetes/setup_tls_secret" namespace = kubernetes_namespace.wealthfolio.metadata[0].name @@ -214,6 +261,181 @@ resource "kubernetes_deployment" "wealthfolio" { limits = { memory = "64Mi" } } } + + # pg-sync sidecar — mirrors a small subset of SQLite into PG every hour + # so Grafana can chart net worth / contributions / growth via the + # `wealthfolio_sync` database. Mounts /data RO; writes to a tmp dir + # for the sqlite3 .backup snapshot to avoid blocking writers. Bootstrap + # DDL runs each iteration (CREATE TABLE IF NOT EXISTS — idempotent). + # Truncate-and-reload pattern: tables are small (~10k DAV rows, ~500 + # activities, 6 accounts), so a full reload each hour is simpler than + # incremental upserts and gives clean cold-start behaviour. + container { + name = "pg-sync" + image = "alpine:3.20" + env { + name = "PGHOST" + value_from { + secret_key_ref { + name = "wealthfolio-sync-db-creds" + key = "PGHOST" + } + } + } + env { + name = "PGPORT" + value_from { + secret_key_ref { + name = "wealthfolio-sync-db-creds" + key = "PGPORT" + } + } + } + env { + name = "PGDATABASE" + value_from { + secret_key_ref { + name = "wealthfolio-sync-db-creds" + key = "PGDATABASE" + } + } + } + env { + name = "PGUSER" + value_from { + secret_key_ref { + name = "wealthfolio-sync-db-creds" + key = "PGUSER" + } + } + } + env { + name = "PGPASSWORD" + value_from { + secret_key_ref { + name = "wealthfolio-sync-db-creds" + key = "PGPASSWORD" + } + } + } + command = ["/bin/sh", "-c", <<-EOT + set -eu + apk add --no-cache --quiet sqlite postgresql-client busybox-suid + mkdir -p /etc/crontabs /scripts /tmp/wf-sync + cat >/etc/crontabs/root <<'CRON' + # Hourly: snapshot SQLite, reload PG mirror. + 7 * * * * /scripts/sync.sh >>/proc/1/fd/1 2>&1 + CRON + cat >/scripts/sync.sh <<'SCRIPT' + #!/bin/sh + set -eu + TS=$(date -u +%Y-%m-%dT%H:%M:%SZ) + echo "[$TS] wealthfolio-pg-sync: starting" + + # Bootstrap schema (idempotent). + psql -v ON_ERROR_STOP=1 <<'SQL' + CREATE TABLE IF NOT EXISTS accounts ( + id TEXT PRIMARY KEY, + name TEXT, + account_type TEXT, + currency TEXT, + is_active BOOLEAN + ); + CREATE TABLE IF NOT EXISTS daily_account_valuation ( + id TEXT PRIMARY KEY, + account_id TEXT NOT NULL, + valuation_date DATE NOT NULL, + account_currency TEXT, + base_currency TEXT, + fx_rate_to_base NUMERIC, + cash_balance NUMERIC, + investment_market_value NUMERIC, + total_value NUMERIC, + cost_basis NUMERIC, + net_contribution NUMERIC + ); + CREATE INDEX IF NOT EXISTS idx_dav_acct_date ON daily_account_valuation(account_id, valuation_date); + CREATE INDEX IF NOT EXISTS idx_dav_date ON daily_account_valuation(valuation_date); + CREATE TABLE IF NOT EXISTS activities ( + id TEXT PRIMARY KEY, + account_id TEXT, + asset_id TEXT, + activity_type TEXT, + activity_date TIMESTAMPTZ, + quantity NUMERIC, + unit_price NUMERIC, + amount NUMERIC, + fee NUMERIC, + currency TEXT, + fx_rate NUMERIC, + notes TEXT + ); + CREATE INDEX IF NOT EXISTS idx_act_date ON activities(activity_date); + SQL + + # Snapshot SQLite (online backup — non-blocking). + rm -f /tmp/wf-sync/snapshot.db + sqlite3 /data/wealthfolio.db ".backup /tmp/wf-sync/snapshot.db" + + # Dump source rows to TSV. + sqlite3 -separator $'\t' /tmp/wf-sync/snapshot.db \ + "SELECT id, name, account_type, currency, is_active FROM accounts;" \ + > /tmp/wf-sync/accounts.tsv + + sqlite3 -separator $'\t' /tmp/wf-sync/snapshot.db <<'SQ' > /tmp/wf-sync/dav.tsv + SELECT id, account_id, valuation_date, account_currency, base_currency, + CAST(fx_rate_to_base AS REAL), + CAST(cash_balance AS REAL), + CAST(investment_market_value AS REAL), + CAST(total_value AS REAL), + CAST(cost_basis AS REAL), + CAST(net_contribution AS REAL) + FROM daily_account_valuation + WHERE account_id != 'TOTAL'; -- synthetic pre-aggregated row; would double-count when summed + SQ + + sqlite3 -separator $'\t' /tmp/wf-sync/snapshot.db <<'SQ' > /tmp/wf-sync/activities.tsv + SELECT id, account_id, asset_id, activity_type, activity_date, + CAST(quantity AS REAL), + CAST(unit_price AS REAL), + CAST(amount AS REAL), + CAST(fee AS REAL), + currency, + CAST(fx_rate AS REAL), + notes + FROM activities WHERE status='POSTED'; + SQ + + # Truncate-and-reload (small tables; simpler than upserts). + psql -v ON_ERROR_STOP=1 <