From 43254ccd3fb4db46b8911af4c49f499fdcd95de3 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 18 Apr 2026 23:21:36 +0000
Subject: [PATCH 1/3] [infra] Add Woodpecker pipeline to deploy PVE
 /etc/exports (Wave 6b)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Context

Wave 6b of the state-drift consolidation plan. `scripts/pve-nfs-exports` is
the git-managed source of truth for the Proxmox host's NFS export table
(file header documents this since the 2026-04-14 NFS outage post-mortem).
Deploying it was runbook-only — `scp` then `ssh ... exportfs -ra` — which
means a change could sit unpushed-to-PVE indefinitely, and nothing alerted
on divergence between git and host.

Wave 6b closes that loop: a Woodpecker pipeline watches
`scripts/pve-nfs-exports` on the `master` branch, diffs against the
current host file, and scp's the new content followed by `exportfs -ra`.
The same 2-shell-command runbook, now a CI step that runs on every push
and is manually triggerable.

## This change

- New pipeline `.woodpecker/pve-nfs-exports-sync.yml` — path-filtered push
  trigger + manual.
- SSH credentials provisioned 2026-04-18:
  - ed25519 keypair `woodpecker-pve-nfs-exports-sync`
  - Public key in `root@192.168.1.127:~/.ssh/authorized_keys`
  - Private key in Vault `secret/woodpecker/pve_ssh_key` (plus known_hosts
    entry for deterministic host-key pinning from Vault)
  - Woodpecker repo-level secret `pve_ssh_key` (id 139) bound to the infra
    repo's `push`/`manual`/`cron` events
- Pipeline steps: install openssh + curl (alpine image) → stage private
  key from secret → ssh-keyscan the PVE host into known_hosts → diff
  current vs. proposed exports (shown in pipeline log) → scp → exportfs
  -ra → Slack notify status.

## What is NOT in this change

- Drift detection (git-truth vs. host-truth) via cron: this pipeline only
  fires on *push*, so a host-side edit wouldn't be caught. Could add a
  daily cron that just runs the diff step and alerts if non-empty. Left
  as a refinement if drift becomes an issue.
- Pulling known_hosts from Vault rather than ssh-keyscan on each run: the
  keyscan is simpler and works against key rotation without needing a
  Vault round-trip. Pulling from Vault is the right answer the moment we
  add MITM risk, which the internal network doesn't have today.

## Reproduce locally
Edit `scripts/pve-nfs-exports`, push to master. Watch the pipeline in
Woodpecker. Verify on PVE: `ssh root@192.168.1.127 "md5sum /etc/exports"`
matches `md5sum scripts/pve-nfs-exports` in the repo.

Closes: code-dne

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .woodpecker/pve-nfs-exports-sync.yml | 63 ++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 .woodpecker/pve-nfs-exports-sync.yml

diff --git a/.woodpecker/pve-nfs-exports-sync.yml b/.woodpecker/pve-nfs-exports-sync.yml
new file mode 100644
index 00000000..2c26df45
--- /dev/null
+++ b/.woodpecker/pve-nfs-exports-sync.yml
@@ -0,0 +1,63 @@
+# Sync infra/scripts/pve-nfs-exports → PVE host /etc/exports on change.
+#
+# Wave 6b of the state-drift consolidation plan: move the "scp + exportfs -ra"
+# deploy step out of runbook-human-hands and into CI so the Proxmox NFS export
+# table tracks git.
+#
+# Trigger: push to master that touches `scripts/pve-nfs-exports`. The file
+# header documents the deploy invocation; this pipeline codifies it.
+#
+# Credentials:
+#   - pve_ssh_key: Woodpecker repo-secret (ed25519 keypair provisioned
+#     2026-04-18 as `woodpecker-pve-nfs-exports-sync`). Public key lives in
+#     /root/.ssh/authorized_keys on the PVE host. Private key mirrored in
+#     Vault `secret/woodpecker/pve_ssh_key` for recovery.
+
+when:
+  - event: push
+    branch: master
+    path: scripts/pve-nfs-exports
+  - event: manual
+
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 1
+      attempts: 3
+
+steps:
+  - name: deploy
+    image: alpine:3.20
+    environment:
+      PVE_SSH_KEY:
+        from_secret: pve_ssh_key
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      - apk add --no-cache openssh-client curl
+      - mkdir -p ~/.ssh && chmod 700 ~/.ssh
+      - printf '%s\n' "$PVE_SSH_KEY" > ~/.ssh/id_ed25519
+      - chmod 600 ~/.ssh/id_ed25519
+      # Pin host key — CI's ~/.ssh/known_hosts is ephemeral, so accept-new on first pull.
+      - ssh-keyscan -t ed25519 192.168.1.127 >> ~/.ssh/known_hosts 2>/dev/null
+      # Diff what we'd ship, so pipeline logs show the intended change.
+      - echo '---diff---' && ssh -o BatchMode=yes root@192.168.1.127 "cat /etc/exports" > /tmp/remote.exports || true
+      - diff -u /tmp/remote.exports scripts/pve-nfs-exports || true
+      - echo '---applying---'
+      - scp -o BatchMode=yes scripts/pve-nfs-exports root@192.168.1.127:/etc/exports
+      - ssh -o BatchMode=yes root@192.168.1.127 "exportfs -ra && exportfs -s | head -5"
+      - echo '---done---'
+
+  - name: slack
+    image: curlimages/curl:8.11.0
+    environment:
+      SLACK_WEBHOOK:
+        from_secret: slack_webhook
+    commands:
+      - |
+        curl -s -X POST -H 'Content-type: application/json' \
+          --data "{\"channel\":\"general\",\"text\":\"PVE /etc/exports sync: ${CI_PIPELINE_STATUS}\"}" \
+          "$SLACK_WEBHOOK" || true
+    when:
+      status: [success, failure]

From f6cff262f04fe6f3f520c32ce0fe51f5d4251c10 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sat, 18 Apr 2026 23:22:43 +0000
Subject: [PATCH 2/3] broker-sync: chown fidelity_storage_state to broker uid
 in init container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Context

First end-to-end test of the broker-sync-fidelity CronJob failed with
`PermissionError: [Errno 13] Permission denied:
'/data/fidelity_storage_state.json'`. Init container runs as root (uid
0) but the broker-sync container runs as uid 10001; chmod 600 without
chown made the file unreadable from the main container.

## This change

Added `chown 10001:10001` before the existing `chmod 600` in the
`stage-storage-state` init container command. Init container has
CAP_CHOWN by default as root, so this succeeds.

## Verification

$ kubectl apply -f test-pod.yaml   # same init + main pattern
$ kubectl logs fidelity-debug -c broker-sync
...
broker_sync.providers.fidelity_planviewer.FidelitySessionError:
    PlanViewer session stale — run `broker-sync fidelity-seed`

Init container succeeded + main container read the file + Playwright
launched Chromium + navigated to PlanViewer + hit the 15-min idle page
→ exactly the intended behaviour for a stale session. Next step
(out-of-band): Viktor paste a fresh SMS OTP and re-seed via
fidelity-seed on Viktor's laptop or the existing chat-driven flow.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 stacks/broker-sync/main.tf | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/stacks/broker-sync/main.tf b/stacks/broker-sync/main.tf
index fb5915f8..b3c71905 100644
--- a/stacks/broker-sync/main.tf
+++ b/stacks/broker-sync/main.tf
@@ -669,7 +669,9 @@ resource "kubernetes_cron_job_v1" "fidelity" {
           spec {
             restart_policy = "OnFailure"
             # Materialise the JSON storage_state from the projected Secret
-            # onto the PVC where Playwright expects to read it.
+            # onto the PVC where Playwright expects to read it. Init container
+            # runs as root; the main broker-sync container runs as uid 10001,
+            # so we chown+chmod 600 to grant read access to the broker user.
             init_container {
               name  = "stage-storage-state"
               image = "busybox:1.36"
@@ -677,6 +679,7 @@ resource "kubernetes_cron_job_v1" "fidelity" {
               set -eu
               mkdir -p /data
               cp /secrets/fidelity_storage_state /data/fidelity_storage_state.json
+              chown 10001:10001 /data/fidelity_storage_state.json
               chmod 600 /data/fidelity_storage_state.json
               EOT
               ]

From cc56ba29394a6724753c7165939870d9d7ebfff8 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <me@viktorbarzin.me>
Date: Sat, 18 Apr 2026 23:23:07 +0000
Subject: [PATCH 3/3] [payslip-ingest] Move Payslips datasource 'database' into
 jsonData
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Grafana 11.2+ Postgres plugin reads the DB name from jsonData.database
(see grafana/grafana#112418). The top-level 'database' field is silently
ignored by the frontend — datasource health checks and POST /api/ds/query
still work because the backend honors it, but every dashboard panel fails
with 'you do not have default database'.

Rolling back to the supported shape fixes rendering for all 4 uk-payslip
panels.
---
 stacks/payslip-ingest/main.tf | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/stacks/payslip-ingest/main.tf b/stacks/payslip-ingest/main.tf
index 0f13e036..7e4d0006 100644
--- a/stacks/payslip-ingest/main.tf
+++ b/stacks/payslip-ingest/main.tf
@@ -312,14 +312,18 @@ resource "kubernetes_config_map" "grafana_payslips_datasource" {
     "payslips-datasource.yaml" = yamlencode({
       apiVersion = 1
       datasources = [{
-        name     = "Payslips"
-        type     = "postgres"
-        access   = "proxy"
-        url      = "${var.postgresql_host}:5432"
-        database = "payslip_ingest"
-        user     = "payslip_ingest"
-        uid      = "payslips-pg"
+        name   = "Payslips"
+        type   = "postgres"
+        access = "proxy"
+        url    = "${var.postgresql_host}:5432"
+        user   = "payslip_ingest"
+        uid    = "payslips-pg"
+        # Grafana 11.2+ Postgres plugin reads the DB name from jsonData.database;
+        # the top-level `database` field is silently ignored by the frontend and
+        # triggers "you do not have default database" on every panel.
+        # See github.com/grafana/grafana#112418.
         jsonData = {
+          database        = "payslip_ingest"
           sslmode         = "disable"
           postgresVersion = 1600
           timescaledb     = false