diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md
index eb1d11d7..5d4ebf64 100644
--- a/docs/architecture/backup-dr.md
+++ b/docs/architecture/backup-dr.md
@@ -692,6 +692,16 @@ module "nfs_backup" {
 - ~~CloudSync monitor~~: Removed (TrueNAS decommissioned)
 - Vaultwarden integrity: Pushes `vaultwarden_sqlite_integrity_ok` hourly
 
+**Pushgateway persistence**: The Pushgateway is configured with
+`--persistence.file=/data/pushgateway.bin --persistence.interval=1m`
+on a 2Gi `proxmox-lvm-encrypted` PVC (helm values:
+`prometheus-pushgateway.persistentVolume`). Without this, every pod
+restart drops in-memory metrics. Once-per-day pushers (offsite-sync,
+weekly backup) are otherwise invisible for up to 24h if the
+Pushgateway restarts between pushes — which is exactly what triggered
+the 2026-04-22 backup_offsite_sync FAIL (node3 kubelet hiccup at
+11:42 UTC terminated the Pushgateway 8h after the 03:12 UTC push).
+
 **Alert routing**:
 - All backup alerts → Slack `#infra-alerts`
 - Vaultwarden integrity fail → Slack `#infra-critical` (immediate action required)
diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index ac28733e..777ec1df 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -160,6 +160,27 @@ prometheus-node-exporter:
       memory: 100Mi
     limits:
       memory: 100Mi
+# NOTE: The parent chart forwards subchart values under `prometheus-pushgateway:`,
+# not `pushgateway:` — using the wrong key silently no-ops.
+prometheus-pushgateway:
+  # Without persistence the pushgateway's in-memory metrics are lost on restart.
+  # Once-per-day pushers (offsite-backup-sync) stay invisible until their next run,
+  # which is why backup_last_success_timestamp{job="offsite-backup-sync"} vanished
+  # after the 2026-04-22 node3 kubelet hiccup.
+  persistentVolume:
+    enabled: true
+    size: 2Gi
+    storageClass: proxmox-lvm-encrypted
+    mountPath: /data
+  extraArgs:
+    - --persistence.file=/data/pushgateway.bin
+    - --persistence.interval=1m
+  resources:
+    requests:
+      cpu: 10m
+      memory: 64Mi
+    limits:
+      memory: 256Mi
 server:
   # Enable me to delete metrics
   extraFlags:
diff --git a/stacks/poison-fountain/main.tf b/stacks/poison-fountain/main.tf
index 64e7ae21..639d504f 100644
--- a/stacks/poison-fountain/main.tf
+++ b/stacks/poison-fountain/main.tf
@@ -252,6 +252,13 @@ resource "kubernetes_cron_job_v1" "poison_fetcher" {
             name = "poison-fountain-fetcher"
           }
           spec {
+            security_context {
+              # curlimages/curl defaults to uid 100, but the NFS mount at /data is
+              # owned root:root 755 (writes from the main Deployment which runs as
+              # root). Align the CronJob with the Deployment so mkdir /data/cache
+              # succeeds. no_root_squash is set on the /srv/nfs export.
+              run_as_user = 0
+            }
             container {
               name    = "fetcher"
               image   = "curlimages/curl:latest"
diff --git a/stacks/poison-fountain/providers.tf b/stacks/poison-fountain/providers.tf
index b337a2e9..012af700 100644
--- a/stacks/poison-fountain/providers.tf
+++ b/stacks/poison-fountain/providers.tf
@@ -9,6 +9,10 @@ terraform {
       source  = "cloudflare/cloudflare"
       version = "~> 4"
     }
+    authentik = {
+      source  = "goauthentik/authentik"
+      version = "~> 2024.10"
+    }
   }
 }