From 344fce3692d6fc218d75519a510435b50f616fc1 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Wed, 22 Apr 2026 18:32:29 +0000
Subject: [PATCH] [monitoring][poison-fountain] pushgateway persistence +
 cronjob uid-0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two independent root-cause fixes surfaced by the 2026-04-22 cluster
health check:

1. Pushgateway lost all in-memory metrics when node3 kubelet hiccuped
   at 11:42 UTC, hiding backup_last_success_timestamp{job="offsite-
   backup-sync"} until the next 06:01 UTC push — a ~18h false-negative
   window. Enable persistence on a 2Gi proxmox-lvm-encrypted PVC with
   --persistence.interval=1m. Chart note: values key is
   `prometheus-pushgateway:` (subchart alias), not `pushgateway:`.

2. poison-fountain-fetcher CronJob runs curlimages/curl as UID 100
   but the NFS mount /srv/nfs/poison-fountain is root:root 755 and
   the main Deployment runs as root, so mkdir /data/cache fails
   every 6h. Set run_as_user=0 on the CronJob container (no_root_squash
   is set on the export).

Closes the backup_offsite_sync FAIL on the next 06:01 UTC offsite
sync; closes the recurring poison-fountain evicted-pod noise on the
next 00:00 UTC cron tick.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/architecture/backup-dr.md                | 10 +++++++++
 .../monitoring/prometheus_chart_values.tpl    | 21 +++++++++++++++++++
 stacks/poison-fountain/main.tf                |  7 +++++++
 stacks/poison-fountain/providers.tf           |  4 ++++
 4 files changed, 42 insertions(+)

diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md
index eb1d11d7..5d4ebf64 100644
--- a/docs/architecture/backup-dr.md
+++ b/docs/architecture/backup-dr.md
@@ -692,6 +692,16 @@ module "nfs_backup" {
 - ~~CloudSync monitor~~: Removed (TrueNAS decommissioned)
 - Vaultwarden integrity: Pushes `vaultwarden_sqlite_integrity_ok` hourly
 
+**Pushgateway persistence**: The Pushgateway is configured with
+`--persistence.file=/data/pushgateway.bin --persistence.interval=1m`
+on a 2Gi `proxmox-lvm-encrypted` PVC (helm values:
+`prometheus-pushgateway.persistentVolume`). Without this, every pod
+restart drops in-memory metrics. Once-per-day pushers (offsite-sync,
+weekly backup) are otherwise invisible for up to 24h if the
+Pushgateway restarts between pushes — which is exactly what triggered
+the 2026-04-22 backup_offsite_sync FAIL (node3 kubelet hiccup at
+11:42 UTC terminated the Pushgateway 8h after the 03:12 UTC push).
+
 **Alert routing**:
 - All backup alerts → Slack `#infra-alerts`
 - Vaultwarden integrity fail → Slack `#infra-critical` (immediate action required)
diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index ac28733e..777ec1df 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -160,6 +160,27 @@ prometheus-node-exporter:
       memory: 100Mi
     limits:
       memory: 100Mi
+# NOTE: The parent chart forwards subchart values under `prometheus-pushgateway:`,
+# not `pushgateway:` — using the wrong key silently no-ops.
+prometheus-pushgateway:
+  # Without persistence the pushgateway's in-memory metrics are lost on restart.
+  # Once-per-day pushers (offsite-backup-sync) stay invisible until their next run,
+  # which is why backup_last_success_timestamp{job="offsite-backup-sync"} vanished
+  # after the 2026-04-22 node3 kubelet hiccup.
+  persistentVolume:
+    enabled: true
+    size: 2Gi
+    storageClass: proxmox-lvm-encrypted
+    mountPath: /data
+  extraArgs:
+    - --persistence.file=/data/pushgateway.bin
+    - --persistence.interval=1m
+  resources:
+    requests:
+      cpu: 10m
+      memory: 64Mi
+    limits:
+      memory: 256Mi
 server:
   # Enable me to delete metrics
   extraFlags:
diff --git a/stacks/poison-fountain/main.tf b/stacks/poison-fountain/main.tf
index 64e7ae21..639d504f 100644
--- a/stacks/poison-fountain/main.tf
+++ b/stacks/poison-fountain/main.tf
@@ -252,6 +252,13 @@ resource "kubernetes_cron_job_v1" "poison_fetcher" {
             name = "poison-fountain-fetcher"
           }
           spec {
+            security_context {
+              # curlimages/curl defaults to uid 100, but the NFS mount at /data is
+              # owned root:root 755 (writes from the main Deployment which runs as
+              # root). Align the CronJob with the Deployment so mkdir /data/cache
+              # succeeds. no_root_squash is set on the /srv/nfs export.
+              run_as_user = 0
+            }
             container {
               name    = "fetcher"
               image   = "curlimages/curl:latest"
diff --git a/stacks/poison-fountain/providers.tf b/stacks/poison-fountain/providers.tf
index b337a2e9..012af700 100644
--- a/stacks/poison-fountain/providers.tf
+++ b/stacks/poison-fountain/providers.tf
@@ -9,6 +9,10 @@ terraform {
       source  = "cloudflare/cloudflare"
       version = "~> 4"
     }
+    authentik = {
+      source  = "goauthentik/authentik"
+      version = "~> 2024.10"
+    }
   }
 }