From 344fce3692d6fc218d75519a510435b50f616fc1 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 22 Apr 2026 18:32:29 +0000 Subject: [PATCH] [monitoring][poison-fountain] pushgateway persistence + cronjob uid-0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two independent root-cause fixes surfaced by the 2026-04-22 cluster health check: 1. Pushgateway lost all in-memory metrics when node3 kubelet hiccuped at 11:42 UTC, hiding backup_last_success_timestamp{job="offsite- backup-sync"} until the next 06:01 UTC push — a ~18h false-negative window. Enable persistence on a 2Gi proxmox-lvm-encrypted PVC with --persistence.interval=1m. Chart note: values key is `prometheus-pushgateway:` (subchart alias), not `pushgateway:`. 2. poison-fountain-fetcher CronJob runs curlimages/curl as UID 100 but the NFS mount /srv/nfs/poison-fountain is root:root 755 and the main Deployment runs as root, so mkdir /data/cache fails every 6h. Set run_as_user=0 on the CronJob container (no_root_squash is set on the export). Closes the backup_offsite_sync FAIL on the next 06:01 UTC offsite sync; closes the recurring poison-fountain evicted-pod noise on the next 00:00 UTC cron tick. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/architecture/backup-dr.md | 10 +++++++++ .../monitoring/prometheus_chart_values.tpl | 21 +++++++++++++++++++ stacks/poison-fountain/main.tf | 7 +++++++ stacks/poison-fountain/providers.tf | 4 ++++ 4 files changed, 42 insertions(+) diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index eb1d11d7..5d4ebf64 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -692,6 +692,16 @@ module "nfs_backup" { - ~~CloudSync monitor~~: Removed (TrueNAS decommissioned) - Vaultwarden integrity: Pushes `vaultwarden_sqlite_integrity_ok` hourly +**Pushgateway persistence**: The Pushgateway is configured with +`--persistence.file=/data/pushgateway.bin --persistence.interval=1m` +on a 2Gi `proxmox-lvm-encrypted` PVC (helm values: +`prometheus-pushgateway.persistentVolume`). Without this, every pod +restart drops in-memory metrics. Once-per-day pushers (offsite-sync, +weekly backup) are otherwise invisible for up to 24h if the +Pushgateway restarts between pushes — which is exactly what triggered +the 2026-04-22 backup_offsite_sync FAIL (node3 kubelet hiccup at +11:42 UTC terminated the Pushgateway 8h after the 03:12 UTC push). + **Alert routing**: - All backup alerts → Slack `#infra-alerts` - Vaultwarden integrity fail → Slack `#infra-critical` (immediate action required) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index ac28733e..777ec1df 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -160,6 +160,27 @@ prometheus-node-exporter: memory: 100Mi limits: memory: 100Mi +# NOTE: The parent chart forwards subchart values under `prometheus-pushgateway:`, +# not `pushgateway:` — using the wrong key silently no-ops. +prometheus-pushgateway: + # Without persistence the pushgateway's in-memory metrics are lost on restart. + # Once-per-day pushers (offsite-backup-sync) stay invisible until their next run, + # which is why backup_last_success_timestamp{job="offsite-backup-sync"} vanished + # after the 2026-04-22 node3 kubelet hiccup. + persistentVolume: + enabled: true + size: 2Gi + storageClass: proxmox-lvm-encrypted + mountPath: /data + extraArgs: + - --persistence.file=/data/pushgateway.bin + - --persistence.interval=1m + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 256Mi server: # Enable me to delete metrics extraFlags: diff --git a/stacks/poison-fountain/main.tf b/stacks/poison-fountain/main.tf index 64e7ae21..639d504f 100644 --- a/stacks/poison-fountain/main.tf +++ b/stacks/poison-fountain/main.tf @@ -252,6 +252,13 @@ resource "kubernetes_cron_job_v1" "poison_fetcher" { name = "poison-fountain-fetcher" } spec { + security_context { + # curlimages/curl defaults to uid 100, but the NFS mount at /data is + # owned root:root 755 (writes from the main Deployment which runs as + # root). Align the CronJob with the Deployment so mkdir /data/cache + # succeeds. no_root_squash is set on the /srv/nfs export. + run_as_user = 0 + } container { name = "fetcher" image = "curlimages/curl:latest" diff --git a/stacks/poison-fountain/providers.tf b/stacks/poison-fountain/providers.tf index b337a2e9..012af700 100644 --- a/stacks/poison-fountain/providers.tf +++ b/stacks/poison-fountain/providers.tf @@ -9,6 +9,10 @@ terraform { source = "cloudflare/cloudflare" version = "~> 4" } + authentik = { + source = "goauthentik/authentik" + version = "~> 2024.10" + } } }