diff --git a/docs/architecture/backup-dr.md b/docs/architecture/backup-dr.md index eb1d11d7..5d4ebf64 100644 --- a/docs/architecture/backup-dr.md +++ b/docs/architecture/backup-dr.md @@ -692,6 +692,16 @@ module "nfs_backup" { - ~~CloudSync monitor~~: Removed (TrueNAS decommissioned) - Vaultwarden integrity: Pushes `vaultwarden_sqlite_integrity_ok` hourly +**Pushgateway persistence**: The Pushgateway is configured with +`--persistence.file=/data/pushgateway.bin --persistence.interval=1m` +on a 2Gi `proxmox-lvm-encrypted` PVC (helm values: +`prometheus-pushgateway.persistentVolume`). Without this, every pod +restart drops in-memory metrics. Once-per-day pushers (offsite-sync, +weekly backup) are otherwise invisible for up to 24h if the +Pushgateway restarts between pushes — which is exactly what triggered +the 2026-04-22 backup_offsite_sync FAIL (node3 kubelet hiccup at +11:42 UTC terminated the Pushgateway 8h after the 03:12 UTC push). + **Alert routing**: - All backup alerts → Slack `#infra-alerts` - Vaultwarden integrity fail → Slack `#infra-critical` (immediate action required) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index ac28733e..777ec1df 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -160,6 +160,27 @@ prometheus-node-exporter: memory: 100Mi limits: memory: 100Mi +# NOTE: The parent chart forwards subchart values under `prometheus-pushgateway:`, +# not `pushgateway:` — using the wrong key silently no-ops. +prometheus-pushgateway: + # Without persistence the pushgateway's in-memory metrics are lost on restart. + # Once-per-day pushers (offsite-backup-sync) stay invisible until their next run, + # which is why backup_last_success_timestamp{job="offsite-backup-sync"} vanished + # after the 2026-04-22 node3 kubelet hiccup. + persistentVolume: + enabled: true + size: 2Gi + storageClass: proxmox-lvm-encrypted + mountPath: /data + extraArgs: + - --persistence.file=/data/pushgateway.bin + - --persistence.interval=1m + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 256Mi server: # Enable me to delete metrics extraFlags: diff --git a/stacks/poison-fountain/main.tf b/stacks/poison-fountain/main.tf index 64e7ae21..639d504f 100644 --- a/stacks/poison-fountain/main.tf +++ b/stacks/poison-fountain/main.tf @@ -252,6 +252,13 @@ resource "kubernetes_cron_job_v1" "poison_fetcher" { name = "poison-fountain-fetcher" } spec { + security_context { + # curlimages/curl defaults to uid 100, but the NFS mount at /data is + # owned root:root 755 (writes from the main Deployment which runs as + # root). Align the CronJob with the Deployment so mkdir /data/cache + # succeeds. no_root_squash is set on the /srv/nfs export. + run_as_user = 0 + } container { name = "fetcher" image = "curlimages/curl:latest" diff --git a/stacks/poison-fountain/providers.tf b/stacks/poison-fountain/providers.tf index b337a2e9..012af700 100644 --- a/stacks/poison-fountain/providers.tf +++ b/stacks/poison-fountain/providers.tf @@ -9,6 +9,10 @@ terraform { source = "cloudflare/cloudflare" version = "~> 4" } + authentik = { + source = "goauthentik/authentik" + version = "~> 2024.10" + } } }