Two independent root-cause fixes surfaced by the 2026-04-22 cluster
health check:
1. Pushgateway lost all in-memory metrics when node3 kubelet hiccuped
at 11:42 UTC, hiding backup_last_success_timestamp{job="offsite-
backup-sync"} until the next 06:01 UTC push — a ~18h false-negative
window. Enable persistence on a 2Gi proxmox-lvm-encrypted PVC with
--persistence.interval=1m. Chart note: values key is
`prometheus-pushgateway:` (subchart alias), not `pushgateway:`.
2. poison-fountain-fetcher CronJob runs curlimages/curl as UID 100
but the NFS mount /srv/nfs/poison-fountain is root:root 755 and
the main Deployment runs as root, so mkdir /data/cache fails
every 6h. Set run_as_user=0 on the CronJob container (no_root_squash
is set on the export).
Closes the backup_offsite_sync FAIL on the next 06:01 UTC offsite
sync; closes the recurring poison-fountain evicted-pod noise on the
next 00:00 UTC cron tick.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
37 lines
677 B
HCL
37 lines
677 B
HCL
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
|
|
terraform {
|
|
required_providers {
|
|
vault = {
|
|
source = "hashicorp/vault"
|
|
version = "~> 4.0"
|
|
}
|
|
cloudflare = {
|
|
source = "cloudflare/cloudflare"
|
|
version = "~> 4"
|
|
}
|
|
authentik = {
|
|
source = "goauthentik/authentik"
|
|
version = "~> 2024.10"
|
|
}
|
|
}
|
|
}
|
|
|
|
variable "kube_config_path" {
|
|
type = string
|
|
default = "~/.kube/config"
|
|
}
|
|
|
|
provider "kubernetes" {
|
|
config_path = var.kube_config_path
|
|
}
|
|
|
|
provider "helm" {
|
|
kubernetes = {
|
|
config_path = var.kube_config_path
|
|
}
|
|
}
|
|
|
|
provider "vault" {
|
|
address = "https://vault.viktorbarzin.me"
|
|
skip_child_token = true
|
|
}
|