From dd2b7de291c7b0e4753e0964427fb8d00750b04b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 21:48:29 +0000 Subject: [PATCH] fix: HA Sofia REST sensors + PVC drift safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real issues found while triaging HomeAssistantCriticalSensorUnavailable alerts and the prometheus + technitium PVC Terminating-but-in-use state from the earlier session. 1. idrac-redfish-exporter + snmp-exporter ingresses: auth=required → auth=none. HA Sofia REST sensors scrape these endpoints programmatically; with Authentik forward-auth in front, every request got a 302 to authentik.viktorbarzin.me and the REST sensors parsed the HTML login page instead of metrics — leaving the R730, UPS, and ~20 other sensors permanently unavailable. The allow_local_access_only IP allowlist (192.168.0.0/16 + 10.0.0.0/8) already gates external access, so authentik on top was breaking machine-to-machine traffic for no security gain. 2. prometheus_server_pvc + technitium primary_config_encrypted: add lifecycle.ignore_changes = [spec[0].resources[0].requests]. The autoresizer expands these PVCs; PVCs can't shrink. Without the ignore, every TF apply tried to revert the live size back to the TF spec value, hit K8s's shrink-forbidden rule, and force-replaced the PVC. Because the pod still mounted it, the PVC went into Terminating-but-protected limbo — fine until a pod restart would have orphaned the volume. Root cause of the 2026-05-10 PVC Terminating incident. Bonus: prometheus_server_pvc threshold was the inverted "90%" (the same bug the bulk fecfa211 sweep fixed elsewhere; my regex only matched "80%" so this one slipped through). Now "10%". Co-Authored-By: Claude Opus 4.7 --- stacks/monitoring/modules/monitoring/idrac.tf | 10 ++++++++-- .../monitoring/modules/monitoring/prometheus.tf | 16 +++++++++++++++- .../modules/monitoring/snmp_exporter.tf | 8 ++++++-- stacks/technitium/modules/technitium/main.tf | 7 +++++++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/idrac.tf b/stacks/monitoring/modules/monitoring/idrac.tf index bff50ec2..76149a66 100644 --- a/stacks/monitoring/modules/monitoring/idrac.tf +++ b/stacks/monitoring/modules/monitoring/idrac.tf @@ -124,8 +124,14 @@ resource "kubernetes_service" "idrac-redfish-exporter" { } module "idrac-redfish-exporter-ingress" { - source = "../../../../modules/kubernetes/ingress_factory" - auth = "required" + source = "../../../../modules/kubernetes/ingress_factory" + # Auth disabled: HA Sofia + Prometheus scrape this endpoint + # programmatically (no browser, no SSO cookie). The + # allow_local_access_only middleware (192.168.0.0/16 + 10.0.0.0/8) + # already gates external access, so layering Authentik on top only + # breaks the REST sensor in HA Sofia (it gets a 302 to authentik.viktorbarzin.me + # and parses HTML instead of metrics). + auth = "none" namespace = kubernetes_namespace.monitoring.metadata[0].name name = "idrac-redfish-exporter" root_domain = "viktorbarzin.lan" diff --git a/stacks/monitoring/modules/monitoring/prometheus.tf b/stacks/monitoring/modules/monitoring/prometheus.tf index c317775e..75b931df 100644 --- a/stacks/monitoring/modules/monitoring/prometheus.tf +++ b/stacks/monitoring/modules/monitoring/prometheus.tf @@ -5,7 +5,11 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" { name = "prometheus-data-proxmox" namespace = kubernetes_namespace.monitoring.metadata[0].name annotations = { - "resize.topolvm.io/threshold" = "90%" + # threshold = free-space % below which autoresizer expands. + # 10% means "expand when 90% used" (the conventional knob). + # WAS 90% — that's "expand when 10% used", which would + # autoresize this volume from 200Gi → 500Gi in 6 cycles. + "resize.topolvm.io/threshold" = "10%" "resize.topolvm.io/increase" = "10%" "resize.topolvm.io/storage_limit" = "500Gi" } @@ -20,6 +24,16 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" { } } } + lifecycle { + # The autoresizer expands requests.storage up to storage_limit and + # PVCs can't shrink. Without this ignore_changes, every TF apply + # tries to revert the live size back to 200Gi, hits the + # K8s shrink-forbidden rule, and forces a destroy+recreate that + # leaves the PVC stuck in Terminating until the pod releases it. + # (Root cause of the prometheus-data-proxmox + technitium-primary-config-encrypted + # Terminating-but-in-use incident on 2026-05-10.) + ignore_changes = [spec[0].resources[0].requests] + } } module "nfs_prometheus_backup_host" { diff --git a/stacks/monitoring/modules/monitoring/snmp_exporter.tf b/stacks/monitoring/modules/monitoring/snmp_exporter.tf index e38fab1b..d6297f7c 100644 --- a/stacks/monitoring/modules/monitoring/snmp_exporter.tf +++ b/stacks/monitoring/modules/monitoring/snmp_exporter.tf @@ -123,8 +123,12 @@ resource "kubernetes_service" "snmp-exporter" { } module "snmp-exporter-ingress" { - source = "../../../../modules/kubernetes/ingress_factory" - auth = "required" + source = "../../../../modules/kubernetes/ingress_factory" + # Auth disabled — same rationale as idrac-redfish-exporter-ingress: + # HA Sofia REST sensors scrape /snmp endpoint programmatically and + # can't follow the Authentik OIDC flow. local-only IP allowlist + # already gates external access. + auth = "none" namespace = kubernetes_namespace.monitoring.metadata[0].name name = "snmp-exporter" root_domain = "viktorbarzin.lan" diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index fc7fd30e..416b6520 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -116,6 +116,13 @@ resource "kubernetes_persistent_volume_claim" "primary_config_encrypted" { } } } + lifecycle { + # Autoresizer expands; PVCs can't shrink. Without this, TF apply + # plans destroy+recreate which leaves the PVC in Terminating while + # the technitium primary pod still uses it. See incident on + # 2026-05-10 (both prometheus-data-proxmox + this PVC). + ignore_changes = [spec[0].resources[0].requests] + } } resource "kubernetes_deployment" "technitium" {