diff --git a/stacks/monitoring/modules/monitoring/idrac.tf b/stacks/monitoring/modules/monitoring/idrac.tf index bff50ec2..76149a66 100644 --- a/stacks/monitoring/modules/monitoring/idrac.tf +++ b/stacks/monitoring/modules/monitoring/idrac.tf @@ -124,8 +124,14 @@ resource "kubernetes_service" "idrac-redfish-exporter" { } module "idrac-redfish-exporter-ingress" { - source = "../../../../modules/kubernetes/ingress_factory" - auth = "required" + source = "../../../../modules/kubernetes/ingress_factory" + # Auth disabled: HA Sofia + Prometheus scrape this endpoint + # programmatically (no browser, no SSO cookie). The + # allow_local_access_only middleware (192.168.0.0/16 + 10.0.0.0/8) + # already gates external access, so layering Authentik on top only + # breaks the REST sensor in HA Sofia (it gets a 302 to authentik.viktorbarzin.me + # and parses HTML instead of metrics). + auth = "none" namespace = kubernetes_namespace.monitoring.metadata[0].name name = "idrac-redfish-exporter" root_domain = "viktorbarzin.lan" diff --git a/stacks/monitoring/modules/monitoring/prometheus.tf b/stacks/monitoring/modules/monitoring/prometheus.tf index c317775e..75b931df 100644 --- a/stacks/monitoring/modules/monitoring/prometheus.tf +++ b/stacks/monitoring/modules/monitoring/prometheus.tf @@ -5,7 +5,11 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" { name = "prometheus-data-proxmox" namespace = kubernetes_namespace.monitoring.metadata[0].name annotations = { - "resize.topolvm.io/threshold" = "90%" + # threshold = free-space % below which autoresizer expands. + # 10% means "expand when 90% used" (the conventional knob). + # WAS 90% — that's "expand when 10% used", which would + # autoresize this volume from 200Gi → 500Gi in 6 cycles. + "resize.topolvm.io/threshold" = "10%" "resize.topolvm.io/increase" = "10%" "resize.topolvm.io/storage_limit" = "500Gi" } @@ -20,6 +24,16 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" { } } } + lifecycle { + # The autoresizer expands requests.storage up to storage_limit and + # PVCs can't shrink. Without this ignore_changes, every TF apply + # tries to revert the live size back to 200Gi, hits the + # K8s shrink-forbidden rule, and forces a destroy+recreate that + # leaves the PVC stuck in Terminating until the pod releases it. + # (Root cause of the prometheus-data-proxmox + technitium-primary-config-encrypted + # Terminating-but-in-use incident on 2026-05-10.) + ignore_changes = [spec[0].resources[0].requests] + } } module "nfs_prometheus_backup_host" { diff --git a/stacks/monitoring/modules/monitoring/snmp_exporter.tf b/stacks/monitoring/modules/monitoring/snmp_exporter.tf index e38fab1b..d6297f7c 100644 --- a/stacks/monitoring/modules/monitoring/snmp_exporter.tf +++ b/stacks/monitoring/modules/monitoring/snmp_exporter.tf @@ -123,8 +123,12 @@ resource "kubernetes_service" "snmp-exporter" { } module "snmp-exporter-ingress" { - source = "../../../../modules/kubernetes/ingress_factory" - auth = "required" + source = "../../../../modules/kubernetes/ingress_factory" + # Auth disabled — same rationale as idrac-redfish-exporter-ingress: + # HA Sofia REST sensors scrape /snmp endpoint programmatically and + # can't follow the Authentik OIDC flow. local-only IP allowlist + # already gates external access. + auth = "none" namespace = kubernetes_namespace.monitoring.metadata[0].name name = "snmp-exporter" root_domain = "viktorbarzin.lan" diff --git a/stacks/technitium/modules/technitium/main.tf b/stacks/technitium/modules/technitium/main.tf index fc7fd30e..416b6520 100644 --- a/stacks/technitium/modules/technitium/main.tf +++ b/stacks/technitium/modules/technitium/main.tf @@ -116,6 +116,13 @@ resource "kubernetes_persistent_volume_claim" "primary_config_encrypted" { } } } + lifecycle { + # Autoresizer expands; PVCs can't shrink. Without this, TF apply + # plans destroy+recreate which leaves the PVC in Terminating while + # the technitium primary pod still uses it. See incident on + # 2026-05-10 (both prometheus-data-proxmox + this PVC). + ignore_changes = [spec[0].resources[0].requests] + } } resource "kubernetes_deployment" "technitium" {