fix: HA Sofia REST sensors + PVC drift safety
Two real issues found while triaging HomeAssistantCriticalSensorUnavailable alerts and the prometheus + technitium PVC Terminating-but-in-use state from the earlier session. 1. idrac-redfish-exporter + snmp-exporter ingresses: auth=required → auth=none. HA Sofia REST sensors scrape these endpoints programmatically; with Authentik forward-auth in front, every request got a 302 to authentik.viktorbarzin.me and the REST sensors parsed the HTML login page instead of metrics — leaving the R730, UPS, and ~20 other sensors permanently unavailable. The allow_local_access_only IP allowlist (192.168.0.0/16 + 10.0.0.0/8) already gates external access, so authentik on top was breaking machine-to-machine traffic for no security gain. 2. prometheus_server_pvc + technitium primary_config_encrypted: add lifecycle.ignore_changes = [spec[0].resources[0].requests]. The autoresizer expands these PVCs; PVCs can't shrink. Without the ignore, every TF apply tried to revert the live size back to the TF spec value, hit K8s's shrink-forbidden rule, and force-replaced the PVC. Because the pod still mounted it, the PVC went into Terminating-but-protected limbo — fine until a pod restart would have orphaned the volume. Root cause of the 2026-05-10 PVC Terminating incident. Bonus: prometheus_server_pvc threshold was the inverted "90%" (the same bug the bulk fecfa211 sweep fixed elsewhere; my regex only matched "80%" so this one slipped through). Now "10%". Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
7e69951cb9
commit
dd2b7de291
4 changed files with 36 additions and 5 deletions
|
|
@ -124,8 +124,14 @@ resource "kubernetes_service" "idrac-redfish-exporter" {
|
|||
}
|
||||
|
||||
module "idrac-redfish-exporter-ingress" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
auth = "required"
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
# Auth disabled: HA Sofia + Prometheus scrape this endpoint
|
||||
# programmatically (no browser, no SSO cookie). The
|
||||
# allow_local_access_only middleware (192.168.0.0/16 + 10.0.0.0/8)
|
||||
# already gates external access, so layering Authentik on top only
|
||||
# breaks the REST sensor in HA Sofia (it gets a 302 to authentik.viktorbarzin.me
|
||||
# and parses HTML instead of metrics).
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
name = "idrac-redfish-exporter"
|
||||
root_domain = "viktorbarzin.lan"
|
||||
|
|
|
|||
|
|
@ -5,7 +5,11 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
|
|||
name = "prometheus-data-proxmox"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
annotations = {
|
||||
"resize.topolvm.io/threshold" = "90%"
|
||||
# threshold = free-space % below which autoresizer expands.
|
||||
# 10% means "expand when 90% used" (the conventional knob).
|
||||
# WAS 90% — that's "expand when 10% used", which would
|
||||
# autoresize this volume from 200Gi → 500Gi in 6 cycles.
|
||||
"resize.topolvm.io/threshold" = "10%"
|
||||
"resize.topolvm.io/increase" = "10%"
|
||||
"resize.topolvm.io/storage_limit" = "500Gi"
|
||||
}
|
||||
|
|
@ -20,6 +24,16 @@ resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
|
|||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# The autoresizer expands requests.storage up to storage_limit and
|
||||
# PVCs can't shrink. Without this ignore_changes, every TF apply
|
||||
# tries to revert the live size back to 200Gi, hits the
|
||||
# K8s shrink-forbidden rule, and forces a destroy+recreate that
|
||||
# leaves the PVC stuck in Terminating until the pod releases it.
|
||||
# (Root cause of the prometheus-data-proxmox + technitium-primary-config-encrypted
|
||||
# Terminating-but-in-use incident on 2026-05-10.)
|
||||
ignore_changes = [spec[0].resources[0].requests]
|
||||
}
|
||||
}
|
||||
|
||||
module "nfs_prometheus_backup_host" {
|
||||
|
|
|
|||
|
|
@ -123,8 +123,12 @@ resource "kubernetes_service" "snmp-exporter" {
|
|||
}
|
||||
|
||||
module "snmp-exporter-ingress" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
auth = "required"
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
# Auth disabled — same rationale as idrac-redfish-exporter-ingress:
|
||||
# HA Sofia REST sensors scrape /snmp endpoint programmatically and
|
||||
# can't follow the Authentik OIDC flow. local-only IP allowlist
|
||||
# already gates external access.
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
name = "snmp-exporter"
|
||||
root_domain = "viktorbarzin.lan"
|
||||
|
|
|
|||
|
|
@ -116,6 +116,13 @@ resource "kubernetes_persistent_volume_claim" "primary_config_encrypted" {
|
|||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# Autoresizer expands; PVCs can't shrink. Without this, TF apply
|
||||
# plans destroy+recreate which leaves the PVC in Terminating while
|
||||
# the technitium primary pod still uses it. See incident on
|
||||
# 2026-05-10 (both prometheus-data-proxmox + this PVC).
|
||||
ignore_changes = [spec[0].resources[0].requests]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "technitium" {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue