Two real issues found while triaging HomeAssistantCriticalSensorUnavailable
alerts and the prometheus + technitium PVC Terminating-but-in-use
state from the earlier session.
1. idrac-redfish-exporter + snmp-exporter ingresses: auth=required →
auth=none. HA Sofia REST sensors scrape these endpoints
programmatically; with Authentik forward-auth in front, every
request got a 302 to authentik.viktorbarzin.me and the REST
sensors parsed the HTML login page instead of metrics — leaving
the R730, UPS, and ~20 other sensors permanently unavailable.
The allow_local_access_only IP allowlist (192.168.0.0/16 +
10.0.0.0/8) already gates external access, so authentik on top
was breaking machine-to-machine traffic for no security gain.
2. prometheus_server_pvc + technitium primary_config_encrypted:
add lifecycle.ignore_changes = [spec[0].resources[0].requests].
The autoresizer expands these PVCs; PVCs can't shrink. Without
the ignore, every TF apply tried to revert the live size back
to the TF spec value, hit K8s's shrink-forbidden rule, and
force-replaced the PVC. Because the pod still mounted it, the
PVC went into Terminating-but-protected limbo — fine until a
pod restart would have orphaned the volume. Root cause of the
2026-05-10 PVC Terminating incident.
Bonus: prometheus_server_pvc threshold was the inverted "90%" (the
same bug the bulk fecfa211 sweep fixed elsewhere; my regex only
matched "80%" so this one slipped through). Now "10%".
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
64 lines
2.5 KiB
HCL
64 lines
2.5 KiB
HCL
|
|
|
|
resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
|
|
metadata {
|
|
name = "prometheus-data-proxmox"
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
annotations = {
|
|
# threshold = free-space % below which autoresizer expands.
|
|
# 10% means "expand when 90% used" (the conventional knob).
|
|
# WAS 90% — that's "expand when 10% used", which would
|
|
# autoresize this volume from 200Gi → 500Gi in 6 cycles.
|
|
"resize.topolvm.io/threshold" = "10%"
|
|
"resize.topolvm.io/increase" = "10%"
|
|
"resize.topolvm.io/storage_limit" = "500Gi"
|
|
}
|
|
}
|
|
|
|
spec {
|
|
access_modes = ["ReadWriteOnce"]
|
|
storage_class_name = "proxmox-lvm"
|
|
resources {
|
|
requests = {
|
|
storage = "200Gi"
|
|
}
|
|
}
|
|
}
|
|
lifecycle {
|
|
# The autoresizer expands requests.storage up to storage_limit and
|
|
# PVCs can't shrink. Without this ignore_changes, every TF apply
|
|
# tries to revert the live size back to 200Gi, hits the
|
|
# K8s shrink-forbidden rule, and forces a destroy+recreate that
|
|
# leaves the PVC stuck in Terminating until the pod releases it.
|
|
# (Root cause of the prometheus-data-proxmox + technitium-primary-config-encrypted
|
|
# Terminating-but-in-use incident on 2026-05-10.)
|
|
ignore_changes = [spec[0].resources[0].requests]
|
|
}
|
|
}
|
|
|
|
module "nfs_prometheus_backup_host" {
|
|
source = "../../../../modules/kubernetes/nfs_volume"
|
|
name = "monitoring-prometheus-backup-host"
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
nfs_server = "192.168.1.127"
|
|
nfs_path = "/srv/nfs/prometheus-backup"
|
|
}
|
|
|
|
resource "helm_release" "prometheus" {
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
create_namespace = true
|
|
name = "prometheus"
|
|
|
|
repository = "https://prometheus-community.github.io/helm-charts"
|
|
chart = "prometheus"
|
|
# version = "15.0.2"
|
|
version = "25.8.2"
|
|
|
|
timeout = 900 # 15 min — Recreate strategy + iSCSI reattach is slow
|
|
# force_update disabled 2026-04-23: caused Helm to try replacing the bound
|
|
# pushgateway PVC (added in rev 188, see commit e51c104), which is immutable.
|
|
# Re-enable temporarily only when a StatefulSet volumeClaimTemplate change needs --force.
|
|
force_update = false
|
|
|
|
values = [templatefile("${path.module}/prometheus_chart_values.tpl", { alertmanager_mail_pass = var.alertmanager_account_password, alertmanager_slack_api_url = var.alertmanager_slack_api_url, tuya_api_key = var.tiny_tuya_service_secret, haos_api_token = var.haos_api_token })]
|
|
}
|