fix: restore pvc-autoresizer by allow-listing kubelet_volume_stats_available_bytes

The Prometheus scrape config for the kubernetes-nodes job kept
capacity_bytes + used_bytes but dropped available_bytes. pvc-autoresizer
computes utilization from available/capacity, so without that metric it
was silent for every PVC in the cluster — including mailserver, which
filled to 89% (1.7G/2.0G) and started rejecting all inbound mail with
'452 4.3.1 Insufficient system storage' (15+ hours, all real senders:
Brevo, Gmail, Facebook).

Also bumps the floors of mailserver (2Gi -> 5Gi, limit 10Gi) and forgejo
(15Gi -> 30Gi) PVCs to recover from the immediate outage, and adds
ignore_changes on requests.storage so future autoresizer expansions
don't cause TF drift.
This commit is contained in:
Viktor Barzin 2026-05-09 10:49:17 +00:00
parent 352586f711
commit 9d5da4d8e0
3 changed files with 19 additions and 4 deletions

View file

@ -40,10 +40,16 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
storage_class_name = "proxmox-lvm-encrypted"
resources {
requests = {
storage = "15Gi"
storage = "30Gi"
}
}
}
lifecycle {
# pvc-autoresizer expands this PVC up to storage_limit; ignore drift on
# requests.storage. To bump the floor manually: temporarily remove this
# block, apply the new size, re-add the block, apply again.
ignore_changes = [spec[0].resources[0].requests]
}
}
resource "kubernetes_deployment" "forgejo" {

View file

@ -293,7 +293,7 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
annotations = {
"resize.topolvm.io/threshold" = "80%"
"resize.topolvm.io/increase" = "100%"
"resize.topolvm.io/storage_limit" = "5Gi"
"resize.topolvm.io/storage_limit" = "10Gi"
}
}
spec {
@ -301,10 +301,16 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
storage_class_name = "proxmox-lvm-encrypted"
resources {
requests = {
storage = "2Gi"
storage = "5Gi"
}
}
}
lifecycle {
# pvc-autoresizer expands this PVC up to storage_limit; ignore drift on
# requests.storage. To bump the floor manually: temporarily remove this
# block, apply the new size, re-add the block, apply again.
ignore_changes = [spec[0].resources[0].requests]
}
}
resource "kubernetes_deployment" "mailserver" {

View file

@ -380,8 +380,11 @@ serverFiles:
regex: 'kubernetes_feature_enabled|kubelet_container_log_filesystem_used_bytes'
action: drop
# Whitelist: only keep essential kubelet metrics
# kubelet_volume_stats_available_bytes is required by pvc-autoresizer
# (it computes utilization as 1 - available/capacity). Without it the
# autoresizer is silent for every PVC in the cluster.
- source_labels: [__name__]
regex: 'kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_inodes_used|kubelet_running_containers|kubelet_runtime_operations_errors_total|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|go_memstats_alloc_bytes|up'
regex: 'kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_available_bytes|kubelet_volume_stats_inodes_used|kubelet_running_containers|kubelet_runtime_operations_errors_total|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|go_memstats_alloc_bytes|up'
action: keep
- job_name: kubernetes-nodes-cadvisor
scheme: https