From 9d5da4d8e0d3ccf28c49dda208258aaf1a5e53e7 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 9 May 2026 10:49:17 +0000 Subject: [PATCH] fix: restore pvc-autoresizer by allow-listing kubelet_volume_stats_available_bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Prometheus scrape config for the kubernetes-nodes job kept capacity_bytes + used_bytes but dropped available_bytes. pvc-autoresizer computes utilization from available/capacity, so without that metric it was silent for every PVC in the cluster — including mailserver, which filled to 89% (1.7G/2.0G) and started rejecting all inbound mail with '452 4.3.1 Insufficient system storage' (15+ hours, all real senders: Brevo, Gmail, Facebook). Also bumps the floors of mailserver (2Gi -> 5Gi, limit 10Gi) and forgejo (15Gi -> 30Gi) PVCs to recover from the immediate outage, and adds ignore_changes on requests.storage so future autoresizer expansions don't cause TF drift. --- stacks/forgejo/main.tf | 8 +++++++- stacks/mailserver/modules/mailserver/main.tf | 10 ++++++++-- .../modules/monitoring/prometheus_chart_values.tpl | 5 ++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/stacks/forgejo/main.tf b/stacks/forgejo/main.tf index fdd2fadf..54c60685 100644 --- a/stacks/forgejo/main.tf +++ b/stacks/forgejo/main.tf @@ -40,10 +40,16 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" { storage_class_name = "proxmox-lvm-encrypted" resources { requests = { - storage = "15Gi" + storage = "30Gi" } } } + lifecycle { + # pvc-autoresizer expands this PVC up to storage_limit; ignore drift on + # requests.storage. To bump the floor manually: temporarily remove this + # block, apply the new size, re-add the block, apply again. + ignore_changes = [spec[0].resources[0].requests] + } } resource "kubernetes_deployment" "forgejo" { diff --git a/stacks/mailserver/modules/mailserver/main.tf b/stacks/mailserver/modules/mailserver/main.tf index c3c33d26..7744b202 100644 --- a/stacks/mailserver/modules/mailserver/main.tf +++ b/stacks/mailserver/modules/mailserver/main.tf @@ -293,7 +293,7 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" { annotations = { "resize.topolvm.io/threshold" = "80%" "resize.topolvm.io/increase" = "100%" - "resize.topolvm.io/storage_limit" = "5Gi" + "resize.topolvm.io/storage_limit" = "10Gi" } } spec { @@ -301,10 +301,16 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" { storage_class_name = "proxmox-lvm-encrypted" resources { requests = { - storage = "2Gi" + storage = "5Gi" } } } + lifecycle { + # pvc-autoresizer expands this PVC up to storage_limit; ignore drift on + # requests.storage. To bump the floor manually: temporarily remove this + # block, apply the new size, re-add the block, apply again. + ignore_changes = [spec[0].resources[0].requests] + } } resource "kubernetes_deployment" "mailserver" { diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 406a19c9..793dccf9 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -380,8 +380,11 @@ serverFiles: regex: 'kubernetes_feature_enabled|kubelet_container_log_filesystem_used_bytes' action: drop # Whitelist: only keep essential kubelet metrics + # kubelet_volume_stats_available_bytes is required by pvc-autoresizer + # (it computes utilization as 1 - available/capacity). Without it the + # autoresizer is silent for every PVC in the cluster. - source_labels: [__name__] - regex: 'kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_inodes_used|kubelet_running_containers|kubelet_runtime_operations_errors_total|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|go_memstats_alloc_bytes|up' + regex: 'kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_used_bytes|kubelet_volume_stats_available_bytes|kubelet_volume_stats_inodes_used|kubelet_running_containers|kubelet_runtime_operations_errors_total|process_cpu_seconds_total|process_resident_memory_bytes|process_start_time_seconds|go_memstats_alloc_bytes|up' action: keep - job_name: kubernetes-nodes-cadvisor scheme: https