[ci skip] Fix Prometheus storage alert and Grafana quota exhaustion

- Enable size-based TSDB retention (45GB) to clean up old blocks (including 2021-era blocks with failed compaction) - Increase monitoring namespace quota from 64/128Gi to 80/160Gi CPU/memory limits to allow Grafana rolling updates
2026-02-21 21:04:08 +00:00 · 2026-02-21 21:04:08 +00:00 · 26ba9ea371
commit 26ba9ea371
parent dcce738641
2 changed files with 3 additions and 3 deletions
--- a/modules/kubernetes/monitoring/main.tf
+++ b/modules/kubernetes/monitoring/main.tf
@ -194,8 +194,8 @@ resource "kubernetes_resource_quota" "monitoring" {
    hard = {
      "requests.cpu"    = "16"
      "requests.memory" = "16Gi"
-      "limits.cpu"      = "64"
-      "limits.memory"   = "128Gi"
+      "limits.cpu"      = "80"
+      "limits.memory"   = "160Gi"
      pods              = "100"
    }
  }
--- a/modules/kubernetes/monitoring/prometheus_chart_values.tpl
+++ b/modules/kubernetes/monitoring/prometheus_chart_values.tpl
@ -109,7 +109,7 @@ server:
    # - "web.enable-admin-api"
    - "web.enable-lifecycle"
    - "storage.tsdb.allow-overlapping-blocks"
-    # - "storage.tsdb.retention.size=1GB"
+    - "storage.tsdb.retention.size=45GB"
    - "storage.tsdb.wal-compression"
  persistentVolume:
    # enabled: false