From 194281e527b66f44e8d39b67c17210f852c7cc5d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Mar 2026 15:30:18 +0000 Subject: [PATCH] right-size cluster memory: reduce overprovisioned, fix under-provisioned services MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 - Quick wins (~4.5 Gi saved): - democratic-csi: add explicit sidecar resources (64-80Mi vs 256Mi LimitRange default) - caretta: 768Mi → 600Mi (VPA upper 485Mi) - immich-ml: 4Gi → 3584Mi (VPA upper 2.95Gi, GPU margin) - onlyoffice: 3Gi → 2304Mi (VPA upper 1.82Gi) Phase 2 - Safety fixes (prevent OOMKills): - frigate: 2Gi/8Gi → 5Gi/10Gi (VPA upper 7.7Gi, was 4% headroom) - openclaw: 1280Mi req → 2Gi req=limit (documented 2Gi requirement) Phase 3 - Additional right-sizing: - authentik workers: 1Gi → 896Mi x3 (VPA upper 722Mi) - shlink: 512Mi/768Mi → 960Mi req=limit (VPA upper 780Mi, safety increase) Phase 4 - Burstable QoS for lower tiers: - tier-3-edge: 128Mi/128Mi → 96Mi req / 192Mi limit - tier-4-aux: 128Mi/128Mi → 64Mi req / 256Mi limit Phase 5 - Monitoring: - Add ClusterMemoryRequestsHigh alert (>85% allocatable, 15m) - Add ContainerNearOOM alert (>85% limit, 30m) - Add PodUnschedulable alert (5m, critical) Cluster: 92.7% → 90.8% memory requests. Stirling-pdf now schedulable. --- stacks/frigate/main.tf | 4 +- stacks/immich/main.tf | 4 +- stacks/onlyoffice/main.tf | 4 +- stacks/openclaw/main.tf | 2 +- stacks/platform/modules/authentik/values.yaml | 4 +- stacks/platform/modules/iscsi-csi/main.tf | 44 +++++++++++++++++++ .../modules/kyverno/resource-governance.tf | 12 ++--- stacks/platform/modules/monitoring/caretta.tf | 4 +- .../monitoring/prometheus_chart_values.tpl | 21 +++++++++ stacks/url/main.tf | 4 +- 10 files changed, 84 insertions(+), 19 deletions(-) diff --git a/stacks/frigate/main.tf b/stacks/frigate/main.tf index f142b160..71eb8a75 100644 --- a/stacks/frigate/main.tf +++ b/stacks/frigate/main.tf @@ -86,10 +86,10 @@ resource "kubernetes_deployment" "frigate" { resources { requests = { cpu = "1500m" - memory = "2Gi" + memory = "5Gi" } limits = { - memory = "8Gi" + memory = "10Gi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/immich/main.tf b/stacks/immich/main.tf index 46f09603..8752abe4 100644 --- a/stacks/immich/main.tf +++ b/stacks/immich/main.tf @@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" { resources { requests = { cpu = "100m" - memory = "4Gi" + memory = "3584Mi" } limits = { - memory = "4Gi" + memory = "3584Mi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/onlyoffice/main.tf b/stacks/onlyoffice/main.tf index 44f30a9a..86b16bc3 100644 --- a/stacks/onlyoffice/main.tf +++ b/stacks/onlyoffice/main.tf @@ -103,10 +103,10 @@ resource "kubernetes_deployment" "onlyoffice-document-server" { resources { requests = { cpu = "100m" - memory = "3Gi" + memory = "2304Mi" } limits = { - memory = "3Gi" + memory = "2304Mi" } } port { diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index e153fede..bffd6511 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -426,7 +426,7 @@ resource "kubernetes_deployment" "openclaw" { } requests = { cpu = "100m" - memory = "1280Mi" + memory = "2Gi" } } } diff --git a/stacks/platform/modules/authentik/values.yaml b/stacks/platform/modules/authentik/values.yaml index ddd191cd..05cc1065 100644 --- a/stacks/platform/modules/authentik/values.yaml +++ b/stacks/platform/modules/authentik/values.yaml @@ -48,9 +48,9 @@ worker: resources: requests: cpu: 100m - memory: 1Gi + memory: 896Mi limits: - memory: 1Gi + memory: 896Mi topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname diff --git a/stacks/platform/modules/iscsi-csi/main.tf b/stacks/platform/modules/iscsi-csi/main.tf index 0860b317..4c4a8d59 100644 --- a/stacks/platform/modules/iscsi-csi/main.tf +++ b/stacks/platform/modules/iscsi-csi/main.tf @@ -43,6 +43,38 @@ resource "helm_release" "democratic_csi" { limits = { memory = "192Mi" } } } + externalProvisioner = { + resources = { + requests = { cpu = "5m", memory = "64Mi" } + limits = { memory = "64Mi" } + } + } + externalAttacher = { + resources = { + requests = { cpu = "5m", memory = "64Mi" } + limits = { memory = "64Mi" } + } + } + externalResizer = { + resources = { + requests = { cpu = "5m", memory = "64Mi" } + limits = { memory = "64Mi" } + } + } + externalSnapshotter = { + resources = { + requests = { cpu = "5m", memory = "80Mi" } + limits = { memory = "80Mi" } + } + } + } + + # csiProxy is a top-level chart key, NOT nested under controller/node + csiProxy = { + resources = { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "32Mi" } + } } node = { @@ -52,6 +84,18 @@ resource "helm_release" "democratic_csi" { limits = { memory = "192Mi" } } } + driverRegistrar = { + resources = { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "32Mi" } + } + } + cleanup = { + resources = { + requests = { cpu = "5m", memory = "32Mi" } + limits = { memory = "32Mi" } + } + } hostPID = true hostPath = "/lib/modules" diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf index 5b948b62..86384279 100644 --- a/stacks/platform/modules/kyverno/resource-governance.tf +++ b/stacks/platform/modules/kyverno/resource-governance.tf @@ -263,7 +263,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { } } }, - # Tier 3-edge + # Tier 3-edge — Burstable QoS: request < limit to reduce scheduler pressure { name = "limitrange-tier-3-edge" match = { @@ -305,11 +305,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - memory = "128Mi" + memory = "192Mi" } defaultRequest = { cpu = "50m" - memory = "128Mi" + memory = "96Mi" } max = { memory = "4Gi" @@ -320,7 +320,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { } } }, - # Tier 4-aux + # Tier 4-aux — Burstable QoS: request < limit to reduce scheduler pressure { name = "limitrange-tier-4-aux" match = { @@ -362,11 +362,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - memory = "128Mi" + memory = "256Mi" } defaultRequest = { cpu = "50m" - memory = "128Mi" + memory = "64Mi" } max = { memory = "4Gi" diff --git a/stacks/platform/modules/monitoring/caretta.tf b/stacks/platform/modules/monitoring/caretta.tf index 5f3ca34a..cf8bfcae 100644 --- a/stacks/platform/modules/monitoring/caretta.tf +++ b/stacks/platform/modules/monitoring/caretta.tf @@ -29,10 +29,10 @@ resource "helm_release" "caretta" { resources = { requests = { cpu = "10m" - memory = "768Mi" + memory = "600Mi" } limits = { - memory = "768Mi" + memory = "600Mi" } } })] diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl index 107a7199..5a1fc65d 100755 --- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl @@ -396,6 +396,27 @@ serverFiles: severity: warning annotations: summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: OOM killed" + - alert: ClusterMemoryRequestsHigh + expr: sum(kube_pod_container_resource_requests{resource="memory"}) / sum(kube_node_status_allocatable{resource="memory"}) > 0.85 + for: 15m + labels: + severity: warning + annotations: + summary: "Cluster memory requests above 85% of allocatable" + - alert: ContainerNearOOM + expr: (container_memory_working_set_bytes / container_spec_memory_limit_bytes > 0.85) and container_spec_memory_limit_bytes > 0 + for: 30m + labels: + severity: warning + annotations: + summary: "{{ $labels.container }} in {{ $labels.namespace }}/{{ $labels.pod }} using >85% of memory limit" + - alert: PodUnschedulable + expr: kube_pod_status_conditions{condition="PodScheduled", status="false"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} unschedulable" - alert: NodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 5m diff --git a/stacks/url/main.tf b/stacks/url/main.tf index 1950dfc0..a42c7c86 100644 --- a/stacks/url/main.tf +++ b/stacks/url/main.tf @@ -141,11 +141,11 @@ resource "kubernetes_deployment" "shlink" { # } resources { limits = { - memory = "768Mi" + memory = "960Mi" } requests = { cpu = "25m" - memory = "512Mi" + memory = "960Mi" } } port {