right-size cluster memory: reduce overprovisioned, fix under-provisioned services

Phase 1 - Quick wins (~4.5 Gi saved):
- democratic-csi: add explicit sidecar resources (64-80Mi vs 256Mi LimitRange default)
- caretta: 768Mi → 600Mi (VPA upper 485Mi)
- immich-ml: 4Gi → 3584Mi (VPA upper 2.95Gi, GPU margin)
- onlyoffice: 3Gi → 2304Mi (VPA upper 1.82Gi)

Phase 2 - Safety fixes (prevent OOMKills):
- frigate: 2Gi/8Gi → 5Gi/10Gi (VPA upper 7.7Gi, was 4% headroom)
- openclaw: 1280Mi req → 2Gi req=limit (documented 2Gi requirement)

Phase 3 - Additional right-sizing:
- authentik workers: 1Gi → 896Mi x3 (VPA upper 722Mi)
- shlink: 512Mi/768Mi → 960Mi req=limit (VPA upper 780Mi, safety increase)

Phase 4 - Burstable QoS for lower tiers:
- tier-3-edge: 128Mi/128Mi → 96Mi req / 192Mi limit
- tier-4-aux: 128Mi/128Mi → 64Mi req / 256Mi limit

Phase 5 - Monitoring:
- Add ClusterMemoryRequestsHigh alert (>85% allocatable, 15m)
- Add ContainerNearOOM alert (>85% limit, 30m)
- Add PodUnschedulable alert (5m, critical)

Cluster: 92.7% → 90.8% memory requests. Stirling-pdf now schedulable.
This commit is contained in:
Viktor Barzin 2026-03-15 15:30:18 +00:00
parent 8bac6db48f
commit 194281e527
10 changed files with 84 additions and 19 deletions

View file

@ -86,10 +86,10 @@ resource "kubernetes_deployment" "frigate" {
resources {
requests = {
cpu = "1500m"
memory = "2Gi"
memory = "5Gi"
}
limits = {
memory = "8Gi"
memory = "10Gi"
"nvidia.com/gpu" = "1"
}
}

View file

@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" {
resources {
requests = {
cpu = "100m"
memory = "4Gi"
memory = "3584Mi"
}
limits = {
memory = "4Gi"
memory = "3584Mi"
"nvidia.com/gpu" = "1"
}
}

View file

@ -103,10 +103,10 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
resources {
requests = {
cpu = "100m"
memory = "3Gi"
memory = "2304Mi"
}
limits = {
memory = "3Gi"
memory = "2304Mi"
}
}
port {

View file

@ -426,7 +426,7 @@ resource "kubernetes_deployment" "openclaw" {
}
requests = {
cpu = "100m"
memory = "1280Mi"
memory = "2Gi"
}
}
}

View file

@ -48,9 +48,9 @@ worker:
resources:
requests:
cpu: 100m
memory: 1Gi
memory: 896Mi
limits:
memory: 1Gi
memory: 896Mi
topologySpreadConstraints:
- maxSkew: 1
topologyKey: kubernetes.io/hostname

View file

@ -43,6 +43,38 @@ resource "helm_release" "democratic_csi" {
limits = { memory = "192Mi" }
}
}
externalProvisioner = {
resources = {
requests = { cpu = "5m", memory = "64Mi" }
limits = { memory = "64Mi" }
}
}
externalAttacher = {
resources = {
requests = { cpu = "5m", memory = "64Mi" }
limits = { memory = "64Mi" }
}
}
externalResizer = {
resources = {
requests = { cpu = "5m", memory = "64Mi" }
limits = { memory = "64Mi" }
}
}
externalSnapshotter = {
resources = {
requests = { cpu = "5m", memory = "80Mi" }
limits = { memory = "80Mi" }
}
}
}
# csiProxy is a top-level chart key, NOT nested under controller/node
csiProxy = {
resources = {
requests = { cpu = "5m", memory = "32Mi" }
limits = { memory = "32Mi" }
}
}
node = {
@ -52,6 +84,18 @@ resource "helm_release" "democratic_csi" {
limits = { memory = "192Mi" }
}
}
driverRegistrar = {
resources = {
requests = { cpu = "5m", memory = "32Mi" }
limits = { memory = "32Mi" }
}
}
cleanup = {
resources = {
requests = { cpu = "5m", memory = "32Mi" }
limits = { memory = "32Mi" }
}
}
hostPID = true
hostPath = "/lib/modules"

View file

@ -263,7 +263,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
}
}
},
# Tier 3-edge
# Tier 3-edge Burstable QoS: request < limit to reduce scheduler pressure
{
name = "limitrange-tier-3-edge"
match = {
@ -305,11 +305,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
{
type = "Container"
default = {
memory = "128Mi"
memory = "192Mi"
}
defaultRequest = {
cpu = "50m"
memory = "128Mi"
memory = "96Mi"
}
max = {
memory = "4Gi"
@ -320,7 +320,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
}
}
},
# Tier 4-aux
# Tier 4-aux Burstable QoS: request < limit to reduce scheduler pressure
{
name = "limitrange-tier-4-aux"
match = {
@ -362,11 +362,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
{
type = "Container"
default = {
memory = "128Mi"
memory = "256Mi"
}
defaultRequest = {
cpu = "50m"
memory = "128Mi"
memory = "64Mi"
}
max = {
memory = "4Gi"

View file

@ -29,10 +29,10 @@ resource "helm_release" "caretta" {
resources = {
requests = {
cpu = "10m"
memory = "768Mi"
memory = "600Mi"
}
limits = {
memory = "768Mi"
memory = "600Mi"
}
}
})]

View file

@ -396,6 +396,27 @@ serverFiles:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: OOM killed"
- alert: ClusterMemoryRequestsHigh
expr: sum(kube_pod_container_resource_requests{resource="memory"}) / sum(kube_node_status_allocatable{resource="memory"}) > 0.85
for: 15m
labels:
severity: warning
annotations:
summary: "Cluster memory requests above 85% of allocatable"
- alert: ContainerNearOOM
expr: (container_memory_working_set_bytes / container_spec_memory_limit_bytes > 0.85) and container_spec_memory_limit_bytes > 0
for: 30m
labels:
severity: warning
annotations:
summary: "{{ $labels.container }} in {{ $labels.namespace }}/{{ $labels.pod }} using >85% of memory limit"
- alert: PodUnschedulable
expr: kube_pod_status_conditions{condition="PodScheduled", status="false"} == 1
for: 5m
labels:
severity: critical
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} unschedulable"
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m

View file

@ -141,11 +141,11 @@ resource "kubernetes_deployment" "shlink" {
# }
resources {
limits = {
memory = "768Mi"
memory = "960Mi"
}
requests = {
cpu = "25m"
memory = "512Mi"
memory = "960Mi"
}
}
port {