right-size cluster memory: reduce overprovisioned, fix under-provisioned services
Phase 1 - Quick wins (~4.5 Gi saved): - democratic-csi: add explicit sidecar resources (64-80Mi vs 256Mi LimitRange default) - caretta: 768Mi → 600Mi (VPA upper 485Mi) - immich-ml: 4Gi → 3584Mi (VPA upper 2.95Gi, GPU margin) - onlyoffice: 3Gi → 2304Mi (VPA upper 1.82Gi) Phase 2 - Safety fixes (prevent OOMKills): - frigate: 2Gi/8Gi → 5Gi/10Gi (VPA upper 7.7Gi, was 4% headroom) - openclaw: 1280Mi req → 2Gi req=limit (documented 2Gi requirement) Phase 3 - Additional right-sizing: - authentik workers: 1Gi → 896Mi x3 (VPA upper 722Mi) - shlink: 512Mi/768Mi → 960Mi req=limit (VPA upper 780Mi, safety increase) Phase 4 - Burstable QoS for lower tiers: - tier-3-edge: 128Mi/128Mi → 96Mi req / 192Mi limit - tier-4-aux: 128Mi/128Mi → 64Mi req / 256Mi limit Phase 5 - Monitoring: - Add ClusterMemoryRequestsHigh alert (>85% allocatable, 15m) - Add ContainerNearOOM alert (>85% limit, 30m) - Add PodUnschedulable alert (5m, critical) Cluster: 92.7% → 90.8% memory requests. Stirling-pdf now schedulable.
This commit is contained in:
parent
8bac6db48f
commit
194281e527
10 changed files with 84 additions and 19 deletions
|
|
@ -29,10 +29,10 @@ resource "helm_release" "caretta" {
|
|||
resources = {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "768Mi"
|
||||
memory = "600Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "768Mi"
|
||||
memory = "600Mi"
|
||||
}
|
||||
}
|
||||
})]
|
||||
|
|
|
|||
|
|
@ -396,6 +396,27 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: OOM killed"
|
||||
- alert: ClusterMemoryRequestsHigh
|
||||
expr: sum(kube_pod_container_resource_requests{resource="memory"}) / sum(kube_node_status_allocatable{resource="memory"}) > 0.85
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Cluster memory requests above 85% of allocatable"
|
||||
- alert: ContainerNearOOM
|
||||
expr: (container_memory_working_set_bytes / container_spec_memory_limit_bytes > 0.85) and container_spec_memory_limit_bytes > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.container }} in {{ $labels.namespace }}/{{ $labels.pod }} using >85% of memory limit"
|
||||
- alert: PodUnschedulable
|
||||
expr: kube_pod_status_conditions{condition="PodScheduled", status="false"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} unschedulable"
|
||||
- alert: NodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 5m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue