right-size cluster memory: reduce overprovisioned, fix under-provisioned services
Phase 1 - Quick wins (~4.5 Gi saved): - democratic-csi: add explicit sidecar resources (64-80Mi vs 256Mi LimitRange default) - caretta: 768Mi → 600Mi (VPA upper 485Mi) - immich-ml: 4Gi → 3584Mi (VPA upper 2.95Gi, GPU margin) - onlyoffice: 3Gi → 2304Mi (VPA upper 1.82Gi) Phase 2 - Safety fixes (prevent OOMKills): - frigate: 2Gi/8Gi → 5Gi/10Gi (VPA upper 7.7Gi, was 4% headroom) - openclaw: 1280Mi req → 2Gi req=limit (documented 2Gi requirement) Phase 3 - Additional right-sizing: - authentik workers: 1Gi → 896Mi x3 (VPA upper 722Mi) - shlink: 512Mi/768Mi → 960Mi req=limit (VPA upper 780Mi, safety increase) Phase 4 - Burstable QoS for lower tiers: - tier-3-edge: 128Mi/128Mi → 96Mi req / 192Mi limit - tier-4-aux: 128Mi/128Mi → 64Mi req / 256Mi limit Phase 5 - Monitoring: - Add ClusterMemoryRequestsHigh alert (>85% allocatable, 15m) - Add ContainerNearOOM alert (>85% limit, 30m) - Add PodUnschedulable alert (5m, critical) Cluster: 92.7% → 90.8% memory requests. Stirling-pdf now schedulable.
This commit is contained in:
parent
8bac6db48f
commit
194281e527
10 changed files with 84 additions and 19 deletions
|
|
@ -86,10 +86,10 @@ resource "kubernetes_deployment" "frigate" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "1500m"
|
||||
memory = "2Gi"
|
||||
memory = "5Gi"
|
||||
}
|
||||
limits = {
|
||||
memory = "8Gi"
|
||||
memory = "10Gi"
|
||||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -513,10 +513,10 @@ resource "kubernetes_deployment" "immich-machine-learning" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
memory = "4Gi"
|
||||
memory = "3584Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "4Gi"
|
||||
memory = "3584Mi"
|
||||
"nvidia.com/gpu" = "1"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,10 +103,10 @@ resource "kubernetes_deployment" "onlyoffice-document-server" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
memory = "3Gi"
|
||||
memory = "2304Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "3Gi"
|
||||
memory = "2304Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
|
|
|
|||
|
|
@ -426,7 +426,7 @@ resource "kubernetes_deployment" "openclaw" {
|
|||
}
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
memory = "1280Mi"
|
||||
memory = "2Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,9 +48,9 @@ worker:
|
|||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 1Gi
|
||||
memory: 896Mi
|
||||
limits:
|
||||
memory: 1Gi
|
||||
memory: 896Mi
|
||||
topologySpreadConstraints:
|
||||
- maxSkew: 1
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
|
|
|||
|
|
@ -43,6 +43,38 @@ resource "helm_release" "democratic_csi" {
|
|||
limits = { memory = "192Mi" }
|
||||
}
|
||||
}
|
||||
externalProvisioner = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "64Mi" }
|
||||
limits = { memory = "64Mi" }
|
||||
}
|
||||
}
|
||||
externalAttacher = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "64Mi" }
|
||||
limits = { memory = "64Mi" }
|
||||
}
|
||||
}
|
||||
externalResizer = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "64Mi" }
|
||||
limits = { memory = "64Mi" }
|
||||
}
|
||||
}
|
||||
externalSnapshotter = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "80Mi" }
|
||||
limits = { memory = "80Mi" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# csiProxy is a top-level chart key, NOT nested under controller/node
|
||||
csiProxy = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "32Mi" }
|
||||
limits = { memory = "32Mi" }
|
||||
}
|
||||
}
|
||||
|
||||
node = {
|
||||
|
|
@ -52,6 +84,18 @@ resource "helm_release" "democratic_csi" {
|
|||
limits = { memory = "192Mi" }
|
||||
}
|
||||
}
|
||||
driverRegistrar = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "32Mi" }
|
||||
limits = { memory = "32Mi" }
|
||||
}
|
||||
}
|
||||
cleanup = {
|
||||
resources = {
|
||||
requests = { cpu = "5m", memory = "32Mi" }
|
||||
limits = { memory = "32Mi" }
|
||||
}
|
||||
}
|
||||
|
||||
hostPID = true
|
||||
hostPath = "/lib/modules"
|
||||
|
|
|
|||
|
|
@ -263,7 +263,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
}
|
||||
}
|
||||
},
|
||||
# Tier 3-edge
|
||||
# Tier 3-edge — Burstable QoS: request < limit to reduce scheduler pressure
|
||||
{
|
||||
name = "limitrange-tier-3-edge"
|
||||
match = {
|
||||
|
|
@ -305,11 +305,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
{
|
||||
type = "Container"
|
||||
default = {
|
||||
memory = "128Mi"
|
||||
memory = "192Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
memory = "96Mi"
|
||||
}
|
||||
max = {
|
||||
memory = "4Gi"
|
||||
|
|
@ -320,7 +320,7 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
}
|
||||
}
|
||||
},
|
||||
# Tier 4-aux
|
||||
# Tier 4-aux — Burstable QoS: request < limit to reduce scheduler pressure
|
||||
{
|
||||
name = "limitrange-tier-4-aux"
|
||||
match = {
|
||||
|
|
@ -362,11 +362,11 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
{
|
||||
type = "Container"
|
||||
default = {
|
||||
memory = "128Mi"
|
||||
memory = "256Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
memory = "64Mi"
|
||||
}
|
||||
max = {
|
||||
memory = "4Gi"
|
||||
|
|
|
|||
|
|
@ -29,10 +29,10 @@ resource "helm_release" "caretta" {
|
|||
resources = {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "768Mi"
|
||||
memory = "600Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "768Mi"
|
||||
memory = "600Mi"
|
||||
}
|
||||
}
|
||||
})]
|
||||
|
|
|
|||
|
|
@ -396,6 +396,27 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}: OOM killed"
|
||||
- alert: ClusterMemoryRequestsHigh
|
||||
expr: sum(kube_pod_container_resource_requests{resource="memory"}) / sum(kube_node_status_allocatable{resource="memory"}) > 0.85
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Cluster memory requests above 85% of allocatable"
|
||||
- alert: ContainerNearOOM
|
||||
expr: (container_memory_working_set_bytes / container_spec_memory_limit_bytes > 0.85) and container_spec_memory_limit_bytes > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.container }} in {{ $labels.namespace }}/{{ $labels.pod }} using >85% of memory limit"
|
||||
- alert: PodUnschedulable
|
||||
expr: kube_pod_status_conditions{condition="PodScheduled", status="false"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} unschedulable"
|
||||
- alert: NodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 5m
|
||||
|
|
|
|||
|
|
@ -141,11 +141,11 @@ resource "kubernetes_deployment" "shlink" {
|
|||
# }
|
||||
resources {
|
||||
limits = {
|
||||
memory = "768Mi"
|
||||
memory = "960Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "25m"
|
||||
memory = "512Mi"
|
||||
memory = "960Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue