From fffc2ed0ab3ed50ab869a0a3cc7830cd334ed8ca Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 8 Mar 2026 10:33:38 +0000 Subject: [PATCH] fix node OOM: reduce memory overcommit ratio and add kubelet eviction thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LimitRange defaults had a 4-8x limit/request ratio causing the scheduler to overpack nodes. When pods burst, nodes OOM-thrashed and became unresponsive (k8s-node3 and k8s-node4 both went down today). Changes: - Increase default memory requests across all tiers (ratio now 2x): - core/cluster: 64Mi → 256Mi request (512Mi limit) - gpu: 256Mi → 1Gi request (2Gi limit) - edge/aux/fallback: 64Mi → 128Mi request (256Mi limit) - Add kubelet memory reservation and eviction thresholds: - systemReserved: 512Mi, kubeReserved: 512Mi - evictionHard: 500Mi (was 100Mi), evictionSoft: 1Gi (was unset) - Applied to all nodes and future node template --- stacks/infra/main.tf | 18 ++++++++++++++ .../modules/kyverno/resource-governance.tf | 24 +++++++++---------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/stacks/infra/main.tf b/stacks/infra/main.tf index f5e5aed4..8015c9aa 100644 --- a/stacks/infra/main.tf +++ b/stacks/infra/main.tf @@ -86,6 +86,24 @@ module "k8s-node-template" { sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \ sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \ echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 50' | sudo tee -a /var/lib/kubelet/config.yaml + + # Memory reservation and eviction — prevent node OOM by reserving memory + # for OS/kubelet and evicting pods before the node runs out of memory. + sudo sed -i '/systemReserved:/d; /kubeReserved:/d; /evictionHard:/,/^[^ ]/{ /evictionHard:/d; /^ /d }; /evictionSoft:/,/^[^ ]/{ /evictionSoft:/d; /^ /d }; /evictionSoftGracePeriod:/,/^[^ ]/{ /evictionSoftGracePeriod:/d; /^ /d }' /var/lib/kubelet/config.yaml + cat <<'KUBELET_PATCH' | sudo tee -a /var/lib/kubelet/config.yaml +systemReserved: + memory: "512Mi" +kubeReserved: + memory: "512Mi" +evictionHard: + memory.available: "500Mi" + nodefs.available: "10%" + imagefs.available: "15%" +evictionSoft: + memory.available: "1Gi" +evictionSoftGracePeriod: + memory.available: "30s" +KUBELET_PATCH EOF k8s_join_command = var.k8s_join_command } diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf index 5183dbc0..d671ba5c 100644 --- a/stacks/platform/modules/kyverno/resource-governance.tf +++ b/stacks/platform/modules/kyverno/resource-governance.tf @@ -134,8 +134,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { memory = "512Mi" } defaultRequest = { - cpu = "50m" - memory = "64Mi" + cpu = "100m" + memory = "256Mi" } max = { cpu = "4" @@ -193,8 +193,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { memory = "512Mi" } defaultRequest = { - cpu = "50m" - memory = "64Mi" + cpu = "100m" + memory = "256Mi" } max = { cpu = "2" @@ -252,8 +252,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { memory = "2Gi" } defaultRequest = { - cpu = "100m" - memory = "256Mi" + cpu = "200m" + memory = "1Gi" } max = { cpu = "8" @@ -311,8 +311,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { memory = "256Mi" } defaultRequest = { - cpu = "25m" - memory = "64Mi" + cpu = "50m" + memory = "128Mi" } max = { cpu = "2" @@ -370,8 +370,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { memory = "256Mi" } defaultRequest = { - cpu = "25m" - memory = "64Mi" + cpu = "50m" + memory = "128Mi" } max = { cpu = "2" @@ -432,8 +432,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { memory = "256Mi" } defaultRequest = { - cpu = "25m" - memory = "64Mi" + cpu = "50m" + memory = "128Mi" } max = { cpu = "1"