fix node OOM: reduce memory overcommit ratio and add kubelet eviction thresholds
LimitRange defaults had a 4-8x limit/request ratio causing the scheduler to overpack nodes. When pods burst, nodes OOM-thrashed and became unresponsive (k8s-node3 and k8s-node4 both went down today). Changes: - Increase default memory requests across all tiers (ratio now 2x): - core/cluster: 64Mi → 256Mi request (512Mi limit) - gpu: 256Mi → 1Gi request (2Gi limit) - edge/aux/fallback: 64Mi → 128Mi request (256Mi limit) - Add kubelet memory reservation and eviction thresholds: - systemReserved: 512Mi, kubeReserved: 512Mi - evictionHard: 500Mi (was 100Mi), evictionSoft: 1Gi (was unset) - Applied to all nodes and future node template
This commit is contained in:
parent
193f2e2dc5
commit
fffc2ed0ab
2 changed files with 30 additions and 12 deletions
|
|
@ -86,6 +86,24 @@ module "k8s-node-template" {
|
|||
sudo sed -i '/serializeImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||
sudo sed -i '/maxParallelImagePulls:/d' /var/lib/kubelet/config.yaml && \
|
||||
echo -e 'serializeImagePulls: false\nmaxParallelImagePulls: 50' | sudo tee -a /var/lib/kubelet/config.yaml
|
||||
|
||||
# Memory reservation and eviction — prevent node OOM by reserving memory
|
||||
# for OS/kubelet and evicting pods before the node runs out of memory.
|
||||
sudo sed -i '/systemReserved:/d; /kubeReserved:/d; /evictionHard:/,/^[^ ]/{ /evictionHard:/d; /^ /d }; /evictionSoft:/,/^[^ ]/{ /evictionSoft:/d; /^ /d }; /evictionSoftGracePeriod:/,/^[^ ]/{ /evictionSoftGracePeriod:/d; /^ /d }' /var/lib/kubelet/config.yaml
|
||||
cat <<'KUBELET_PATCH' | sudo tee -a /var/lib/kubelet/config.yaml
|
||||
systemReserved:
|
||||
memory: "512Mi"
|
||||
kubeReserved:
|
||||
memory: "512Mi"
|
||||
evictionHard:
|
||||
memory.available: "500Mi"
|
||||
nodefs.available: "10%"
|
||||
imagefs.available: "15%"
|
||||
evictionSoft:
|
||||
memory.available: "1Gi"
|
||||
evictionSoftGracePeriod:
|
||||
memory.available: "30s"
|
||||
KUBELET_PATCH
|
||||
EOF
|
||||
k8s_join_command = var.k8s_join_command
|
||||
}
|
||||
|
|
|
|||
|
|
@ -134,8 +134,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
memory = "512Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "50m"
|
||||
memory = "64Mi"
|
||||
cpu = "100m"
|
||||
memory = "256Mi"
|
||||
}
|
||||
max = {
|
||||
cpu = "4"
|
||||
|
|
@ -193,8 +193,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
memory = "512Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "50m"
|
||||
memory = "64Mi"
|
||||
cpu = "100m"
|
||||
memory = "256Mi"
|
||||
}
|
||||
max = {
|
||||
cpu = "2"
|
||||
|
|
@ -252,8 +252,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
memory = "2Gi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "100m"
|
||||
memory = "256Mi"
|
||||
cpu = "200m"
|
||||
memory = "1Gi"
|
||||
}
|
||||
max = {
|
||||
cpu = "8"
|
||||
|
|
@ -311,8 +311,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
memory = "256Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "25m"
|
||||
memory = "64Mi"
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
max = {
|
||||
cpu = "2"
|
||||
|
|
@ -370,8 +370,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
memory = "256Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "25m"
|
||||
memory = "64Mi"
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
max = {
|
||||
cpu = "2"
|
||||
|
|
@ -432,8 +432,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
|
|||
memory = "256Mi"
|
||||
}
|
||||
defaultRequest = {
|
||||
cpu = "25m"
|
||||
memory = "64Mi"
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
max = {
|
||||
cpu = "1"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue