From 69c4c0c76e9c2b9ad4a6f16a337e0600bd41cc61 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Thu, 26 Feb 2026 23:15:43 +0000
Subject: [PATCH] [ci skip] VPA: reduce LimitRange defaults, add overcommit
 check, protect tier-0

- Reduce Kyverno LimitRange default limits ~4x across all tiers to fix
  800-900% memory overcommitment on worker nodes
- Add cluster health check #25: per-node resource overcommitment
  showing requests and limits vs allocatable capacity
- Add Kyverno policy for Goldilocks VPA mode by tier: tier-0 namespaces
  get VPA Off mode (recommend only, no evictions) to prevent downtime
  on critical infra (traefik, cloudflared, authentik, technitium, etc.)
- Non-tier-0 namespaces get VPA Auto mode for active right-sizing
---
 scripts/cluster_healthcheck.sh                | 127 +++++++++++++++++-
 .../modules/kyverno/resource-governance.tf    |  52 +++----
 stacks/platform/modules/vpa/main.tf           |  90 +++++++++++++
 3 files changed, 242 insertions(+), 27 deletions(-)

diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh
index f8d6ba34..4f2ccb6a 100755
--- a/scripts/cluster_healthcheck.sh
+++ b/scripts/cluster_healthcheck.sh
@@ -26,7 +26,7 @@ JSON=false
 KUBECONFIG_PATH="$(pwd)/config"
 KUBECTL=""
 JSON_RESULTS=()
-TOTAL_CHECKS=24
+TOTAL_CHECKS=25
 
 # --- Helpers ---
 info()  { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; }
@@ -1151,6 +1151,130 @@ check_cloudflare_tunnel() {
     fi
 }
 
+# --- 25. Resource Overcommitment ---
+check_overcommit() {
+    section 25 "Resource Overcommitment"
+    local detail="" had_issue=false status="PASS"
+
+    local node_file pod_file
+    node_file=$(mktemp)
+    pod_file=$(mktemp)
+    trap "rm -f '$node_file' '$pod_file'" RETURN
+
+    $KUBECTL get nodes -o json >"$node_file" 2>/dev/null || { fail "Cannot get nodes"; json_add "overcommit" "FAIL" "Cannot get nodes"; return 0; }
+    $KUBECTL get pods -A -o json --field-selector=status.phase=Running >"$pod_file" 2>/dev/null || { fail "Cannot get pods"; json_add "overcommit" "FAIL" "Cannot get pods"; return 0; }
+
+    local overcommit_info
+    overcommit_info=$(python3 - "$node_file" "$pod_file" <<'PYEOF'
+import json, sys
+
+def parse_cpu(val):
+    val = str(val)
+    if val.endswith("m"):
+        return float(val[:-1])
+    elif val.endswith("n"):
+        return float(val[:-1]) / 1e6
+    return float(val) * 1000
+
+def parse_mem(val):
+    val = str(val)
+    units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4,
+             "K": 1000, "M": 1000**2, "G": 1000**3, "T": 1000**4}
+    for suffix, mult in sorted(units.items(), key=lambda x: -len(x[0])):
+        if val.endswith(suffix):
+            return float(val[:-len(suffix)]) * mult
+    return float(val)
+
+def fmt_mem(b):
+    return f"{b / (1024**3):.1f}Gi"
+
+def fmt_cpu(m):
+    if m >= 1000:
+        return f"{m/1000:.1f}"
+    return f"{m:.0f}m"
+
+with open(sys.argv[1]) as f:
+    nodes = json.load(f)
+with open(sys.argv[2]) as f:
+    pods = json.load(f)
+
+alloc = {}
+for node in nodes["items"]:
+    name = node["metadata"]["name"]
+    a = node["status"].get("allocatable", {})
+    alloc[name] = {
+        "cpu": parse_cpu(a.get("cpu", "0")),
+        "mem": parse_mem(a.get("memory", "0")),
+    }
+
+node_req = {n: {"cpu": 0, "mem": 0} for n in alloc}
+node_lim = {n: {"cpu": 0, "mem": 0} for n in alloc}
+
+for pod in pods["items"]:
+    node = pod.get("spec", {}).get("nodeName")
+    if not node or node not in alloc:
+        continue
+    for c in pod.get("spec", {}).get("containers", []) + pod.get("spec", {}).get("initContainers", []):
+        res = c.get("resources", {})
+        req = res.get("requests", {})
+        lim = res.get("limits", {})
+        node_req[node]["cpu"] += parse_cpu(req.get("cpu", "0"))
+        node_req[node]["mem"] += parse_mem(req.get("memory", "0"))
+        node_lim[node]["cpu"] += parse_cpu(lim.get("cpu", "0"))
+        node_lim[node]["mem"] += parse_mem(lim.get("memory", "0"))
+
+for name in sorted(alloc):
+    a = alloc[name]
+    r = node_req[name]
+    l = node_lim[name]
+    if a["cpu"] <= 0 or a["mem"] <= 0:
+        continue
+
+    req_cpu_pct = (r["cpu"] / a["cpu"]) * 100
+    req_mem_pct = (r["mem"] / a["mem"]) * 100
+    lim_cpu_pct = (l["cpu"] / a["cpu"]) * 100
+    lim_mem_pct = (l["mem"] / a["mem"]) * 100
+
+    level = "OK"
+    if lim_mem_pct > 500 or lim_cpu_pct > 500:
+        level = "FAIL"
+    elif lim_mem_pct > 200 or lim_cpu_pct > 200:
+        level = "WARN"
+
+    print(f"{level}:{name}:req cpu {fmt_cpu(r['cpu'])}/{fmt_cpu(a['cpu'])} ({req_cpu_pct:.0f}%), mem {fmt_mem(r['mem'])}/{fmt_mem(a['mem'])} ({req_mem_pct:.0f}%) | lim cpu {fmt_cpu(l['cpu'])}/{fmt_cpu(a['cpu'])} ({lim_cpu_pct:.0f}%), mem {fmt_mem(l['mem'])}/{fmt_mem(a['mem'])} ({lim_mem_pct:.0f}%)")
+PYEOF
+) || true
+
+    if [[ -z "$overcommit_info" ]]; then
+        pass "Cannot compute overcommitment"
+        json_add "overcommit" "PASS" "No data"
+        return 0
+    fi
+
+    while IFS= read -r line; do
+        local level node_detail
+        level=$(echo "$line" | cut -d: -f1)
+        node_detail=$(echo "$line" | cut -d: -f2-)
+
+        if [[ "$level" == "FAIL" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Overcommitment"
+            fail "$node_detail"
+            had_issue=true
+            status="FAIL"
+        elif [[ "$level" == "WARN" ]]; then
+            [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Overcommitment"
+            warn "$node_detail"
+            had_issue=true
+            [[ "$status" != "FAIL" ]] && status="WARN"
+        else
+            pass "$node_detail"
+        fi
+        detail+="$node_detail; "
+    done <<< "$overcommit_info"
+
+    json_add "overcommit" "$status" "$detail"
+}
+
 # --- Summary ---
 print_summary() {
     if [[ "$JSON" == true ]]; then
@@ -1227,6 +1351,7 @@ main() {
     check_tls_certs
     check_gpu
     check_cloudflare_tunnel
+    check_overcommit
     print_summary
 
     # Exit code: 2 for failures, 1 for warnings, 0 for clean
diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf
index 5aaabf17..e6244d80 100644
--- a/stacks/platform/modules/kyverno/resource-governance.tf
+++ b/stacks/platform/modules/kyverno/resource-governance.tf
@@ -117,16 +117,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
                   {
                     type = "Container"
                     default = {
-                      cpu    = "2"
-                      memory = "4Gi"
+                      cpu    = "500m"
+                      memory = "512Mi"
                     }
                     defaultRequest = {
-                      cpu    = "100m"
-                      memory = "128Mi"
+                      cpu    = "50m"
+                      memory = "64Mi"
                     }
                     max = {
-                      cpu    = "8"
-                      memory = "16Gi"
+                      cpu    = "4"
+                      memory = "8Gi"
                     }
                   }
                 ]
@@ -163,16 +163,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
                   {
                     type = "Container"
                     default = {
-                      cpu    = "2"
-                      memory = "4Gi"
+                      cpu    = "500m"
+                      memory = "512Mi"
                     }
                     defaultRequest = {
-                      cpu    = "100m"
-                      memory = "128Mi"
+                      cpu    = "50m"
+                      memory = "64Mi"
                     }
                     max = {
-                      cpu    = "4"
-                      memory = "8Gi"
+                      cpu    = "2"
+                      memory = "4Gi"
                     }
                   }
                 ]
@@ -209,8 +209,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
                   {
                     type = "Container"
                     default = {
-                      cpu    = "4"
-                      memory = "8Gi"
+                      cpu    = "1"
+                      memory = "2Gi"
                     }
                     defaultRequest = {
                       cpu    = "100m"
@@ -255,16 +255,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
                   {
                     type = "Container"
                     default = {
-                      cpu    = "1"
-                      memory = "2Gi"
+                      cpu    = "250m"
+                      memory = "256Mi"
                     }
                     defaultRequest = {
-                      cpu    = "50m"
-                      memory = "128Mi"
+                      cpu    = "25m"
+                      memory = "64Mi"
                     }
                     max = {
-                      cpu    = "4"
-                      memory = "8Gi"
+                      cpu    = "2"
+                      memory = "4Gi"
                     }
                   }
                 ]
@@ -301,8 +301,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
                   {
                     type = "Container"
                     default = {
-                      cpu    = "500m"
-                      memory = "1Gi"
+                      cpu    = "250m"
+                      memory = "256Mi"
                     }
                     defaultRequest = {
                       cpu    = "25m"
@@ -363,16 +363,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" {
                   {
                     type = "Container"
                     default = {
-                      cpu    = "500m"
-                      memory = "1Gi"
+                      cpu    = "250m"
+                      memory = "256Mi"
                     }
                     defaultRequest = {
                       cpu    = "25m"
                       memory = "64Mi"
                     }
                     max = {
-                      cpu    = "2"
-                      memory = "4Gi"
+                      cpu    = "1"
+                      memory = "2Gi"
                     }
                   }
                 ]
diff --git a/stacks/platform/modules/vpa/main.tf b/stacks/platform/modules/vpa/main.tf
index f433f9ec..10c1776c 100644
--- a/stacks/platform/modules/vpa/main.tf
+++ b/stacks/platform/modules/vpa/main.tf
@@ -84,3 +84,93 @@ module "ingress" {
 
   depends_on = [helm_release.goldilocks]
 }
+
+# -----------------------------------------------------------------------------
+# Kyverno policy — label namespaces for VPA mode by tier
+# -----------------------------------------------------------------------------
+# Goldilocks reads the goldilocks.fairwinds.com/vpa-update-mode label on
+# namespaces to decide the updateMode for VPA objects it creates.
+# Tier 0-core gets "off" (recommend only — these are critical infra where
+# evictions cause downtime). All other namespaces get "auto".
+
+resource "kubernetes_manifest" "vpa_auto_mode_label" {
+  manifest = {
+    apiVersion = "kyverno.io/v1"
+    kind       = "ClusterPolicy"
+    metadata = {
+      name = "goldilocks-vpa-auto-mode"
+      annotations = {
+        "policies.kyverno.io/title"       = "Goldilocks VPA Mode by Tier"
+        "policies.kyverno.io/description" = "Sets VPA update mode per namespace: Off for tier-0 critical infra (no evictions), Auto for all others."
+      }
+    }
+    spec = {
+      rules = [
+        # Tier 0-core: recommend only, never evict
+        {
+          name = "label-vpa-off-tier-0"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                  selector = {
+                    matchLabels = {
+                      tier = "0-core"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          mutate = {
+            patchStrategicMerge = {
+              metadata = {
+                labels = {
+                  "goldilocks.fairwinds.com/vpa-update-mode" = "off"
+                }
+              }
+            }
+          }
+        },
+        # All other namespaces: auto mode
+        {
+          name = "label-vpa-auto-default"
+          match = {
+            any = [
+              {
+                resources = {
+                  kinds = ["Namespace"]
+                }
+              }
+            ]
+          }
+          exclude = {
+            any = [
+              {
+                resources = {
+                  selector = {
+                    matchLabels = {
+                      tier = "0-core"
+                    }
+                  }
+                }
+              }
+            ]
+          }
+          mutate = {
+            patchStrategicMerge = {
+              metadata = {
+                labels = {
+                  "goldilocks.fairwinds.com/vpa-update-mode" = "auto"
+                }
+              }
+            }
+          }
+        },
+      ]
+    }
+  }
+
+  depends_on = [helm_release.goldilocks]
+}