From 69c4c0c76e9c2b9ad4a6f16a337e0600bd41cc61 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 26 Feb 2026 23:15:43 +0000 Subject: [PATCH] [ci skip] VPA: reduce LimitRange defaults, add overcommit check, protect tier-0 - Reduce Kyverno LimitRange default limits ~4x across all tiers to fix 800-900% memory overcommitment on worker nodes - Add cluster health check #25: per-node resource overcommitment showing requests and limits vs allocatable capacity - Add Kyverno policy for Goldilocks VPA mode by tier: tier-0 namespaces get VPA Off mode (recommend only, no evictions) to prevent downtime on critical infra (traefik, cloudflared, authentik, technitium, etc.) - Non-tier-0 namespaces get VPA Auto mode for active right-sizing --- scripts/cluster_healthcheck.sh | 127 +++++++++++++++++- .../modules/kyverno/resource-governance.tf | 52 +++---- stacks/platform/modules/vpa/main.tf | 90 +++++++++++++ 3 files changed, 242 insertions(+), 27 deletions(-) diff --git a/scripts/cluster_healthcheck.sh b/scripts/cluster_healthcheck.sh index f8d6ba34..4f2ccb6a 100755 --- a/scripts/cluster_healthcheck.sh +++ b/scripts/cluster_healthcheck.sh @@ -26,7 +26,7 @@ JSON=false KUBECONFIG_PATH="$(pwd)/config" KUBECTL="" JSON_RESULTS=() -TOTAL_CHECKS=24 +TOTAL_CHECKS=25 # --- Helpers --- info() { [[ "$JSON" == true ]] && return 0; echo -e "${BLUE}[INFO]${NC} $*"; } @@ -1151,6 +1151,130 @@ check_cloudflare_tunnel() { fi } +# --- 25. Resource Overcommitment --- +check_overcommit() { + section 25 "Resource Overcommitment" + local detail="" had_issue=false status="PASS" + + local node_file pod_file + node_file=$(mktemp) + pod_file=$(mktemp) + trap "rm -f '$node_file' '$pod_file'" RETURN + + $KUBECTL get nodes -o json >"$node_file" 2>/dev/null || { fail "Cannot get nodes"; json_add "overcommit" "FAIL" "Cannot get nodes"; return 0; } + $KUBECTL get pods -A -o json --field-selector=status.phase=Running >"$pod_file" 2>/dev/null || { fail "Cannot get pods"; json_add "overcommit" "FAIL" "Cannot get pods"; return 0; } + + local overcommit_info + overcommit_info=$(python3 - "$node_file" "$pod_file" <<'PYEOF' +import json, sys + +def parse_cpu(val): + val = str(val) + if val.endswith("m"): + return float(val[:-1]) + elif val.endswith("n"): + return float(val[:-1]) / 1e6 + return float(val) * 1000 + +def parse_mem(val): + val = str(val) + units = {"Ki": 1024, "Mi": 1024**2, "Gi": 1024**3, "Ti": 1024**4, + "K": 1000, "M": 1000**2, "G": 1000**3, "T": 1000**4} + for suffix, mult in sorted(units.items(), key=lambda x: -len(x[0])): + if val.endswith(suffix): + return float(val[:-len(suffix)]) * mult + return float(val) + +def fmt_mem(b): + return f"{b / (1024**3):.1f}Gi" + +def fmt_cpu(m): + if m >= 1000: + return f"{m/1000:.1f}" + return f"{m:.0f}m" + +with open(sys.argv[1]) as f: + nodes = json.load(f) +with open(sys.argv[2]) as f: + pods = json.load(f) + +alloc = {} +for node in nodes["items"]: + name = node["metadata"]["name"] + a = node["status"].get("allocatable", {}) + alloc[name] = { + "cpu": parse_cpu(a.get("cpu", "0")), + "mem": parse_mem(a.get("memory", "0")), + } + +node_req = {n: {"cpu": 0, "mem": 0} for n in alloc} +node_lim = {n: {"cpu": 0, "mem": 0} for n in alloc} + +for pod in pods["items"]: + node = pod.get("spec", {}).get("nodeName") + if not node or node not in alloc: + continue + for c in pod.get("spec", {}).get("containers", []) + pod.get("spec", {}).get("initContainers", []): + res = c.get("resources", {}) + req = res.get("requests", {}) + lim = res.get("limits", {}) + node_req[node]["cpu"] += parse_cpu(req.get("cpu", "0")) + node_req[node]["mem"] += parse_mem(req.get("memory", "0")) + node_lim[node]["cpu"] += parse_cpu(lim.get("cpu", "0")) + node_lim[node]["mem"] += parse_mem(lim.get("memory", "0")) + +for name in sorted(alloc): + a = alloc[name] + r = node_req[name] + l = node_lim[name] + if a["cpu"] <= 0 or a["mem"] <= 0: + continue + + req_cpu_pct = (r["cpu"] / a["cpu"]) * 100 + req_mem_pct = (r["mem"] / a["mem"]) * 100 + lim_cpu_pct = (l["cpu"] / a["cpu"]) * 100 + lim_mem_pct = (l["mem"] / a["mem"]) * 100 + + level = "OK" + if lim_mem_pct > 500 or lim_cpu_pct > 500: + level = "FAIL" + elif lim_mem_pct > 200 or lim_cpu_pct > 200: + level = "WARN" + + print(f"{level}:{name}:req cpu {fmt_cpu(r['cpu'])}/{fmt_cpu(a['cpu'])} ({req_cpu_pct:.0f}%), mem {fmt_mem(r['mem'])}/{fmt_mem(a['mem'])} ({req_mem_pct:.0f}%) | lim cpu {fmt_cpu(l['cpu'])}/{fmt_cpu(a['cpu'])} ({lim_cpu_pct:.0f}%), mem {fmt_mem(l['mem'])}/{fmt_mem(a['mem'])} ({lim_mem_pct:.0f}%)") +PYEOF +) || true + + if [[ -z "$overcommit_info" ]]; then + pass "Cannot compute overcommitment" + json_add "overcommit" "PASS" "No data" + return 0 + fi + + while IFS= read -r line; do + local level node_detail + level=$(echo "$line" | cut -d: -f1) + node_detail=$(echo "$line" | cut -d: -f2-) + + if [[ "$level" == "FAIL" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Overcommitment" + fail "$node_detail" + had_issue=true + status="FAIL" + elif [[ "$level" == "WARN" ]]; then + [[ "$had_issue" == false && "$QUIET" == true ]] && section_always 25 "Resource Overcommitment" + warn "$node_detail" + had_issue=true + [[ "$status" != "FAIL" ]] && status="WARN" + else + pass "$node_detail" + fi + detail+="$node_detail; " + done <<< "$overcommit_info" + + json_add "overcommit" "$status" "$detail" +} + # --- Summary --- print_summary() { if [[ "$JSON" == true ]]; then @@ -1227,6 +1351,7 @@ main() { check_tls_certs check_gpu check_cloudflare_tunnel + check_overcommit print_summary # Exit code: 2 for failures, 1 for warnings, 0 for clean diff --git a/stacks/platform/modules/kyverno/resource-governance.tf b/stacks/platform/modules/kyverno/resource-governance.tf index 5aaabf17..e6244d80 100644 --- a/stacks/platform/modules/kyverno/resource-governance.tf +++ b/stacks/platform/modules/kyverno/resource-governance.tf @@ -117,16 +117,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - cpu = "2" - memory = "4Gi" + cpu = "500m" + memory = "512Mi" } defaultRequest = { - cpu = "100m" - memory = "128Mi" + cpu = "50m" + memory = "64Mi" } max = { - cpu = "8" - memory = "16Gi" + cpu = "4" + memory = "8Gi" } } ] @@ -163,16 +163,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - cpu = "2" - memory = "4Gi" + cpu = "500m" + memory = "512Mi" } defaultRequest = { - cpu = "100m" - memory = "128Mi" + cpu = "50m" + memory = "64Mi" } max = { - cpu = "4" - memory = "8Gi" + cpu = "2" + memory = "4Gi" } } ] @@ -209,8 +209,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - cpu = "4" - memory = "8Gi" + cpu = "1" + memory = "2Gi" } defaultRequest = { cpu = "100m" @@ -255,16 +255,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - cpu = "1" - memory = "2Gi" + cpu = "250m" + memory = "256Mi" } defaultRequest = { - cpu = "50m" - memory = "128Mi" + cpu = "25m" + memory = "64Mi" } max = { - cpu = "4" - memory = "8Gi" + cpu = "2" + memory = "4Gi" } } ] @@ -301,8 +301,8 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - cpu = "500m" - memory = "1Gi" + cpu = "250m" + memory = "256Mi" } defaultRequest = { cpu = "25m" @@ -363,16 +363,16 @@ resource "kubernetes_manifest" "generate_limitrange_by_tier" { { type = "Container" default = { - cpu = "500m" - memory = "1Gi" + cpu = "250m" + memory = "256Mi" } defaultRequest = { cpu = "25m" memory = "64Mi" } max = { - cpu = "2" - memory = "4Gi" + cpu = "1" + memory = "2Gi" } } ] diff --git a/stacks/platform/modules/vpa/main.tf b/stacks/platform/modules/vpa/main.tf index f433f9ec..10c1776c 100644 --- a/stacks/platform/modules/vpa/main.tf +++ b/stacks/platform/modules/vpa/main.tf @@ -84,3 +84,93 @@ module "ingress" { depends_on = [helm_release.goldilocks] } + +# ----------------------------------------------------------------------------- +# Kyverno policy — label namespaces for VPA mode by tier +# ----------------------------------------------------------------------------- +# Goldilocks reads the goldilocks.fairwinds.com/vpa-update-mode label on +# namespaces to decide the updateMode for VPA objects it creates. +# Tier 0-core gets "off" (recommend only — these are critical infra where +# evictions cause downtime). All other namespaces get "auto". + +resource "kubernetes_manifest" "vpa_auto_mode_label" { + manifest = { + apiVersion = "kyverno.io/v1" + kind = "ClusterPolicy" + metadata = { + name = "goldilocks-vpa-auto-mode" + annotations = { + "policies.kyverno.io/title" = "Goldilocks VPA Mode by Tier" + "policies.kyverno.io/description" = "Sets VPA update mode per namespace: Off for tier-0 critical infra (no evictions), Auto for all others." + } + } + spec = { + rules = [ + # Tier 0-core: recommend only, never evict + { + name = "label-vpa-off-tier-0" + match = { + any = [ + { + resources = { + kinds = ["Namespace"] + selector = { + matchLabels = { + tier = "0-core" + } + } + } + } + ] + } + mutate = { + patchStrategicMerge = { + metadata = { + labels = { + "goldilocks.fairwinds.com/vpa-update-mode" = "off" + } + } + } + } + }, + # All other namespaces: auto mode + { + name = "label-vpa-auto-default" + match = { + any = [ + { + resources = { + kinds = ["Namespace"] + } + } + ] + } + exclude = { + any = [ + { + resources = { + selector = { + matchLabels = { + tier = "0-core" + } + } + } + } + ] + } + mutate = { + patchStrategicMerge = { + metadata = { + labels = { + "goldilocks.fairwinds.com/vpa-update-mode" = "auto" + } + } + } + } + }, + ] + } + } + + depends_on = [helm_release.goldilocks] +}