etcd-load-reduction: remove VPA/Goldilocks, disable kyverno reporting, descheduler hourly

The control-plane flap (etcd lease-renewal timeouts) recurred. Rather than move etcd to SSD (code-oflt, deferred again), the chosen direction is to REDUCE etcd load enough that the leader-election-timeout band-aid (renew 10s->30s) becomes removable. These are the big, clean cuts: 1. Remove VPA/Goldilocks (stacks/vpa emptied). All 349 VPAs ran updateMode=Off (no auto-right-sizing) yet cost ~800 etcd objects + continuous recommender writes + a pod-creation admission webhook, purely to feed a dashboard. krr (Dockerized, on-demand) replaces it. Reverses the re-add after memory 2431. 2. Disable kyverno reporting (admission/aggregate/background). policyReports were already off, so the pipeline generated ephemeralreports + an hourly all-resource etcd re-scan for NO user-facing output. Admission enforcement (deny-* policies) and Keel mutation are unaffected; violations surface via Loki->Slack. 3. descheduler */5 -> hourly (fewer list/evict cycles; rebalancing isn't urgent). Deferred (poor ROI / unsafe as planned): ESO refreshInterval 15m->1h is a ~20-stack sprawl for ~0.1 writes/s; keel background=false is invalid for a mutate-existing policy and its churn is apply-time not steady-state. Both filed as follow-up beads. Post-apply: delete the chart-orphaned VPA CRDs to cascade-clean leftover CRs. Then measure etcd apply-latency and revert the timeouts. Docs updated (VPA/Goldilocks -> krr). See memory 5402-5407. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-12 19:41:22 +00:00 · 2026-06-12 19:41:22 +00:00 · 0216e993dc
commit 0216e993dc
parent 16adda2c48
5 changed files with 32 additions and 184 deletions
--- a/stacks/descheduler/values.yaml
+++ b/stacks/descheduler/values.yaml
@ -52,7 +52,7 @@ namespaceOverride: ""
 commonLabels: {}

 cronJobApiVersion: "batch/v1"
-schedule: "*/5 * * * *"
+schedule: "0 * * * *"  # hourly (was */5; 2026-06-12 etcd-load-reduction — fewer list/evict cycles, rebalancing isn't time-critical)
 suspend: false
 # startingDeadlineSeconds: 200
 successfulJobsHistoryLimit: 10
--- a/stacks/kyverno/modules/kyverno/main.tf
+++ b/stacks/kyverno/modules/kyverno/main.tf
@ -30,9 +30,24 @@ resource "helm_release" "kyverno" {
      forceFailurePolicyIgnore = {
        enabled = true
      }
+      # Reporting fully disabled (2026-06-12, etcd-load-reduction). policyReports
+      # were already off, so admission/aggregate/background reporting generated
+      # ephemeralreports + an hourly all-resource etcd re-scan for NO user-facing
+      # output. Admission enforcement (deny-* policies) and Keel mutation are
+      # independent of reporting; policy violations surface via Loki->Slack. This
+      # removes a steady-state etcd write/scan load (control-plane flap mitigation).
      policyReports = {
        enabled = false
      }
+      admissionReports = {
+        enabled = false
+      }
+      aggregateReports = {
+        enabled = false
+      }
+      backgroundScan = {
+        enabled = false
+      }
    }

    reportsController = {
--- a/stacks/vpa/main.tf
+++ b/stacks/vpa/main.tf
@ -1,7 +1,15 @@
 variable "tls_secret_name" { type = string }

-module "vpa" {
-  source          = "./modules/vpa"
-  tls_secret_name = var.tls_secret_name
-  tier            = local.tiers.cluster
-}
+# VPA / Goldilocks REMOVED 2026-06-12 (etcd-load-reduction; reverses the re-add
+# after memory 2431, ties to code-oflt). All 349 VPAs ran updateMode=Off (no
+# auto-right-sizing) yet cost ~800 etcd objects, continuous recommender writes,
+# and a pod-creation admission webhook — pure etcd overhead feeding only the
+# dashboard. Right-size on demand with krr (Dockerized, no cluster footprint).
+#
+# The `module "vpa"` block was removed so `scripts/tg apply` DESTROYS the helm
+# releases (vpa, goldilocks), the goldilocks-vpa-auto-mode ClusterPolicy, the
+# dashboard ingress, and the vpa namespace. The chart-installed VPA CRDs (Helm
+# keeps CRDs on uninstall) and any leftover VPA/checkpoint CRs are removed
+# post-apply (cascade) via:
+#   kubectl delete crd verticalpodautoscalers.autoscaling.k8s.io \
+#                      verticalpodautoscalercheckpoints.autoscaling.k8s.io
--- a/stacks/vpa/modules/vpa/main.tf
+++ b/stacks/vpa/modules/vpa/main.tf
@ -1,175 +0,0 @@
-variable "tls_secret_name" {
-  type      = string
-  sensitive = true
-}
-variable "tier" { type = string }
-
-resource "kubernetes_namespace" "vpa" {
-  metadata {
-    name = "vpa"
-    labels = {
-      tier = var.tier
-      "keel.sh/enrolled" = "true"
-    }
-  }
-}
-
-module "tls_secret" {
-  source          = "../../../../modules/kubernetes/setup_tls_secret"
-  namespace       = kubernetes_namespace.vpa.metadata[0].name
-  tls_secret_name = var.tls_secret_name
-}
-
-# -----------------------------------------------------------------------------
-# VPA — Vertical Pod Autoscaler (Fairwinds Helm chart)
-# -----------------------------------------------------------------------------
-resource "helm_release" "vpa" {
-  namespace        = kubernetes_namespace.vpa.metadata[0].name
-  create_namespace = false
-  name             = "vpa"
-  atomic           = true
-
-  repository = "https://charts.fairwinds.com/stable"
-  chart      = "vpa"
-
-  values = [yamlencode({
-    recommender = {
-      enabled = true
-      resources = {
-        requests = {
-          cpu    = "50m"
-          memory = "200Mi"
-        }
-        limits = {
-          memory = "200Mi"
-        }
-      }
-    }
-    updater = {
-      enabled = true
-      resources = {
-        requests = {
-          cpu    = "50m"
-          memory = "200Mi"
-        }
-        limits = {
-          memory = "200Mi"
-        }
-      }
-    }
-    admissionController = {
-      enabled = true
-      resources = {
-        requests = {
-          cpu    = "50m"
-          memory = "200Mi"
-        }
-        limits = {
-          memory = "200Mi"
-        }
-      }
-    }
-  })]
-}
-
-# -----------------------------------------------------------------------------
-# Goldilocks — VPA dashboard (Fairwinds Helm chart)
-# -----------------------------------------------------------------------------
-resource "helm_release" "goldilocks" {
-  namespace        = kubernetes_namespace.vpa.metadata[0].name
-  create_namespace = false
-  name             = "goldilocks"
-  atomic           = true
-
-  repository = "https://charts.fairwinds.com/stable"
-  chart      = "goldilocks"
-
-  values = [yamlencode({
-    controller = {
-      flags = {
-        on-by-default = "true"
-      }
-    }
-    dashboard = {
-      replicaCount = 1
-      flags = {
-        on-by-default = "true"
-      }
-    }
-  })]
-
-  depends_on = [helm_release.vpa]
-}
-
-# -----------------------------------------------------------------------------
-# Ingress — Goldilocks dashboard at goldilocks.viktorbarzin.me
-# -----------------------------------------------------------------------------
-module "ingress" {
-  source          = "../../../../modules/kubernetes/ingress_factory"
-  dns_type        = "proxied"
-  namespace       = kubernetes_namespace.vpa.metadata[0].name
-  name            = "goldilocks"
-  service_name    = "goldilocks-dashboard"
-  port            = 80
-  tls_secret_name = var.tls_secret_name
-  auth            = "required"
-  extra_annotations = {
-    "gethomepage.dev/enabled"      = "true"
-    "gethomepage.dev/name"         = "Goldilocks"
-    "gethomepage.dev/description"  = "Resource recommendations"
-    "gethomepage.dev/icon"         = "mdi-scale-balance"
-    "gethomepage.dev/group"        = "Core Platform"
-    "gethomepage.dev/pod-selector" = ""
-  }
-
-  depends_on = [helm_release.goldilocks]
-}
-
-# -----------------------------------------------------------------------------
-# Kyverno policy — label namespaces for VPA observe-only mode
-# -----------------------------------------------------------------------------
-# Goldilocks reads the goldilocks.fairwinds.com/vpa-update-mode label on
-# namespaces to decide the updateMode for VPA objects it creates.
-# All namespaces get "off" — Terraform is the authoritative source of truth
-# for container resources. Goldilocks provides recommendations only.
-
-resource "kubernetes_manifest" "vpa_auto_mode_label" {
-  manifest = {
-    apiVersion = "kyverno.io/v1"
-    kind       = "ClusterPolicy"
-    metadata = {
-      name = "goldilocks-vpa-auto-mode"
-      annotations = {
-        "policies.kyverno.io/title"       = "Goldilocks VPA Observe-Only Mode"
-        "policies.kyverno.io/description" = "Sets VPA update mode to off for all namespaces. Terraform owns container resources; Goldilocks provides recommendations only."
-      }
-    }
-    spec = {
-      rules = [
-        {
-          name = "label-vpa-off-all"
-          match = {
-            any = [
-              {
-                resources = {
-                  kinds = ["Namespace"]
-                }
-              }
-            ]
-          }
-          mutate = {
-            patchStrategicMerge = {
-              metadata = {
-                labels = {
-                  "goldilocks.fairwinds.com/vpa-update-mode" = "off"
-                }
-              }
-            }
-          }
-        },
-      ]
-    }
-  }
-
-  depends_on = [helm_release.goldilocks]
-}