etcd-load-reduction: remove VPA/Goldilocks, disable kyverno reporting, descheduler hourly
The control-plane flap (etcd lease-renewal timeouts) recurred. Rather than move etcd to SSD (code-oflt, deferred again), the chosen direction is to REDUCE etcd load enough that the leader-election-timeout band-aid (renew 10s->30s) becomes removable. These are the big, clean cuts: 1. Remove VPA/Goldilocks (stacks/vpa emptied). All 349 VPAs ran updateMode=Off (no auto-right-sizing) yet cost ~800 etcd objects + continuous recommender writes + a pod-creation admission webhook, purely to feed a dashboard. krr (Dockerized, on-demand) replaces it. Reverses the re-add after memory 2431. 2. Disable kyverno reporting (admission/aggregate/background). policyReports were already off, so the pipeline generated ephemeralreports + an hourly all-resource etcd re-scan for NO user-facing output. Admission enforcement (deny-* policies) and Keel mutation are unaffected; violations surface via Loki->Slack. 3. descheduler */5 -> hourly (fewer list/evict cycles; rebalancing isn't urgent). Deferred (poor ROI / unsafe as planned): ESO refreshInterval 15m->1h is a ~20-stack sprawl for ~0.1 writes/s; keel background=false is invalid for a mutate-existing policy and its churn is apply-time not steady-state. Both filed as follow-up beads. Post-apply: delete the chart-orphaned VPA CRDs to cascade-clean leftover CRs. Then measure etcd apply-latency and revert the timeouts. Docs updated (VPA/Goldilocks -> krr). See memory 5402-5407. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
16adda2c48
commit
0216e993dc
5 changed files with 32 additions and 184 deletions
|
|
@ -52,7 +52,7 @@ namespaceOverride: ""
|
|||
commonLabels: {}
|
||||
|
||||
cronJobApiVersion: "batch/v1"
|
||||
schedule: "*/5 * * * *"
|
||||
schedule: "0 * * * *" # hourly (was */5; 2026-06-12 etcd-load-reduction — fewer list/evict cycles, rebalancing isn't time-critical)
|
||||
suspend: false
|
||||
# startingDeadlineSeconds: 200
|
||||
successfulJobsHistoryLimit: 10
|
||||
|
|
|
|||
|
|
@ -30,9 +30,24 @@ resource "helm_release" "kyverno" {
|
|||
forceFailurePolicyIgnore = {
|
||||
enabled = true
|
||||
}
|
||||
# Reporting fully disabled (2026-06-12, etcd-load-reduction). policyReports
|
||||
# were already off, so admission/aggregate/background reporting generated
|
||||
# ephemeralreports + an hourly all-resource etcd re-scan for NO user-facing
|
||||
# output. Admission enforcement (deny-* policies) and Keel mutation are
|
||||
# independent of reporting; policy violations surface via Loki->Slack. This
|
||||
# removes a steady-state etcd write/scan load (control-plane flap mitigation).
|
||||
policyReports = {
|
||||
enabled = false
|
||||
}
|
||||
admissionReports = {
|
||||
enabled = false
|
||||
}
|
||||
aggregateReports = {
|
||||
enabled = false
|
||||
}
|
||||
backgroundScan = {
|
||||
enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
reportsController = {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,15 @@
|
|||
variable "tls_secret_name" { type = string }
|
||||
|
||||
module "vpa" {
|
||||
source = "./modules/vpa"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
tier = local.tiers.cluster
|
||||
}
|
||||
# VPA / Goldilocks REMOVED 2026-06-12 (etcd-load-reduction; reverses the re-add
|
||||
# after memory 2431, ties to code-oflt). All 349 VPAs ran updateMode=Off (no
|
||||
# auto-right-sizing) yet cost ~800 etcd objects, continuous recommender writes,
|
||||
# and a pod-creation admission webhook — pure etcd overhead feeding only the
|
||||
# dashboard. Right-size on demand with krr (Dockerized, no cluster footprint).
|
||||
#
|
||||
# The `module "vpa"` block was removed so `scripts/tg apply` DESTROYS the helm
|
||||
# releases (vpa, goldilocks), the goldilocks-vpa-auto-mode ClusterPolicy, the
|
||||
# dashboard ingress, and the vpa namespace. The chart-installed VPA CRDs (Helm
|
||||
# keeps CRDs on uninstall) and any leftover VPA/checkpoint CRs are removed
|
||||
# post-apply (cascade) via:
|
||||
# kubectl delete crd verticalpodautoscalers.autoscaling.k8s.io \
|
||||
# verticalpodautoscalercheckpoints.autoscaling.k8s.io
|
||||
|
|
|
|||
|
|
@ -1,175 +0,0 @@
|
|||
variable "tls_secret_name" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "tier" { type = string }
|
||||
|
||||
resource "kubernetes_namespace" "vpa" {
|
||||
metadata {
|
||||
name = "vpa"
|
||||
labels = {
|
||||
tier = var.tier
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module "tls_secret" {
|
||||
source = "../../../../modules/kubernetes/setup_tls_secret"
|
||||
namespace = kubernetes_namespace.vpa.metadata[0].name
|
||||
tls_secret_name = var.tls_secret_name
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# VPA — Vertical Pod Autoscaler (Fairwinds Helm chart)
|
||||
# -----------------------------------------------------------------------------
|
||||
resource "helm_release" "vpa" {
|
||||
namespace = kubernetes_namespace.vpa.metadata[0].name
|
||||
create_namespace = false
|
||||
name = "vpa"
|
||||
atomic = true
|
||||
|
||||
repository = "https://charts.fairwinds.com/stable"
|
||||
chart = "vpa"
|
||||
|
||||
values = [yamlencode({
|
||||
recommender = {
|
||||
enabled = true
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "200Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "200Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
updater = {
|
||||
enabled = true
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "200Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "200Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
admissionController = {
|
||||
enabled = true
|
||||
resources = {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "200Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "200Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
})]
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Goldilocks — VPA dashboard (Fairwinds Helm chart)
|
||||
# -----------------------------------------------------------------------------
|
||||
resource "helm_release" "goldilocks" {
|
||||
namespace = kubernetes_namespace.vpa.metadata[0].name
|
||||
create_namespace = false
|
||||
name = "goldilocks"
|
||||
atomic = true
|
||||
|
||||
repository = "https://charts.fairwinds.com/stable"
|
||||
chart = "goldilocks"
|
||||
|
||||
values = [yamlencode({
|
||||
controller = {
|
||||
flags = {
|
||||
on-by-default = "true"
|
||||
}
|
||||
}
|
||||
dashboard = {
|
||||
replicaCount = 1
|
||||
flags = {
|
||||
on-by-default = "true"
|
||||
}
|
||||
}
|
||||
})]
|
||||
|
||||
depends_on = [helm_release.vpa]
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Ingress — Goldilocks dashboard at goldilocks.viktorbarzin.me
|
||||
# -----------------------------------------------------------------------------
|
||||
module "ingress" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
dns_type = "proxied"
|
||||
namespace = kubernetes_namespace.vpa.metadata[0].name
|
||||
name = "goldilocks"
|
||||
service_name = "goldilocks-dashboard"
|
||||
port = 80
|
||||
tls_secret_name = var.tls_secret_name
|
||||
auth = "required"
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "Goldilocks"
|
||||
"gethomepage.dev/description" = "Resource recommendations"
|
||||
"gethomepage.dev/icon" = "mdi-scale-balance"
|
||||
"gethomepage.dev/group" = "Core Platform"
|
||||
"gethomepage.dev/pod-selector" = ""
|
||||
}
|
||||
|
||||
depends_on = [helm_release.goldilocks]
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Kyverno policy — label namespaces for VPA observe-only mode
|
||||
# -----------------------------------------------------------------------------
|
||||
# Goldilocks reads the goldilocks.fairwinds.com/vpa-update-mode label on
|
||||
# namespaces to decide the updateMode for VPA objects it creates.
|
||||
# All namespaces get "off" — Terraform is the authoritative source of truth
|
||||
# for container resources. Goldilocks provides recommendations only.
|
||||
|
||||
resource "kubernetes_manifest" "vpa_auto_mode_label" {
|
||||
manifest = {
|
||||
apiVersion = "kyverno.io/v1"
|
||||
kind = "ClusterPolicy"
|
||||
metadata = {
|
||||
name = "goldilocks-vpa-auto-mode"
|
||||
annotations = {
|
||||
"policies.kyverno.io/title" = "Goldilocks VPA Observe-Only Mode"
|
||||
"policies.kyverno.io/description" = "Sets VPA update mode to off for all namespaces. Terraform owns container resources; Goldilocks provides recommendations only."
|
||||
}
|
||||
}
|
||||
spec = {
|
||||
rules = [
|
||||
{
|
||||
name = "label-vpa-off-all"
|
||||
match = {
|
||||
any = [
|
||||
{
|
||||
resources = {
|
||||
kinds = ["Namespace"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
mutate = {
|
||||
patchStrategicMerge = {
|
||||
metadata = {
|
||||
labels = {
|
||||
"goldilocks.fairwinds.com/vpa-update-mode" = "off"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
depends_on = [helm_release.goldilocks]
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue