From ccf0b2232fd70294255ff50bd0ae645e53c9c74d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 1 Mar 2026 19:03:49 +0000 Subject: [PATCH] [ci skip] switch VPA to off mode globally, fix Ollama/MySQL resources - Kyverno policy: VPA mode set to 'off' for all namespaces (was 'initial' for non-core). Terraform is now sole authority for container resources. Goldilocks provides recommendations only. - Ollama: add explicit CPU/memory resources (500m/4Gi req, 4/12Gi limit) alongside GPU allocation. Fixes OOMKill from VPA scaling down resources. - MySQL InnoDB Cluster: bump memory limit from 2Gi to 3Gi. - Remove redundant per-namespace VPA opt-out labels from onlyoffice, openclaw, trading-bot (now handled globally by Kyverno policy). --- .claude/CLAUDE.md | 7 ++-- stacks/ollama/main.tf | 6 +++ stacks/onlyoffice/main.tf | 3 +- stacks/openclaw/main.tf | 3 +- stacks/platform/modules/dbaas/main.tf | 4 +- stacks/platform/modules/vpa/main.tf | 54 +++------------------------ stacks/trading-bot/main.tf | 5 +-- 7 files changed, 21 insertions(+), 61 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index a06d698d..3ea930b3 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -175,9 +175,9 @@ Custom quota namespaces: `authentik` (16 req CPU/16Gi req mem/48 lim CPU/96Gi li **LimitRange opt-out**: label `resource-governance/custom-limitrange=true` — skips Kyverno-generated LimitRange, requires a custom `kubernetes_limit_range` in the stack. Used by: `nextcloud` (max 16 CPU/8Gi), `onlyoffice` (max 8 CPU/8Gi). -**Other mutating policies**: `inject-priority-class-from-tier` (sets priorityClassName, **CREATE only**), `inject-ndots` (ndots:2 on all pods), `sync-tier-label-from-namespace`, `goldilocks-vpa-auto-mode` (sets VPA to `initial` for non-core, `off` for core). +**Other mutating policies**: `inject-priority-class-from-tier` (sets priorityClassName, **CREATE only**), `inject-ndots` (ndots:2 on all pods), `sync-tier-label-from-namespace`, `goldilocks-vpa-auto-mode` (sets VPA to `off` for ALL namespaces — Terraform owns container resources, Goldilocks is observe-only). -**Goldilocks VPA warning**: VPA in Initial mode overrides explicit container resource limits on pod creation. To disable for a deployment: annotate with `goldilocks.fairwinds.com/enabled=false` and set namespace label `goldilocks.fairwinds.com/vpa-update-mode=off`. +**Goldilocks VPA**: VPA is in `off` mode globally — it provides resource recommendations only via the Goldilocks dashboard, but never mutates pods. Terraform is the sole authority for container resources. **Security policies** (ALL Audit mode, log-only): `deny-privileged-containers`, `deny-host-namespaces`, `restrict-sys-admin`, `require-trusted-registries`. @@ -187,8 +187,7 @@ Custom quota namespaces: `authentik` (16 req CPU/16Gi req mem/48 lim CPU/96Gi li 3. **Evicted?** → aux-tier pods (priority 200K, Never preempt) are first evicted under pressure. 4. **Unexpected limits?** → LimitRange injects defaults when `resources: {}` or no resources block exists. Always set explicit resources. 5. **Need more?** → Set explicit `resources {}` on container (overrides LimitRange defaults) or add `resource-governance/custom-quota=true` label + `resource-governance/custom-limitrange=true` label with custom resources in the stack. -6. **VPA overriding resources?** → Goldilocks VPA in `initial` mode scales down explicit limits. Annotate deployment with `goldilocks.fairwinds.com/enabled=false`. -7. **Pod patch failing with immutable spec?** → Kyverno `inject-priority-class-from-tier` was fixed to CREATE-only. If similar issues arise, check mutating webhooks with `kubectl get mutatingwebhookconfigurations`. +6. **Pod patch failing with immutable spec?** → Kyverno `inject-priority-class-from-tier` was fixed to CREATE-only. If similar issues arise, check mutating webhooks with `kubectl get mutatingwebhookconfigurations`. --- diff --git a/stacks/ollama/main.tf b/stacks/ollama/main.tf index 18dba845..0dda7d80 100644 --- a/stacks/ollama/main.tf +++ b/stacks/ollama/main.tf @@ -121,7 +121,13 @@ resource "kubernetes_deployment" "ollama" { mount_path = "/root/.ollama" } resources { + requests = { + cpu = "500m" + memory = "4Gi" + } limits = { + cpu = "4" + memory = "12Gi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/onlyoffice/main.tf b/stacks/onlyoffice/main.tf index d2885d9d..978e730a 100644 --- a/stacks/onlyoffice/main.tf +++ b/stacks/onlyoffice/main.tf @@ -11,8 +11,7 @@ resource "kubernetes_namespace" "onlyoffice" { name = "onlyoffice" labels = { "istio-injection" : "disabled" - tier = local.tiers.edge - "goldilocks.fairwinds.com/vpa-update-mode" = "off" + tier = local.tiers.edge "resource-governance/custom-limitrange" = "true" "resource-governance/custom-quota" = "true" } diff --git a/stacks/openclaw/main.tf b/stacks/openclaw/main.tf index 7c311e33..ce29e9f3 100644 --- a/stacks/openclaw/main.tf +++ b/stacks/openclaw/main.tf @@ -13,8 +13,7 @@ resource "kubernetes_namespace" "openclaw" { metadata { name = "openclaw" labels = { - tier = local.tiers.aux - "goldilocks.fairwinds.com/vpa-update-mode" = "off" + tier = local.tiers.aux } } } diff --git a/stacks/platform/modules/dbaas/main.tf b/stacks/platform/modules/dbaas/main.tf index dea3a43c..95363737 100644 --- a/stacks/platform/modules/dbaas/main.tf +++ b/stacks/platform/modules/dbaas/main.tf @@ -150,7 +150,7 @@ resource "helm_release" "mysql_cluster" { } limits = { cpu = "2" - memory = "2Gi" + memory = "3Gi" } } @@ -176,7 +176,7 @@ resource "helm_release" "mysql_cluster" { cpu = "250m" } limits = { - memory = "2Gi" + memory = "3Gi" cpu = "2" } } diff --git a/stacks/platform/modules/vpa/main.tf b/stacks/platform/modules/vpa/main.tf index dadb0c40..b5854ab7 100644 --- a/stacks/platform/modules/vpa/main.tf +++ b/stacks/platform/modules/vpa/main.tf @@ -86,12 +86,12 @@ module "ingress" { } # ----------------------------------------------------------------------------- -# Kyverno policy — label namespaces for VPA mode by tier +# Kyverno policy — label namespaces for VPA observe-only mode # ----------------------------------------------------------------------------- # Goldilocks reads the goldilocks.fairwinds.com/vpa-update-mode label on # namespaces to decide the updateMode for VPA objects it creates. -# Tier 0-core gets "off" (recommend only — these are critical infra where -# evictions cause downtime). All other namespaces get "auto". +# All namespaces get "off" — Terraform is the authoritative source of truth +# for container resources. Goldilocks provides recommendations only. resource "kubernetes_manifest" "vpa_auto_mode_label" { manifest = { @@ -100,25 +100,19 @@ resource "kubernetes_manifest" "vpa_auto_mode_label" { metadata = { name = "goldilocks-vpa-auto-mode" annotations = { - "policies.kyverno.io/title" = "Goldilocks VPA Mode by Tier" - "policies.kyverno.io/description" = "Sets VPA update mode per namespace: Off for tier-0 critical infra (no evictions), Auto for all others." + "policies.kyverno.io/title" = "Goldilocks VPA Observe-Only Mode" + "policies.kyverno.io/description" = "Sets VPA update mode to off for all namespaces. Terraform owns container resources; Goldilocks provides recommendations only." } } spec = { rules = [ - # Tier 0-core: recommend only, never evict { - name = "label-vpa-off-tier-0" + name = "label-vpa-off-all" match = { any = [ { resources = { kinds = ["Namespace"] - selector = { - matchLabels = { - tier = "0-core" - } - } } } ] @@ -133,42 +127,6 @@ resource "kubernetes_manifest" "vpa_auto_mode_label" { } } }, - # All other namespaces: initial mode (compatible with Terraform — - # VPA mutates pods at creation, not the deployment spec) - { - name = "label-vpa-initial-default" - match = { - any = [ - { - resources = { - kinds = ["Namespace"] - } - } - ] - } - exclude = { - any = [ - { - resources = { - selector = { - matchLabels = { - tier = "0-core" - } - } - } - } - ] - } - mutate = { - patchStrategicMerge = { - metadata = { - labels = { - "goldilocks.fairwinds.com/vpa-update-mode" = "initial" - } - } - } - } - }, ] } } diff --git a/stacks/trading-bot/main.tf b/stacks/trading-bot/main.tf index c170bd59..187f5970 100644 --- a/stacks/trading-bot/main.tf +++ b/stacks/trading-bot/main.tf @@ -47,9 +47,8 @@ resource "kubernetes_namespace" "trading-bot" { metadata { name = "trading-bot" labels = { - tier = local.tiers.edge - "resource-governance/custom-quota" = "true" - "goldilocks.fairwinds.com/vpa-update-mode" = "off" + tier = local.tiers.edge + "resource-governance/custom-quota" = "true" } } }