From 25fcf80651adcf6c56f3bf4014c6a19edce88323 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 17 May 2026 00:07:32 +0000 Subject: [PATCH] =?UTF-8?q?keel:=20expand=20critical-namespace=20exclude?= =?UTF-8?q?=20list=20=E2=80=94=20protects=20vault/cnpg/authentik/etc.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2026-05-17 incident: Keel rolled authentik 2026.2.2 → 2026.2.3 around 23:36. The force+match-tag pairing should have constrained Keel to digest-only on the current tag (not switch to a new tag), but a race between Kyverno's mutate (injecting match-tag) and Keel's hourly poll caused the workload to still have the old `force`-only annotation when Keel acted. Result: tag rewrite, pods cycled, pgbouncer connection failures, login broken. Manual rollback: `kubectl rollout undo` on all 5 authentik deployments back to 2026.2.2. Auth restored within ~5 min. Going forward, critical-namespace workloads are excluded at the policy level so this race can't recur. They get upgraded via TF (Helm chart version bumps) on a deliberate cadence, never by Keel. Live state: 36 workloads on policy=never (35 critical + chrome-service pin + 7 CI-driven self-hosted from earlier), 190 on policy=force+match-tag for opt-out-pure auto-update on the remaining stateless apps. This matches user direction (2026-05-17): "upgrading is fine as long as we upgrade correctly and the latest version is healthy" + "keel responsible for the latest version, phased rollout, graceful". Co-Authored-By: Claude Opus 4.7 --- .../modules/kyverno/keel-annotations.tf | 57 +++++++++++++++++-- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/stacks/kyverno/modules/kyverno/keel-annotations.tf b/stacks/kyverno/modules/kyverno/keel-annotations.tf index e1f6cba3..4af50641 100644 --- a/stacks/kyverno/modules/kyverno/keel-annotations.tf +++ b/stacks/kyverno/modules/kyverno/keel-annotations.tf @@ -56,12 +56,57 @@ resource "kubernetes_manifest" "policy_inject_keel_annotations" { any = [ { resources = { - # Keel must not auto-update itself (decision #11). - # calico-system: managed by tigera-operator via Installation CR. - # Keel rewriting the calico-node DaemonSet image causes an - # hourly fight loop (Keel → v3.26.5, operator → v3.26.1). - # Calico version is bumped manually via the Installation CR. - namespaces = ["keel", "calico-system"] + # Namespaces that must NEVER be auto-updated by Keel. + # Each has a domain-aware upgrade flow (operator, Helm chart + # version bump, schema migration, etc.) that Keel would fight. + # + # - keel: supervisor self-update (decision #11) + # - calico-system: tigera-operator owns Installation CR + # - authentik: 2026-05-17 incident — minor bump 2026.2.2→2026.2.3 + # broke pgbouncer connections; rolled back manually + # - vault, cnpg-system, dbaas: state-coupled with TF backend + # - monitoring: kube-prometheus-stack multi-component coordination + # - traefik, metallb-system, technitium: networking critical path + # - kyverno, external-secrets, sealed-secrets, reloader, + # descheduler, vpa, kube-system: cluster-level operators + # - proxmox-csi, nfs-csi, nvidia, tigera-operator: hardware/CNI + # coordination + # - cloudflared, headscale, wireguard, xray: VPN/tunnel critical + # - mailserver, crowdsec, redis, reverse-proxy: stateful critical + # - infra-maintenance, metrics-server: cluster utilities + namespaces = [ + "keel", + "calico-system", + "authentik", + "vault", + "cnpg-system", + "dbaas", + "monitoring", + "traefik", + "technitium", + "mailserver", + "kyverno", + "metallb-system", + "external-secrets", + "proxmox-csi", + "nfs-csi", + "nvidia", + "kube-system", + "cloudflared", + "crowdsec", + "reverse-proxy", + "reloader", + "descheduler", + "vpa", + "redis", + "sealed-secrets", + "headscale", + "wireguard", + "xray", + "infra-maintenance", + "metrics-server", + "tigera-operator", + ] } }, {