keel: expand critical-namespace exclude list — protects vault/cnpg/authentik/etc.
2026-05-17 incident: Keel rolled authentik 2026.2.2 → 2026.2.3 around 23:36. The force+match-tag pairing should have constrained Keel to digest-only on the current tag (not switch to a new tag), but a race between Kyverno's mutate (injecting match-tag) and Keel's hourly poll caused the workload to still have the old `force`-only annotation when Keel acted. Result: tag rewrite, pods cycled, pgbouncer connection failures, login broken. Manual rollback: `kubectl rollout undo` on all 5 authentik deployments back to 2026.2.2. Auth restored within ~5 min. Going forward, critical-namespace workloads are excluded at the policy level so this race can't recur. They get upgraded via TF (Helm chart version bumps) on a deliberate cadence, never by Keel. Live state: 36 workloads on policy=never (35 critical + chrome-service pin + 7 CI-driven self-hosted from earlier), 190 on policy=force+match-tag for opt-out-pure auto-update on the remaining stateless apps. This matches user direction (2026-05-17): "upgrading is fine as long as we upgrade correctly and the latest version is healthy" + "keel responsible for the latest version, phased rollout, graceful". Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
cdd6781bb9
commit
0365ed83ca
1 changed files with 51 additions and 6 deletions
|
|
@ -56,12 +56,57 @@ resource "kubernetes_manifest" "policy_inject_keel_annotations" {
|
||||||
any = [
|
any = [
|
||||||
{
|
{
|
||||||
resources = {
|
resources = {
|
||||||
# Keel must not auto-update itself (decision #11).
|
# Namespaces that must NEVER be auto-updated by Keel.
|
||||||
# calico-system: managed by tigera-operator via Installation CR.
|
# Each has a domain-aware upgrade flow (operator, Helm chart
|
||||||
# Keel rewriting the calico-node DaemonSet image causes an
|
# version bump, schema migration, etc.) that Keel would fight.
|
||||||
# hourly fight loop (Keel → v3.26.5, operator → v3.26.1).
|
#
|
||||||
# Calico version is bumped manually via the Installation CR.
|
# - keel: supervisor self-update (decision #11)
|
||||||
namespaces = ["keel", "calico-system"]
|
# - calico-system: tigera-operator owns Installation CR
|
||||||
|
# - authentik: 2026-05-17 incident — minor bump 2026.2.2→2026.2.3
|
||||||
|
# broke pgbouncer connections; rolled back manually
|
||||||
|
# - vault, cnpg-system, dbaas: state-coupled with TF backend
|
||||||
|
# - monitoring: kube-prometheus-stack multi-component coordination
|
||||||
|
# - traefik, metallb-system, technitium: networking critical path
|
||||||
|
# - kyverno, external-secrets, sealed-secrets, reloader,
|
||||||
|
# descheduler, vpa, kube-system: cluster-level operators
|
||||||
|
# - proxmox-csi, nfs-csi, nvidia, tigera-operator: hardware/CNI
|
||||||
|
# coordination
|
||||||
|
# - cloudflared, headscale, wireguard, xray: VPN/tunnel critical
|
||||||
|
# - mailserver, crowdsec, redis, reverse-proxy: stateful critical
|
||||||
|
# - infra-maintenance, metrics-server: cluster utilities
|
||||||
|
namespaces = [
|
||||||
|
"keel",
|
||||||
|
"calico-system",
|
||||||
|
"authentik",
|
||||||
|
"vault",
|
||||||
|
"cnpg-system",
|
||||||
|
"dbaas",
|
||||||
|
"monitoring",
|
||||||
|
"traefik",
|
||||||
|
"technitium",
|
||||||
|
"mailserver",
|
||||||
|
"kyverno",
|
||||||
|
"metallb-system",
|
||||||
|
"external-secrets",
|
||||||
|
"proxmox-csi",
|
||||||
|
"nfs-csi",
|
||||||
|
"nvidia",
|
||||||
|
"kube-system",
|
||||||
|
"cloudflared",
|
||||||
|
"crowdsec",
|
||||||
|
"reverse-proxy",
|
||||||
|
"reloader",
|
||||||
|
"descheduler",
|
||||||
|
"vpa",
|
||||||
|
"redis",
|
||||||
|
"sealed-secrets",
|
||||||
|
"headscale",
|
||||||
|
"wireguard",
|
||||||
|
"xray",
|
||||||
|
"infra-maintenance",
|
||||||
|
"metrics-server",
|
||||||
|
"tigera-operator",
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue