From f325b949be83448bcdbc112a66c34d14d1ebab3d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 26 May 2026 19:06:51 +0000 Subject: [PATCH] keel: re-enable with policy=patch (semver-bounded) + fix CI deny-privileged MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-enables Keel after the 2026-05-26 emergency stop, with a safer default. Switch Kyverno-injected default from `force + match-tag=true` (proven unreliable — it rewrote tag strings cluster-wide despite the design intent) to `patch`, which is semver-parser-bounded: - Only patch bumps within current major.minor (1.2.3 → 1.2.4, never 1.3.x or 2.x — the parser does the math, not string compare). - Non-semver tags (`:latest`, `:v4`, `:2`, SHA, `:nightly`) are IGNORED entirely. No tag rewriting under any code path. - 151 stale `force` annotations migrated to `patch` cluster-wide during this apply (anchor `+()` dropped, then re-added). Live state after this commit: 0 workloads on `force`, 209 on `patch`, 22 on `never`. Keel deployment back to 1/1 on `:0.21.1`. Note: 22 workloads with `keel.sh/policy=never` LABEL had their annotation mutated to `patch` during the migration despite Kyverno's matchLabels-based exclude rule — appears to be a quirk of `mutateExistingOnPolicyUpdate` not honoring `selector` excludes. Repatched all 22 back to `annotation=never` via `kubectl annotate --overwrite`, then restored the `+(keel.sh/policy)` anchor in the policy so future Kyverno reconciles preserve them. Also fixes CI build-cli workflow which was blocked by `deny-privileged-containers` since wave 1 enforce flip on 2026-05-18: woodpecker namespace added to the shared security_policy_exclude_namespaces list (CI pipeline pods `wp-*` run privileged docker builds, legitimate use). The `default` workflow (terragrunt apply) was already passing — only the parallel `build-cli` workflow (which builds the infra-cli docker image) was failing, but it took the overall pipeline status down with it. Co-Authored-By: Claude Opus 4.7 --- stacks/keel/main.tf | 21 ++++----- .../modules/kyverno/keel-annotations.tf | 43 ++++++++++++++++--- .../modules/kyverno/security-policies.tf | 1 + 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/stacks/keel/main.tf b/stacks/keel/main.tf index e68b8576..e29fda5a 100644 --- a/stacks/keel/main.tf +++ b/stacks/keel/main.tf @@ -46,16 +46,17 @@ resource "helm_release" "keel" { atomic = true values = [yamlencode({ - # EMERGENCY STOP — scaled to 0 on 2026-05-26 16:42 UTC. Keel was actively - # rewriting tag strings (not just digests) despite the - # `keel.sh/match-tag=true` annotation injected by Kyverno that's supposed - # to constrain it to digest-only watches. Known casualties this round: - # uptime-kuma (2 → 1, 4h CrashLoopBackOff), n8n (1.80.5 → 0.1.2, silent - # degradation), beads-server/dolt-workbench (0.3.73 → 0.1.0), and ~10 - # other deployments with downgrade-flavored change-cause annotations. - # Re-enable only after root-causing why match-tag isn't being enforced, - # OR after migrating each app to a content-addressed (SHA) tag pin. - replicaCount = 0 + # 2026-05-26 17:30: re-enabled after switching the Kyverno-injected + # default from `force + match-tag=true` (proven unreliable — see + # stacks/kyverno/modules/kyverno/keel-annotations.tf) to `patch` which + # is semver-parser-bounded. Under `patch`: + # - Semver-tagged workloads get patch bumps only (1.2.3 → 1.2.4). + # - Float / SHA / non-semver tags are IGNORED — no tag rewriting. + # The 2026-05-26 emergency-stop scope (replicaCount=0) is reverted now + # that the default is safe. Workloads pinned out-of-band (uptime-kuma + # via keel.sh/policy=never LABEL) stay opted-out via the Kyverno + # exclude rule, not via Keel's own annotation. + replicaCount = 1 # Prometheus pod-annotation scrape — picks up Keel-specific metrics # (pending_approvals, poll_trigger_tracked_images, registries_scanned_total{image,registry}) # on container port 9300 /metrics. The cluster's `kubernetes-pods` diff --git a/stacks/kyverno/modules/kyverno/keel-annotations.tf b/stacks/kyverno/modules/kyverno/keel-annotations.tf index 3d2970d8..bb0ef70e 100644 --- a/stacks/kyverno/modules/kyverno/keel-annotations.tf +++ b/stacks/kyverno/modules/kyverno/keel-annotations.tf @@ -177,13 +177,42 @@ resource "kubectl_manifest" "policy_inject_keel_annotations" { # to bypass this mutation) # Per-namespace opt-out: # Remove the `keel.sh/enrolled=true` namespace label. - # `+(...)` anchor — only add if not present. This preserves - # per-workload overrides set out-of-band (e.g. `never` for - # phased rollout). Without the anchor, every policy update - # would overwrite existing annotations, breaking the phased - # rollout state. - "+(keel.sh/policy)" = "force" - "+(keel.sh/match-tag)" = "true" + # 2026-05-26: switched default from `force + match-tag=true` + # to `patch` after the 2026-05-26 incident proved match-tag + # does NOT reliably constrain Keel — tag strings got rewritten + # (uptime-kuma :2→:1, n8n :1.80.5→:0.1.2, dolt-workbench + # :0.3.73→:0.1.0, wealthfolio :3.2.1→:2.0→:3.2 truncated). + # + # `patch` is semver-parser-bounded: + # - Only patch bumps within current major.minor + # (e.g. 1.2.3 → 1.2.4; never 1.3.x or 2.x). + # - Non-semver tags (`:latest`, `:v4`, `:2`, SHA, `:nightly`) + # are IGNORED entirely — Keel does nothing for them. + # - No more string-comparison surprises. + # + # `match-tag` annotation dropped — it was only meaningful as + # the (failed) safety net under `force`. Irrelevant under + # semver-bounded policies. + # + # `+(...)` anchor = "add only if missing". With the anchor, + # this policy ONLY sets defaults on new workloads — existing + # per-workload overrides (set via TF or kubectl annotate) + # are preserved across policy updates. This was DROPPED for + # one apply on 2026-05-26 to migrate the 151 stale `force` + # annotations to `patch`, then re-added in the same session + # after observing that the label-based exclude rule below + # doesn't reliably filter mutateExistingOnPolicyUpdate scans + # (22 workloads with LABEL keel.sh/policy=never still got + # their ANNOTATION rewritten and had to be repatched). Keep + # the anchor unless you genuinely want a cluster-wide flip. + # + # To override per workload, set the ANNOTATION directly: + # - keel.sh/policy=never (Keel won't touch) + # - keel.sh/policy=minor (wider semver bumps, still bounded) + # - keel.sh/policy=major (any semver bump) + # The corresponding LABEL keel.sh/policy=never is for the + # exclude rule below (defense-in-depth against future mutations). + "+(keel.sh/policy)" = "patch" "+(keel.sh/trigger)" = "poll" "+(keel.sh/pollSchedule)" = "@every 1h" } diff --git a/stacks/kyverno/modules/kyverno/security-policies.tf b/stacks/kyverno/modules/kyverno/security-policies.tf index 163031ec..4eb2b5c4 100644 --- a/stacks/kyverno/modules/kyverno/security-policies.tf +++ b/stacks/kyverno/modules/kyverno/security-policies.tf @@ -26,6 +26,7 @@ locals { "kured", # kured DaemonSet is privileged (manages node reboots) "default", # etcd backup + defrag CronJobs use hostNetwork "changedetection", # uses SYS_ADMIN for chromium sandbox + "woodpecker", # CI pipeline pods (wp-*) run privileged docker builds ] }