From 15c88bc683b6a901a2d141c0771cee955f5ad6ae Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 26 May 2026 21:53:10 +0000 Subject: [PATCH] keel: belt-and-suspenders opt-out for mysql/redis/nvidia-exporter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After re-enabling Keel with `policy: patch` (commit f325b949), 3 of the 60 first-hour bumps broke things and need explicit cluster-wide opt-out so future Kyverno reconciles can't put them back under auto-update: - `dbaas/mysql-standalone`: patch-bumped `mysql:8.4.8 → :8.4.9` and the DD upgrade stalled (we explicitly track that as beads `code-963q` — the 8.4.9 jump needs a wipe+reinit, not a rolling upgrade). The StatefulSet already had `annotation=never` from TF but was missing the LABEL — Kyverno's selector exclude reads the LABEL, so a reconcile that dropped the annotation could resume auto-update. Added the LABEL. - `redis/redis-v2`: patch-bumped `redis:8-alpine → :8.0.6-alpine` and the new image rejected the `aof-load-corrupt-tail-max-size` directive from commit 1eee56d0 → redis-v2-2 CrashLoopBackOff. Plus :8.0.6 is semantically older than :8-alpine (which resolves to :8.6.2) — same Keel tag-picking pathology as the 2026-05-26 morning incident, just in a different shape. LABEL + ANNOTATION both added. - `nvidia/nvidia-exporter`: Keel rewrote `:latest → :4.5.2-4.8.1-ubuntu22.04` and the new dcgm-exporter OOMKilled at the 192Mi memory limit (4 restarts before I caught it). Added LABEL + ANNOTATION for opt-out, AND bumped memory request/limit 192Mi → 256Mi/512Mi so the bumped image doesn't OOM (older versions fit in 192Mi; the bumped one needs ~250Mi steady-state). The 56 other Keel bumps in that 10-minute window (coredns 1.12.1→1.12.4, kyverno 1.16.1→1.16.4, nextcloud 32.0.3→32.0.9, grafana 12.3.1→12.3.6, cnpg, mailserver, csi-nfs, metrics-server, etc.) landed cleanly — the `patch` policy is the right default. Per-workload `never` opt-out is the maintenance cost. Co-Authored-By: Claude Opus 4.7 --- stacks/dbaas/modules/dbaas/main.tf | 8 ++++++++ stacks/nvidia/modules/nvidia/main.tf | 15 +++++++++++++-- stacks/redis/modules/redis/main.tf | 10 ++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/stacks/dbaas/modules/dbaas/main.tf b/stacks/dbaas/modules/dbaas/main.tf index 94236930..e3776b51 100644 --- a/stacks/dbaas/modules/dbaas/main.tf +++ b/stacks/dbaas/modules/dbaas/main.tf @@ -130,6 +130,14 @@ resource "kubernetes_stateful_set_v1" "mysql_standalone" { "app.kubernetes.io/name" = "mysql" "app.kubernetes.io/instance" = "mysql-standalone" "app.kubernetes.io/component" = "primary" + # 2026-05-26: defense-in-depth on top of the annotation below. The + # Kyverno `inject-keel-annotations` ClusterPolicy reads this LABEL + # via its `exclude.any[].resources.selector.matchLabels` rule, so + # even if the dbaas namespace exclude were lost the label still + # bypasses the mutation. Without the label, a Kyverno reconcile + # had silently overwritten our annotation=never → patch this turn + # and Keel patch-bumped mysql:8.4.8 → 8.4.9, stalling the DD upgrade. + "keel.sh/policy" = "never" } # Explicit Keel opt-out. The dbaas namespace is already excluded # from the `inject-keel-annotations` Kyverno ClusterPolicy, but the diff --git a/stacks/nvidia/modules/nvidia/main.tf b/stacks/nvidia/modules/nvidia/main.tf index 2c285517..2268baf4 100644 --- a/stacks/nvidia/modules/nvidia/main.tf +++ b/stacks/nvidia/modules/nvidia/main.tf @@ -137,6 +137,14 @@ resource "kubernetes_deployment" "nvidia-exporter" { labels = { app = "nvidia-exporter" tier = var.tier + # 2026-05-26: Keel tag-rewrote :latest → :4.5.2-4.8.1-ubuntu22.04 + # and the new image OOMs at 192Mi. Adding both LABEL + ANNOTATION + # to opt out of Keel cluster-wide auto-update — bump nvidia images + # in a separate planned change once we've sized the memory limit. + "keel.sh/policy" = "never" + } + annotations = { + "keel.sh/policy" = "never" } } spec { @@ -176,10 +184,13 @@ resource "kubernetes_deployment" "nvidia-exporter" { } resources { requests = { - memory = "192Mi" + memory = "256Mi" } limits = { - memory = "192Mi" + # Bumped 192Mi → 512Mi (2026-05-26): dcgm-exporter + # 4.5.2-4.8.1-ubuntu22.04 OOMKills at 192Mi. Older versions + # ran comfortably under 192Mi but post-bump we need headroom. + memory = "512Mi" "nvidia.com/gpu" = "1" } } diff --git a/stacks/redis/modules/redis/main.tf b/stacks/redis/modules/redis/main.tf index ee358b06..898fab34 100644 --- a/stacks/redis/modules/redis/main.tf +++ b/stacks/redis/modules/redis/main.tf @@ -428,6 +428,16 @@ resource "kubernetes_stateful_set_v1" "redis_v2" { namespace = kubernetes_namespace.redis.metadata[0].name labels = { app = "redis-v2" + # 2026-05-26: Keel patch-bumped :8-alpine → :8.0.6-alpine, which + # rejected the `aof-load-corrupt-tail-max-size` config and crashed + # redis-v2-2. The bump is also semantically a downgrade (8-alpine is + # 8.6.2, 8.0.6 is older). Both LABEL + ANNOTATION are required for + # full opt-out: label drives Kyverno's selector exclude, annotation + # drives Keel's own gate. + "keel.sh/policy" = "never" + } + annotations = { + "keel.sh/policy" = "never" } } spec {