From e5250f417e0308c628e164350527907dae1b5f54 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 20 Jun 2026 08:14:50 +0000 Subject: [PATCH] k8s-version-upgrade: compat gate must not false-block patch upgrades MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compat gate compared every addon's matrix ceiling against the target k8s minor unconditionally. That is correct for a minor JUMP, but it also blocked patch upgrades within the minor the cluster is ALREADY running: ESO v0.12's matrix ceiling is 1.31, the cluster runs 1.34.9, so a target of 1.34.10 (a patch) was refused with "external-secrets supports k8s <= 1.31; target 1.34 exceeds it" — even though the running cluster is itself proof ESO 0.12 works on 1.34. That silently defeats autonomous patching (it would have bitten the moment a 1.34.10 was published). Fix: a target at or below the running minor crosses into no new k8s minor, so every installed addon is already empirically proven on it — check_addons now returns no reasons when target_minor <= running_minor. Added running_minor() (oldest kubelet across nodes, mirroring the detector; RUNNING_K8S env override for tests) and pass it in. Minor jumps are unchanged: 1.34->1.35 still blocks on ESO 0.12 + kyverno 1.16. removed-API + containerd checks are naturally inert for patches (no API removal / containerd floor inside a minor) and keep running as defence. Added test_compat_gate.py (8 cases) covering both paths. Verified end-to-end against live Prometheus: target 1.34.10 -> EXIT 0 (safe), target 1.35.6 -> EXIT 2 (blocked on ESO+kyverno). Co-Authored-By: Claude Opus 4.8 --- .../scripts/compat-gate.py | 29 +++++- .../scripts/test_compat_gate.py | 97 +++++++++++++++++++ 2 files changed, 124 insertions(+), 2 deletions(-) create mode 100644 stacks/k8s-version-upgrade/scripts/test_compat_gate.py diff --git a/stacks/k8s-version-upgrade/scripts/compat-gate.py b/stacks/k8s-version-upgrade/scripts/compat-gate.py index 1c8895b2..22c95ac2 100644 --- a/stacks/k8s-version-upgrade/scripts/compat-gate.py +++ b/stacks/k8s-version-upgrade/scripts/compat-gate.py @@ -46,7 +46,31 @@ def kget(args): return "" -def check_addons(matrix, tgt): +def running_minor(): + """Oldest kubelet minor across all nodes, as a (major, minor) tuple. + + Mirrors the detector's "oldest kubelet" choice so a partially-upgraded + cluster is judged by its lowest node, not its newest. RUNNING_K8S overrides + for local testing. None if undeterminable (treated as a minor jump → the + addon checks run in full, fail-safe).""" + env = os.environ.get("RUNNING_K8S") + if env: + return minor(env) + out = kget(["get", "nodes", "-o", + "jsonpath={range .items[*]}{.status.nodeInfo.kubeletVersion}{\"\\n\"}{end}"]) + minors = [minor(line) for line in out.splitlines() if minor(line)] + return min(minors) if minors else None + + +def check_addons(matrix, tgt, running): + # A target at or below the RUNNING minor (a patch, or a same/lower minor) + # crosses into no new k8s minor, so every installed addon is already + # empirically proven on it — addon ceilings only constrain a true minor jump. + # Without this guard an addon whose matrix ceiling sits below the running + # minor (e.g. ESO 0.12 → 1.31 on a cluster already running 1.34) would + # false-block legitimate patch upgrades, defeating autonomous patching. + if running and tgt <= running: + return [] reasons = [] for a in matrix.get("addons", []): img = kget(["-n", a["namespace"], "get", a["kind"], a["resource"], @@ -127,7 +151,8 @@ def main(): print(f"could not parse compat matrix JSON: {e}") sys.exit(3) - reasons = (check_addons(matrix, tgt) + running = running_minor() + reasons = (check_addons(matrix, tgt, running) + check_removed_apis(tgt) + check_containerd(matrix, tgt)) if reasons: diff --git a/stacks/k8s-version-upgrade/scripts/test_compat_gate.py b/stacks/k8s-version-upgrade/scripts/test_compat_gate.py new file mode 100644 index 00000000..3b688f6e --- /dev/null +++ b/stacks/k8s-version-upgrade/scripts/test_compat_gate.py @@ -0,0 +1,97 @@ +"""Unit tests for the k8s-upgrade compat gate (compat-gate.py). + +Run: pytest stacks/k8s-version-upgrade/scripts/test_compat_gate.py + +The module filename has a hyphen so it is loaded via importlib rather than a +plain import. kget() (kubectl) is monkeypatched so the addon checks read a +controlled "running" image without a live cluster. +""" +import importlib.util +import pathlib + +HERE = pathlib.Path(__file__).parent +_spec = importlib.util.spec_from_file_location("compat_gate", HERE / "compat-gate.py") +cg = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(cg) + +# Single-addon matrices keep each test's intent obvious. +ESO_MATRIX = { + "addons": [{ + "name": "external-secrets", + "namespace": "external-secrets", + "kind": "deployment", + "resource": "external-secrets", + "image_re": r"external-secrets:v(\d+\.\d+)", + "max_k8s": {"0.12": "1.31", "2.0": "1.35"}, + }] +} +CALICO_MATRIX = { + "addons": [{ + "name": "calico", + "namespace": "calico-system", + "kind": "daemonset", + "resource": "calico-node", + "image_re": r"node:v(\d+\.\d+)", + "max_k8s": {"3.26": "1.28", "3.30": "1.35", "3.32": "1.36"}, + }] +} + + +def _img(monkeypatch, image): + monkeypatch.setattr(cg, "kget", lambda args: image) + + +def test_minor_jump_blocks_when_addon_ceiling_below_target(monkeypatch): + # running 1.34, target 1.35: ESO 0.12 ceiling 1.31 < 1.35 -> block. + _img(monkeypatch, "external-secrets/external-secrets:v0.12.1") + reasons = cg.check_addons(ESO_MATRIX, (1, 35), (1, 34)) + assert any("external-secrets" in r for r in reasons), reasons + + +def test_patch_within_running_minor_not_blocked(monkeypatch): + # running 1.34, target 1.34.x patch: ceiling 1.31 < 1.34, BUT the cluster + # already runs ESO 0.12 on 1.34, so a patch is empirically safe -> no block. + _img(monkeypatch, "external-secrets/external-secrets:v0.12.1") + reasons = cg.check_addons(ESO_MATRIX, (1, 34), (1, 34)) + assert reasons == [], reasons + + +def test_target_below_running_not_blocked(monkeypatch): + # defensive: a target minor below running is never addon-blocked. + _img(monkeypatch, "external-secrets/external-secrets:v0.12.1") + reasons = cg.check_addons(ESO_MATRIX, (1, 33), (1, 34)) + assert reasons == [], reasons + + +def test_same_minor_addon_supports_target(monkeypatch): + # running 1.34, target 1.35, calico 3.30 supports 1.35 -> no block. + _img(monkeypatch, "quay.io/calico/node:v3.30.7") + reasons = cg.check_addons(CALICO_MATRIX, (1, 35), (1, 34)) + assert reasons == [], reasons + + +def test_unreadable_addon_failsafe_on_minor_jump(monkeypatch): + # can't read running version on a real minor jump -> fail safe (block). + _img(monkeypatch, "") + reasons = cg.check_addons(ESO_MATRIX, (1, 35), (1, 34)) + assert any("upgrade blind" in r or "could not read" in r for r in reasons), reasons + + +def test_unreadable_addon_ignored_on_patch(monkeypatch): + # patch within running minor: addon checks are skipped entirely, so an + # unreadable image must NOT fail-safe-block a legitimate patch. + _img(monkeypatch, "") + reasons = cg.check_addons(ESO_MATRIX, (1, 34), (1, 34)) + assert reasons == [], reasons + + +def test_running_minor_env_override(monkeypatch): + monkeypatch.setenv("RUNNING_K8S", "1.34.9") + assert cg.running_minor() == (1, 34) + + +def test_running_minor_from_kubectl(monkeypatch): + monkeypatch.delenv("RUNNING_K8S", raising=False) + # oldest kubelet wins (mirrors the detector): node2 on 1.33 is the floor. + monkeypatch.setattr(cg, "kget", lambda args: "v1.34.9\nv1.33.5\nv1.34.9") + assert cg.running_minor() == (1, 33)