k8s-upgrade: classify compat-gate blocks as actionable vs held; quiet the held case
The nightly upgrade chain detected 1.36, the preflight compat-gate refused it,
and that produced a Failed preflight Job + a K8sUpgradeBlocked alert EVERY
night — even though the block is unactionable (no kyverno/ESO release supports
1.36 yet, and gpu-operator is pinned to its current version because bumping it
needs a newer NVIDIA driver image + Ubuntu/kernel we're not ready for). Viktor
asked to teach the checker to tell 'we can fix this' apart from 'nothing to do
but wait', and stop the nightly Failed-Job + alert noise for the latter.
compat-gate.py now classifies each blocker:
- ACTIONABLE: a newer addon version in addon-compat.json supports the target
-> exit 2, k8s_upgrade_blocked=1 -> K8sUpgradeBlocked alert (reasons in the
nightly report).
- WAITING-on-upstream: no released version supports the target yet -> held.
- PINNED: addon marked pinned in the matrix (gpu-operator) -> held.
Held wins on a mix -> exit 4, k8s_upgrade_held=1 (NEW gauge), NO alert.
Tidy the block path (Viktor's scope choice): deliberate gate decisions now make
the preflight Job Complete cleanly (HALT_CHAIN stops chain progression without a
non-zero exit), so they no longer create Failed Jobs. Dropped the now-obsolete
'unless k8s_upgrade_blocked==1' suppression from K8sUpgradeChainJobFailed. Gauge
is pushed definitively once per run (no 1->0->1 flap that re-notifies). The
detector re-spawns a refused-but-Complete preflight nightly (silently) so a
standing hold still re-evaluates, and only announces genuine new/Failed spawns.
nightly-report gains a quiet '⏸️ HELD' headline with reasons grouped by class.
gpu-operator pinned in addon-compat.json (unpin = delete pinned + pin_reason).
Net effect on 1.36: HELD + quiet (waiting on kyverno/ESO, gpu-operator pinned;
Calico the lone actionable piece) — no nightly Failed Job, no alert, just the
morning report's HELD line. Design: docs/plans/2026-06-28-k8s-upgrade-gate-held-classification.md
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
afcd463f39
commit
eebb6c8594
9 changed files with 397 additions and 78 deletions
|
|
@ -95,3 +95,121 @@ def test_running_minor_from_kubectl(monkeypatch):
|
|||
# oldest kubelet wins (mirrors the detector): node2 on 1.33 is the floor.
|
||||
monkeypatch.setattr(cg, "kget", lambda args: "v1.34.9\nv1.33.5\nv1.34.9")
|
||||
assert cg.running_minor() == (1, 33)
|
||||
|
||||
|
||||
# --- block classification: actionable / waiting-upstream / pinned ----------
|
||||
# A block is ACTIONABLE if a newer addon version in the matrix supports the
|
||||
# target (we can upgrade to clear it), WAITING if no released version supports
|
||||
# the target yet (only upstream can clear it), or PINNED if a version exists but
|
||||
# we deliberately hold the addon. Held (waiting|pinned) is quiet; actionable
|
||||
# alerts.
|
||||
KYVERNO_MATRIX = {
|
||||
"addons": [{
|
||||
"name": "kyverno",
|
||||
"namespace": "kyverno",
|
||||
"kind": "deployment",
|
||||
"resource": "kyverno-admission-controller",
|
||||
"image_re": r"kyverno:v(\d+\.\d+)",
|
||||
"max_k8s": {"1.16": "1.34", "1.18": "1.35"},
|
||||
}]
|
||||
}
|
||||
GPU_MATRIX = {
|
||||
"addons": [{
|
||||
"name": "gpu-operator",
|
||||
"namespace": "nvidia",
|
||||
"kind": "deployment",
|
||||
"resource": "gpu-operator",
|
||||
"image_re": r"gpu-operator:v(\d+\.\d+)",
|
||||
"max_k8s": {"25.10": "1.35", "26.3": "1.36"},
|
||||
"pinned": True,
|
||||
"pin_reason": "needs newer NVIDIA driver + Ubuntu release",
|
||||
}]
|
||||
}
|
||||
|
||||
|
||||
def test_actionable_when_higher_version_supports_target(monkeypatch):
|
||||
# calico 3.30 (ceiling 1.35), target 1.36, matrix has 3.32 -> 1.36:
|
||||
# upgrading calico WOULD clear it -> ACTIONABLE, with a remediation hint.
|
||||
_img(monkeypatch, "quay.io/calico/node:v3.30.7")
|
||||
reasons = cg.check_addons(CALICO_MATRIX, (1, 36), (1, 35))
|
||||
assert len(reasons) == 1, reasons
|
||||
assert reasons[0].startswith("[ACTIONABLE]"), reasons
|
||||
assert "3.32" in reasons[0] and "calico" in reasons[0]
|
||||
|
||||
|
||||
def test_waiting_when_no_version_supports_target(monkeypatch):
|
||||
# kyverno 1.18 is the matrix ceiling (k8s 1.35); target 1.36 has NO
|
||||
# supporting version -> WAITING on upstream (nothing to upgrade to).
|
||||
_img(monkeypatch, "kyverno/kyverno:v1.18.1")
|
||||
reasons = cg.check_addons(KYVERNO_MATRIX, (1, 36), (1, 35))
|
||||
assert len(reasons) == 1, reasons
|
||||
assert reasons[0].startswith("[WAITING]"), reasons
|
||||
assert "kyverno" in reasons[0]
|
||||
|
||||
|
||||
def test_pinned_addon_is_held_not_actionable(monkeypatch):
|
||||
# gpu-operator 25.10, target 1.36; 26.3 supports 1.36 BUT the entry is
|
||||
# pinned -> classified PINNED (held), never ACTIONABLE.
|
||||
_img(monkeypatch, "nvcr.io/nvidia/gpu-operator:v25.10.0")
|
||||
reasons = cg.check_addons(GPU_MATRIX, (1, 36), (1, 35))
|
||||
assert len(reasons) == 1, reasons
|
||||
assert reasons[0].startswith("[PINNED]"), reasons
|
||||
assert "gpu-operator" in reasons[0]
|
||||
|
||||
|
||||
def test_unreadable_addon_tagged_actionable(monkeypatch):
|
||||
# fail-safe block on an unreadable image is ACTIONABLE (a human must look).
|
||||
_img(monkeypatch, "")
|
||||
reasons = cg.check_addons(ESO_MATRIX, (1, 35), (1, 34))
|
||||
assert reasons and reasons[0].startswith("[ACTIONABLE]"), reasons
|
||||
|
||||
|
||||
def test_existing_reasons_are_tagged(monkeypatch):
|
||||
# the legacy "ceiling below target, newer version exists" case is ACTIONABLE.
|
||||
_img(monkeypatch, "external-secrets/external-secrets:v0.12.1")
|
||||
reasons = cg.check_addons(ESO_MATRIX, (1, 35), (1, 34))
|
||||
assert reasons[0].startswith("[ACTIONABLE]"), reasons
|
||||
|
||||
|
||||
def test_held_reason_classifier():
|
||||
assert cg.held_reason("[WAITING] x")
|
||||
assert cg.held_reason("[PINNED] x")
|
||||
assert not cg.held_reason("[ACTIONABLE] x")
|
||||
assert not cg.held_reason("untagged")
|
||||
|
||||
|
||||
def test_exit_code_mapping():
|
||||
assert cg.exit_code([]) == 0
|
||||
assert cg.exit_code(["[ACTIONABLE] x"]) == 2
|
||||
assert cg.exit_code(["[WAITING] x"]) == 4
|
||||
assert cg.exit_code(["[PINNED] x"]) == 4
|
||||
# held wins on a mix: an upstream/pinned wait can't be cleared by acting now
|
||||
assert cg.exit_code(["[ACTIONABLE] x", "[WAITING] y"]) == 4
|
||||
|
||||
|
||||
def test_real_matrix_136_is_held(monkeypatch):
|
||||
"""Regression guard on the SHIPPED addon-compat.json: at today's running
|
||||
versions a 1.36 jump must be HELD (exit 4) — calico ACTIONABLE (3.32 in the
|
||||
matrix), ESO+kyverno WAITING (no 1.36 release), gpu-operator PINNED. Catches
|
||||
a matrix edit that silently turns the quiet held state into a nightly alert."""
|
||||
import json as _json
|
||||
matrix = _json.loads((HERE / "addon-compat.json").read_text())
|
||||
running_imgs = {
|
||||
"calico-system": "quay.io/calico/node:v3.30.7",
|
||||
"external-secrets": "ghcr.io/external-secrets/external-secrets:v2.6.0",
|
||||
"kyverno": "ghcr.io/kyverno/kyverno:v1.18.1",
|
||||
"nvidia": "nvcr.io/nvidia/gpu-operator:v25.10.0",
|
||||
}
|
||||
|
||||
def fake_kget(args):
|
||||
ns = args[args.index("-n") + 1] if "-n" in args else ""
|
||||
return running_imgs.get(ns, "")
|
||||
|
||||
monkeypatch.setattr(cg, "kget", fake_kget)
|
||||
reasons = cg.check_addons(matrix, (1, 36), (1, 35))
|
||||
pick = lambda name: next(r for r in reasons if name in r)
|
||||
assert pick("calico").startswith("[ACTIONABLE]"), reasons
|
||||
assert pick("external-secrets").startswith("[WAITING]"), reasons
|
||||
assert pick("kyverno").startswith("[WAITING]"), reasons
|
||||
assert pick("gpu-operator").startswith("[PINNED]"), reasons
|
||||
assert cg.exit_code(reasons) == 4 # held wins
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue