k8s-version-upgrade: retry failed phases + surface wedged chain (fix 5-day silent stall)

The 1.34.9 patch auto-upgrade sat stuck for 5 days without anyone knowing.
On 2026-06-12 a transient critical alert (the ttyd web-terminal probe on the
devvm) was firing when the daily detection ran; the preflight's "halt on any
critical alert" gate aborted it, so the preflight Job Failed (backoffLimit=1).
Two design gaps then turned that blip into a multi-day wedge:

  * the detection guard and spawn_next only checked whether the phase Job
    EXISTED, not whether it succeeded — and the Failed Job lingers 7 days via
    ttlSecondsAfterFinished, so every daily run skipped re-spawning it;
  * the abort happens before the in-flight metric is pushed, so neither
    K8sUpgradeStalled nor upgrade_state.sh could see it — the pipeline reported
    "never ran" while actually being stuck.

Fixes:
  D1 retry-on-failure: detection CronJob (main.tf) and spawn_next
     (upgrade-step.sh) now delete + re-spawn a terminally-Failed phase Job
     instead of skipping it, so a transient gate self-corrects next cycle
     rather than wedging the pipeline for a week.
  D2 WebterminalTtydUnreachable critical -> warning: a devvm developer
     web-terminal is not cluster infrastructure and must not block upgrades.
  D3 observability: new K8sUpgradeChainJobFailed alert
     (kube_job_status_failed in k8s-upgrade ns) and upgrade_state.sh now flags
     a Failed chain Job as "chain failed" — closing the pre-in-flight blind
     spot so a wedge is visible immediately.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-17 13:07:36 +00:00
parent 7e7e41cbef
commit dfa1a12a86
4 changed files with 75 additions and 5 deletions

View file

@ -445,6 +445,17 @@ collect_k8s() {
K8S_NEXT="$(next_daily_noon_utc)" K8S_NEXT="$(next_daily_noon_utc)"
# Failed chain-Job detection. A preflight/phase Job can abort BEFORE pushing
# k8s_upgrade_in_flight=1 (the preflight gates exit pre-metric), so in-flight
# / stalled stay clean while the pipeline is actually wedged: the
# deterministic-name + 7d-TTL Job blocks re-spawn. Surface it directly.
# (2026-06-17: a transient critical alert wedged the 1.34.9 preflight for 5
# days, invisible to every metric-based check.)
local failed_jobs
failed_jobs=$($KUBECTL -n k8s-upgrade get jobs \
-o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Failed")].status}{"\n"}{end}' 2>/dev/null \
| awk -F'\t' '$2=="True" && $1 ~ /^k8s-upgrade-/{print $1}' | paste -sd' ' - || true)
# Status logic. # Status logic.
local stalled=0 local stalled=0
if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then
@ -463,6 +474,10 @@ collect_k8s() {
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale" K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale"
K8S_NOTES="last detection >9d ago" K8S_NOTES="last detection >9d ago"
raise_exit 2 raise_exit 2
elif [[ -n "$failed_jobs" ]]; then
K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="chain failed"
K8S_NOTES="failed upgrade Job(s): $failed_jobs — pipeline wedged. Inspect: kubectl -n k8s-upgrade describe job <name> (the retry-on-failure guard re-spawns on the next detection cycle)"
raise_exit 2
elif [[ "${in_flight:-0}" == "1" ]]; then elif [[ "${in_flight:-0}" == "1" ]]; then
K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight" K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight"
K8S_NOTES="upgrade chain running" K8S_NOTES="upgrade chain running"

View file

@ -451,9 +451,22 @@ resource "kubernetes_cron_job_v1" "k8s_version_check" {
# Idempotency: deterministic name reconciles via `apply`. # Idempotency: deterministic name reconciles via `apply`.
JOB_NAME="k8s-upgrade-preflight-$${TARGET//./-}" JOB_NAME="k8s-upgrade-preflight-$${TARGET//./-}"
# Retry-on-failure idempotency: skip only if an existing preflight
# Job is Active/Complete. A *Failed* preflight (aborted on a
# transient gate, e.g. a spurious critical alert) is deleted and
# re-spawned otherwise its deterministic name + 7d TTL wedges
# the entire pipeline until it ages out. (Stuck-pipeline fix
# 2026-06-17: a transient critical alert wedged 1.34.9 for 5 days.)
if /usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" >/dev/null 2>&1; then if /usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" >/dev/null 2>&1; then
slack "Preflight Job $JOB_NAME already exists (rerunning detection mid-flight?)" JOB_FAILED=$(/usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" \
exit 0 -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || true)
if [ "$JOB_FAILED" = "True" ]; then
slack "Preflight Job $JOB_NAME exists but FAILED — deleting and re-spawning"
/usr/local/bin/kubectl -n k8s-upgrade delete job "$JOB_NAME" --wait=true >/dev/null 2>&1 || true
else
slack "Preflight Job $JOB_NAME already exists (active/complete) — skipping"
exit 0
fi
fi fi
export JOB_NAME PHASE_NEXT=preflight TARGET_NODE_NEXT="" \ export JOB_NAME PHASE_NEXT=preflight TARGET_NODE_NEXT="" \

View file

@ -222,9 +222,23 @@ spawn_next() {
local job_name="k8s-upgrade-${NEXT_PHASE}-${TARGET_VERSION//./-}" local job_name="k8s-upgrade-${NEXT_PHASE}-${TARGET_VERSION//./-}"
[ -n "${NEXT_TARGET_NODE:-}" ] && job_name="${job_name}-${NEXT_TARGET_NODE}" [ -n "${NEXT_TARGET_NODE:-}" ] && job_name="${job_name}-${NEXT_TARGET_NODE}"
# Retry-on-failure idempotency: skip an existing next-Job ONLY if it is
# Active or Complete. A *Failed* Job (a phase that aborted on a transient
# gate) is deleted and re-created — otherwise its deterministic name plus
# ttlSecondsAfterFinished (7d) would block the whole chain from re-running
# that phase until the dead Job aged out. (Stuck-pipeline fix 2026-06-17:
# a transient critical alert wedged the 1.34.9 preflight for 5 days.)
if $KUBECTL -n "$NS" get job "$job_name" >/dev/null 2>&1; then if $KUBECTL -n "$NS" get job "$job_name" >/dev/null 2>&1; then
echo "Next Job $job_name already exists; idempotent skip." local job_failed
return 0 job_failed=$($KUBECTL -n "$NS" get job "$job_name" \
-o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || true)
if [ "$job_failed" = "True" ]; then
echo "Next Job $job_name exists but FAILED — deleting and re-spawning."
$KUBECTL -n "$NS" delete job "$job_name" --wait=true >/dev/null 2>&1 || true
else
echo "Next Job $job_name already exists (active/complete); idempotent skip."
return 0
fi
fi fi
local scheduling_block="" local scheduling_block=""

View file

@ -2224,6 +2224,29 @@ serverFiles:
severity: critical severity: critical
annotations: annotations:
summary: "K8s upgrade has been in flight for >90 min — chain is stuck. Check: kubectl -n k8s-upgrade get jobs" summary: "K8s upgrade has been in flight for >90 min — chain is stuck. Check: kubectl -n k8s-upgrade get jobs"
# K8sUpgradeChainJobFailed: catches a FAILED phase Job even when it
# aborts BEFORE pushing k8s_upgrade_in_flight=1. The preflight gates
# (nodes-ready, halt-on-alert, settle-window, kubeadm-plan) all exit
# pre-metric, so a failed preflight is invisible to K8sUpgradeStalled
# and EtcdPreUpgradeSnapshotMissing (both need in_flight=1) AND to
# upgrade_state.sh — exactly how a transient critical alert wedged the
# 1.34.9 preflight for 5 days (2026-06-17). With the retry-on-failure
# idempotency guard the next detection cycle deletes + re-spawns it, so
# this firing for 15m means it re-failed: investigate the root cause.
# NB: keyed on failed-pod count (bare >0, matching the file's other
# job-failure alerts) not the terminal Failed *condition* — so a phase
# whose 1st pod failed but whose retry succeeded keeps this firing until
# the Job's 7d TTL expires. Accepted: warning-only + alert-on-change
# (notifies once) + send_resolved, and upgrade_state.sh uses the precise
# Failed condition. A false-positive here beats missing a real wedge.
- alert: K8sUpgradeChainJobFailed
expr: kube_job_status_failed{namespace="k8s-upgrade", job_name=~"k8s-upgrade-.*"} > 0
for: 15m
labels:
severity: warning
subsystem: k8s-upgrade
annotations:
summary: "K8s upgrade chain Job {{ $labels.job_name }} has failed pods — pipeline likely wedged. kubectl -n k8s-upgrade get jobs ; kubectl -n k8s-upgrade describe job {{ $labels.job_name }}"
- name: "Traefik Ingress" - name: "Traefik Ingress"
rules: rules:
- alert: TraefikDown - alert: TraefikDown
@ -3076,10 +3099,15 @@ serverFiles:
- alert: WebterminalTtydUnreachable - alert: WebterminalTtydUnreachable
# In-cluster probe to ttyd Service. Bypasses Cloudflare/Traefik/ # In-cluster probe to ttyd Service. Bypasses Cloudflare/Traefik/
# Authentik, so non-200 means ttyd itself is down on the DevVM. # Authentik, so non-200 means ttyd itself is down on the DevVM.
# severity=warning (was critical until 2026-06-17): ttyd is a DevVM
# developer-convenience web terminal, not cluster infrastructure.
# As `critical` it tripped the k8s-upgrade preflight's halt-on-alert
# gate and — with the old no-retry idempotency — wedged the 1.34.9
# upgrade for 5 days. It is not upgrade-blocking; warning is correct.
expr: webterminal_probe_ttyd_status{job="webterminal-probe"} != 200 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 expr: webterminal_probe_ttyd_status{job="webterminal-probe"} != 200 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900
for: 10m for: 10m
labels: labels:
severity: critical severity: warning
subsystem: webterminal subsystem: webterminal
annotations: annotations:
summary: "ttyd in-cluster probe got HTTP {{ $value }} (expected 200) — ttyd on the DevVM (10.0.10.10:7681) is down. `systemctl status ttyd` on devvm." summary: "ttyd in-cluster probe got HTTP {{ $value }} (expected 200) — ttyd on the DevVM (10.0.10.10:7681) is down. `systemctl status ttyd` on devvm."