diff --git a/scripts/upgrade_state.sh b/scripts/upgrade_state.sh index 2e6e7faa..51fbf5d8 100755 --- a/scripts/upgrade_state.sh +++ b/scripts/upgrade_state.sh @@ -445,6 +445,17 @@ collect_k8s() { K8S_NEXT="$(next_daily_noon_utc)" + # Failed chain-Job detection. A preflight/phase Job can abort BEFORE pushing + # k8s_upgrade_in_flight=1 (the preflight gates exit pre-metric), so in-flight + # / stalled stay clean while the pipeline is actually wedged: the + # deterministic-name + 7d-TTL Job blocks re-spawn. Surface it directly. + # (2026-06-17: a transient critical alert wedged the 1.34.9 preflight for 5 + # days, invisible to every metric-based check.) + local failed_jobs + failed_jobs=$($KUBECTL -n k8s-upgrade get jobs \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Failed")].status}{"\n"}{end}' 2>/dev/null \ + | awk -F'\t' '$2=="True" && $1 ~ /^k8s-upgrade-/{print $1}' | paste -sd' ' - || true) + # Status logic. local stalled=0 if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then @@ -463,6 +474,10 @@ collect_k8s() { K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale" K8S_NOTES="last detection >9d ago" raise_exit 2 + elif [[ -n "$failed_jobs" ]]; then + K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="chain failed" + K8S_NOTES="failed upgrade Job(s): $failed_jobs — pipeline wedged. Inspect: kubectl -n k8s-upgrade describe job (the retry-on-failure guard re-spawns on the next detection cycle)" + raise_exit 2 elif [[ "${in_flight:-0}" == "1" ]]; then K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight" K8S_NOTES="upgrade chain running" diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf index 21d11427..58e5715c 100644 --- a/stacks/k8s-version-upgrade/main.tf +++ b/stacks/k8s-version-upgrade/main.tf @@ -451,9 +451,22 @@ resource "kubernetes_cron_job_v1" "k8s_version_check" { # Idempotency: deterministic name reconciles via `apply`. JOB_NAME="k8s-upgrade-preflight-$${TARGET//./-}" + # Retry-on-failure idempotency: skip only if an existing preflight + # Job is Active/Complete. A *Failed* preflight (aborted on a + # transient gate, e.g. a spurious critical alert) is deleted and + # re-spawned — otherwise its deterministic name + 7d TTL wedges + # the entire pipeline until it ages out. (Stuck-pipeline fix + # 2026-06-17: a transient critical alert wedged 1.34.9 for 5 days.) if /usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" >/dev/null 2>&1; then - slack "Preflight Job $JOB_NAME already exists (rerunning detection mid-flight?)" - exit 0 + JOB_FAILED=$(/usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" \ + -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || true) + if [ "$JOB_FAILED" = "True" ]; then + slack "Preflight Job $JOB_NAME exists but FAILED — deleting and re-spawning" + /usr/local/bin/kubectl -n k8s-upgrade delete job "$JOB_NAME" --wait=true >/dev/null 2>&1 || true + else + slack "Preflight Job $JOB_NAME already exists (active/complete) — skipping" + exit 0 + fi fi export JOB_NAME PHASE_NEXT=preflight TARGET_NODE_NEXT="" \ diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh index b10b395d..f46e1af7 100644 --- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh +++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh @@ -222,9 +222,23 @@ spawn_next() { local job_name="k8s-upgrade-${NEXT_PHASE}-${TARGET_VERSION//./-}" [ -n "${NEXT_TARGET_NODE:-}" ] && job_name="${job_name}-${NEXT_TARGET_NODE}" + # Retry-on-failure idempotency: skip an existing next-Job ONLY if it is + # Active or Complete. A *Failed* Job (a phase that aborted on a transient + # gate) is deleted and re-created — otherwise its deterministic name plus + # ttlSecondsAfterFinished (7d) would block the whole chain from re-running + # that phase until the dead Job aged out. (Stuck-pipeline fix 2026-06-17: + # a transient critical alert wedged the 1.34.9 preflight for 5 days.) if $KUBECTL -n "$NS" get job "$job_name" >/dev/null 2>&1; then - echo "Next Job $job_name already exists; idempotent skip." - return 0 + local job_failed + job_failed=$($KUBECTL -n "$NS" get job "$job_name" \ + -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || true) + if [ "$job_failed" = "True" ]; then + echo "Next Job $job_name exists but FAILED — deleting and re-spawning." + $KUBECTL -n "$NS" delete job "$job_name" --wait=true >/dev/null 2>&1 || true + else + echo "Next Job $job_name already exists (active/complete); idempotent skip." + return 0 + fi fi local scheduling_block="" diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 4ca6667c..3895fa04 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2224,6 +2224,29 @@ serverFiles: severity: critical annotations: summary: "K8s upgrade has been in flight for >90 min — chain is stuck. Check: kubectl -n k8s-upgrade get jobs" + # K8sUpgradeChainJobFailed: catches a FAILED phase Job even when it + # aborts BEFORE pushing k8s_upgrade_in_flight=1. The preflight gates + # (nodes-ready, halt-on-alert, settle-window, kubeadm-plan) all exit + # pre-metric, so a failed preflight is invisible to K8sUpgradeStalled + # and EtcdPreUpgradeSnapshotMissing (both need in_flight=1) AND to + # upgrade_state.sh — exactly how a transient critical alert wedged the + # 1.34.9 preflight for 5 days (2026-06-17). With the retry-on-failure + # idempotency guard the next detection cycle deletes + re-spawns it, so + # this firing for 15m means it re-failed: investigate the root cause. + # NB: keyed on failed-pod count (bare >0, matching the file's other + # job-failure alerts) not the terminal Failed *condition* — so a phase + # whose 1st pod failed but whose retry succeeded keeps this firing until + # the Job's 7d TTL expires. Accepted: warning-only + alert-on-change + # (notifies once) + send_resolved, and upgrade_state.sh uses the precise + # Failed condition. A false-positive here beats missing a real wedge. + - alert: K8sUpgradeChainJobFailed + expr: kube_job_status_failed{namespace="k8s-upgrade", job_name=~"k8s-upgrade-.*"} > 0 + for: 15m + labels: + severity: warning + subsystem: k8s-upgrade + annotations: + summary: "K8s upgrade chain Job {{ $labels.job_name }} has failed pods — pipeline likely wedged. kubectl -n k8s-upgrade get jobs ; kubectl -n k8s-upgrade describe job {{ $labels.job_name }}" - name: "Traefik Ingress" rules: - alert: TraefikDown @@ -3076,10 +3099,15 @@ serverFiles: - alert: WebterminalTtydUnreachable # In-cluster probe to ttyd Service. Bypasses Cloudflare/Traefik/ # Authentik, so non-200 means ttyd itself is down on the DevVM. + # severity=warning (was critical until 2026-06-17): ttyd is a DevVM + # developer-convenience web terminal, not cluster infrastructure. + # As `critical` it tripped the k8s-upgrade preflight's halt-on-alert + # gate and — with the old no-retry idempotency — wedged the 1.34.9 + # upgrade for 5 days. It is not upgrade-blocking; warning is correct. expr: webterminal_probe_ttyd_status{job="webterminal-probe"} != 200 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 10m labels: - severity: critical + severity: warning subsystem: webterminal annotations: summary: "ttyd in-cluster probe got HTTP {{ $value }} (expected 200) — ttyd on the DevVM (10.0.10.10:7681) is down. `systemctl status ttyd` on devvm."