From dfa1a12a86ef1aa03473249fc923dce072ede4c6 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 17 Jun 2026 13:07:36 +0000 Subject: [PATCH 1/2] k8s-version-upgrade: retry failed phases + surface wedged chain (fix 5-day silent stall) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 1.34.9 patch auto-upgrade sat stuck for 5 days without anyone knowing. On 2026-06-12 a transient critical alert (the ttyd web-terminal probe on the devvm) was firing when the daily detection ran; the preflight's "halt on any critical alert" gate aborted it, so the preflight Job Failed (backoffLimit=1). Two design gaps then turned that blip into a multi-day wedge: * the detection guard and spawn_next only checked whether the phase Job EXISTED, not whether it succeeded — and the Failed Job lingers 7 days via ttlSecondsAfterFinished, so every daily run skipped re-spawning it; * the abort happens before the in-flight metric is pushed, so neither K8sUpgradeStalled nor upgrade_state.sh could see it — the pipeline reported "never ran" while actually being stuck. Fixes: D1 retry-on-failure: detection CronJob (main.tf) and spawn_next (upgrade-step.sh) now delete + re-spawn a terminally-Failed phase Job instead of skipping it, so a transient gate self-corrects next cycle rather than wedging the pipeline for a week. D2 WebterminalTtydUnreachable critical -> warning: a devvm developer web-terminal is not cluster infrastructure and must not block upgrades. D3 observability: new K8sUpgradeChainJobFailed alert (kube_job_status_failed in k8s-upgrade ns) and upgrade_state.sh now flags a Failed chain Job as "chain failed" — closing the pre-in-flight blind spot so a wedge is visible immediately. Co-Authored-By: Claude Opus 4.8 --- scripts/upgrade_state.sh | 15 ++++++++++ stacks/k8s-version-upgrade/main.tf | 17 +++++++++-- .../scripts/upgrade-step.sh | 18 +++++++++-- .../monitoring/prometheus_chart_values.tpl | 30 ++++++++++++++++++- 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/scripts/upgrade_state.sh b/scripts/upgrade_state.sh index 2e6e7faa..51fbf5d8 100755 --- a/scripts/upgrade_state.sh +++ b/scripts/upgrade_state.sh @@ -445,6 +445,17 @@ collect_k8s() { K8S_NEXT="$(next_daily_noon_utc)" + # Failed chain-Job detection. A preflight/phase Job can abort BEFORE pushing + # k8s_upgrade_in_flight=1 (the preflight gates exit pre-metric), so in-flight + # / stalled stay clean while the pipeline is actually wedged: the + # deterministic-name + 7d-TTL Job blocks re-spawn. Surface it directly. + # (2026-06-17: a transient critical alert wedged the 1.34.9 preflight for 5 + # days, invisible to every metric-based check.) + local failed_jobs + failed_jobs=$($KUBECTL -n k8s-upgrade get jobs \ + -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Failed")].status}{"\n"}{end}' 2>/dev/null \ + | awk -F'\t' '$2=="True" && $1 ~ /^k8s-upgrade-/{print $1}' | paste -sd' ' - || true) + # Status logic. local stalled=0 if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then @@ -463,6 +474,10 @@ collect_k8s() { K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale" K8S_NOTES="last detection >9d ago" raise_exit 2 + elif [[ -n "$failed_jobs" ]]; then + K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="chain failed" + K8S_NOTES="failed upgrade Job(s): $failed_jobs — pipeline wedged. Inspect: kubectl -n k8s-upgrade describe job (the retry-on-failure guard re-spawns on the next detection cycle)" + raise_exit 2 elif [[ "${in_flight:-0}" == "1" ]]; then K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight" K8S_NOTES="upgrade chain running" diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf index 21d11427..58e5715c 100644 --- a/stacks/k8s-version-upgrade/main.tf +++ b/stacks/k8s-version-upgrade/main.tf @@ -451,9 +451,22 @@ resource "kubernetes_cron_job_v1" "k8s_version_check" { # Idempotency: deterministic name reconciles via `apply`. JOB_NAME="k8s-upgrade-preflight-$${TARGET//./-}" + # Retry-on-failure idempotency: skip only if an existing preflight + # Job is Active/Complete. A *Failed* preflight (aborted on a + # transient gate, e.g. a spurious critical alert) is deleted and + # re-spawned — otherwise its deterministic name + 7d TTL wedges + # the entire pipeline until it ages out. (Stuck-pipeline fix + # 2026-06-17: a transient critical alert wedged 1.34.9 for 5 days.) if /usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" >/dev/null 2>&1; then - slack "Preflight Job $JOB_NAME already exists (rerunning detection mid-flight?)" - exit 0 + JOB_FAILED=$(/usr/local/bin/kubectl -n k8s-upgrade get job "$JOB_NAME" \ + -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || true) + if [ "$JOB_FAILED" = "True" ]; then + slack "Preflight Job $JOB_NAME exists but FAILED — deleting and re-spawning" + /usr/local/bin/kubectl -n k8s-upgrade delete job "$JOB_NAME" --wait=true >/dev/null 2>&1 || true + else + slack "Preflight Job $JOB_NAME already exists (active/complete) — skipping" + exit 0 + fi fi export JOB_NAME PHASE_NEXT=preflight TARGET_NODE_NEXT="" \ diff --git a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh index b10b395d..f46e1af7 100644 --- a/stacks/k8s-version-upgrade/scripts/upgrade-step.sh +++ b/stacks/k8s-version-upgrade/scripts/upgrade-step.sh @@ -222,9 +222,23 @@ spawn_next() { local job_name="k8s-upgrade-${NEXT_PHASE}-${TARGET_VERSION//./-}" [ -n "${NEXT_TARGET_NODE:-}" ] && job_name="${job_name}-${NEXT_TARGET_NODE}" + # Retry-on-failure idempotency: skip an existing next-Job ONLY if it is + # Active or Complete. A *Failed* Job (a phase that aborted on a transient + # gate) is deleted and re-created — otherwise its deterministic name plus + # ttlSecondsAfterFinished (7d) would block the whole chain from re-running + # that phase until the dead Job aged out. (Stuck-pipeline fix 2026-06-17: + # a transient critical alert wedged the 1.34.9 preflight for 5 days.) if $KUBECTL -n "$NS" get job "$job_name" >/dev/null 2>&1; then - echo "Next Job $job_name already exists; idempotent skip." - return 0 + local job_failed + job_failed=$($KUBECTL -n "$NS" get job "$job_name" \ + -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || true) + if [ "$job_failed" = "True" ]; then + echo "Next Job $job_name exists but FAILED — deleting and re-spawning." + $KUBECTL -n "$NS" delete job "$job_name" --wait=true >/dev/null 2>&1 || true + else + echo "Next Job $job_name already exists (active/complete); idempotent skip." + return 0 + fi fi local scheduling_block="" diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 4ca6667c..3895fa04 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2224,6 +2224,29 @@ serverFiles: severity: critical annotations: summary: "K8s upgrade has been in flight for >90 min — chain is stuck. Check: kubectl -n k8s-upgrade get jobs" + # K8sUpgradeChainJobFailed: catches a FAILED phase Job even when it + # aborts BEFORE pushing k8s_upgrade_in_flight=1. The preflight gates + # (nodes-ready, halt-on-alert, settle-window, kubeadm-plan) all exit + # pre-metric, so a failed preflight is invisible to K8sUpgradeStalled + # and EtcdPreUpgradeSnapshotMissing (both need in_flight=1) AND to + # upgrade_state.sh — exactly how a transient critical alert wedged the + # 1.34.9 preflight for 5 days (2026-06-17). With the retry-on-failure + # idempotency guard the next detection cycle deletes + re-spawns it, so + # this firing for 15m means it re-failed: investigate the root cause. + # NB: keyed on failed-pod count (bare >0, matching the file's other + # job-failure alerts) not the terminal Failed *condition* — so a phase + # whose 1st pod failed but whose retry succeeded keeps this firing until + # the Job's 7d TTL expires. Accepted: warning-only + alert-on-change + # (notifies once) + send_resolved, and upgrade_state.sh uses the precise + # Failed condition. A false-positive here beats missing a real wedge. + - alert: K8sUpgradeChainJobFailed + expr: kube_job_status_failed{namespace="k8s-upgrade", job_name=~"k8s-upgrade-.*"} > 0 + for: 15m + labels: + severity: warning + subsystem: k8s-upgrade + annotations: + summary: "K8s upgrade chain Job {{ $labels.job_name }} has failed pods — pipeline likely wedged. kubectl -n k8s-upgrade get jobs ; kubectl -n k8s-upgrade describe job {{ $labels.job_name }}" - name: "Traefik Ingress" rules: - alert: TraefikDown @@ -3076,10 +3099,15 @@ serverFiles: - alert: WebterminalTtydUnreachable # In-cluster probe to ttyd Service. Bypasses Cloudflare/Traefik/ # Authentik, so non-200 means ttyd itself is down on the DevVM. + # severity=warning (was critical until 2026-06-17): ttyd is a DevVM + # developer-convenience web terminal, not cluster infrastructure. + # As `critical` it tripped the k8s-upgrade preflight's halt-on-alert + # gate and — with the old no-retry idempotency — wedged the 1.34.9 + # upgrade for 5 days. It is not upgrade-blocking; warning is correct. expr: webterminal_probe_ttyd_status{job="webterminal-probe"} != 200 and on() (time() - process_start_time_seconds{job="prometheus"}) > 900 for: 10m labels: - severity: critical + severity: warning subsystem: webterminal annotations: summary: "ttyd in-cluster probe got HTTP {{ $value }} (expected 200) — ttyd on the DevVM (10.0.10.10:7681) is down. `systemctl status ttyd` on devvm." From fb638cd8ec65f12095be6b933d8354a0f3a67013 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 17 Jun 2026 13:10:18 +0000 Subject: [PATCH 2/2] k8s-version-upgrade: scope chain-fail alert to terminal reasons + sync docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refines the new K8sUpgradeChainJobFailed alert from a bare failed-pod count to the terminal job-condition reasons (BackoffLimitExceeded|DeadlineExceeded). A phase whose first pod failed but whose retry SUCCEEDED must NOT fire: every firing alert also halts kured, so a bare-count false-positive would block all OS node reboots for the Job's 7-day TTL. Verified against kube-state-metrics: the stuck preflight reports reason="BackoffLimitExceeded"; a Complete job has 0 for the terminal reasons. Docs updated to match the behaviour change (per the same-commit docs rule): - docs/runbooks/k8s-version-upgrade.md — new alert in the gates list; the "kill a stuck Job" recovery now leads with retry-on-failure self-heal. - docs/architecture/automated-upgrades.md — fourth Upgrade Gates alert; retry-on-failure note on the deterministic-naming paragraph. - .claude/skills/upgrade-state/SKILL.md — new "chain failed" status, legend entry, and drill-down (also copied to the active ~/.claude copy). Co-Authored-By: Claude Opus 4.8 --- .claude/skills/upgrade-state/SKILL.md | 34 +++++++++++++++++-- docs/architecture/automated-upgrades.md | 12 +++++-- docs/runbooks/k8s-version-upgrade.md | 17 ++++++++-- .../monitoring/prometheus_chart_values.tpl | 34 +++++++++---------- 4 files changed, 71 insertions(+), 26 deletions(-) diff --git a/.claude/skills/upgrade-state/SKILL.md b/.claude/skills/upgrade-state/SKILL.md index a2027a50..f88c228e 100644 --- a/.claude/skills/upgrade-state/SKILL.md +++ b/.claude/skills/upgrade-state/SKILL.md @@ -61,8 +61,11 @@ Pushgateway (`prometheus-prometheus-pushgateway.monitoring:9091`): - `k8s_upgrade_in_flight` — 0/1 - `k8s_upgrade_started_timestamp` — when the current chain started (0 when idle) -`K8sUpgradeStalled` alert fires when `in_flight=1` and the chain has -been running >90 minutes. The script raises `✗` in the same window. +`K8sUpgradeStalled` fires when `in_flight=1` and the chain has been running +>90 minutes. `K8sUpgradeChainJobFailed` fires when a phase Job terminally +failed — including a **preflight that aborted before `in_flight` was set** +(the gates exit pre-metric). The script raises `✗` for either, and reads the +Jobs directly, so it also catches a Failed preflight that left no metric. ## Status-icon legend @@ -72,7 +75,7 @@ been running >90 minutes. The script raises `✗` in the same window. | `→` | Update available, not yet applied (K8s patch/minor) | | `…` | In flight — chain currently running | | `⚠` | Attention: held-with-bumps, recent errors, pending approvals | -| `✗` | Broken: pod down, alert firing, chain stalled | +| `✗` | Broken: pod down, alert firing, chain stalled, or a chain Job failed | ## Drill-down — when a row trips, what to do @@ -177,6 +180,31 @@ kubectl -n monitoring exec deploy/prometheus-server -c prometheus-server -- sh - --header='Content-Type: text/plain'" ``` +### K8s `✗ chain failed` — a phase Job terminally failed + +`K8sUpgradeChainJobFailed` would fire. Most often a **preflight** that aborted +on a gate (a critical alert firing, a node not Ready, a kubeadm-plan mismatch) — +these exit before `in_flight` is set, so `K8sUpgradeStalled` never sees them, and +the deterministic name + 7d TTL blocked re-spawn (the 2026-06-12 5-day wedge). + +```bash +kubectl -n k8s-upgrade get jobs +kubectl -n k8s-upgrade describe job # check the Failed reason +# Preflight abort reasons post to Slack ONLY (not stdout), so Loki won't have +# them. Replay the gate instead — which critical alerts were firing at the +# failure time? (ALERTS{severity="critical"} in Prometheus, query at that ts.) +``` + +Recovery is now mostly automatic: the detection CronJob and `spawn_next` +re-spawn a terminally-Failed Job on the next cycle (retry-on-failure), so a +transient gate clears within ~24h. To expedite, delete the Failed Job and +trigger detection: + +```bash +kubectl -n k8s-upgrade delete job +kubectl -n k8s-upgrade create job --from=cronjob/k8s-version-check manual-detect-$(date +%s) +``` + ### K8s `✗ detection stale` — last detection >9 days ```bash diff --git a/docs/architecture/automated-upgrades.md b/docs/architecture/automated-upgrades.md index 5d8b1c9e..2029c83d 100644 --- a/docs/architecture/automated-upgrades.md +++ b/docs/architecture/automated-upgrades.md @@ -274,8 +274,13 @@ Job 6 — postflight (no pinning) Each Job runs `scripts/upgrade-step.sh`, which dispatches on `$PHASE` and ends by spawning the next Job (`envsubst < /template/job-template.yaml | kubectl apply -f -`). Job names are deterministic (`k8s-upgrade--[-]`) -so `apply` reconciles to a single Job per run — re-running a failed Job -won't duplicate downstream Jobs. +so `apply` reconciles to a single Job per run — re-running won't duplicate +downstream Jobs. The detection CronJob and `spawn_next` additionally delete + +re-spawn a terminally-**Failed** Job of the same name (rather than skipping it +on existence), so a transient preflight gate self-heals on the next cycle +instead of wedging the pipeline until the dead Job's 7d TTL expires +(retry-on-failure, added 2026-06-17 after a spurious critical alert stalled +1.34.9 for 5 days). ### Self-preemption history (the reason for the Job-chain rewrite) @@ -305,10 +310,11 @@ each Job's pod and its drain target are always different nodes. - **Per-node script**: `infra/scripts/update_k8s.sh`. Caller passes `--role master|worker --release X.Y.Z`. Piped via SSH into each node by upgrade-step.sh. -- **Three Upgrade Gates alerts**: +- **Four Upgrade Gates alerts**: - `K8sVersionSkew` — kubelet/apiserver `gitVersion` count >1 for 30m. Catches a half-done rollout. - `EtcdPreUpgradeSnapshotMissing` — `k8s_upgrade_in_flight==1 && k8s_upgrade_snapshot_taken==0` for 10m. Catches preflight failing silently. - `K8sUpgradeStalled` — `k8s_upgrade_in_flight==1 && time()-k8s_upgrade_started_timestamp > 5400` for 5m. Catches a chain Job dying without spawning its successor. + - `K8sUpgradeChainJobFailed` — `kube_job_status_failed{namespace="k8s-upgrade",job_name=~"k8s-upgrade-.*",reason=~"BackoffLimitExceeded|DeadlineExceeded"} > 0` for 15m (warning). Catches a phase Job that terminally failed **before `in_flight` was set** (the preflight gates exit pre-metric) — invisible to the two `in_flight`-based alerts above; this was the blind spot behind the 5-day 1.34.9 preflight wedge. Reason-scoped so a retry-success doesn't false-positive (and so it doesn't needlessly block kured). - **Pushgateway metrics**: - `k8s_upgrade_in_flight` (set in preflight, cleared in postflight) - `k8s_upgrade_snapshot_taken` (set after etcd snapshot Job completes with ≥1 KiB) diff --git a/docs/runbooks/k8s-version-upgrade.md b/docs/runbooks/k8s-version-upgrade.md index 847d2462..00ff78f9 100644 --- a/docs/runbooks/k8s-version-upgrade.md +++ b/docs/runbooks/k8s-version-upgrade.md @@ -115,7 +115,8 @@ Pushed by upgrade-step.sh during phase execution; observed by the - **`K8sVersionSkew`** — distinct kubelet/apiserver `gitVersion` count > 1 for 30m. Catches a half-done rollout. - **`EtcdPreUpgradeSnapshotMissing`** — `k8s_upgrade_in_flight==1 && k8s_upgrade_snapshot_taken==0` for 10m. Catches preflight Stage 2 failing silently. - **`K8sUpgradeStalled`** — `k8s_upgrade_in_flight==1 && time()-k8s_upgrade_started_timestamp > 5400` for 5m. Catches a Job in the chain dying without spawning its successor. -- All three alerts ALSO block kured (same `--prometheus-url` halt-on-alert mechanism) so the OS-reboot pipeline can't run on top of a half-done version upgrade. +- **`K8sUpgradeChainJobFailed`** — `kube_job_status_failed{namespace="k8s-upgrade",job_name=~"k8s-upgrade-.*",reason=~"BackoffLimitExceeded|DeadlineExceeded"} > 0` for 15m (warning). Catches a phase Job that **terminally failed before `k8s_upgrade_in_flight` was set** — the preflight gates exit pre-metric, so the two `in_flight`-based alerts above are blind to a failed preflight (this is what hid the 5-day 1.34.9 wedge on 2026-06-12). Reason-scoped to terminal job conditions so a retry-success doesn't false-positive (a bare failed-pod-count would otherwise also block kured for the Job's 7d TTL). +- All four alerts ALSO block kured (same `--prometheus-url` halt-on-alert mechanism) so the OS-reboot pipeline can't run on top of a half-done version upgrade. ### Vault secrets @@ -202,8 +203,18 @@ EOF ``` ### Kill a stuck Job (chain halted mid-flight) -The chain stalls if any Job dies without spawning its successor. `K8sUpgradeStalled` -fires after 90 min. Recovery: +A phase Job that dies without spawning its successor halts the chain. Two alerts +surface it: `K8sUpgradeStalled` (a mid-chain Job that died with `in_flight=1`, +after 90 min) and `K8sUpgradeChainJobFailed` (any phase that terminally failed, +after 15 min — including a **preflight** that aborted before `in_flight` was set, +which `K8sUpgradeStalled` cannot see). + +**Preflight failures now self-heal** (since 2026-06-17): the detection CronJob and +`spawn_next` delete + re-spawn a terminally-Failed Job instead of skipping it on +name-existence (retry-on-failure), so a transient preflight gate — e.g. a spurious +critical alert like the ttyd web-terminal probe that wedged 1.34.9 for 5 days — +clears on the next daily cycle. A mid-chain phase that keeps failing still needs +manual recovery: fix the root cause, then: ```bash # 1. Identify the failed Job diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 3895fa04..65e89278 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -2224,29 +2224,29 @@ serverFiles: severity: critical annotations: summary: "K8s upgrade has been in flight for >90 min — chain is stuck. Check: kubectl -n k8s-upgrade get jobs" - # K8sUpgradeChainJobFailed: catches a FAILED phase Job even when it - # aborts BEFORE pushing k8s_upgrade_in_flight=1. The preflight gates - # (nodes-ready, halt-on-alert, settle-window, kubeadm-plan) all exit - # pre-metric, so a failed preflight is invisible to K8sUpgradeStalled - # and EtcdPreUpgradeSnapshotMissing (both need in_flight=1) AND to - # upgrade_state.sh — exactly how a transient critical alert wedged the - # 1.34.9 preflight for 5 days (2026-06-17). With the retry-on-failure - # idempotency guard the next detection cycle deletes + re-spawns it, so - # this firing for 15m means it re-failed: investigate the root cause. - # NB: keyed on failed-pod count (bare >0, matching the file's other - # job-failure alerts) not the terminal Failed *condition* — so a phase - # whose 1st pod failed but whose retry succeeded keeps this firing until - # the Job's 7d TTL expires. Accepted: warning-only + alert-on-change - # (notifies once) + send_resolved, and upgrade_state.sh uses the precise - # Failed condition. A false-positive here beats missing a real wedge. + # K8sUpgradeChainJobFailed: catches a TERMINALLY-failed phase Job even + # when it aborts BEFORE pushing k8s_upgrade_in_flight=1 (the preflight + # gates — nodes-ready, halt-on-alert, settle-window, kubeadm-plan — all + # exit pre-metric). K8sUpgradeStalled and EtcdPreUpgradeSnapshotMissing + # both need in_flight=1, and upgrade_state.sh was metric-blind too, so a + # failed preflight was invisible: exactly how a transient critical alert + # wedged 1.34.9 for 5 days (2026-06-12). Scoped to the terminal + # job-condition reasons (BackoffLimitExceeded/DeadlineExceeded), NOT a + # bare failed-pod count, so a phase whose 1st pod failed but whose retry + # SUCCEEDED does not fire — important because every firing alert also + # halts kured (OS-reboot pipeline), and a bare-count false-positive would + # block all node reboots for the Job's 7d TTL. With the retry-on-failure + # idempotency guard the next detection cycle deletes + re-spawns the + # Failed Job (clearing this within ~24h); a sustained firing means it + # re-failed — investigate the root cause. - alert: K8sUpgradeChainJobFailed - expr: kube_job_status_failed{namespace="k8s-upgrade", job_name=~"k8s-upgrade-.*"} > 0 + expr: kube_job_status_failed{namespace="k8s-upgrade", job_name=~"k8s-upgrade-.*", reason=~"BackoffLimitExceeded|DeadlineExceeded"} > 0 for: 15m labels: severity: warning subsystem: k8s-upgrade annotations: - summary: "K8s upgrade chain Job {{ $labels.job_name }} has failed pods — pipeline likely wedged. kubectl -n k8s-upgrade get jobs ; kubectl -n k8s-upgrade describe job {{ $labels.job_name }}" + summary: "K8s upgrade chain Job {{ $labels.job_name }} terminally failed ({{ $labels.reason }}) — pipeline wedged. kubectl -n k8s-upgrade get jobs ; kubectl -n k8s-upgrade describe job {{ $labels.job_name }}" - name: "Traefik Ingress" rules: - alert: TraefikDown