diff --git a/.claude/skills/upgrade-state/SKILL.md b/.claude/skills/upgrade-state/SKILL.md new file mode 100644 index 00000000..6cf23084 --- /dev/null +++ b/.claude/skills/upgrade-state/SKILL.md @@ -0,0 +1,198 @@ +--- +name: upgrade-state +description: | + Audit the three autonomous-upgrade pipelines (apps via Keel, OS via + unattended-upgrades+kured, K8s components via the version-check chain). + Use when: + (1) User asks "/upgrade-state" or "are we current", + (2) User asks "what's pending upgrade" or "what's the upgrade state", + (3) User asks if Keel / kured / k8s-version-check is healthy, + (4) User asks about kept-back / held packages or pending reboots, + (5) Before the Sunday `k8s-version-check` CronJob fires (weekly survey). + Read-only — no `--fix`. Exits 0 healthy / 1 attention / 2 stalled. +author: Claude Code +version: 1.0.0 +date: 2026-05-18 +--- + +# Upgrade-state + +## MANDATORY: Run the script first + +When this skill is invoked, your **first action** must be to run +`upgrade_state.sh` and reason over its output before doing anything +else. Do NOT improvise individual `kubectl` / `ssh` calls — the script +is the authoritative surface. + +```bash +bash /home/wizard/code/infra/scripts/upgrade_state.sh +``` + +For programmatic use: + +```bash +bash /home/wizard/code/infra/scripts/upgrade_state.sh --json | tee /tmp/upgrade-state.json +``` + +Then: + +1. Report the rendered table verbatim — it answers the user's + "are we current" question in three lines. +2. For every `⚠` or `✗` row, surface the relevant drill-down lines + underneath and propose a next action (links in the table below). +3. Only reach for ad-hoc commands when investigating beyond what the + script reported. + +Exit codes: `0` healthy, `1` attention warranted, `2` stalled / broken. + +## What it covers (3 pipelines) + +| Layer | What runs | Cadence | Data sources | +|---|---|---|---| +| **Apps** | Keel polls every watched Deployment's container registry; rolls on new digest | hourly | Prom (`pending_approvals`, `registries_scanned_total`), Keel pod logs | +| **OS** | `unattended-upgrades` in-release patching; `kured` reboots when `/var/run/reboot-required` is set | daily 02:00-06:00 London | SSH fan-out to all 5 nodes | +| **K8s** | `k8s-version-check` CronJob detects new kubeadm patch/minor; spawns the Job-chain that drains+upgrades node-by-node | Sun 12:00 UTC | Pushgateway (`k8s_upgrade_*`), `kubectl get nodes` | + +The K8s pipeline pushes a small set of gauges to the Prometheus +Pushgateway (`prometheus-prometheus-pushgateway.monitoring:9091`): + +- `k8s_upgrade_available{kind="patch"|"minor",target=…}` — 1 if newer release detected +- `k8s_version_check_last_run_timestamp` — when detection last ran +- `k8s_upgrade_in_flight` — 0/1 +- `k8s_upgrade_started_timestamp` — when the current chain started (0 when idle) + +`K8sUpgradeStalled` alert fires when `in_flight=1` and the chain has +been running >90 minutes. The script raises `✗` in the same window. + +## Status-icon legend + +| Icon | Meaning | +|---|---| +| `✓` | Healthy, fully current | +| `→` | Update available, not yet applied (K8s patch/minor) | +| `…` | In flight — chain currently running | +| `⚠` | Attention: held-with-bumps, recent errors, pending approvals | +| `✗` | Broken: pod down, alert firing, chain stalled | + +## Drill-down — when a row trips, what to do + +### Apps `⚠` — pending approvals or errors + +```bash +# Read recent Keel log lines +kubectl -n keel logs deploy/keel --since=24h --tail=200 + +# What is Keel currently tracking? +kubectl -n monitoring exec deploy/prometheus-server -c prometheus-server -- \ + wget -qO- 'http://localhost:9090/api/v1/query?query=count by (image) (registries_scanned_total)' + +# Is the scrape live? +kubectl -n monitoring exec deploy/prometheus-server -c prometheus-server -- \ + wget -qO- 'http://localhost:9090/api/v1/query?query=up{job="kubernetes-pods",app="keel"}' +``` + +Common Keel errors: +- `failed to add image watch job` — image annotation mistyped (rare; Kyverno auto-injects) +- `registry authentication required` — bad imagePullSecret on the watched Deployment +- `bad tag pattern` — Keel can't parse the watched image's tag against its policy + +### OS `⚠` — held packages with bumps + +The script flags any package held via `apt-mark hold` that ALSO appears +in `apt list --upgradable` — excluding k8s components (the K8s pipeline +owns those) and the kernel (kured handles the reboot half). + +Typical cause: a major-version bump (e.g. containerd 1.7 → 2.2, +runc 1.1 → 1.4). These are held because they need cluster-wide +coordination, not silent in-release patching. + +```bash +# Inspect the situation on the flagged node +ssh wizard@10.0.20.10X 'apt-mark showhold; apt list --upgradable 2>/dev/null' + +# Unhold + upgrade a specific package +ssh wizard@10.0.20.10X 'sudo apt-mark unhold containerd && sudo apt-get install -y containerd' +``` + +Node IPs: master=`100`, node1=`101`, node2=`102`, node3=`103`, node4=`104`. + +### OS `⚠` — pending reboot + +A node has `/var/run/reboot-required`. Kured will reboot it inside the +next 02:00-06:00 London window (any day of the week). + +```bash +# Force a manual reboot inside the window (rare) +kubectl drain k8s-nodeX --delete-emptydir-data --ignore-daemonsets +ssh wizard@10.0.20.10X sudo systemctl reboot +``` + +### OS `✗` — kured not Running + +```bash +kubectl -n kured get pods +kubectl -n kured logs daemonset/kured --tail=100 +# Verify sentinel gate (kured-sentinel-gate DaemonSet writes /var/run/gated-reboot-required) +kubectl -n kured get pods -l name=kured-sentinel-gate +``` + +### K8s `→` — patch/minor available + +Detection ran, target identified, chain NOT started. This is normal +between Sun 12:00 UTC detection and the next Job chain. + +```bash +# Inspect Pushgateway state +kubectl -n monitoring exec deploy/prometheus-server -c prometheus-server -- \ + wget -qO- 'http://prometheus-prometheus-pushgateway:9091/metrics' | grep ^k8s_upgrade + +# Trigger a manual run of the detection CronJob +kubectl -n k8s-upgrade create job --from=cronjob/k8s-version-check manual-detect-$(date +%s) +``` + +### K8s `…` — in flight + +The Job chain is running. Watch its progress: + +```bash +kubectl -n k8s-upgrade get jobs --sort-by=.metadata.creationTimestamp +kubectl -n k8s-upgrade logs -l app=k8s-version-upgrade --tail=200 --prefix +``` + +### K8s `✗ stalled` — `K8sUpgradeStalled` would fire + +Chain in-flight >90m. The Job is most likely stuck on drain or a +pre-flight check. + +```bash +kubectl -n k8s-upgrade get jobs +kubectl -n k8s-upgrade describe job +kubectl -n k8s-upgrade logs job/ --tail=300 + +# If you need to clear the in-flight flag (after diagnosing): +kubectl -n monitoring exec deploy/prometheus-server -c prometheus-server -- sh -c \ + "printf 'k8s_upgrade_in_flight 0\nk8s_upgrade_started_timestamp 0\n' | \ + wget -qO- --post-file=- 'http://prometheus-prometheus-pushgateway:9091/metrics/job/k8s-version-upgrade' \ + --header='Content-Type: text/plain'" +``` + +### K8s `✗ detection stale` — last detection >9 days + +```bash +kubectl -n k8s-upgrade get cronjob k8s-version-check +kubectl -n k8s-upgrade get jobs --sort-by=.metadata.creationTimestamp | tail -5 +``` + +If the CronJob hasn't fired on time, suspect: +- `suspend=true` on the CronJob (`var.enabled=false` in the + `k8s-version-upgrade` Terraform stack) +- Image-pull failure on the version-check pod +- Pushgateway scrape gone stale + +## Companion command-line flags + +```bash +bash infra/scripts/upgrade_state.sh # rendered table (default) +bash infra/scripts/upgrade_state.sh --json # machine output +bash infra/scripts/upgrade_state.sh --kubeconfig X # override kubeconfig +``` diff --git a/scripts/upgrade_state.sh b/scripts/upgrade_state.sh new file mode 100755 index 00000000..5f7fa7ab --- /dev/null +++ b/scripts/upgrade_state.sh @@ -0,0 +1,596 @@ +#!/usr/bin/env bash +# +# upgrade_state.sh — survey the three autonomous-upgrade pipelines. +# +# Companion to cluster_healthcheck.sh, surfaced via the /upgrade-state skill. +# Read-only by design — no --fix. +# +# The three pipelines: +# 1. Apps — Keel polls registries hourly and rolls Deployments tagged +# keel.sh/policy. Metrics on container :9300/metrics. +# 2. OS — unattended-upgrades patches in-release per node; kured +# reboots within a daily 02:00-06:00 London window. +# 3. K8s — k8s-version-check CronJob (Sun 12:00 UTC) detects new +# kubeadm patch/minor releases; Job-chain drains+upgrades +# node-by-node. Pushgateway holds k8s_upgrade_* gauges. +# +# Exit codes: 0 healthy, 1 attention warranted, 2 something stalled. + +set -euo pipefail + +# --- Colors --- +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +BLUE='\033[0;34m' +BOLD='\033[1m' +NC='\033[0m' + +# --- Globals --- +JSON=false +KUBECONFIG_PATH="${KUBECONFIG:-${HOME}/.kube/config}" +[[ -f "$KUBECONFIG_PATH" ]] || KUBECONFIG_PATH="/home/wizard/code/infra/config" +KUBECTL="" +NODES=(k8s-master:10.0.20.100 k8s-node1:10.0.20.101 k8s-node2:10.0.20.102 k8s-node3:10.0.20.103 k8s-node4:10.0.20.104) +SSH_OPTS=(-o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no) +NOW_EPOCH=$(date -u +%s) +HIGHEST_EXIT=0 # 0 healthy, 1 attention, 2 stalled + +# Results — collectors fill these. +APPS_STATUS_ICON=""; APPS_STATUS_TEXT="" +APPS_LAST_CHECK=""; APPS_NEXT=""; APPS_NOTES="" +APPS_ENROLLED=0; APPS_PENDING=0; APPS_UPDATES_LINE=""; APPS_ERROR_LINE="" + +OS_STATUS_ICON=""; OS_STATUS_TEXT="" +OS_LAST_CHECK=""; OS_NEXT=""; OS_NOTES="" +OS_DISTRO_SUMMARY=""; OS_KERNEL_SUMMARY="" +OS_PENDING_REBOOT_NODES=""; OS_HELD_DETAIL="" +OS_LAST_UU=""; OS_LAST_KURED="" + +K8S_STATUS_ICON=""; K8S_STATUS_TEXT="" +K8S_LAST_CHECK=""; K8S_NEXT=""; K8S_NOTES="" +K8S_RUNNING=""; K8S_PATCH=""; K8S_MINOR="" +K8S_LAST_DETECT_LINE=""; K8S_IN_FLIGHT="no"; K8S_LAST_CHAIN="" + +# --- Helpers --- +log() { [[ "$JSON" == true ]] && return 0; echo -e "$*"; } + +raise_exit() { + local n="$1" + if [[ "$n" -gt "$HIGHEST_EXIT" ]]; then HIGHEST_EXIT="$n"; fi + return 0 +} + +usage() { + cat <] + +Read-only audit of the three autonomous-upgrade pipelines (apps, OS, k8s). + + --json machine-readable JSON + --kubeconfig PATH override kubeconfig + +Exit codes: 0 healthy, 1 attention warranted, 2 something stalled. +EOF +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --json) JSON=true; shift ;; + --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac + done + KUBECTL="kubectl --kubeconfig $KUBECONFIG_PATH" +} + +# Prometheus query — Prometheus + reload + backup share a network namespace, +# so reaching localhost:9090 works from any of the three sidecars. +prom_q() { + local q="$1" + $KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \ + wget -qO- "http://localhost:9090/api/v1/query?query=${q}" 2>/dev/null || true +} + +pg_metrics() { + $KUBECTL -n monitoring exec deploy/prometheus-server -c prometheus-server -- \ + wget -qO- "http://prometheus-prometheus-pushgateway:9091/metrics" 2>/dev/null || true +} + +ssh_node() { + local ip="$1"; shift + ssh "${SSH_OPTS[@]}" "wizard@$ip" "$@" 2>/dev/null || true +} + +human_age() { + local secs="$1" + if [[ "$secs" -lt 60 ]]; then printf '%ds ago' "$secs" + elif [[ "$secs" -lt 3600 ]]; then printf '%dm ago' $((secs/60)) + elif [[ "$secs" -lt 86400 ]]; then printf '%dh ago' $((secs/3600)) + else printf '%dd ago' $((secs/86400)) + fi +} + +# Pushgateway emits floats and scientific notation — coerce to integer +# epoch seconds. Returns 0 if the input is empty / zero / unparseable. +to_epoch_int() { + local v="${1:-}" + if [[ -z "$v" || "$v" == "0" ]]; then echo 0; return; fi + python3 -c "import sys; v=sys.argv[1]; print(int(float(v)))" "$v" 2>/dev/null || echo 0 +} + +# --- 1. Apps (Keel) --- +collect_apps() { + local pending tracked enrolled updates_24h errors + + # Enrolled: count Deployments with keel.sh/policy != never (Keel itself + # is policy=never). The Kyverno auto-injection labels namespaces + # keel.sh/enrolled=true, but the annotation is what Keel watches. + enrolled=$($KUBECTL get deploy -A -o json 2>/dev/null | python3 -c ' +import json, sys +data = json.load(sys.stdin) +n = sum(1 for d in data["items"] + if (d["metadata"].get("annotations") or {}).get("keel.sh/policy", "never") != "never") +print(n) +' 2>/dev/null || echo 0) + APPS_ENROLLED="$enrolled" + + # Pending approvals (sum across Keel pods). + pending=$(prom_q 'sum(pending_approvals)' | python3 -c ' +import json, sys +try: + r = json.load(sys.stdin)["data"]["result"] + print(int(float(r[0]["value"][1])) if r else 0) +except Exception: + print(0) +' 2>/dev/null || echo 0) + APPS_PENDING="$pending" + + # Tracked images — proxy for "is the scrape live?". + tracked=$(prom_q 'count(count by (image) (registries_scanned_total))' | python3 -c ' +import json, sys +try: + r = json.load(sys.stdin)["data"]["result"] + print(int(float(r[0]["value"][1])) if r else 0) +except Exception: + print(0) +' 2>/dev/null || echo 0) + + # Last scrape age — `up{job="kubernetes-pods", app="keel"}` is 1 if the + # most recent scrape succeeded. We surface the wallclock age via a tiny + # `time() - timestamp(up{...})` query. + APPS_LAST_CHECK=$(prom_q 'time()-timestamp(up{job="kubernetes-pods",app="keel"})' | python3 -c ' +import json, sys +try: + r = json.load(sys.stdin)["data"]["result"] + if not r: print("scrape not live") + else: + secs = int(float(r[0]["value"][1])) + if secs < 60: print(f"{secs}s ago") + elif secs < 3600: print(f"{secs//60}m ago") + else: print(f"{secs//3600}h ago") +except Exception: + print("?") +' 2>/dev/null || echo "?") + + # Recent updates: count lines in Keel logs that report a successful + # rollout. Keel logs an "update completed" message per rollout. + local log_24h + log_24h=$($KUBECTL -n keel logs deploy/keel --since=24h --tail=2000 2>/dev/null || true) + updates_24h=$(echo "$log_24h" | grep -cE 'update completed|successfully updated|deployment updated' 2>/dev/null || true) + [[ -z "$updates_24h" ]] && updates_24h=0 + APPS_UPDATES_LINE="$updates_24h in last 24h (tracked images: $tracked)" + + errors=$(echo "$log_24h" | grep -iE '"level":"(error|fatal)"|level=error' | tail -3 || true) + if [[ -z "$errors" ]]; then + APPS_ERROR_LINE="(none in last 24h)" + else + APPS_ERROR_LINE="$(echo "$errors" | wc -l | tr -d ' ') error(s); newest: $(echo "$errors" | tail -1 | cut -c1-120)" + fi + + # Keel pod state. + local pod_status + pod_status=$($KUBECTL -n keel get pods -l app=keel -o jsonpath='{.items[*].status.phase}' 2>/dev/null || true) + + if [[ "$pod_status" != *"Running"* ]]; then + APPS_STATUS_ICON="✗"; APPS_STATUS_TEXT="down" + APPS_NOTES="Keel pod not Running ($pod_status)" + raise_exit 2 + elif [[ "$pending" -gt 0 || -n "$errors" ]]; then + APPS_STATUS_ICON="⚠"; APPS_STATUS_TEXT="attn" + APPS_NOTES="$enrolled enrolled; $pending pending; $(echo "$errors" | wc -l | tr -d ' ') recent error(s)" + raise_exit 1 + else + APPS_STATUS_ICON="✓"; APPS_STATUS_TEXT="healthy" + APPS_NOTES="$enrolled enrolled, 0 pending, 0 errors" + fi + + APPS_NEXT="rolling, hourly poll" +} + +# --- 2. OS (apt + kured) --- +collect_os() { + local distros kernels distro_uniq kernel_uniq + distros=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.osImage}{"\n"}{end}' 2>/dev/null) + kernels=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kernelVersion}{"\n"}{end}' 2>/dev/null) + distro_uniq=$(echo "$distros" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g') + kernel_uniq=$(echo "$kernels" | sort -u | tr '\n' ',' | sed 's/,$//; s/,/, /g') + OS_DISTRO_SUMMARY="$distro_uniq" + OS_KERNEL_SUMMARY="$kernel_uniq" + + # SSH fan-out — parallel background subshells, write per-node results to tmp files. + local tmpdir; tmpdir=$(mktemp -d) + trap 'rm -rf "$tmpdir"' RETURN + local entry name ip + for entry in "${NODES[@]}"; do + name="${entry%%:*}"; ip="${entry##*:}" + ( + local out reboot held upgradable uu_log + reboot=$(ssh_node "$ip" 'test -f /var/run/reboot-required && echo yes || echo no') + held=$(ssh_node "$ip" 'apt-mark showhold 2>/dev/null') + upgradable=$(ssh_node "$ip" 'apt list --upgradable 2>/dev/null | tail -n +2') + uu_log=$(ssh_node "$ip" 'tail -1 /var/log/unattended-upgrades/unattended-upgrades.log 2>/dev/null') + printf 'reboot=%s\n' "$reboot" > "$tmpdir/$name" + printf 'held<<> "$tmpdir/$name" + printf 'upgradable<<> "$tmpdir/$name" + printf 'uu_log=%s\n' "$uu_log" >> "$tmpdir/$name" + ) & + done + wait + + # Aggregate. + local pending_reboots=() held_with_bumps_lines=() newest_uu_ts=0 newest_uu_iso="" + for entry in "${NODES[@]}"; do + name="${entry%%:*}" + [[ -f "$tmpdir/$name" ]] || continue + local reboot held upgradable uu_log uu_ts + reboot=$(awk -F= '/^reboot=/{print $2}' "$tmpdir/$name") + held=$(awk '/^held<</dev/null || echo 0) + if [[ "$epoch" -gt "$newest_uu_ts" ]]; then + newest_uu_ts="$epoch"; newest_uu_iso="$uu_ts" + fi + fi + done + + OS_PENDING_REBOOT_NODES="${pending_reboots[*]:-}" + if [[ ${#held_with_bumps_lines[@]} -gt 0 ]]; then + OS_HELD_DETAIL=$(printf '%s\n' "${held_with_bumps_lines[@]}" | sort -u | paste -sd '; ' -) + fi + + if [[ "$newest_uu_ts" -gt 0 ]]; then + local age=$((NOW_EPOCH - newest_uu_ts)) + OS_LAST_UU="$newest_uu_iso UTC ($(human_age "$age"))" + OS_LAST_CHECK="$(human_age "$age") (uu daily)" + else + OS_LAST_UU="(no uu log accessible)" + OS_LAST_CHECK="?" + fi + + # Last kured reboot — newest Ready transition across worker nodes. + # `Ready -> True` is what kured causes when the node returns; we surface + # the most recent timestamp and the node it belongs to. + local kured_raw kured_iso kured_node kured_ep kured_age + kured_raw=$($KUBECTL get nodes -o json 2>/dev/null | python3 -c ' +import json, sys +from datetime import datetime +data = json.load(sys.stdin) +best = (0, "", "") +for n in data["items"]: + name = n["metadata"]["name"] + for c in n["status"].get("conditions", []): + if c["type"] == "Ready": + dt = datetime.strptime(c["lastTransitionTime"], "%Y-%m-%dT%H:%M:%SZ") + ep = int(dt.timestamp()) + if ep > best[0]: + best = (ep, name, c["lastTransitionTime"]) +print(f"{best[0]}|{best[1]}|{best[2]}") +' 2>/dev/null || echo "0||") + kured_ep="${kured_raw%%|*}" + kured_node=$(echo "$kured_raw" | cut -d'|' -f2) + kured_iso=$(echo "$kured_raw" | cut -d'|' -f3) + if [[ "$kured_ep" -gt 0 ]]; then + kured_age=$((NOW_EPOCH - kured_ep)) + OS_LAST_KURED="$kured_iso ($kured_node, $(human_age "$kured_age"))" + else + OS_LAST_KURED="?" + fi + + OS_NEXT="daily 02:00-06:00 London" + + # Kured pod health. + local kured_pods kured_unhealthy + kured_pods=$($KUBECTL -n kured get pods -l app.kubernetes.io/name=kured -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' 2>/dev/null) + kured_unhealthy=$(echo "$kured_pods" | grep -cv '^Running$' 2>/dev/null || true) + + local notes=() + [[ -n "$OS_HELD_DETAIL" ]] && notes+=("held with bumps: $OS_HELD_DETAIL") + [[ -n "$OS_PENDING_REBOOT_NODES" ]] && notes+=("pending reboot: $OS_PENDING_REBOOT_NODES") + + if [[ "$kured_unhealthy" -gt 0 ]]; then + OS_STATUS_ICON="✗"; OS_STATUS_TEXT="kured down" + OS_NOTES="kured pods not all Running" + raise_exit 2 + elif [[ ${#notes[@]} -gt 0 ]]; then + OS_STATUS_ICON="⚠"; OS_STATUS_TEXT="attn" + OS_NOTES="${notes[*]}" + raise_exit 1 + else + OS_STATUS_ICON="✓"; OS_STATUS_TEXT="healthy" + OS_NOTES="distros uniform; no held bumps; no pending reboots" + fi +} + +# --- 3. K8s (kubeadm/kubelet/kubectl) --- +collect_k8s() { + local kver_list kver_uniq metrics target_patch target_minor last_run in_flight started + + kver_list=$($KUBECTL get nodes -o jsonpath='{range .items[*]}{.status.nodeInfo.kubeletVersion}{"\n"}{end}' 2>/dev/null) + kver_uniq=$(echo "$kver_list" | sort -u) + local n_uniq; n_uniq=$(echo "$kver_uniq" | wc -l | tr -d ' ') + if [[ "$n_uniq" -eq 1 ]]; then + K8S_RUNNING="$kver_uniq across $(echo "$kver_list" | wc -l | tr -d ' ')/$(echo "$kver_list" | wc -l | tr -d ' ') nodes" + else + K8S_RUNNING="mixed: $(echo "$kver_uniq" | paste -sd', ' -)" + fi + local running_ver; running_ver=$(echo "$kver_uniq" | head -1) + + metrics=$(pg_metrics) + # All five may legitimately be absent (cluster never ran the upgrade + # chain, kind="minor" not detected, etc.) — `|| true` keeps pipefail + # from killing the script on no-match. + target_patch=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="patch"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1) + target_minor=$(echo "$metrics" | { grep -E '^k8s_upgrade_available\{[^}]*kind="minor"' || true; } | sed -n 's/.*target="\([^"]*\)".*/\1/p' | head -1) + # Pushgateway emits these with `{instance="",job="..."}` labels — the + # `awk '$1 ~ /^name(\{|$)/'` form matches both bare and labelled metrics. + last_run=$(echo "$metrics" | awk '$1 ~ /^k8s_version_check_last_run_timestamp(\{|$)/{print $2}' | head -1 || true) + in_flight=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_in_flight(\{|$)/{print $2}' | head -1 || true) + started=$(echo "$metrics" | awk '$1 ~ /^k8s_upgrade_started_timestamp(\{|$)/{print $2}' | head -1 || true) + + # Pushgateway timestamps come back in scientific notation + # (e.g. 1.779052159e+09) — convert to plain integer seconds. + local last_run_int started_int + last_run_int=$(to_epoch_int "$last_run") + started_int=$(to_epoch_int "$started") + + if [[ "$last_run_int" -gt 0 ]]; then + local age=$((NOW_EPOCH - last_run_int)) + K8S_LAST_CHECK="$(human_age "$age") (Sun cron)" + if [[ -n "$target_patch" ]]; then + K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_patch (patch)" + elif [[ -n "$target_minor" ]]; then + K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_minor (minor)" + else + K8S_LAST_DETECT_LINE="last run $(human_age "$age"): no upgrade available" + fi + else + K8S_LAST_CHECK="(metric missing)" + K8S_LAST_DETECT_LINE="(no k8s_version_check_last_run_timestamp in Pushgateway)" + fi + K8S_PATCH="${target_patch:-none}" + K8S_MINOR="${target_minor:-none}" + + # In-flight / last chain. + if [[ "${in_flight:-0}" == "1" ]]; then + K8S_IN_FLIGHT="yes" + local since=0 + [[ "$started_int" -gt 0 ]] && since=$((NOW_EPOCH - started_int)) + K8S_LAST_CHAIN="in-flight (started $(human_age "$since"))" + else + K8S_IN_FLIGHT="no" + if [[ "$started_int" -gt 0 ]]; then + local age=$((NOW_EPOCH - started_int)) + K8S_LAST_CHAIN="$(human_age "$age")" + else + K8S_LAST_CHAIN="never (or zeroed)" + fi + fi + + K8S_NEXT="$(next_sunday_noon_utc)" + + # Status logic. + local stalled=0 + if [[ "${in_flight:-0}" == "1" && "$started_int" -gt 0 ]]; then + # K8sUpgradeStalled fires after 5400s (90m) per monitoring stack. + local since=$((NOW_EPOCH - started_int)) + [[ "$since" -gt 5400 ]] && stalled=1 + fi + local last_run_age=999999999 + [[ "$last_run_int" -gt 0 ]] && last_run_age=$((NOW_EPOCH - last_run_int)) + + if [[ "$stalled" == "1" ]]; then + K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="stalled" + K8S_NOTES="K8sUpgradeStalled would fire — chain in-flight >90m" + raise_exit 2 + elif [[ "$last_run_age" -gt $((9*86400)) ]]; then + K8S_STATUS_ICON="✗"; K8S_STATUS_TEXT="detection stale" + K8S_NOTES="last detection >9d ago" + raise_exit 2 + elif [[ "${in_flight:-0}" == "1" ]]; then + K8S_STATUS_ICON="…"; K8S_STATUS_TEXT="in-flight" + K8S_NOTES="upgrade chain running" + raise_exit 1 + elif [[ -n "$target_patch" ]]; then + K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_patch" + K8S_NOTES="running $running_ver → v$target_patch (patch) available" + raise_exit 1 + elif [[ -n "$target_minor" ]]; then + K8S_STATUS_ICON="→"; K8S_STATUS_TEXT="$target_minor" + K8S_NOTES="running $running_ver → v$target_minor (minor) available" + raise_exit 1 + else + K8S_STATUS_ICON="✓"; K8S_STATUS_TEXT="current" + K8S_NOTES="running $running_ver, nothing newer" + fi +} + +# Next Sun 12:00 UTC — pure bash date math, no croniter. +next_sunday_noon_utc() { + local now_iso target_iso + now_iso=$(date -u +%FT%TZ) + # date %u: Mon=1..Sun=7. Sun=7. + local dow; dow=$(date -u +%u) + local days_until=$(( (7 - dow) % 7 )) + # If today is Sunday and it's before 12:00 UTC, "next" is today. + if [[ "$dow" == "7" ]]; then + local hr; hr=$(date -u +%H) + [[ "$hr" -lt 12 ]] && days_until=0 || days_until=7 + fi + target_iso=$(date -u -d "+$days_until days" +"%Y-%m-%d 12:00 UTC") + echo "Sun $target_iso" +} + +# --- Renderers --- +# The table uses `column -t` so we don't have to compute visual widths +# manually (the status icons are multi-byte UTF-8 and ANSI escapes don't +# play nice with `printf %-Xs`). Trade-off: no in-cell colour, but the +# icon character already carries the signal. +render_table() { + echo + printf "${BOLD}Upgrade state — %s${NC}\n" "$(date -u +'%Y-%m-%d %H:%M UTC')" + echo + { + echo "Layer|Status|Last check|Next upgrade|Notes" + echo "-----|------|----------|------------|-----" + printf 'Apps|%s %s|%s|%s|%s\n' "$APPS_STATUS_ICON" "$APPS_STATUS_TEXT" "$APPS_LAST_CHECK" "$APPS_NEXT" "$APPS_NOTES" + printf 'OS |%s %s|%s|%s|%s\n' "$OS_STATUS_ICON" "$OS_STATUS_TEXT" "$OS_LAST_CHECK" "$OS_NEXT" "$OS_NOTES" + printf 'K8s |%s %s|%s|%s|%s\n' "$K8S_STATUS_ICON" "$K8S_STATUS_TEXT" "$K8S_LAST_CHECK" "$K8S_NEXT" "$K8S_NOTES" + } | column -t -s '|' -o ' | ' + + echo + printf "${BOLD}--- Apps (Keel) ---${NC}\n" + echo "Enrolled deployments: $APPS_ENROLLED" + echo "Recent rollouts: $APPS_UPDATES_LINE" + echo "Pending approvals: $APPS_PENDING" + echo "Last Keel error: $APPS_ERROR_LINE" + + echo + printf "${BOLD}--- OS (apt + kured) ---${NC}\n" + echo "Ubuntu per node: $OS_DISTRO_SUMMARY" + echo "Kernel per node: $OS_KERNEL_SUMMARY" + echo "Pending reboot: ${OS_PENDING_REBOOT_NODES:-none}" + echo "Held packages with upstream bumps: ${OS_HELD_DETAIL:-none (excluding k8s components)}" + echo "Last uu run (newest across nodes): $OS_LAST_UU" + echo "Last kured reboot (newest Ready transition): $OS_LAST_KURED" + echo "Next kured window: $OS_NEXT" + + echo + printf "${BOLD}--- K8s (kubeadm/kubelet/kubectl) ---${NC}\n" + echo "Running: $K8S_RUNNING" + echo "Latest patch (apt): ${K8S_PATCH}" + echo "Next minor available: ${K8S_MINOR}" + echo "Detection: $K8S_LAST_DETECT_LINE" + echo "In-flight: $K8S_IN_FLIGHT | Last chain start: $K8S_LAST_CHAIN" + echo "Next detection: $K8S_NEXT" + echo +} + +render_json() { + # Pipe values into Python via env vars so we don't need to worry about + # embedded quotes/backslashes in error lines. + APPS_STATUS_ICON="$APPS_STATUS_ICON" APPS_STATUS_TEXT="$APPS_STATUS_TEXT" \ + APPS_LAST_CHECK="$APPS_LAST_CHECK" APPS_NEXT="$APPS_NEXT" APPS_NOTES="$APPS_NOTES" \ + APPS_ENROLLED="$APPS_ENROLLED" APPS_PENDING="$APPS_PENDING" \ + APPS_UPDATES_LINE="$APPS_UPDATES_LINE" APPS_ERROR_LINE="$APPS_ERROR_LINE" \ + OS_STATUS_ICON="$OS_STATUS_ICON" OS_STATUS_TEXT="$OS_STATUS_TEXT" \ + OS_LAST_CHECK="$OS_LAST_CHECK" OS_NEXT="$OS_NEXT" OS_NOTES="$OS_NOTES" \ + OS_DISTRO_SUMMARY="$OS_DISTRO_SUMMARY" OS_KERNEL_SUMMARY="$OS_KERNEL_SUMMARY" \ + OS_PENDING_REBOOT_NODES="$OS_PENDING_REBOOT_NODES" OS_HELD_DETAIL="$OS_HELD_DETAIL" \ + OS_LAST_UU="$OS_LAST_UU" OS_LAST_KURED="$OS_LAST_KURED" \ + K8S_STATUS_ICON="$K8S_STATUS_ICON" K8S_STATUS_TEXT="$K8S_STATUS_TEXT" \ + K8S_LAST_CHECK="$K8S_LAST_CHECK" K8S_NEXT="$K8S_NEXT" K8S_NOTES="$K8S_NOTES" \ + K8S_RUNNING="$K8S_RUNNING" K8S_PATCH="$K8S_PATCH" K8S_MINOR="$K8S_MINOR" \ + K8S_LAST_DETECT_LINE="$K8S_LAST_DETECT_LINE" K8S_IN_FLIGHT="$K8S_IN_FLIGHT" K8S_LAST_CHAIN="$K8S_LAST_CHAIN" \ + HIGHEST_EXIT="$HIGHEST_EXIT" \ + python3 -c ' +import json, os +from datetime import datetime, timezone +def env(k): return os.environ.get(k, "") +out = { + "as_of_utc": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "highest_exit": int(env("HIGHEST_EXIT")), + "apps": { + "status": env("APPS_STATUS_ICON"), + "status_text": env("APPS_STATUS_TEXT"), + "last_check": env("APPS_LAST_CHECK"), + "next_upgrade": env("APPS_NEXT"), + "notes": env("APPS_NOTES"), + "enrolled": int(env("APPS_ENROLLED") or 0), + "pending_approvals": int(env("APPS_PENDING") or 0), + "updates_line": env("APPS_UPDATES_LINE"), + "errors_line": env("APPS_ERROR_LINE"), + }, + "os": { + "status": env("OS_STATUS_ICON"), + "status_text": env("OS_STATUS_TEXT"), + "last_check": env("OS_LAST_CHECK"), + "next_upgrade": env("OS_NEXT"), + "notes": env("OS_NOTES"), + "distros": env("OS_DISTRO_SUMMARY"), + "kernels": env("OS_KERNEL_SUMMARY"), + "pending_reboot_nodes": env("OS_PENDING_REBOOT_NODES"), + "held_with_bumps": env("OS_HELD_DETAIL"), + "last_uu_run": env("OS_LAST_UU"), + "last_kured_reboot": env("OS_LAST_KURED"), + }, + "k8s": { + "status": env("K8S_STATUS_ICON"), + "status_text": env("K8S_STATUS_TEXT"), + "last_check": env("K8S_LAST_CHECK"), + "next_upgrade": env("K8S_NEXT"), + "notes": env("K8S_NOTES"), + "running": env("K8S_RUNNING"), + "patch_target": env("K8S_PATCH"), + "minor_target": env("K8S_MINOR"), + "last_detection_line": env("K8S_LAST_DETECT_LINE"), + "in_flight": env("K8S_IN_FLIGHT"), + "last_chain": env("K8S_LAST_CHAIN"), + }, +} +print(json.dumps(out, indent=2)) +' +} + +main() { + parse_args "$@" + collect_apps + collect_os + collect_k8s + if [[ "$JSON" == true ]]; then + render_json + else + render_table + fi + exit "$HIGHEST_EXIT" +} + +main "$@" diff --git a/stacks/keel/main.tf b/stacks/keel/main.tf index 30a65f7b..7a794d8f 100644 --- a/stacks/keel/main.tf +++ b/stacks/keel/main.tf @@ -46,6 +46,16 @@ resource "helm_release" "keel" { atomic = true values = [yamlencode({ + # Prometheus pod-annotation scrape — picks up Keel-specific metrics + # (pending_approvals, poll_trigger_tracked_images, registries_scanned_total{image,registry}) + # on container port 9300 /metrics. The cluster's `kubernetes-pods` + # Prometheus job keys on these annotations. Used by + # infra/scripts/upgrade_state.sh (the /upgrade-state skill). + podAnnotations = { + "prometheus.io/scrape" = "true" + "prometheus.io/port" = "9300" + "prometheus.io/path" = "/metrics" + } polling = { enabled = true # Default poll cadence for workloads that don't override per-Deployment