From 3d43d96a5e671670451eea293d23aab0723c1cc7 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 18 May 2026 18:29:08 +0000 Subject: [PATCH] k8s-version-upgrade: switch detection cron from weekly to daily MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Was `0 12 * * 0` (Sun 12:00 UTC) — patch releases waited up to 6 days before the chain picked them up. Now `0 12 * * *` (daily 12:00 UTC, still outside kured's 02:00-06:00 London window). Concurrency is bounded by Forbid + deterministic job-name idempotency (the detection job exits early if a preflight Job for the same target already exists), so back-to-back days can't pile up parallel runs. - stacks/k8s-version-upgrade/main.tf: var.schedule default + rationale comment - scripts/upgrade_state.sh: rename next_sunday_noon_utc -> next_daily_noon_utc (now returns "Tue 2026-05-19 12:00 UTC" form); change "(Sun cron)" label to "(daily cron)" - .claude/skills/upgrade-state/SKILL.md: cadence column + frontmatter Co-Authored-By: Claude Opus 4.7 --- .claude/skills/upgrade-state/SKILL.md | 9 +++++---- scripts/upgrade_state.sh | 27 +++++++++++---------------- stacks/k8s-version-upgrade/main.tf | 7 ++++++- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/.claude/skills/upgrade-state/SKILL.md b/.claude/skills/upgrade-state/SKILL.md index 6cf23084..a2027a50 100644 --- a/.claude/skills/upgrade-state/SKILL.md +++ b/.claude/skills/upgrade-state/SKILL.md @@ -8,7 +8,7 @@ description: | (2) User asks "what's pending upgrade" or "what's the upgrade state", (3) User asks if Keel / kured / k8s-version-check is healthy, (4) User asks about kept-back / held packages or pending reboots, - (5) Before the Sunday `k8s-version-check` CronJob fires (weekly survey). + (5) Periodic survey before the next `k8s-version-check` daily run. Read-only — no `--fix`. Exits 0 healthy / 1 attention / 2 stalled. author: Claude Code version: 1.0.0 @@ -51,7 +51,7 @@ Exit codes: `0` healthy, `1` attention warranted, `2` stalled / broken. |---|---|---|---| | **Apps** | Keel polls every watched Deployment's container registry; rolls on new digest | hourly | Prom (`pending_approvals`, `registries_scanned_total`), Keel pod logs | | **OS** | `unattended-upgrades` in-release patching; `kured` reboots when `/var/run/reboot-required` is set | daily 02:00-06:00 London | SSH fan-out to all 5 nodes | -| **K8s** | `k8s-version-check` CronJob detects new kubeadm patch/minor; spawns the Job-chain that drains+upgrades node-by-node | Sun 12:00 UTC | Pushgateway (`k8s_upgrade_*`), `kubectl get nodes` | +| **K8s** | `k8s-version-check` CronJob detects new kubeadm patch/minor; spawns the Job-chain that drains+upgrades node-by-node | daily 12:00 UTC | Pushgateway (`k8s_upgrade_*`), `kubectl get nodes` | The K8s pipeline pushes a small set of gauges to the Prometheus Pushgateway (`prometheus-prometheus-pushgateway.monitoring:9091`): @@ -138,8 +138,9 @@ kubectl -n kured get pods -l name=kured-sentinel-gate ### K8s `→` — patch/minor available -Detection ran, target identified, chain NOT started. This is normal -between Sun 12:00 UTC detection and the next Job chain. +Detection ran, target identified, chain NOT started. The chain spawns +on the same daily detection cycle — typically within ~24h of the +target first being detected. ```bash # Inspect Pushgateway state diff --git a/scripts/upgrade_state.sh b/scripts/upgrade_state.sh index 5f7fa7ab..003996f0 100755 --- a/scripts/upgrade_state.sh +++ b/scripts/upgrade_state.sh @@ -384,7 +384,7 @@ collect_k8s() { if [[ "$last_run_int" -gt 0 ]]; then local age=$((NOW_EPOCH - last_run_int)) - K8S_LAST_CHECK="$(human_age "$age") (Sun cron)" + K8S_LAST_CHECK="$(human_age "$age") (daily cron)" if [[ -n "$target_patch" ]]; then K8S_LAST_DETECT_LINE="last run $(human_age "$age"): available v$target_patch (patch)" elif [[ -n "$target_minor" ]]; then @@ -415,7 +415,7 @@ collect_k8s() { fi fi - K8S_NEXT="$(next_sunday_noon_utc)" + K8S_NEXT="$(next_daily_noon_utc)" # Status logic. local stalled=0 @@ -453,20 +453,15 @@ collect_k8s() { fi } -# Next Sun 12:00 UTC — pure bash date math, no croniter. -next_sunday_noon_utc() { - local now_iso target_iso - now_iso=$(date -u +%FT%TZ) - # date %u: Mon=1..Sun=7. Sun=7. - local dow; dow=$(date -u +%u) - local days_until=$(( (7 - dow) % 7 )) - # If today is Sunday and it's before 12:00 UTC, "next" is today. - if [[ "$dow" == "7" ]]; then - local hr; hr=$(date -u +%H) - [[ "$hr" -lt 12 ]] && days_until=0 || days_until=7 - fi - target_iso=$(date -u -d "+$days_until days" +"%Y-%m-%d 12:00 UTC") - echo "Sun $target_iso" +# Next daily 12:00 UTC — pure bash date math, no croniter. Schedule was +# weekly Sunday until 2026-05-18; now `0 12 * * *` in the +# k8s-version-upgrade stack. If we're still before today's 12:00 UTC, +# the next run is today; otherwise it's tomorrow. +next_daily_noon_utc() { + local hr days_ahead + hr=$(date -u +%H) + if [[ "$hr" -lt 12 ]]; then days_ahead=0; else days_ahead=1; fi + date -u -d "+$days_ahead days" +"%a %Y-%m-%d 12:00 UTC" } # --- Renderers --- diff --git a/stacks/k8s-version-upgrade/main.tf b/stacks/k8s-version-upgrade/main.tf index dac54af2..7fe00d96 100644 --- a/stacks/k8s-version-upgrade/main.tf +++ b/stacks/k8s-version-upgrade/main.tf @@ -26,7 +26,12 @@ variable "schedule" { type = string - default = "0 12 * * 0" # Sunday 12:00 UTC — outside kured window + # Daily 12:00 UTC — outside kured window (kured runs 02:00-06:00 + # London). Was weekly Sunday until 2026-05-18; daily picks up upstream + # patch releases the same day they land. Concurrency is bounded by the + # CronJob's Forbid policy + Job-name idempotency (the detection job + # skips spawning a preflight Job if one already exists). + default = "0 12 * * *" } variable "enabled" {