k8s-upgrade: nightly Slack report monitor + scope chain-failed alert to phases

Adds a daily visibility layer so every night's autonomous-upgrade outcome is reviewable at a glance during the upgrade-cleanup window (Viktor: "track every night's upgrade for the next 7 days; clean up all bugs and blockers"). Last night (2026-06-20) confirmed BOTH prior fixes work in production: the detector resolved target 1.35.6 (k8s_upgrade_available) and the compat gate correctly REFUSED it (k8s_upgrade_blocked=1 -> K8sUpgradeBlocked) because ESO v0.12 (<=1.31) and kyverno v1.16 (<=1.34) don't support 1.35. What's here: - CronJob k8s-upgrade-nightly-report (06:07 UTC) -> one Slack summary/morning: running version, detector freshness, detected target, outcome (no-op / blocked+live reasons / upgraded / in-progress / detector-stale), recent jobs. Read-only: reads Pushgateway gauges + live nodes/jobs, re-runs compat-gate.py for fresh blockers; reuses the chain SA + slack_webhook + scripts ConfigMap. Pure helpers unit-tested (test_nightly_report.py, 8 cases incl. a real v-prefix bug TDD caught). Verified end-to-end in-cluster (posted to Slack). - K8sUpgradeChainJobFailed regex scoped from `k8s-upgrade-.*` to `k8s-upgrade-(preflight|master|worker|postflight)-.*` so the new report job (or any future helper) can't false-trip the chain-wedged alarm. Manual state repair (no git artifact): imported the orphaned `alert-digest` CronJob into the monitoring stack state (`tg import module.monitoring.kubernetes_cron_job_v1.alert_digest monitoring/alert-digest`). Root cause: when alert_digest was added (2026-06-12) the apply recorded its ConfigMap + Secret but not the CronJob, so every full monitoring apply since has failed with `cronjobs.batch "alert-digest" already exists` (Woodpecker pipeline 298 today) — surviving only via targeted prometheus applies. Now in state, so monitoring CI applies cleanly again. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-21 16:57:44 +00:00 · 2026-06-21 16:57:44 +00:00 · ead876ec65
commit ead876ec65
parent 7270e2be3b
6 changed files with 431 additions and 3 deletions
--- a/stacks/k8s-version-upgrade/main.tf
+++ b/stacks/k8s-version-upgrade/main.tf
@ -41,6 +41,15 @@ variable "enabled" {
  default = true
 }

+# Nightly upgrade-report CronJob schedule. 06:07 UTC (07:07 London) — safely
+# after the 23:00 chain has finished (worst case ~02:00) and before the 08:00
+# London alert-digest, so the morning Slack skim shows last night's upgrade
+# outcome + any live blocker. Posts once/day; read-only.
+variable "report_schedule" {
+  type    = string
+  default = "7 6 * * *"
+}
+
 # Mirrors `local.image_tag` in stacks/claude-agent-service/main.tf — bump
 # in lockstep with claude-agent-service rebuilds. The image ships kubectl,
 # ssh-client, curl, jq, envsubst — everything the upgrade Jobs need.
@ -301,6 +310,7 @@ resource "kubernetes_config_map" "k8s_upgrade_scripts" {
    "update_k8s.sh"     = file("${path.module}/../../scripts/update_k8s.sh")
    "compat-gate.py"    = file("${path.module}/scripts/compat-gate.py")
    "addon-compat.json" = file("${path.module}/scripts/addon-compat.json")
+    "nightly-report.py" = file("${path.module}/scripts/nightly-report.py")
  }
 }

@ -548,6 +558,98 @@ resource "kubernetes_cron_job_v1" "k8s_version_check" {
  }
 }

+# --- Nightly upgrade report ---
+#
+# Each morning, after the 23:00 chain has finished, posts ONE concise Slack
+# report of last night's upgrade outcome (no-op / blocked+reasons / upgraded /
+# in-progress) so the autonomous upgrader's nightly result — and any live
+# blocker — is visible at a glance. Read-only: reads the chain's Pushgateway
+# gauges + live nodes/jobs and re-runs compat-gate.py for fresh blocker reasons.
+# Reuses the same SA, creds secret (slack_webhook), and scripts ConfigMap as the
+# chain. Logic + unit tests: scripts/nightly-report.py, scripts/test_nightly_report.py.
+resource "kubernetes_cron_job_v1" "k8s_upgrade_nightly_report" {
+  metadata {
+    name      = "k8s-upgrade-nightly-report"
+    namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
+    labels    = local.labels
+  }
+  spec {
+    schedule                      = var.report_schedule
+    concurrency_policy            = "Forbid"
+    successful_jobs_history_limit = 3
+    failed_jobs_history_limit     = 3
+    starting_deadline_seconds     = 600
+    suspend                       = !var.enabled
+    job_template {
+      metadata {
+        labels = local.labels
+      }
+      spec {
+        backoff_limit              = 1
+        ttl_seconds_after_finished = 86400
+        template {
+          metadata {
+            labels = local.labels
+          }
+          spec {
+            service_account_name = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
+            restart_policy       = "Never"
+            image_pull_secrets {
+              name = "registry-credentials"
+            }
+            volume {
+              name = "creds"
+              secret {
+                secret_name  = "k8s-upgrade-creds"
+                default_mode = "0444"
+              }
+            }
+            volume {
+              name = "scripts"
+              config_map {
+                name         = kubernetes_config_map.k8s_upgrade_scripts.metadata[0].name
+                default_mode = "0755"
+              }
+            }
+            container {
+              name    = "report"
+              image   = local.image
+              command = ["python3", "/scripts/nightly-report.py"]
+              env {
+                name  = "HOME"
+                value = "/tmp"
+              }
+              volume_mount {
+                name       = "creds"
+                mount_path = "/secrets/k8s-upgrade"
+                read_only  = true
+              }
+              volume_mount {
+                name       = "scripts"
+                mount_path = "/scripts"
+                read_only  = true
+              }
+              resources {
+                requests = {
+                  cpu    = "50m"
+                  memory = "128Mi"
+                }
+                limits = {
+                  memory = "256Mi"
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
+    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
+  }
+}
+
 # CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
 # CI retrigger v2 2026-05-16T13:46:35+00:00