k8s-upgrade: nightly Slack report monitor + scope chain-failed alert to phases
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All checks were successful
ci/woodpecker/push/default Pipeline was successful
Adds a daily visibility layer so every night's autonomous-upgrade outcome is reviewable at a glance during the upgrade-cleanup window (Viktor: "track every night's upgrade for the next 7 days; clean up all bugs and blockers"). Last night (2026-06-20) confirmed BOTH prior fixes work in production: the detector resolved target 1.35.6 (k8s_upgrade_available) and the compat gate correctly REFUSED it (k8s_upgrade_blocked=1 -> K8sUpgradeBlocked) because ESO v0.12 (<=1.31) and kyverno v1.16 (<=1.34) don't support 1.35. What's here: - CronJob k8s-upgrade-nightly-report (06:07 UTC) -> one Slack summary/morning: running version, detector freshness, detected target, outcome (no-op / blocked+live reasons / upgraded / in-progress / detector-stale), recent jobs. Read-only: reads Pushgateway gauges + live nodes/jobs, re-runs compat-gate.py for fresh blockers; reuses the chain SA + slack_webhook + scripts ConfigMap. Pure helpers unit-tested (test_nightly_report.py, 8 cases incl. a real v-prefix bug TDD caught). Verified end-to-end in-cluster (posted to Slack). - K8sUpgradeChainJobFailed regex scoped from `k8s-upgrade-.*` to `k8s-upgrade-(preflight|master|worker|postflight)-.*` so the new report job (or any future helper) can't false-trip the chain-wedged alarm. Manual state repair (no git artifact): imported the orphaned `alert-digest` CronJob into the monitoring stack state (`tg import module.monitoring.kubernetes_cron_job_v1.alert_digest monitoring/alert-digest`). Root cause: when alert_digest was added (2026-06-12) the apply recorded its ConfigMap + Secret but not the CronJob, so every full monitoring apply since has failed with `cronjobs.batch "alert-digest" already exists` (Woodpecker pipeline 298 today) — surviving only via targeted prometheus applies. Now in state, so monitoring CI applies cleanly again. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
7270e2be3b
commit
ead876ec65
6 changed files with 431 additions and 3 deletions
|
|
@ -41,6 +41,15 @@ variable "enabled" {
|
|||
default = true
|
||||
}
|
||||
|
||||
# Nightly upgrade-report CronJob schedule. 06:07 UTC (07:07 London) — safely
|
||||
# after the 23:00 chain has finished (worst case ~02:00) and before the 08:00
|
||||
# London alert-digest, so the morning Slack skim shows last night's upgrade
|
||||
# outcome + any live blocker. Posts once/day; read-only.
|
||||
variable "report_schedule" {
|
||||
type = string
|
||||
default = "7 6 * * *"
|
||||
}
|
||||
|
||||
# Mirrors `local.image_tag` in stacks/claude-agent-service/main.tf — bump
|
||||
# in lockstep with claude-agent-service rebuilds. The image ships kubectl,
|
||||
# ssh-client, curl, jq, envsubst — everything the upgrade Jobs need.
|
||||
|
|
@ -301,6 +310,7 @@ resource "kubernetes_config_map" "k8s_upgrade_scripts" {
|
|||
"update_k8s.sh" = file("${path.module}/../../scripts/update_k8s.sh")
|
||||
"compat-gate.py" = file("${path.module}/scripts/compat-gate.py")
|
||||
"addon-compat.json" = file("${path.module}/scripts/addon-compat.json")
|
||||
"nightly-report.py" = file("${path.module}/scripts/nightly-report.py")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -548,6 +558,98 @@ resource "kubernetes_cron_job_v1" "k8s_version_check" {
|
|||
}
|
||||
}
|
||||
|
||||
# --- Nightly upgrade report ---
|
||||
#
|
||||
# Each morning, after the 23:00 chain has finished, posts ONE concise Slack
|
||||
# report of last night's upgrade outcome (no-op / blocked+reasons / upgraded /
|
||||
# in-progress) so the autonomous upgrader's nightly result — and any live
|
||||
# blocker — is visible at a glance. Read-only: reads the chain's Pushgateway
|
||||
# gauges + live nodes/jobs and re-runs compat-gate.py for fresh blocker reasons.
|
||||
# Reuses the same SA, creds secret (slack_webhook), and scripts ConfigMap as the
|
||||
# chain. Logic + unit tests: scripts/nightly-report.py, scripts/test_nightly_report.py.
|
||||
resource "kubernetes_cron_job_v1" "k8s_upgrade_nightly_report" {
|
||||
metadata {
|
||||
name = "k8s-upgrade-nightly-report"
|
||||
namespace = kubernetes_namespace.k8s_upgrade.metadata[0].name
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
schedule = var.report_schedule
|
||||
concurrency_policy = "Forbid"
|
||||
successful_jobs_history_limit = 3
|
||||
failed_jobs_history_limit = 3
|
||||
starting_deadline_seconds = 600
|
||||
suspend = !var.enabled
|
||||
job_template {
|
||||
metadata {
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 86400
|
||||
template {
|
||||
metadata {
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
service_account_name = kubernetes_service_account.k8s_upgrade_job.metadata[0].name
|
||||
restart_policy = "Never"
|
||||
image_pull_secrets {
|
||||
name = "registry-credentials"
|
||||
}
|
||||
volume {
|
||||
name = "creds"
|
||||
secret {
|
||||
secret_name = "k8s-upgrade-creds"
|
||||
default_mode = "0444"
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "scripts"
|
||||
config_map {
|
||||
name = kubernetes_config_map.k8s_upgrade_scripts.metadata[0].name
|
||||
default_mode = "0755"
|
||||
}
|
||||
}
|
||||
container {
|
||||
name = "report"
|
||||
image = local.image
|
||||
command = ["python3", "/scripts/nightly-report.py"]
|
||||
env {
|
||||
name = "HOME"
|
||||
value = "/tmp"
|
||||
}
|
||||
volume_mount {
|
||||
name = "creds"
|
||||
mount_path = "/secrets/k8s-upgrade"
|
||||
read_only = true
|
||||
}
|
||||
volume_mount {
|
||||
name = "scripts"
|
||||
mount_path = "/scripts"
|
||||
read_only = true
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "50m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
|
||||
# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
|
||||
# CI retrigger v2 2026-05-16T13:46:35+00:00
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue