From a64d2ba2b9a0e2c5a122ca0a6ba636f7dbd3911c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 2 Jul 2026 07:16:50 +0000 Subject: [PATCH] upgrades: fix hourly gotenberg error + cap update notifications at weekly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Viktor was getting upgrade-error Slack messages every hour and wants update notifications at most weekly. Root cause of the errors: Keel kept trying to roll gotenberg 8.25->8.25.1 in paperless-ngx but kyverno's require-trusted-registries denied it — gotenberg/* (and apache/*, which tika will hit next) were never allowlisted, and Keel's Slack notifier at info level re-posted the identical failure to #general on every hourly poll since Jun 28. Changes: allowlist gotenberg/* + apache/* so the patch applies cleanly; disable Keel's direct Slack notifier and replace failure visibility with a KeelUpdateFailing Loki-ruler alert (alert-on-change: one notification plus the daily digest, never an hourly drip); remove diun's Slack notifier whose default message @channel-pinged #image-updates for every new upstream tag every 6h (the n8n upgrade-agent webhook feed is untouched). The k8s upgrade report is already weekly (Mon 06:07 UTC). Paperless-ngx itself stays paused (keel policy=never, user-managed) while the ingest runs. Co-Authored-By: Claude Fable 5 --- docs/architecture/automated-upgrades.md | 2 +- stacks/diun/main.tf | 18 ++++++---------- stacks/keel/main.tf | 19 ++++++++--------- .../modules/kyverno/security-policies.tf | 4 ++-- stacks/monitoring/modules/monitoring/loki.tf | 21 +++++++++++++++++++ 5 files changed, 39 insertions(+), 25 deletions(-) diff --git a/docs/architecture/automated-upgrades.md b/docs/architecture/automated-upgrades.md index c0200d84..e0decbfb 100644 --- a/docs/architecture/automated-upgrades.md +++ b/docs/architecture/automated-upgrades.md @@ -128,7 +128,7 @@ The agent handles all three version patterns in Terraform: - **Slack**: All upgrade events reported (start, success, failure, rollback) - **Git**: Detailed commit messages with changelog summaries, risk level, backup status -- **DIUN Slack**: Independent Slack channel for raw version detection (separate from upgrade agent) +- **DIUN Slack**: REMOVED 2026-07-02 (per-tag @channel pings in #image-updates; human cadence is the weekly upgrade report). The n8n webhook feed to the upgrade agent is unchanged. ## Bulk Upgrades diff --git a/stacks/diun/main.tf b/stacks/diun/main.tf index 81294806..21aac2c2 100644 --- a/stacks/diun/main.tf +++ b/stacks/diun/main.tf @@ -9,7 +9,7 @@ resource "kubernetes_namespace" "diun" { name = "diun" labels = { "istio-injection" : "disabled" - tier = local.tiers.aux + tier = local.tiers.aux "keel.sh/enrolled" = "true" } } @@ -203,16 +203,10 @@ resource "kubernetes_deployment" "diun" { name = "DIUN_NOTIF_WEBHOOK_HEADERS_CONTENT-TYPE" value = "application/json" } - # Slack notifier (independent notification channel) - env { - name = "DIUN_NOTIF_SLACK_WEBHOOKURL" - value_from { - secret_key_ref { - name = "diun-secrets" - key = "slack_url" - } - } - } + # Slack notifier REMOVED (2026-07-02): diun's default Slack message + # -pinged #image-updates for every new upstream tag (every + # 6h watch cycle). The n8n webhook feed above remains the machine + # consumer; humans get the weekly upgrade report instead. env { name = "LOG_LEVEL" value = "debug" @@ -242,7 +236,7 @@ resource "kubernetes_deployment" "diun" { } lifecycle { ignore_changes = [ - spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates metadata[0].annotations["keel.sh/policy"], metadata[0].annotations["keel.sh/trigger"], diff --git a/stacks/keel/main.tf b/stacks/keel/main.tf index e29fda5a..93a6b5ec 100644 --- a/stacks/keel/main.tf +++ b/stacks/keel/main.tf @@ -38,7 +38,7 @@ resource "helm_release" "keel" { chart = "keel" # Latest stable per `helm search repo keel/keel -l` 2026-05-16 # (app version 0.21.1). 1.0.6 doesn't exist — verify before bumping. - version = "1.2.0" + version = "1.2.0" # Atomic mitigates partial-deploy state. Keel itself is exempt from # auto-update (Kyverno mutate excludes the keel namespace), so it only @@ -80,16 +80,15 @@ resource "helm_release" "keel" { persistence = { enabled = false } - # Slack notifications: post every rollout to the configured channel. - # Bot token from Vault (secret/viktor -> slack_bot_token). The Keel - # chart sets SLACK_BOT_TOKEN, SLACK_CHANNELS, etc. on the deployment - # from these values. + # Direct Slack notifications DISABLED (2026-07-02): at notificationLevel + # info Keel posted every rollout event to #general, and a stuck update + # (gotenberg blocked by require-trusted-registries) re-posted the same + # failure EVERY HOURLY POLL for days. Failure visibility now comes from + # the KeelUpdateFailing Loki-ruler alert (stacks/monitoring loki.tf), + # which rides the alert-on-change routing: one Slack notification plus + # the daily digest — never an hourly drip. slack = { - enabled = true - botToken = data.vault_kv_secret_v2.viktor.data["slack_bot_token"] - channel = "general" - # No approval flow — opt-out-pure means everything auto-rolls. - # If we ever introduce gated rollouts, set approvalsChannel here. + enabled = false } # Keel uses each watched Deployment's own imagePullSecrets to query # its registry. Forgejo creds (`registry-credentials`) are auto-synced diff --git a/stacks/kyverno/modules/kyverno/security-policies.tf b/stacks/kyverno/modules/kyverno/security-policies.tf index 7ceb79d3..4fc681cc 100644 --- a/stacks/kyverno/modules/kyverno/security-policies.tf +++ b/stacks/kyverno/modules/kyverno/security-policies.tf @@ -345,10 +345,10 @@ resource "kubectl_manifest" "policy_require_trusted_registries" { # 2026-06-05: mauriceboe (TREK group-trip planner trial). "actualbudget/*", "afadil/*", "amruthpillai/*", "athomasson2/*", "binwiederhier/*", "bitnami/*", - "clickhouse/*", "cloudflare/*", "coturn/*", "crowdsecurity/*", + "apache/*", "clickhouse/*", "cloudflare/*", "coturn/*", "crowdsecurity/*", "curlimages/*", "deluan/*", "dgtlmoon/*", "dolthub/*", "dpage/*", "dperson/*", "edoburu/*", "esanchezm/*", - "freikin/*", "freshrss/*", "hackmdio/*", "hashicorp/*", + "freikin/*", "freshrss/*", "gotenberg/*", "hackmdio/*", "hashicorp/*", "headscale/*", "jhonderson/*", "kebe/*", "library/*", "lissy93/*", "louislam/*", "matrixdotorg/*", "mauriceboe/*", "mendhak/*", diff --git a/stacks/monitoring/modules/monitoring/loki.tf b/stacks/monitoring/modules/monitoring/loki.tf index 47ad6ba4..fc237c5b 100644 --- a/stacks/monitoring/modules/monitoring/loki.tf +++ b/stacks/monitoring/modules/monitoring/loki.tf @@ -225,6 +225,27 @@ resource "kubernetes_config_map" "loki_alert_rules" { }, ] }, + { + # App auto-upgrades (Keel). Keel's direct Slack notifier was disabled + # 2026-07-02 after a stuck update (gotenberg vs require-trusted- + # registries) re-posted an identical failure to #general on every + # hourly poll for days. This log alert is the replacement failure + # signal: alert-on-change routing notifies ONCE and the daily digest + # carries it while it persists — never an hourly drip. + name = "App auto-upgrades (Keel)" + rules = [ + { + alert = "KeelUpdateFailing" + expr = "sum(count_over_time({namespace=\"keel\"} |= \"level=error\" |= \"got error while updating resource\" [3h])) > 2" + for = "10m" + labels = { severity = "warning" } + annotations = { + summary = "Keel repeatedly failing to roll out an image update" + description = "Keel failed the same resource update >2 times in 3h (its poll is hourly, so this means a persistently stuck rollout, not a blip). kubectl -n keel logs deploy/keel | grep level=error. Common causes: kyverno require-trusted-registries denying the new tag (extend the allowlist in stacks/kyverno/modules/kyverno/security-policies.tf), a ResourceQuota rejecting the surge pod, or a bad imagePullSecret." + } + }, + ] + }, { # t3 session-auth + auto-upgrade health (devvm host scripts → journald → # Loki). Backstops the gated-nightly t3 tracker: the dispatch logs every