upgrades: fix hourly gotenberg error + cap update notifications at weekly
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All checks were successful
ci/woodpecker/push/default Pipeline was successful
Viktor was getting upgrade-error Slack messages every hour and wants update notifications at most weekly. Root cause of the errors: Keel kept trying to roll gotenberg 8.25->8.25.1 in paperless-ngx but kyverno's require-trusted-registries denied it — gotenberg/* (and apache/*, which tika will hit next) were never allowlisted, and Keel's Slack notifier at info level re-posted the identical failure to #general on every hourly poll since Jun 28. Changes: allowlist gotenberg/* + apache/* so the patch applies cleanly; disable Keel's direct Slack notifier and replace failure visibility with a KeelUpdateFailing Loki-ruler alert (alert-on-change: one notification plus the daily digest, never an hourly drip); remove diun's Slack notifier whose default message @channel-pinged #image-updates for every new upstream tag every 6h (the n8n upgrade-agent webhook feed is untouched). The k8s upgrade report is already weekly (Mon 06:07 UTC). Paperless-ngx itself stays paused (keel policy=never, user-managed) while the ingest runs. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
5d5d9752cb
commit
a64d2ba2b9
5 changed files with 39 additions and 25 deletions
|
|
@ -128,7 +128,7 @@ The agent handles all three version patterns in Terraform:
|
||||||
|
|
||||||
- **Slack**: All upgrade events reported (start, success, failure, rollback)
|
- **Slack**: All upgrade events reported (start, success, failure, rollback)
|
||||||
- **Git**: Detailed commit messages with changelog summaries, risk level, backup status
|
- **Git**: Detailed commit messages with changelog summaries, risk level, backup status
|
||||||
- **DIUN Slack**: Independent Slack channel for raw version detection (separate from upgrade agent)
|
- **DIUN Slack**: REMOVED 2026-07-02 (per-tag @channel pings in #image-updates; human cadence is the weekly upgrade report). The n8n webhook feed to the upgrade agent is unchanged.
|
||||||
|
|
||||||
## Bulk Upgrades
|
## Bulk Upgrades
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ resource "kubernetes_namespace" "diun" {
|
||||||
name = "diun"
|
name = "diun"
|
||||||
labels = {
|
labels = {
|
||||||
"istio-injection" : "disabled"
|
"istio-injection" : "disabled"
|
||||||
tier = local.tiers.aux
|
tier = local.tiers.aux
|
||||||
"keel.sh/enrolled" = "true"
|
"keel.sh/enrolled" = "true"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -203,16 +203,10 @@ resource "kubernetes_deployment" "diun" {
|
||||||
name = "DIUN_NOTIF_WEBHOOK_HEADERS_CONTENT-TYPE"
|
name = "DIUN_NOTIF_WEBHOOK_HEADERS_CONTENT-TYPE"
|
||||||
value = "application/json"
|
value = "application/json"
|
||||||
}
|
}
|
||||||
# Slack notifier (independent notification channel)
|
# Slack notifier REMOVED (2026-07-02): diun's default Slack message
|
||||||
env {
|
# <!channel>-pinged #image-updates for every new upstream tag (every
|
||||||
name = "DIUN_NOTIF_SLACK_WEBHOOKURL"
|
# 6h watch cycle). The n8n webhook feed above remains the machine
|
||||||
value_from {
|
# consumer; humans get the weekly upgrade report instead.
|
||||||
secret_key_ref {
|
|
||||||
name = "diun-secrets"
|
|
||||||
key = "slack_url"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
env {
|
env {
|
||||||
name = "LOG_LEVEL"
|
name = "LOG_LEVEL"
|
||||||
value = "debug"
|
value = "debug"
|
||||||
|
|
@ -242,7 +236,7 @@ resource "kubernetes_deployment" "diun" {
|
||||||
}
|
}
|
||||||
lifecycle {
|
lifecycle {
|
||||||
ignore_changes = [
|
ignore_changes = [
|
||||||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
||||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||||
metadata[0].annotations["keel.sh/policy"],
|
metadata[0].annotations["keel.sh/policy"],
|
||||||
metadata[0].annotations["keel.sh/trigger"],
|
metadata[0].annotations["keel.sh/trigger"],
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ resource "helm_release" "keel" {
|
||||||
chart = "keel"
|
chart = "keel"
|
||||||
# Latest stable per `helm search repo keel/keel -l` 2026-05-16
|
# Latest stable per `helm search repo keel/keel -l` 2026-05-16
|
||||||
# (app version 0.21.1). 1.0.6 doesn't exist — verify before bumping.
|
# (app version 0.21.1). 1.0.6 doesn't exist — verify before bumping.
|
||||||
version = "1.2.0"
|
version = "1.2.0"
|
||||||
|
|
||||||
# Atomic mitigates partial-deploy state. Keel itself is exempt from
|
# Atomic mitigates partial-deploy state. Keel itself is exempt from
|
||||||
# auto-update (Kyverno mutate excludes the keel namespace), so it only
|
# auto-update (Kyverno mutate excludes the keel namespace), so it only
|
||||||
|
|
@ -80,16 +80,15 @@ resource "helm_release" "keel" {
|
||||||
persistence = {
|
persistence = {
|
||||||
enabled = false
|
enabled = false
|
||||||
}
|
}
|
||||||
# Slack notifications: post every rollout to the configured channel.
|
# Direct Slack notifications DISABLED (2026-07-02): at notificationLevel
|
||||||
# Bot token from Vault (secret/viktor -> slack_bot_token). The Keel
|
# info Keel posted every rollout event to #general, and a stuck update
|
||||||
# chart sets SLACK_BOT_TOKEN, SLACK_CHANNELS, etc. on the deployment
|
# (gotenberg blocked by require-trusted-registries) re-posted the same
|
||||||
# from these values.
|
# failure EVERY HOURLY POLL for days. Failure visibility now comes from
|
||||||
|
# the KeelUpdateFailing Loki-ruler alert (stacks/monitoring loki.tf),
|
||||||
|
# which rides the alert-on-change routing: one Slack notification plus
|
||||||
|
# the daily digest — never an hourly drip.
|
||||||
slack = {
|
slack = {
|
||||||
enabled = true
|
enabled = false
|
||||||
botToken = data.vault_kv_secret_v2.viktor.data["slack_bot_token"]
|
|
||||||
channel = "general"
|
|
||||||
# No approval flow — opt-out-pure means everything auto-rolls.
|
|
||||||
# If we ever introduce gated rollouts, set approvalsChannel here.
|
|
||||||
}
|
}
|
||||||
# Keel uses each watched Deployment's own imagePullSecrets to query
|
# Keel uses each watched Deployment's own imagePullSecrets to query
|
||||||
# its registry. Forgejo creds (`registry-credentials`) are auto-synced
|
# its registry. Forgejo creds (`registry-credentials`) are auto-synced
|
||||||
|
|
|
||||||
|
|
@ -345,10 +345,10 @@ resource "kubectl_manifest" "policy_require_trusted_registries" {
|
||||||
# 2026-06-05: mauriceboe (TREK group-trip planner trial).
|
# 2026-06-05: mauriceboe (TREK group-trip planner trial).
|
||||||
"actualbudget/*", "afadil/*", "amruthpillai/*", "athomasson2/*",
|
"actualbudget/*", "afadil/*", "amruthpillai/*", "athomasson2/*",
|
||||||
"binwiederhier/*", "bitnami/*",
|
"binwiederhier/*", "bitnami/*",
|
||||||
"clickhouse/*", "cloudflare/*", "coturn/*", "crowdsecurity/*",
|
"apache/*", "clickhouse/*", "cloudflare/*", "coturn/*", "crowdsecurity/*",
|
||||||
"curlimages/*", "deluan/*", "dgtlmoon/*", "dolthub/*",
|
"curlimages/*", "deluan/*", "dgtlmoon/*", "dolthub/*",
|
||||||
"dpage/*", "dperson/*", "edoburu/*", "esanchezm/*",
|
"dpage/*", "dperson/*", "edoburu/*", "esanchezm/*",
|
||||||
"freikin/*", "freshrss/*", "hackmdio/*", "hashicorp/*",
|
"freikin/*", "freshrss/*", "gotenberg/*", "hackmdio/*", "hashicorp/*",
|
||||||
"headscale/*", "jhonderson/*", "kebe/*", "library/*",
|
"headscale/*", "jhonderson/*", "kebe/*", "library/*",
|
||||||
"lissy93/*", "louislam/*", "matrixdotorg/*", "mauriceboe/*",
|
"lissy93/*", "louislam/*", "matrixdotorg/*", "mauriceboe/*",
|
||||||
"mendhak/*",
|
"mendhak/*",
|
||||||
|
|
|
||||||
|
|
@ -225,6 +225,27 @@ resource "kubernetes_config_map" "loki_alert_rules" {
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
# App auto-upgrades (Keel). Keel's direct Slack notifier was disabled
|
||||||
|
# 2026-07-02 after a stuck update (gotenberg vs require-trusted-
|
||||||
|
# registries) re-posted an identical failure to #general on every
|
||||||
|
# hourly poll for days. This log alert is the replacement failure
|
||||||
|
# signal: alert-on-change routing notifies ONCE and the daily digest
|
||||||
|
# carries it while it persists — never an hourly drip.
|
||||||
|
name = "App auto-upgrades (Keel)"
|
||||||
|
rules = [
|
||||||
|
{
|
||||||
|
alert = "KeelUpdateFailing"
|
||||||
|
expr = "sum(count_over_time({namespace=\"keel\"} |= \"level=error\" |= \"got error while updating resource\" [3h])) > 2"
|
||||||
|
for = "10m"
|
||||||
|
labels = { severity = "warning" }
|
||||||
|
annotations = {
|
||||||
|
summary = "Keel repeatedly failing to roll out an image update"
|
||||||
|
description = "Keel failed the same resource update >2 times in 3h (its poll is hourly, so this means a persistently stuck rollout, not a blip). kubectl -n keel logs deploy/keel | grep level=error. Common causes: kyverno require-trusted-registries denying the new tag (extend the allowlist in stacks/kyverno/modules/kyverno/security-policies.tf), a ResourceQuota rejecting the surge pod, or a bad imagePullSecret."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
# t3 session-auth + auto-upgrade health (devvm host scripts → journald →
|
# t3 session-auth + auto-upgrade health (devvm host scripts → journald →
|
||||||
# Loki). Backstops the gated-nightly t3 tracker: the dispatch logs every
|
# Loki). Backstops the gated-nightly t3 tracker: the dispatch logs every
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue