From aa05942fa5a13b133529a482aec2fc0cd0e6d168 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Tue, 19 May 2026 22:06:21 +0000 Subject: [PATCH] upgrade-state: filter transient registry digest-check errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keel polls ~175 image manifests hourly against public registries. Transient i/o timeouts and registry 5xx responses are inherent at that scale and auto-recover on the next poll, but they were tripping the Apps row into ⚠ attn — pure noise. Extend benign_re to cover: - failed to check digest + (i/o timeout | connection refused | connection reset | context deadline exceeded | TLS handshake timeout | no such host | EOF) - failed to check digest + non-successful response (status=5xx) Real actionable digest-check failures (HTTP 401 auth, 404 removed tag) still surface. Persistent registry-side 5xx is owned by the registry's own monitoring (forgejo-integrity-probe + RegistryCatalogInaccessible), not by Keel logs. Tested locally: Apps row flips from ⚠ attn → ✓ healthy after the filter is in place; remaining errors-line drops to "(none in last 24h)". Co-Authored-By: Claude Opus 4.7 --- scripts/upgrade_state.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/scripts/upgrade_state.sh b/scripts/upgrade_state.sh index f5735c07..2e6e7faa 100755 --- a/scripts/upgrade_state.sh +++ b/scripts/upgrade_state.sh @@ -192,7 +192,25 @@ except Exception: # token. We don't want the interactive bot (no approvals; opt-out # auto-update). The Slack NOTIFICATION sender works independently # of the bot, so rollout messages still post to #general. - local benign_re='bot\.Run\(\): can not get configuration for bot \[slack\]|SLACK_APP_TOKEN must have the (previf|prefix)' + # - `failed to check digest` with a transient network error — + # Keel polls ~175 image manifests against public registries + # hourly. Occasional `i/o timeout` / `connection refused` / + # `TLS handshake timeout` / `no such host` / `EOF` / + # `context deadline exceeded` are inherent to public-internet + # polling at that scale and auto-recover on the next poll. + # Actionable digest-check failures surface as HTTP 401/404 + # (auth, removed-tag) — those are NOT filtered. + # - `failed to check digest` with HTTP 5xx — upstream registry + # having a problem (DockerHub maintenance, Forgejo restart, + # etc.). Same recovery pattern as network errors: next hourly + # poll succeeds once upstream is back. Persistent 5xx for >24h + # would indicate a real registry-side issue, but that surfaces + # via the registry's own monitoring (e.g. forgejo-integrity-probe + # + RegistryCatalogInaccessible), not via Keel logs. + local benign_re='bot\.Run\(\): can not get configuration for bot \[slack\]' + benign_re+='|SLACK_APP_TOKEN must have the (previf|prefix)' + benign_re+='|failed to check digest.*(i/o timeout|connection refused|connection reset|context deadline exceeded|TLS handshake timeout|no such host|: EOF)' + benign_re+='|failed to check digest.*non-successful response \(status=5[0-9][0-9]' errors=$(echo "$log_24h" | grep -iE '"level":"(error|fatal)"|level=error' | grep -vE "$benign_re" | tail -3 || true) if [[ -z "$errors" ]]; then APPS_ERROR_LINE="(none in last 24h)"