From f6812fe69f80fe2bc803aeb73c4b0568685bda65 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:02:35 +0000 Subject: [PATCH] [uptime-kuma] Support per-ingress probe path annotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context The `external-monitor-sync` CronJob probed `https:///` for every `*.viktorbarzin.me` ingress. Homepages frequently return 200 (or allow-listed 30x/40x) even when the backend or DB is broken, producing false-negatives — the forgejo outage on 2026-04-17 was not caught for this reason: `/` returned a login page while `/api/healthz` returned 503 from the DB probe. Manual monitor edits don't stick: the next sync is create-if-missing only, so a deleted monitor gets recreated pointing at `/` again. ## This change Teaches the sync three things: 1. **Reads a new annotation** `uptime.viktorbarzin.me/external-monitor-path`. The annotation value is appended as the probe path; default `/` preserves today's behaviour for every ingress that hasn't opted in. 2. **Tightens accepted status codes** when an explicit path is set: `['200-299']` (strict — we expect a real healthz). The default `/` path keeps the existing lenient set `['200-299','300-399','400-499']` because homepages routinely 30x redirect or 40x on missing auth. 3. **Updates existing monitors** when the target URL or accepted status codes drift. Previously the loop was create-if-missing only, so annotating an already-monitored ingress had no effect until the monitor was deleted. Now re-running the sync after changing the annotation converges the live monitor. ## What is NOT in this change - No change to the Ingress annotations on any individual stack. Each service that wants a non-`/` probe path opts in separately. - No change to the ConfigMap fallback payload shape — legacy entries still get the lenient status codes. - Monitor DB state in Uptime Kuma's SQLite is untouched at plan time; the sync CronJob is what reconciles state on each run. ## Flow ``` ingress annotation CronJob Python ------------------ -------------- (none) --> url = https://host/ codes = lenient external-monitor-path --> url = https://host codes = strict ['200-299'] ^^ "/api/healthz" https://host/api/healthz codes = ['200-299'] existing monitor + drifted target url --> api.edit_monitor(id, url=..., accepted_statuscodes=...) ``` ## Test Plan ### Automated - `terraform fmt -check -recursive stacks/uptime-kuma` — exit 0. - `scripts/tg plan` on `stacks/uptime-kuma` — `Plan: 0 to add, 1 to change, 0 to destroy`. The single in-place change is the CronJob command (Python heredoc re-rendered). No other resources drift. - Embedded Python compiles: extracted the `PYEOF` block and ran `python3 -m py_compile` — OK. ### Manual Verification 1. Annotate an ingress: `kubectl annotate ingress/ -n uptime.viktorbarzin.me/external-monitor-path=/api/healthz` 2. Trigger sync early: `kubectl -n uptime-kuma create job --from=cronjob/external-monitor-sync external-monitor-sync-manual` 3. Expected log line: `Updating monitor [External] : https://host/ -> https://host/api/healthz (codes ['200-299','300-399','400-499'] -> ['200-299'])` 4. Inspect monitor in Uptime Kuma UI: URL and accepted status codes reflect the annotation. 5. Final summary line includes updated count: `Sync complete: 0 created, 1 updated, 0 deleted, N unchanged`. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../uptime-kuma/modules/uptime-kuma/main.tf | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/stacks/uptime-kuma/modules/uptime-kuma/main.tf b/stacks/uptime-kuma/modules/uptime-kuma/main.tf index d88d3f36..3eaccebf 100644 --- a/stacks/uptime-kuma/modules/uptime-kuma/main.tf +++ b/stacks/uptime-kuma/modules/uptime-kuma/main.tf @@ -339,6 +339,12 @@ FALLBACK_FILE = "/config/targets.json" PREFIX = "[External] " ANNOTATION_ENABLE = "uptime.viktorbarzin.me/external-monitor" ANNOTATION_NAME = "uptime.viktorbarzin.me/external-monitor-name" +ANNOTATION_PATH = "uptime.viktorbarzin.me/external-monitor-path" +DEFAULT_PATH = "/" +# Homepages often serve 200/30x/40x even when backends are degraded. +# When an explicit probe path is set we expect a real healthz: tighten codes. +STATUSCODES_LENIENT = ["200-299", "300-399", "400-499"] +STATUSCODES_STRICT = ["200-299"] SA_DIR = "/var/run/secrets/kubernetes.io/serviceaccount" API_SERVER = f"https://{os.environ.get('KUBERNETES_SERVICE_HOST', 'kubernetes.default.svc.cluster.local')}:{os.environ.get('KUBERNETES_SERVICE_PORT', '443')}" @@ -378,7 +384,11 @@ def load_from_api(): if monitor_name in seen: continue # dedupe by final monitor name, not hostname (fixes duplicate creation) seen.add(monitor_name) - targets.append({"name": label, "url": f"https://{host}"}) + path = anns.get(ANNOTATION_PATH) or DEFAULT_PATH + if not path.startswith("/"): + path = "/" + path + statuscodes = STATUSCODES_STRICT if path != DEFAULT_PATH else STATUSCODES_LENIENT + targets.append({"name": label, "url": f"https://{host}{path}", "statuscodes": statuscodes}) return targets @@ -386,7 +396,7 @@ def load_from_configmap(): """Legacy fallback: read the ConfigMap list.""" with open(FALLBACK_FILE) as f: raw = json.load(f) - return [{"name": t["name"], "url": t["url"]} for t in raw] + return [{"name": t["name"], "url": t["url"], "statuscodes": STATUSCODES_LENIENT} for t in raw] try: @@ -413,10 +423,12 @@ for m in monitors: existing_external[m["name"]] = m target_names = set() +targets_by_name = {} created = 0 for t in targets: monitor_name = f"{PREFIX}{t['name']}" target_names.add(monitor_name) + targets_by_name[monitor_name] = t if monitor_name not in existing_external: print(f"Creating monitor: {monitor_name} -> {t['url']}") api.add_monitor( @@ -425,11 +437,31 @@ for t in targets: url=t["url"], interval=300, maxretries=3, - accepted_statuscodes=["200-299", "300-399", "400-499"], + accepted_statuscodes=t["statuscodes"], ) created += 1 time.sleep(0.3) +# Update monitors whose target URL or accepted status codes drifted +# (e.g., new probe-path annotation added on an existing ingress). +updated = 0 +for monitor_name, t in targets_by_name.items(): + existing = existing_external.get(monitor_name) + if not existing: + continue + current_url = existing.get("url") + current_codes = existing.get("accepted_statuscodes") or [] + if current_url == t["url"] and current_codes == t["statuscodes"]: + continue + print(f"Updating monitor {monitor_name}: {current_url} -> {t['url']} (codes {current_codes} -> {t['statuscodes']})") + api.edit_monitor( + existing["id"], + url=t["url"], + accepted_statuscodes=t["statuscodes"], + ) + updated += 1 + time.sleep(0.3) + # Remove monitors for services no longer in the list deleted = 0 for name, m in existing_external.items(): @@ -440,7 +472,8 @@ for name, m in existing_external.items(): time.sleep(0.3) api.disconnect() -print(f"Sync complete: {created} created, {deleted} deleted, {len(target_names) - created} unchanged") +unchanged = len(target_names) - created - updated +print(f"Sync complete: {created} created, {updated} updated, {deleted} deleted, {unchanged} unchanged") PYEOF EOT ]