From 4ed0c5a8347bb2e27a58c62bac4555559beb854e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 4 Jun 2026 02:12:22 +0000 Subject: [PATCH] uptime-kuma: codify Traefik LB internal monitor at .203 (was stale .200) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hand-created (non-TF) uptime-kuma monitor "Traefik LoadBalancer" (id=95) port-checked 10.0.20.200:443 — the shared LB IP Traefik moved OFF on 2026-05-30 when it took its dedicated .203 (ETP=Local). It had been DOWN for ~5 days, surfacing as the cluster-health "uptime_kuma internal down(1)" WARN. Add it to local.internal_monitors as "Traefik LoadBalancer (10.0.20.203)" (port 10.0.20.203:443) so it's managed like the TP-Link/Proxmox direct-IP probes — a direct check of the MetalLB L2 + Traefik bind, complementing the [External] traefik (full CF path) and Traefik Dashboard (in-cluster) monitors. The sync CronJob created it (id=902, reporting UP @1ms); the orphan id=95 was deleted via the uptime-kuma API. --- .../uptime-kuma/modules/uptime-kuma/main.tf | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/stacks/uptime-kuma/modules/uptime-kuma/main.tf b/stacks/uptime-kuma/modules/uptime-kuma/main.tf index 2aee365f..0695081a 100644 --- a/stacks/uptime-kuma/modules/uptime-kuma/main.tf +++ b/stacks/uptime-kuma/modules/uptime-kuma/main.tf @@ -26,7 +26,7 @@ resource "kubernetes_namespace" "uptime-kuma" { metadata { name = "uptime-kuma" labels = { - tier = var.tier + tier = var.tier "keel.sh/enrolled" = "true" } # labels = { @@ -192,12 +192,12 @@ resource "kubernetes_deployment" "uptime-kuma" { # as `never` so a Kyverno reconcile (or manual kubectl) can't flip it # back to `force` and re-enable auto-updates. metadata[0].annotations["keel.sh/trigger"], - metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 - spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates metadata[0].annotations["kubernetes.io/change-cause"], metadata[0].annotations["deployment.kubernetes.io/revision"], spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 - metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno + metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno ] } } @@ -699,6 +699,29 @@ locals { retry_interval = 60 max_retries = 2 }, + { + # Direct port probe of the Traefik MetalLB LB IP. Complements the + # `[External] traefik` HTTPS monitor (full DNS→CF→tunnel path) and the + # in-cluster `Traefik Dashboard` monitor: this one checks the dedicated + # LB IP + :443 bind directly, so a MetalLB L2 / Traefik-bind failure is + # distinguishable from a Cloudflare/tunnel outage. The IP is .203 (the + # DEDICATED Traefik LB, ETP=Local) — NOT the shared .200, which Traefik + # moved off on 2026-05-30. Replaces a hand-created monitor that still + # pointed at the dead .200:443. Keep this IP in sync with the Traefik LB + # in `docs/architecture/networking.md`. + name = "Traefik LoadBalancer (10.0.20.203)" + type = "port" + database_connection_string = null + database_password_vault_key = null + hostname = "10.0.20.203" + port = 443 + url = null + accepted_statuscodes = null + ignore_tls = null + interval = 60 + retry_interval = 30 + max_retries = 3 + }, ] }