uptime-kuma: codify Traefik LB internal monitor at .203 (was stale .200)

A hand-created (non-TF) uptime-kuma monitor "Traefik LoadBalancer" (id=95)
port-checked 10.0.20.200:443 — the shared LB IP Traefik moved OFF on
2026-05-30 when it took its dedicated .203 (ETP=Local). It had been DOWN
for ~5 days, surfacing as the cluster-health "uptime_kuma internal down(1)"
WARN.

Add it to local.internal_monitors as "Traefik LoadBalancer (10.0.20.203)"
(port 10.0.20.203:443) so it's managed like the TP-Link/Proxmox direct-IP
probes — a direct check of the MetalLB L2 + Traefik bind, complementing the
[External] traefik (full CF path) and Traefik Dashboard (in-cluster)
monitors. The sync CronJob created it (id=902, reporting UP @1ms); the
orphan id=95 was deleted via the uptime-kuma API.
This commit is contained in:
Viktor Barzin 2026-06-04 02:12:22 +00:00
parent 011c63c92d
commit 4ed0c5a834

View file

@ -26,7 +26,7 @@ resource "kubernetes_namespace" "uptime-kuma" {
metadata {
name = "uptime-kuma"
labels = {
tier = var.tier
tier = var.tier
"keel.sh/enrolled" = "true"
}
# labels = {
@ -192,12 +192,12 @@ resource "kubernetes_deployment" "uptime-kuma" {
# as `never` so a Kyverno reconcile (or manual kubectl) can't flip it
# back to `force` and re-enable auto-updates.
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE Keel manages tag updates
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE Keel manages tag updates
metadata[0].annotations["kubernetes.io/change-cause"],
metadata[0].annotations["deployment.kubernetes.io/revision"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno
metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno
]
}
}
@ -699,6 +699,29 @@ locals {
retry_interval = 60
max_retries = 2
},
{
# Direct port probe of the Traefik MetalLB LB IP. Complements the
# `[External] traefik` HTTPS monitor (full DNSCFtunnel path) and the
# in-cluster `Traefik Dashboard` monitor: this one checks the dedicated
# LB IP + :443 bind directly, so a MetalLB L2 / Traefik-bind failure is
# distinguishable from a Cloudflare/tunnel outage. The IP is .203 (the
# DEDICATED Traefik LB, ETP=Local) NOT the shared .200, which Traefik
# moved off on 2026-05-30. Replaces a hand-created monitor that still
# pointed at the dead .200:443. Keep this IP in sync with the Traefik LB
# in `docs/architecture/networking.md`.
name = "Traefik LoadBalancer (10.0.20.203)"
type = "port"
database_connection_string = null
database_password_vault_key = null
hostname = "10.0.20.203"
port = 443
url = null
accepted_statuscodes = null
ignore_tls = null
interval = 60
retry_interval = 30
max_retries = 3
},
]
}