uptime-kuma: codify Traefik LB internal monitor at .203 (was stale .200)
A hand-created (non-TF) uptime-kuma monitor "Traefik LoadBalancer" (id=95) port-checked 10.0.20.200:443 — the shared LB IP Traefik moved OFF on 2026-05-30 when it took its dedicated .203 (ETP=Local). It had been DOWN for ~5 days, surfacing as the cluster-health "uptime_kuma internal down(1)" WARN. Add it to local.internal_monitors as "Traefik LoadBalancer (10.0.20.203)" (port 10.0.20.203:443) so it's managed like the TP-Link/Proxmox direct-IP probes — a direct check of the MetalLB L2 + Traefik bind, complementing the [External] traefik (full CF path) and Traefik Dashboard (in-cluster) monitors. The sync CronJob created it (id=902, reporting UP @1ms); the orphan id=95 was deleted via the uptime-kuma API.
This commit is contained in:
parent
011c63c92d
commit
4ed0c5a834
1 changed files with 27 additions and 4 deletions
|
|
@ -26,7 +26,7 @@ resource "kubernetes_namespace" "uptime-kuma" {
|
|||
metadata {
|
||||
name = "uptime-kuma"
|
||||
labels = {
|
||||
tier = var.tier
|
||||
tier = var.tier
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
# labels = {
|
||||
|
|
@ -192,12 +192,12 @@ resource "kubernetes_deployment" "uptime-kuma" {
|
|||
# as `never` so a Kyverno reconcile (or manual kubectl) can't flip it
|
||||
# back to `force` and re-enable auto-updates.
|
||||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
metadata[0].annotations["kubernetes.io/change-cause"],
|
||||
metadata[0].annotations["deployment.kubernetes.io/revision"],
|
||||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
|
||||
metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno
|
||||
metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -699,6 +699,29 @@ locals {
|
|||
retry_interval = 60
|
||||
max_retries = 2
|
||||
},
|
||||
{
|
||||
# Direct port probe of the Traefik MetalLB LB IP. Complements the
|
||||
# `[External] traefik` HTTPS monitor (full DNS→CF→tunnel path) and the
|
||||
# in-cluster `Traefik Dashboard` monitor: this one checks the dedicated
|
||||
# LB IP + :443 bind directly, so a MetalLB L2 / Traefik-bind failure is
|
||||
# distinguishable from a Cloudflare/tunnel outage. The IP is .203 (the
|
||||
# DEDICATED Traefik LB, ETP=Local) — NOT the shared .200, which Traefik
|
||||
# moved off on 2026-05-30. Replaces a hand-created monitor that still
|
||||
# pointed at the dead .200:443. Keep this IP in sync with the Traefik LB
|
||||
# in `docs/architecture/networking.md`.
|
||||
name = "Traefik LoadBalancer (10.0.20.203)"
|
||||
type = "port"
|
||||
database_connection_string = null
|
||||
database_password_vault_key = null
|
||||
hostname = "10.0.20.203"
|
||||
port = 443
|
||||
url = null
|
||||
accepted_statuscodes = null
|
||||
ignore_tls = null
|
||||
interval = 60
|
||||
retry_interval = 30
|
||||
max_retries = 3
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue