fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/stacks/uptime-kuma/modules/uptime-kuma/main.tf
+++ b/stacks/uptime-kuma/modules/uptime-kuma/main.tf
@ -0,0 +1,920 @@
+variable "tls_secret_name" {}
+variable "tier" { type = string }
+variable "nfs_server" { type = string }
+variable "cloudflare_proxied_names" { type = list(string) }
+
+data "vault_kv_secret_v2" "viktor" {
+  mount = "secret"
+  name  = "viktor"
+}
+
+locals {
+  # Services that don't respond to standard HTTP health checks
+  non_http_services = toset(["xray-vless", "xray-ws", "xray-grpc"])
+
+  external_monitor_targets = [
+    for name in var.cloudflare_proxied_names : {
+      name     = name
+      hostname = name == "viktorbarzin.me" ? "viktorbarzin.me" : "${name}.viktorbarzin.me"
+      url      = name == "viktorbarzin.me" ? "https://viktorbarzin.me" : "https://${name}.viktorbarzin.me"
+    }
+    if !contains(local.non_http_services, name)
+  ]
+}
+
+resource "kubernetes_namespace" "uptime-kuma" {
+  metadata {
+    name = "uptime-kuma"
+    labels = {
+      tier               = var.tier
+      "keel.sh/enrolled" = "true"
+    }
+    # labels = {
+    #   "istio-injection" : "enabled"
+    # }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
+    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
+  }
+}
+
+module "tls_secret" {
+  source          = "../../../../modules/kubernetes/setup_tls_secret"
+  namespace       = kubernetes_namespace.uptime-kuma.metadata[0].name
+  tls_secret_name = var.tls_secret_name
+}
+
+resource "kubernetes_persistent_volume_claim" "data_proxmox" {
+  wait_until_bound = false
+  metadata {
+    name      = "uptime-kuma-data-proxmox"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+    annotations = {
+      "resize.topolvm.io/threshold"     = "10%"
+      "resize.topolvm.io/increase"      = "50%"
+      "resize.topolvm.io/storage_limit" = "20Gi"
+    }
+  }
+  spec {
+    access_modes       = ["ReadWriteOnce"]
+    storage_class_name = "proxmox-lvm"
+    resources {
+      requests = {
+        storage = "5Gi"
+      }
+    }
+  }
+  lifecycle {
+    # The autoresizer expands requests.storage up to storage_limit and
+    # PVCs can't shrink. Without this, every TF apply tries to revert
+    # to the spec value, K8s rejects the shrink, and the PVC ends up
+    # in Terminating-but-in-use limbo.
+    ignore_changes = [spec[0].resources[0].requests]
+  }
+}
+
+resource "kubernetes_deployment" "uptime-kuma" {
+  metadata {
+    name      = "uptime-kuma"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+    labels = {
+      app  = "uptime-kuma"
+      tier = var.tier
+      # Opt out of Kyverno's inject-keel-annotations ClusterPolicy. The Kyverno
+      # rule excludes any workload with this LABEL (see
+      # stacks/kyverno/modules/kyverno/keel-annotations.tf, exclude.any
+      # matchLabels keel.sh/policy=never). Without the label, Kyverno would
+      # silently re-add `keel.sh/policy=force` after every reconcile, undoing
+      # the annotation below.
+      "keel.sh/policy" = "never"
+    }
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+      # Stop Keel polling for this workload. Even with match-tag=true,
+      # Keel auto-downgraded :2 → :1 on 2026-05-26 12:14, which v1 booted
+      # into SQLite mode and couldn't read the existing MariaDB store
+      # (db-config.json) → 4h CrashLoopBackOff. Pinning the image string
+      # alone isn't enough because Keel kept fighting the apply. Combined
+      # with the matching LABEL above, this fully bypasses Keel.
+      "keel.sh/policy" = "never"
+    }
+  }
+  spec {
+    replicas = 1
+    strategy {
+      type = "Recreate"
+    }
+    selector {
+      match_labels = {
+        app = "uptime-kuma"
+      }
+    }
+    template {
+      metadata {
+        annotations = {
+          "diun.enable"       = "true"
+          "diun.include_tags" = "latest"
+        }
+        labels = {
+          app = "uptime-kuma"
+        }
+      }
+      spec {
+        container {
+          # Pinned to 2.3.2 because Keel auto-downgraded :2 → :1 on 2026-05-26
+          # 12:14 UTC despite the Kyverno-injected `keel.sh/match-tag=true` +
+          # `keel.sh/policy=force` annotation pair (which is supposed to gate
+          # digest changes only). The v1 image opens kuma.db (SQLite) at boot
+          # and can't read the v2 db-config.json → 4h CrashLoopBackOff while
+          # the MariaDB store sat intact. Until the keel-match-tag regression
+          # is root-caused, pin minor versions explicitly.
+          image = "louislam/uptime-kuma:2.3.2"
+          name  = "uptime-kuma"
+
+          resources {
+            requests = {
+              cpu    = "100m"
+              memory = "128Mi"
+            }
+            limits = {
+              memory = "512Mi"
+            }
+          }
+
+          port {
+            container_port = 3001
+          }
+          liveness_probe {
+            http_get {
+              path = "/"
+              port = 3001
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/"
+              port = 3001
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
+          volume_mount {
+            name       = "data"
+            mount_path = "/app/data"
+          }
+        }
+        volume {
+          name = "data"
+          persistent_volume_claim {
+            claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    ignore_changes = [
+      spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
+      # `keel.sh/policy` is intentionally NOT ignored — we want TF to own it
+      # as `never` so a Kyverno reconcile (or manual kubectl) can't flip it
+      # back to `force` and re-enable auto-updates.
+      metadata[0].annotations["keel.sh/trigger"],
+      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
+      spec[0].template[0].spec[0].container[0].image,  # KEEL_IGNORE_IMAGE — Keel manages tag updates
+      metadata[0].annotations["kubernetes.io/change-cause"],
+      metadata[0].annotations["deployment.kubernetes.io/revision"],
+      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
+      metadata[0].annotations["keel.sh/match-tag"],                       # injected by Kyverno
+    ]
+  }
+}
+resource "kubernetes_service" "uptime-kuma" {
+  metadata {
+    name      = "uptime-kuma"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+    labels = {
+      "app" = "uptime-kuma"
+    }
+  }
+
+  spec {
+    selector = {
+      app = "uptime-kuma"
+    }
+    port {
+      port        = "80"
+      target_port = "3001"
+    }
+  }
+}
+module "ingress" {
+  source          = "../../../../modules/kubernetes/ingress_factory"
+  auth            = "required"
+  dns_type        = "proxied"
+  namespace       = kubernetes_namespace.uptime-kuma.metadata[0].name
+  name            = "uptime"
+  tls_secret_name = var.tls_secret_name
+  service_name    = "uptime-kuma"
+  extra_annotations = {
+    "gethomepage.dev/enabled"     = "true"
+    "gethomepage.dev/description" = "Uptime monitor"
+    "gethomepage.dev/group"       = "Core Platform"
+    "gethomepage.dev/icon" : "uptime-kuma.png"
+    "gethomepage.dev/name"         = "Uptime Kuma"
+    "gethomepage.dev/pod-selector" = ""
+    "gethomepage.dev/widget.type"  = "uptimekuma"
+    "gethomepage.dev/widget.url"   = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
+    "gethomepage.dev/widget.slug"  = "infra"
+  }
+}
+
+# Path-level carve-out for Uptime Kuma's public-by-design endpoints.
+# The main ingress above gates the ENTIRE site (path "/") behind Authentik
+# forward-auth — which 302-bounces the public status pages, push-monitor
+# ingest, status-page API, badges, and static assets to the SSO login. Status
+# pages are meant for logged-out viewers and push monitors POST from machines;
+# neither can follow the Authentik 302 → OAuth → cookie dance, so all of these
+# were broken (302 instead of 200/JSON). This second ingress points the public
+# paths at the same uptime-kuma Service with NO Authentik middleware. Traefik
+# routes by rule length, so these path-scoped routers out-prioritise the "/"
+# catch-all (same mechanism as the meshcentral agent carve-out, commit
+# 9a15f3f2). The dashboard ("/", "/dashboard", "/manage-*", "/add", "/edit",
+# "/settings", "/setup") stays Authentik-gated via the module above. Uptime
+# Kuma is WebSocket-based; the ingress_factory default middleware chain passes
+# Upgrade/Connection through unchanged, so the realtime status UI still works.
+module "ingress_public" {
+  source       = "../../../../modules/kubernetes/ingress_factory"
+  namespace    = kubernetes_namespace.uptime-kuma.metadata[0].name
+  name         = "uptime-public"
+  service_name = "uptime-kuma"
+  # auth = "none": Uptime Kuma public status pages + push-monitor/badge endpoints - hit logged-out / by machines, cannot do Authentik SSO
+  auth = "none"
+  ingress_path = [
+    "/status",          # public status pages (/status/<slug>)
+    "/api/status-page", # status-page data + heartbeat API
+    "/api/push",        # push-monitor ingest (/api/push/<key>)
+    "/api/badge",       # status/uptime/ping badges
+    "/assets",          # JS/CSS/font bundles for the status page
+    "/icon.svg",        # favicon / logo
+    "/upload",          # uploaded status-page logos/images
+  ]
+  full_host        = "uptime.viktorbarzin.me"
+  dns_type         = "none" # DNS already owned by the main uptime ingress above.
+  tls_secret_name  = var.tls_secret_name
+  anti_ai_scraping = false # Status pages + push ingest are machine/anon-hit; bot-block forwardAuth would break them.
+  homepage_enabled = false # Homepage tile belongs to the main UI ingress.
+  external_monitor = false # The main ingress already carries the external monitor.
+}
+
+# CronJob for daily SQLite backups # no longer needed as we're using the mysql
+# resource "kubernetes_cron_job_v1" "sqlite-backup" {
+#   metadata {
+#     name      = "backup"
+#    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+#   }
+#   spec {
+#     concurrency_policy        = "Replace"
+#     failed_jobs_history_limit = 5
+#     schedule                  = "0 0 * * *"
+#     # schedule                      = "* * * * *"
+#     starting_deadline_seconds     = 10
+#     successful_jobs_history_limit = 3
+#     job_template {
+#       metadata {}
+#       spec {
+#         active_deadline_seconds    = 600 # should finish in 10 minutes
+#         backoff_limit              = 3
+#         ttl_seconds_after_finished = 10
+#         template {
+#           metadata {}
+#           spec {
+#             container {
+#               name  = "backup"
+#               image = "alpine/sqlite:latest"
+#               command = ["/bin/sh", "-c", <<-EOT
+#                 set -e
+#                 export now=$(date +"%Y_%m_%d_%H_%M")
+#                 echo "Backing up SQLite database to /app/data/backup/backup_$now.sqlite"
+#                 sqlite3 /app/data/kuma.db ".backup /app/data/backup/backup_$now.sqlite"
+#                 echo "Backup completed. Deleting old backups..."
+
+#                 # Rotate - delete last log file
+#                 cd /app/data/backup
+#                 find . -name "*.sqlite" -type f -mtime +7 -delete # 7 day retention of backups
+#                 echo "Old backups deleted."
+#               EOT
+#               ]
+#               volume_mount {
+#                 name       = "data"
+#                 mount_path = "/app/data"
+#               }
+#             }
+#             volume {
+#               name = "data"
+#               nfs {
+#                 server = var.nfs_server
+#                 path   = "/mnt/main/uptime-kuma"
+#               }
+#             }
+#           }
+#         }
+#       }
+#     }
+#   }
+# }
+
+# =============================================================================
+# External Monitor Sync
+# Ensures Uptime Kuma has external HTTPS monitors for every ingress annotated
+# with `uptime.viktorbarzin.me/external-monitor=true`. Falls back to a
+# Terraform-generated ConfigMap when API discovery is unavailable.
+#
+# Discovery modes (the script tries them in order):
+#   1. K8s API — list ingresses cluster-wide, filter by annotation
+#   2. ConfigMap fallback — read /config/targets.json (legacy list from
+#      cloudflare_proxied_names)
+# =============================================================================
+
+resource "kubernetes_service_account_v1" "external_monitor_sync" {
+  metadata {
+    name      = "external-monitor-sync"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+}
+
+resource "kubernetes_cluster_role_v1" "external_monitor_sync" {
+  metadata {
+    name = "external-monitor-sync"
+  }
+  rule {
+    api_groups = ["networking.k8s.io"]
+    resources  = ["ingresses"]
+    verbs      = ["list", "get"]
+  }
+}
+
+resource "kubernetes_cluster_role_binding_v1" "external_monitor_sync" {
+  metadata {
+    name = "external-monitor-sync"
+  }
+  role_ref {
+    api_group = "rbac.authorization.k8s.io"
+    kind      = "ClusterRole"
+    name      = kubernetes_cluster_role_v1.external_monitor_sync.metadata[0].name
+  }
+  subject {
+    kind      = "ServiceAccount"
+    name      = kubernetes_service_account_v1.external_monitor_sync.metadata[0].name
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+}
+
+resource "kubernetes_config_map_v1" "external_monitor_targets" {
+  metadata {
+    name      = "external-monitor-targets"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+  data = {
+    "targets.json" = jsonencode(local.external_monitor_targets)
+  }
+}
+
+resource "kubernetes_cron_job_v1" "external_monitor_sync" {
+  metadata {
+    name      = "external-monitor-sync"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+  spec {
+    concurrency_policy            = "Forbid"
+    failed_jobs_history_limit     = 3
+    successful_jobs_history_limit = 3
+    schedule                      = "*/10 * * * *"
+    job_template {
+      metadata {}
+      spec {
+        backoff_limit              = 1
+        ttl_seconds_after_finished = 300
+        template {
+          metadata {}
+          spec {
+            service_account_name = kubernetes_service_account_v1.external_monitor_sync.metadata[0].name
+            container {
+              name  = "sync"
+              image = "docker.io/library/python:3.12-alpine"
+              command = ["/bin/sh", "-c", <<-EOT
+                pip install --quiet --disable-pip-version-check uptime-kuma-api
+                python3 << 'PYEOF'
+import os, json, ssl, time, urllib.request, urllib.error
+from uptime_kuma_api import UptimeKumaApi, MonitorType
+
+UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
+UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"]
+FALLBACK_FILE = "/config/targets.json"
+PREFIX = "[External] "
+ANNOTATION_ENABLE = "uptime.viktorbarzin.me/external-monitor"
+ANNOTATION_NAME = "uptime.viktorbarzin.me/external-monitor-name"
+ANNOTATION_PATH = "uptime.viktorbarzin.me/external-monitor-path"
+DEFAULT_PATH = "/"
+# Homepages often serve 200/30x/40x even when backends are degraded.
+# When an explicit probe path is set we expect a real healthz: tighten codes.
+STATUSCODES_LENIENT = ["200-299", "300-399", "400-499"]
+STATUSCODES_STRICT = ["200-299"]
+SA_DIR = "/var/run/secrets/kubernetes.io/serviceaccount"
+API_SERVER = f"https://{os.environ.get('KUBERNETES_SERVICE_HOST', 'kubernetes.default.svc.cluster.local')}:{os.environ.get('KUBERNETES_SERVICE_PORT', '443')}"
+
+
+def load_from_api():
+    """List ingresses via in-cluster API. Opt-OUT by default:
+    every ingress whose host matches *.viktorbarzin.me gets a monitor,
+    UNLESS its annotation `uptime.viktorbarzin.me/external-monitor` is `"false"`.
+    This covers Helm-managed ingresses (authentik, grafana, vault, forgejo, ntfy)
+    that don't go through ingress_factory."""
+    with open(f"{SA_DIR}/token") as f:
+        token = f.read().strip()
+    ctx = ssl.create_default_context(cafile=f"{SA_DIR}/ca.crt")
+    url = f"{API_SERVER}/apis/networking.k8s.io/v1/ingresses"
+    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
+    with urllib.request.urlopen(req, context=ctx, timeout=30) as resp:
+        body = json.loads(resp.read())
+
+    targets = []
+    seen = set()
+    for ing in body.get("items", []):
+        anns = (ing.get("metadata") or {}).get("annotations") or {}
+        if anns.get(ANNOTATION_ENABLE, "").lower() == "false":
+            continue  # explicit opt-out
+        tls = (ing.get("spec") or {}).get("tls") or []
+        host = None
+        if tls and tls[0].get("hosts"):
+            host = tls[0]["hosts"][0]
+        else:
+            rules = (ing.get("spec") or {}).get("rules") or []
+            if rules:
+                host = rules[0].get("host")
+        if not host or not host.endswith(".viktorbarzin.me"):
+            continue  # skip internal-only or non-public hosts
+        label = anns.get(ANNOTATION_NAME) or host.split(".")[0]
+        monitor_name = f"{PREFIX}{label}"
+        if monitor_name in seen:
+            continue  # dedupe by final monitor name, not hostname (fixes duplicate creation)
+        seen.add(monitor_name)
+        path = anns.get(ANNOTATION_PATH, "").strip()
+        if path and not path.startswith("/"):
+            path = "/" + path
+        # Omit trailing slash when no explicit path — matches pre-existing monitor URLs
+        # and avoids every sync re-updating unchanged monitors.
+        url = f"https://{host}{path}" if path else f"https://{host}"
+        statuscodes = STATUSCODES_STRICT if path else STATUSCODES_LENIENT
+        targets.append({"name": label, "url": url, "statuscodes": statuscodes})
+    return targets
+
+
+def load_from_configmap():
+    """Legacy fallback: read the ConfigMap list."""
+    with open(FALLBACK_FILE) as f:
+        raw = json.load(f)
+    return [{"name": t["name"], "url": t["url"], "statuscodes": STATUSCODES_LENIENT} for t in raw]
+
+
+try:
+    targets = load_from_api()
+    source = "k8s-api"
+    if not targets:
+        print("WARN: k8s-api returned 0 targets; falling back to ConfigMap")
+        targets = load_from_configmap()
+        source = "configmap"
+except (urllib.error.URLError, OSError, KeyError, ValueError) as e:
+    print(f"WARN: k8s-api discovery failed ({e!r}); falling back to ConfigMap")
+    targets = load_from_configmap()
+    source = "configmap"
+
+print(f"Loaded {len(targets)} external monitor targets (source={source})")
+
+api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=120, wait_events=0.2)
+api.login("admin", UPTIME_KUMA_PASS)
+
+monitors = api.get_monitors()
+existing_external = {}
+for m in monitors:
+    if m["name"].startswith(PREFIX):
+        existing_external[m["name"]] = m
+
+target_names = set()
+targets_by_name = {}
+created = 0
+for t in targets:
+    monitor_name = f"{PREFIX}{t['name']}"
+    target_names.add(monitor_name)
+    targets_by_name[monitor_name] = t
+    if monitor_name not in existing_external:
+        print(f"Creating monitor: {monitor_name} -> {t['url']}")
+        api.add_monitor(
+            type=MonitorType.HTTP,
+            name=monitor_name,
+            url=t["url"],
+            interval=300,
+            maxretries=3,
+            accepted_statuscodes=t["statuscodes"],
+        )
+        created += 1
+        time.sleep(0.3)
+
+# Update monitors whose target URL or accepted status codes drifted
+# (e.g., new probe-path annotation added on an existing ingress).
+updated = 0
+for monitor_name, t in targets_by_name.items():
+    existing = existing_external.get(monitor_name)
+    if not existing:
+        continue
+    current_url = existing.get("url")
+    current_codes = existing.get("accepted_statuscodes") or []
+    if current_url == t["url"] and current_codes == t["statuscodes"]:
+        continue
+    print(f"Updating monitor {monitor_name}: {current_url} -> {t['url']} (codes {current_codes} -> {t['statuscodes']})")
+    api.edit_monitor(
+        existing["id"],
+        url=t["url"],
+        accepted_statuscodes=t["statuscodes"],
+    )
+    updated += 1
+    time.sleep(0.3)
+
+# Remove monitors for services no longer in the list
+deleted = 0
+for name, m in existing_external.items():
+    if name not in target_names:
+        print(f"Deleting orphaned monitor: {name}")
+        api.delete_monitor(m["id"])
+        deleted += 1
+        time.sleep(0.3)
+
+api.disconnect()
+unchanged = len(target_names) - created - updated
+print(f"Sync complete: {created} created, {updated} updated, {deleted} deleted, {unchanged} unchanged")
+PYEOF
+              EOT
+              ]
+              env {
+                name  = "UPTIME_KUMA_PASSWORD"
+                value = data.vault_kv_secret_v2.viktor.data["uptime_kuma_admin_password"]
+              }
+              volume_mount {
+                name       = "config"
+                mount_path = "/config"
+                read_only  = true
+              }
+              resources {
+                requests = {
+                  memory = "128Mi"
+                  cpu    = "10m"
+                }
+                limits = {
+                  memory = "256Mi"
+                }
+              }
+            }
+            volume {
+              name = "config"
+              config_map {
+                name = kubernetes_config_map_v1.external_monitor_targets.metadata[0].name
+              }
+            }
+            dns_config {
+              option {
+                name  = "ndots"
+                value = "2"
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
+  }
+}
+
+# =============================================================================
+# Internal Monitor Sync
+# Declaratively manages monitors for internal services (databases, non-HTTP
+# endpoints) that can't be discovered from ingress annotations. Idempotent:
+# looks up monitors by name, creates if missing, patches if drifted.
+#
+# Why a CronJob and not a one-shot Job:
+# - louislam/uptime-kuma has no Terraform provider (only a CLI tool).
+# - UK v2 stores monitors in MariaDB (`uptimekuma` on mysql.dbaas); if the DB
+#   is wiped/restored we must re-create them.
+# - CronJob self-heals drift (manual UI edits, UK restarts, DB restores).
+#
+# Managed monitors (name -> desired spec) are defined in local.internal_monitors
+# below. Add new internal-service monitors there.
+# =============================================================================
+
+locals {
+  internal_monitors = [
+    {
+      name                        = "MySQL Standalone (dbaas)"
+      type                        = "mysql"
+      database_connection_string  = "mysql://uptimekuma@mysql.dbaas.svc.cluster.local:3306"
+      database_password_vault_key = "uptimekuma_db_password"
+      hostname                    = null
+      port                        = null
+      url                         = null
+      accepted_statuscodes        = null
+      ignore_tls                  = null
+      interval                    = 60
+      retry_interval              = 60
+      max_retries                 = 2
+    },
+    {
+      # HAProxy service in redis ns health-checks INFO replication and
+      # only routes to the current Sentinel-elected master, so this
+      # survives failover. Bitnami chart has auth disabled, so no
+      # password_vault_key.
+      name                        = "Redis"
+      type                        = "redis"
+      database_connection_string  = "redis://redis-master.redis.svc.cluster.local:6379"
+      database_password_vault_key = null
+      hostname                    = null
+      port                        = null
+      url                         = null
+      accepted_statuscodes        = null
+      ignore_tls                  = null
+      interval                    = 60
+      retry_interval              = 30
+      max_retries                 = 3
+    },
+    {
+      # TP-Link home router upstream of pfSense. Complements the
+      # `[External] gw` HTTPS monitor: this one checks the router
+      # directly on 443, so we can tell a Cloudflare/tunnel outage
+      # apart from the router itself being unreachable.
+      name                        = "TP-Link Gateway (192.168.1.1)"
+      type                        = "port"
+      database_connection_string  = null
+      database_password_vault_key = null
+      hostname                    = "192.168.1.1"
+      port                        = 443
+      url                         = null
+      accepted_statuscodes        = null
+      ignore_tls                  = null
+      interval                    = 60
+      retry_interval              = 30
+      max_retries                 = 3
+    },
+    {
+      # Proxmox web UI on the PVE host. Probes the IP directly (NOT a
+      # `*.viktorbarzin.lan` name) because in-cluster lookups for those
+      # are vulnerable to CoreDNS pod-level cache skew — pre-fix, this
+      # monitor would intermittently land on a stale `10.0.10.1`
+      # (pfSense gateway, nothing on :8006) and spuriously alert
+      # `ExternalAccessDivergence`. Direct-IP HTTPS eliminates that
+      # variable. Self-signed cert → ignore_tls=true. The 301→HTTPS
+      # redirect from pveproxy lands in the 300-399 band, so we accept
+      # 200-499 to cover redirect + auth-prompt responses.
+      name                        = "Proxmox UI"
+      type                        = "http"
+      database_connection_string  = null
+      database_password_vault_key = null
+      hostname                    = null
+      port                        = null
+      url                         = "https://192.168.1.127:8006/"
+      accepted_statuscodes        = ["200-299", "300-399", "400-499"]
+      ignore_tls                  = true
+      interval                    = 300
+      retry_interval              = 60
+      max_retries                 = 2
+    },
+    {
+      # Direct port probe of the Traefik MetalLB LB IP. Complements the
+      # `[External] traefik` HTTPS monitor (full DNS→CF→tunnel path) and the
+      # in-cluster `Traefik Dashboard` monitor: this one checks the dedicated
+      # LB IP + :443 bind directly, so a MetalLB L2 / Traefik-bind failure is
+      # distinguishable from a Cloudflare/tunnel outage. The IP is .203 (the
+      # DEDICATED Traefik LB, ETP=Local) — NOT the shared .200, which Traefik
+      # moved off on 2026-05-30. Replaces a hand-created monitor that still
+      # pointed at the dead .200:443. Keep this IP in sync with the Traefik LB
+      # in `docs/architecture/networking.md`.
+      name                        = "Traefik LoadBalancer (10.0.20.203)"
+      type                        = "port"
+      database_connection_string  = null
+      database_password_vault_key = null
+      hostname                    = "10.0.20.203"
+      port                        = 443
+      url                         = null
+      accepted_statuscodes        = null
+      ignore_tls                  = null
+      interval                    = 60
+      retry_interval              = 30
+      max_retries                 = 3
+    },
+    {
+      # Internal /healthz probe of the nextcloud-todos service. The `/cb`
+      # ingress carries the `[External]` HTTPS monitor (auto-created by
+      # external-monitor-sync), but those endpoints are HMAC-gated and only
+      # cover the callback path — this checks the app's own liveness inside
+      # the cluster on the ClusterIP svc. Plain HTTP, expects a clean 200.
+      name                        = "nextcloud-todos (/healthz)"
+      type                        = "http"
+      database_connection_string  = null
+      database_password_vault_key = null
+      hostname                    = null
+      port                        = null
+      url                         = "http://nextcloud-todos.nextcloud-todos.svc.cluster.local:8080/healthz"
+      accepted_statuscodes        = ["200-299"]
+      ignore_tls                  = false
+      interval                    = 60
+      retry_interval              = 30
+      max_retries                 = 3
+    },
+  ]
+}
+
+resource "kubernetes_secret" "internal_monitor_sync" {
+  metadata {
+    name      = "internal-monitor-sync"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+  data = merge(
+    { UPTIME_KUMA_PASSWORD = data.vault_kv_secret_v2.viktor.data["uptime_kuma_admin_password"] },
+    {
+      for m in local.internal_monitors :
+      "DB_PASSWORD_${upper(replace(m.name, "/[^A-Za-z0-9]/", "_"))}" =>
+      data.vault_kv_secret_v2.viktor.data[m.database_password_vault_key]
+      if m.database_password_vault_key != null
+    },
+  )
+}
+
+resource "kubernetes_config_map_v1" "internal_monitor_targets" {
+  metadata {
+    name      = "internal-monitor-targets"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+  data = {
+    "targets.json" = jsonencode([
+      for m in local.internal_monitors : {
+        name                       = m.name
+        type                       = m.type
+        database_connection_string = m.database_connection_string
+        hostname                   = m.hostname
+        port                       = m.port
+        url                        = m.url
+        accepted_statuscodes       = m.accepted_statuscodes
+        ignore_tls                 = m.ignore_tls
+        password_env               = m.database_password_vault_key != null ? "DB_PASSWORD_${upper(replace(m.name, "/[^A-Za-z0-9]/", "_"))}" : null
+        interval                   = m.interval
+        retry_interval             = m.retry_interval
+        max_retries                = m.max_retries
+      }
+    ])
+  }
+}
+
+resource "kubernetes_cron_job_v1" "internal_monitor_sync" {
+  metadata {
+    name      = "internal-monitor-sync"
+    namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
+  }
+  spec {
+    concurrency_policy            = "Forbid"
+    failed_jobs_history_limit     = 3
+    successful_jobs_history_limit = 3
+    schedule                      = "*/10 * * * *"
+    job_template {
+      metadata {}
+      spec {
+        backoff_limit              = 1
+        ttl_seconds_after_finished = 300
+        template {
+          metadata {}
+          spec {
+            container {
+              name  = "sync"
+              image = "docker.io/library/python:3.12-alpine"
+              command = ["/bin/sh", "-c", <<-EOT
+                pip install --quiet --disable-pip-version-check uptime-kuma-api
+                python3 << 'PYEOF'
+import json, os, time
+from uptime_kuma_api import UptimeKumaApi, MonitorType
+
+UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
+UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"]
+
+with open("/config/targets.json") as f:
+    targets = json.load(f)
+
+api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=120, wait_events=0.2)
+api.login("admin", UPTIME_KUMA_PASS)
+
+existing = {m["name"]: m for m in api.get_monitors()}
+
+for t in targets:
+    name = t["name"]
+    mtype = MonitorType(t["type"])
+    # MYSQL uses `databaseConnectionString` + `radiusPassword` (UK v2 re-uses
+    # radiusPassword for mysql auth — backwards compat). Redis has auth
+    # disabled on the cluster, so password_env is null. PORT monitors use
+    # hostname + port directly. HTTP monitors use url + accepted_statuscodes
+    # + ignoreTls (camelCase on the API; stored as `ignore_tls` in DB).
+    desired = {
+        "type": mtype,
+        "name": name,
+        "interval": t["interval"],
+        "retryInterval": t["retry_interval"],
+        "maxretries": t["max_retries"],
+    }
+    if mtype == MonitorType.PORT:
+        desired["hostname"] = t["hostname"]
+        desired["port"] = t["port"]
+    elif mtype == MonitorType.HTTP:
+        desired["url"] = t["url"]
+        desired["accepted_statuscodes"] = t["accepted_statuscodes"]
+        desired["ignoreTls"] = bool(t["ignore_tls"])
+    else:
+        desired["databaseConnectionString"] = t["database_connection_string"]
+        if t.get("password_env"):
+            desired["radiusPassword"] = os.environ[t["password_env"]]
+    if name not in existing:
+        print(f"Creating monitor: {name}")
+        api.add_monitor(**desired)
+        continue
+    m = existing[name]
+    drift_fields = ["interval", "retryInterval", "maxretries"]
+    if mtype == MonitorType.PORT:
+        drift_fields += ["hostname", "port"]
+    elif mtype == MonitorType.HTTP:
+        drift_fields += ["url", "accepted_statuscodes", "ignoreTls"]
+    else:
+        drift_fields += ["databaseConnectionString"]
+        if "radiusPassword" in desired:
+            drift_fields += ["radiusPassword"]
+    drifted = any(m.get(f) != desired.get(f) for f in drift_fields)
+    if drifted:
+        print(f"Updating monitor {name} (id={m['id']})")
+        edit_kwargs = {f: desired[f] for f in drift_fields if f in desired}
+        api.edit_monitor(m["id"], **edit_kwargs)
+    else:
+        print(f"Monitor {name} (id={m['id']}) already in desired state")
+    time.sleep(0.3)
+
+api.disconnect()
+print("Internal monitor sync complete")
+PYEOF
+              EOT
+              ]
+              env_from {
+                secret_ref {
+                  name = kubernetes_secret.internal_monitor_sync.metadata[0].name
+                }
+              }
+              volume_mount {
+                name       = "config"
+                mount_path = "/config"
+                read_only  = true
+              }
+              resources {
+                requests = {
+                  memory = "128Mi"
+                  cpu    = "10m"
+                }
+                limits = {
+                  memory = "256Mi"
+                }
+              }
+            }
+            volume {
+              name = "config"
+              config_map {
+                name = kubernetes_config_map_v1.internal_monitor_targets.metadata[0].name
+              }
+            }
+            dns_config {
+              option {
+                name  = "ndots"
+                value = "2"
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
+  }
+}