fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]
6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6d224861c4
commit
fd0f4a0365
1166 changed files with 358546 additions and 0 deletions
920
stacks/uptime-kuma/modules/uptime-kuma/main.tf
Normal file
920
stacks/uptime-kuma/modules/uptime-kuma/main.tf
Normal file
|
|
@ -0,0 +1,920 @@
|
|||
variable "tls_secret_name" {}
|
||||
variable "tier" { type = string }
|
||||
variable "nfs_server" { type = string }
|
||||
variable "cloudflare_proxied_names" { type = list(string) }
|
||||
|
||||
data "vault_kv_secret_v2" "viktor" {
|
||||
mount = "secret"
|
||||
name = "viktor"
|
||||
}
|
||||
|
||||
locals {
|
||||
# Services that don't respond to standard HTTP health checks
|
||||
non_http_services = toset(["xray-vless", "xray-ws", "xray-grpc"])
|
||||
|
||||
external_monitor_targets = [
|
||||
for name in var.cloudflare_proxied_names : {
|
||||
name = name
|
||||
hostname = name == "viktorbarzin.me" ? "viktorbarzin.me" : "${name}.viktorbarzin.me"
|
||||
url = name == "viktorbarzin.me" ? "https://viktorbarzin.me" : "https://${name}.viktorbarzin.me"
|
||||
}
|
||||
if !contains(local.non_http_services, name)
|
||||
]
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace" "uptime-kuma" {
|
||||
metadata {
|
||||
name = "uptime-kuma"
|
||||
labels = {
|
||||
tier = var.tier
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
# labels = {
|
||||
# "istio-injection" : "enabled"
|
||||
# }
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
|
||||
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
||||
}
|
||||
}
|
||||
|
||||
module "tls_secret" {
|
||||
source = "../../../../modules/kubernetes/setup_tls_secret"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
tls_secret_name = var.tls_secret_name
|
||||
}
|
||||
|
||||
resource "kubernetes_persistent_volume_claim" "data_proxmox" {
|
||||
wait_until_bound = false
|
||||
metadata {
|
||||
name = "uptime-kuma-data-proxmox"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
annotations = {
|
||||
"resize.topolvm.io/threshold" = "10%"
|
||||
"resize.topolvm.io/increase" = "50%"
|
||||
"resize.topolvm.io/storage_limit" = "20Gi"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
access_modes = ["ReadWriteOnce"]
|
||||
storage_class_name = "proxmox-lvm"
|
||||
resources {
|
||||
requests = {
|
||||
storage = "5Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# The autoresizer expands requests.storage up to storage_limit and
|
||||
# PVCs can't shrink. Without this, every TF apply tries to revert
|
||||
# to the spec value, K8s rejects the shrink, and the PVC ends up
|
||||
# in Terminating-but-in-use limbo.
|
||||
ignore_changes = [spec[0].resources[0].requests]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "uptime-kuma" {
|
||||
metadata {
|
||||
name = "uptime-kuma"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
labels = {
|
||||
app = "uptime-kuma"
|
||||
tier = var.tier
|
||||
# Opt out of Kyverno's inject-keel-annotations ClusterPolicy. The Kyverno
|
||||
# rule excludes any workload with this LABEL (see
|
||||
# stacks/kyverno/modules/kyverno/keel-annotations.tf, exclude.any
|
||||
# matchLabels keel.sh/policy=never). Without the label, Kyverno would
|
||||
# silently re-add `keel.sh/policy=force` after every reconcile, undoing
|
||||
# the annotation below.
|
||||
"keel.sh/policy" = "never"
|
||||
}
|
||||
annotations = {
|
||||
"reloader.stakater.com/search" = "true"
|
||||
# Stop Keel polling for this workload. Even with match-tag=true,
|
||||
# Keel auto-downgraded :2 → :1 on 2026-05-26 12:14, which v1 booted
|
||||
# into SQLite mode and couldn't read the existing MariaDB store
|
||||
# (db-config.json) → 4h CrashLoopBackOff. Pinning the image string
|
||||
# alone isn't enough because Keel kept fighting the apply. Combined
|
||||
# with the matching LABEL above, this fully bypasses Keel.
|
||||
"keel.sh/policy" = "never"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
strategy {
|
||||
type = "Recreate"
|
||||
}
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "uptime-kuma"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
annotations = {
|
||||
"diun.enable" = "true"
|
||||
"diun.include_tags" = "latest"
|
||||
}
|
||||
labels = {
|
||||
app = "uptime-kuma"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
container {
|
||||
# Pinned to 2.3.2 because Keel auto-downgraded :2 → :1 on 2026-05-26
|
||||
# 12:14 UTC despite the Kyverno-injected `keel.sh/match-tag=true` +
|
||||
# `keel.sh/policy=force` annotation pair (which is supposed to gate
|
||||
# digest changes only). The v1 image opens kuma.db (SQLite) at boot
|
||||
# and can't read the v2 db-config.json → 4h CrashLoopBackOff while
|
||||
# the MariaDB store sat intact. Until the keel-match-tag regression
|
||||
# is root-caused, pin minor versions explicitly.
|
||||
image = "louislam/uptime-kuma:2.3.2"
|
||||
name = "uptime-kuma"
|
||||
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "100m"
|
||||
memory = "128Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "512Mi"
|
||||
}
|
||||
}
|
||||
|
||||
port {
|
||||
container_port = 3001
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 3001
|
||||
}
|
||||
initial_delay_seconds = 15
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 5
|
||||
}
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/"
|
||||
port = 3001
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/app/data"
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "data"
|
||||
persistent_volume_claim {
|
||||
claim_name = kubernetes_persistent_volume_claim.data_proxmox.metadata[0].name
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [
|
||||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
||||
# `keel.sh/policy` is intentionally NOT ignored — we want TF to own it
|
||||
# as `never` so a Kyverno reconcile (or manual kubectl) can't flip it
|
||||
# back to `force` and re-enable auto-updates.
|
||||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
metadata[0].annotations["kubernetes.io/change-cause"],
|
||||
metadata[0].annotations["deployment.kubernetes.io/revision"],
|
||||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
|
||||
metadata[0].annotations["keel.sh/match-tag"], # injected by Kyverno
|
||||
]
|
||||
}
|
||||
}
|
||||
resource "kubernetes_service" "uptime-kuma" {
|
||||
metadata {
|
||||
name = "uptime-kuma"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
labels = {
|
||||
"app" = "uptime-kuma"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
app = "uptime-kuma"
|
||||
}
|
||||
port {
|
||||
port = "80"
|
||||
target_port = "3001"
|
||||
}
|
||||
}
|
||||
}
|
||||
module "ingress" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
auth = "required"
|
||||
dns_type = "proxied"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
name = "uptime"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
service_name = "uptime-kuma"
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/description" = "Uptime monitor"
|
||||
"gethomepage.dev/group" = "Core Platform"
|
||||
"gethomepage.dev/icon" : "uptime-kuma.png"
|
||||
"gethomepage.dev/name" = "Uptime Kuma"
|
||||
"gethomepage.dev/pod-selector" = ""
|
||||
"gethomepage.dev/widget.type" = "uptimekuma"
|
||||
"gethomepage.dev/widget.url" = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
|
||||
"gethomepage.dev/widget.slug" = "infra"
|
||||
}
|
||||
}
|
||||
|
||||
# Path-level carve-out for Uptime Kuma's public-by-design endpoints.
|
||||
# The main ingress above gates the ENTIRE site (path "/") behind Authentik
|
||||
# forward-auth — which 302-bounces the public status pages, push-monitor
|
||||
# ingest, status-page API, badges, and static assets to the SSO login. Status
|
||||
# pages are meant for logged-out viewers and push monitors POST from machines;
|
||||
# neither can follow the Authentik 302 → OAuth → cookie dance, so all of these
|
||||
# were broken (302 instead of 200/JSON). This second ingress points the public
|
||||
# paths at the same uptime-kuma Service with NO Authentik middleware. Traefik
|
||||
# routes by rule length, so these path-scoped routers out-prioritise the "/"
|
||||
# catch-all (same mechanism as the meshcentral agent carve-out, commit
|
||||
# 9a15f3f2). The dashboard ("/", "/dashboard", "/manage-*", "/add", "/edit",
|
||||
# "/settings", "/setup") stays Authentik-gated via the module above. Uptime
|
||||
# Kuma is WebSocket-based; the ingress_factory default middleware chain passes
|
||||
# Upgrade/Connection through unchanged, so the realtime status UI still works.
|
||||
module "ingress_public" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
name = "uptime-public"
|
||||
service_name = "uptime-kuma"
|
||||
# auth = "none": Uptime Kuma public status pages + push-monitor/badge endpoints - hit logged-out / by machines, cannot do Authentik SSO
|
||||
auth = "none"
|
||||
ingress_path = [
|
||||
"/status", # public status pages (/status/<slug>)
|
||||
"/api/status-page", # status-page data + heartbeat API
|
||||
"/api/push", # push-monitor ingest (/api/push/<key>)
|
||||
"/api/badge", # status/uptime/ping badges
|
||||
"/assets", # JS/CSS/font bundles for the status page
|
||||
"/icon.svg", # favicon / logo
|
||||
"/upload", # uploaded status-page logos/images
|
||||
]
|
||||
full_host = "uptime.viktorbarzin.me"
|
||||
dns_type = "none" # DNS already owned by the main uptime ingress above.
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false # Status pages + push ingest are machine/anon-hit; bot-block forwardAuth would break them.
|
||||
homepage_enabled = false # Homepage tile belongs to the main UI ingress.
|
||||
external_monitor = false # The main ingress already carries the external monitor.
|
||||
}
|
||||
|
||||
# CronJob for daily SQLite backups # no longer needed as we're using the mysql
|
||||
# resource "kubernetes_cron_job_v1" "sqlite-backup" {
|
||||
# metadata {
|
||||
# name = "backup"
|
||||
# namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
# }
|
||||
# spec {
|
||||
# concurrency_policy = "Replace"
|
||||
# failed_jobs_history_limit = 5
|
||||
# schedule = "0 0 * * *"
|
||||
# # schedule = "* * * * *"
|
||||
# starting_deadline_seconds = 10
|
||||
# successful_jobs_history_limit = 3
|
||||
# job_template {
|
||||
# metadata {}
|
||||
# spec {
|
||||
# active_deadline_seconds = 600 # should finish in 10 minutes
|
||||
# backoff_limit = 3
|
||||
# ttl_seconds_after_finished = 10
|
||||
# template {
|
||||
# metadata {}
|
||||
# spec {
|
||||
# container {
|
||||
# name = "backup"
|
||||
# image = "alpine/sqlite:latest"
|
||||
# command = ["/bin/sh", "-c", <<-EOT
|
||||
# set -e
|
||||
# export now=$(date +"%Y_%m_%d_%H_%M")
|
||||
# echo "Backing up SQLite database to /app/data/backup/backup_$now.sqlite"
|
||||
# sqlite3 /app/data/kuma.db ".backup /app/data/backup/backup_$now.sqlite"
|
||||
# echo "Backup completed. Deleting old backups..."
|
||||
|
||||
# # Rotate - delete last log file
|
||||
# cd /app/data/backup
|
||||
# find . -name "*.sqlite" -type f -mtime +7 -delete # 7 day retention of backups
|
||||
# echo "Old backups deleted."
|
||||
# EOT
|
||||
# ]
|
||||
# volume_mount {
|
||||
# name = "data"
|
||||
# mount_path = "/app/data"
|
||||
# }
|
||||
# }
|
||||
# volume {
|
||||
# name = "data"
|
||||
# nfs {
|
||||
# server = var.nfs_server
|
||||
# path = "/mnt/main/uptime-kuma"
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
# =============================================================================
|
||||
# External Monitor Sync
|
||||
# Ensures Uptime Kuma has external HTTPS monitors for every ingress annotated
|
||||
# with `uptime.viktorbarzin.me/external-monitor=true`. Falls back to a
|
||||
# Terraform-generated ConfigMap when API discovery is unavailable.
|
||||
#
|
||||
# Discovery modes (the script tries them in order):
|
||||
# 1. K8s API — list ingresses cluster-wide, filter by annotation
|
||||
# 2. ConfigMap fallback — read /config/targets.json (legacy list from
|
||||
# cloudflare_proxied_names)
|
||||
# =============================================================================
|
||||
|
||||
resource "kubernetes_service_account_v1" "external_monitor_sync" {
|
||||
metadata {
|
||||
name = "external-monitor-sync"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role_v1" "external_monitor_sync" {
|
||||
metadata {
|
||||
name = "external-monitor-sync"
|
||||
}
|
||||
rule {
|
||||
api_groups = ["networking.k8s.io"]
|
||||
resources = ["ingresses"]
|
||||
verbs = ["list", "get"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cluster_role_binding_v1" "external_monitor_sync" {
|
||||
metadata {
|
||||
name = "external-monitor-sync"
|
||||
}
|
||||
role_ref {
|
||||
api_group = "rbac.authorization.k8s.io"
|
||||
kind = "ClusterRole"
|
||||
name = kubernetes_cluster_role_v1.external_monitor_sync.metadata[0].name
|
||||
}
|
||||
subject {
|
||||
kind = "ServiceAccount"
|
||||
name = kubernetes_service_account_v1.external_monitor_sync.metadata[0].name
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map_v1" "external_monitor_targets" {
|
||||
metadata {
|
||||
name = "external-monitor-targets"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"targets.json" = jsonencode(local.external_monitor_targets)
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "external_monitor_sync" {
|
||||
metadata {
|
||||
name = "external-monitor-sync"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Forbid"
|
||||
failed_jobs_history_limit = 3
|
||||
successful_jobs_history_limit = 3
|
||||
schedule = "*/10 * * * *"
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 300
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
service_account_name = kubernetes_service_account_v1.external_monitor_sync.metadata[0].name
|
||||
container {
|
||||
name = "sync"
|
||||
image = "docker.io/library/python:3.12-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
pip install --quiet --disable-pip-version-check uptime-kuma-api
|
||||
python3 << 'PYEOF'
|
||||
import os, json, ssl, time, urllib.request, urllib.error
|
||||
from uptime_kuma_api import UptimeKumaApi, MonitorType
|
||||
|
||||
UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
|
||||
UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"]
|
||||
FALLBACK_FILE = "/config/targets.json"
|
||||
PREFIX = "[External] "
|
||||
ANNOTATION_ENABLE = "uptime.viktorbarzin.me/external-monitor"
|
||||
ANNOTATION_NAME = "uptime.viktorbarzin.me/external-monitor-name"
|
||||
ANNOTATION_PATH = "uptime.viktorbarzin.me/external-monitor-path"
|
||||
DEFAULT_PATH = "/"
|
||||
# Homepages often serve 200/30x/40x even when backends are degraded.
|
||||
# When an explicit probe path is set we expect a real healthz: tighten codes.
|
||||
STATUSCODES_LENIENT = ["200-299", "300-399", "400-499"]
|
||||
STATUSCODES_STRICT = ["200-299"]
|
||||
SA_DIR = "/var/run/secrets/kubernetes.io/serviceaccount"
|
||||
API_SERVER = f"https://{os.environ.get('KUBERNETES_SERVICE_HOST', 'kubernetes.default.svc.cluster.local')}:{os.environ.get('KUBERNETES_SERVICE_PORT', '443')}"
|
||||
|
||||
|
||||
def load_from_api():
|
||||
"""List ingresses via in-cluster API. Opt-OUT by default:
|
||||
every ingress whose host matches *.viktorbarzin.me gets a monitor,
|
||||
UNLESS its annotation `uptime.viktorbarzin.me/external-monitor` is `"false"`.
|
||||
This covers Helm-managed ingresses (authentik, grafana, vault, forgejo, ntfy)
|
||||
that don't go through ingress_factory."""
|
||||
with open(f"{SA_DIR}/token") as f:
|
||||
token = f.read().strip()
|
||||
ctx = ssl.create_default_context(cafile=f"{SA_DIR}/ca.crt")
|
||||
url = f"{API_SERVER}/apis/networking.k8s.io/v1/ingresses"
|
||||
req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}"})
|
||||
with urllib.request.urlopen(req, context=ctx, timeout=30) as resp:
|
||||
body = json.loads(resp.read())
|
||||
|
||||
targets = []
|
||||
seen = set()
|
||||
for ing in body.get("items", []):
|
||||
anns = (ing.get("metadata") or {}).get("annotations") or {}
|
||||
if anns.get(ANNOTATION_ENABLE, "").lower() == "false":
|
||||
continue # explicit opt-out
|
||||
tls = (ing.get("spec") or {}).get("tls") or []
|
||||
host = None
|
||||
if tls and tls[0].get("hosts"):
|
||||
host = tls[0]["hosts"][0]
|
||||
else:
|
||||
rules = (ing.get("spec") or {}).get("rules") or []
|
||||
if rules:
|
||||
host = rules[0].get("host")
|
||||
if not host or not host.endswith(".viktorbarzin.me"):
|
||||
continue # skip internal-only or non-public hosts
|
||||
label = anns.get(ANNOTATION_NAME) or host.split(".")[0]
|
||||
monitor_name = f"{PREFIX}{label}"
|
||||
if monitor_name in seen:
|
||||
continue # dedupe by final monitor name, not hostname (fixes duplicate creation)
|
||||
seen.add(monitor_name)
|
||||
path = anns.get(ANNOTATION_PATH, "").strip()
|
||||
if path and not path.startswith("/"):
|
||||
path = "/" + path
|
||||
# Omit trailing slash when no explicit path — matches pre-existing monitor URLs
|
||||
# and avoids every sync re-updating unchanged monitors.
|
||||
url = f"https://{host}{path}" if path else f"https://{host}"
|
||||
statuscodes = STATUSCODES_STRICT if path else STATUSCODES_LENIENT
|
||||
targets.append({"name": label, "url": url, "statuscodes": statuscodes})
|
||||
return targets
|
||||
|
||||
|
||||
def load_from_configmap():
|
||||
"""Legacy fallback: read the ConfigMap list."""
|
||||
with open(FALLBACK_FILE) as f:
|
||||
raw = json.load(f)
|
||||
return [{"name": t["name"], "url": t["url"], "statuscodes": STATUSCODES_LENIENT} for t in raw]
|
||||
|
||||
|
||||
try:
|
||||
targets = load_from_api()
|
||||
source = "k8s-api"
|
||||
if not targets:
|
||||
print("WARN: k8s-api returned 0 targets; falling back to ConfigMap")
|
||||
targets = load_from_configmap()
|
||||
source = "configmap"
|
||||
except (urllib.error.URLError, OSError, KeyError, ValueError) as e:
|
||||
print(f"WARN: k8s-api discovery failed ({e!r}); falling back to ConfigMap")
|
||||
targets = load_from_configmap()
|
||||
source = "configmap"
|
||||
|
||||
print(f"Loaded {len(targets)} external monitor targets (source={source})")
|
||||
|
||||
api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=120, wait_events=0.2)
|
||||
api.login("admin", UPTIME_KUMA_PASS)
|
||||
|
||||
monitors = api.get_monitors()
|
||||
existing_external = {}
|
||||
for m in monitors:
|
||||
if m["name"].startswith(PREFIX):
|
||||
existing_external[m["name"]] = m
|
||||
|
||||
target_names = set()
|
||||
targets_by_name = {}
|
||||
created = 0
|
||||
for t in targets:
|
||||
monitor_name = f"{PREFIX}{t['name']}"
|
||||
target_names.add(monitor_name)
|
||||
targets_by_name[monitor_name] = t
|
||||
if monitor_name not in existing_external:
|
||||
print(f"Creating monitor: {monitor_name} -> {t['url']}")
|
||||
api.add_monitor(
|
||||
type=MonitorType.HTTP,
|
||||
name=monitor_name,
|
||||
url=t["url"],
|
||||
interval=300,
|
||||
maxretries=3,
|
||||
accepted_statuscodes=t["statuscodes"],
|
||||
)
|
||||
created += 1
|
||||
time.sleep(0.3)
|
||||
|
||||
# Update monitors whose target URL or accepted status codes drifted
|
||||
# (e.g., new probe-path annotation added on an existing ingress).
|
||||
updated = 0
|
||||
for monitor_name, t in targets_by_name.items():
|
||||
existing = existing_external.get(monitor_name)
|
||||
if not existing:
|
||||
continue
|
||||
current_url = existing.get("url")
|
||||
current_codes = existing.get("accepted_statuscodes") or []
|
||||
if current_url == t["url"] and current_codes == t["statuscodes"]:
|
||||
continue
|
||||
print(f"Updating monitor {monitor_name}: {current_url} -> {t['url']} (codes {current_codes} -> {t['statuscodes']})")
|
||||
api.edit_monitor(
|
||||
existing["id"],
|
||||
url=t["url"],
|
||||
accepted_statuscodes=t["statuscodes"],
|
||||
)
|
||||
updated += 1
|
||||
time.sleep(0.3)
|
||||
|
||||
# Remove monitors for services no longer in the list
|
||||
deleted = 0
|
||||
for name, m in existing_external.items():
|
||||
if name not in target_names:
|
||||
print(f"Deleting orphaned monitor: {name}")
|
||||
api.delete_monitor(m["id"])
|
||||
deleted += 1
|
||||
time.sleep(0.3)
|
||||
|
||||
api.disconnect()
|
||||
unchanged = len(target_names) - created - updated
|
||||
print(f"Sync complete: {created} created, {updated} updated, {deleted} deleted, {unchanged} unchanged")
|
||||
PYEOF
|
||||
EOT
|
||||
]
|
||||
env {
|
||||
name = "UPTIME_KUMA_PASSWORD"
|
||||
value = data.vault_kv_secret_v2.viktor.data["uptime_kuma_admin_password"]
|
||||
}
|
||||
volume_mount {
|
||||
name = "config"
|
||||
mount_path = "/config"
|
||||
read_only = true
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
memory = "128Mi"
|
||||
cpu = "10m"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "config"
|
||||
config_map {
|
||||
name = kubernetes_config_map_v1.external_monitor_targets.metadata[0].name
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# Internal Monitor Sync
|
||||
# Declaratively manages monitors for internal services (databases, non-HTTP
|
||||
# endpoints) that can't be discovered from ingress annotations. Idempotent:
|
||||
# looks up monitors by name, creates if missing, patches if drifted.
|
||||
#
|
||||
# Why a CronJob and not a one-shot Job:
|
||||
# - louislam/uptime-kuma has no Terraform provider (only a CLI tool).
|
||||
# - UK v2 stores monitors in MariaDB (`uptimekuma` on mysql.dbaas); if the DB
|
||||
# is wiped/restored we must re-create them.
|
||||
# - CronJob self-heals drift (manual UI edits, UK restarts, DB restores).
|
||||
#
|
||||
# Managed monitors (name -> desired spec) are defined in local.internal_monitors
|
||||
# below. Add new internal-service monitors there.
|
||||
# =============================================================================
|
||||
|
||||
locals {
|
||||
internal_monitors = [
|
||||
{
|
||||
name = "MySQL Standalone (dbaas)"
|
||||
type = "mysql"
|
||||
database_connection_string = "mysql://uptimekuma@mysql.dbaas.svc.cluster.local:3306"
|
||||
database_password_vault_key = "uptimekuma_db_password"
|
||||
hostname = null
|
||||
port = null
|
||||
url = null
|
||||
accepted_statuscodes = null
|
||||
ignore_tls = null
|
||||
interval = 60
|
||||
retry_interval = 60
|
||||
max_retries = 2
|
||||
},
|
||||
{
|
||||
# HAProxy service in redis ns health-checks INFO replication and
|
||||
# only routes to the current Sentinel-elected master, so this
|
||||
# survives failover. Bitnami chart has auth disabled, so no
|
||||
# password_vault_key.
|
||||
name = "Redis"
|
||||
type = "redis"
|
||||
database_connection_string = "redis://redis-master.redis.svc.cluster.local:6379"
|
||||
database_password_vault_key = null
|
||||
hostname = null
|
||||
port = null
|
||||
url = null
|
||||
accepted_statuscodes = null
|
||||
ignore_tls = null
|
||||
interval = 60
|
||||
retry_interval = 30
|
||||
max_retries = 3
|
||||
},
|
||||
{
|
||||
# TP-Link home router upstream of pfSense. Complements the
|
||||
# `[External] gw` HTTPS monitor: this one checks the router
|
||||
# directly on 443, so we can tell a Cloudflare/tunnel outage
|
||||
# apart from the router itself being unreachable.
|
||||
name = "TP-Link Gateway (192.168.1.1)"
|
||||
type = "port"
|
||||
database_connection_string = null
|
||||
database_password_vault_key = null
|
||||
hostname = "192.168.1.1"
|
||||
port = 443
|
||||
url = null
|
||||
accepted_statuscodes = null
|
||||
ignore_tls = null
|
||||
interval = 60
|
||||
retry_interval = 30
|
||||
max_retries = 3
|
||||
},
|
||||
{
|
||||
# Proxmox web UI on the PVE host. Probes the IP directly (NOT a
|
||||
# `*.viktorbarzin.lan` name) because in-cluster lookups for those
|
||||
# are vulnerable to CoreDNS pod-level cache skew — pre-fix, this
|
||||
# monitor would intermittently land on a stale `10.0.10.1`
|
||||
# (pfSense gateway, nothing on :8006) and spuriously alert
|
||||
# `ExternalAccessDivergence`. Direct-IP HTTPS eliminates that
|
||||
# variable. Self-signed cert → ignore_tls=true. The 301→HTTPS
|
||||
# redirect from pveproxy lands in the 300-399 band, so we accept
|
||||
# 200-499 to cover redirect + auth-prompt responses.
|
||||
name = "Proxmox UI"
|
||||
type = "http"
|
||||
database_connection_string = null
|
||||
database_password_vault_key = null
|
||||
hostname = null
|
||||
port = null
|
||||
url = "https://192.168.1.127:8006/"
|
||||
accepted_statuscodes = ["200-299", "300-399", "400-499"]
|
||||
ignore_tls = true
|
||||
interval = 300
|
||||
retry_interval = 60
|
||||
max_retries = 2
|
||||
},
|
||||
{
|
||||
# Direct port probe of the Traefik MetalLB LB IP. Complements the
|
||||
# `[External] traefik` HTTPS monitor (full DNS→CF→tunnel path) and the
|
||||
# in-cluster `Traefik Dashboard` monitor: this one checks the dedicated
|
||||
# LB IP + :443 bind directly, so a MetalLB L2 / Traefik-bind failure is
|
||||
# distinguishable from a Cloudflare/tunnel outage. The IP is .203 (the
|
||||
# DEDICATED Traefik LB, ETP=Local) — NOT the shared .200, which Traefik
|
||||
# moved off on 2026-05-30. Replaces a hand-created monitor that still
|
||||
# pointed at the dead .200:443. Keep this IP in sync with the Traefik LB
|
||||
# in `docs/architecture/networking.md`.
|
||||
name = "Traefik LoadBalancer (10.0.20.203)"
|
||||
type = "port"
|
||||
database_connection_string = null
|
||||
database_password_vault_key = null
|
||||
hostname = "10.0.20.203"
|
||||
port = 443
|
||||
url = null
|
||||
accepted_statuscodes = null
|
||||
ignore_tls = null
|
||||
interval = 60
|
||||
retry_interval = 30
|
||||
max_retries = 3
|
||||
},
|
||||
{
|
||||
# Internal /healthz probe of the nextcloud-todos service. The `/cb`
|
||||
# ingress carries the `[External]` HTTPS monitor (auto-created by
|
||||
# external-monitor-sync), but those endpoints are HMAC-gated and only
|
||||
# cover the callback path — this checks the app's own liveness inside
|
||||
# the cluster on the ClusterIP svc. Plain HTTP, expects a clean 200.
|
||||
name = "nextcloud-todos (/healthz)"
|
||||
type = "http"
|
||||
database_connection_string = null
|
||||
database_password_vault_key = null
|
||||
hostname = null
|
||||
port = null
|
||||
url = "http://nextcloud-todos.nextcloud-todos.svc.cluster.local:8080/healthz"
|
||||
accepted_statuscodes = ["200-299"]
|
||||
ignore_tls = false
|
||||
interval = 60
|
||||
retry_interval = 30
|
||||
max_retries = 3
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
resource "kubernetes_secret" "internal_monitor_sync" {
|
||||
metadata {
|
||||
name = "internal-monitor-sync"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
data = merge(
|
||||
{ UPTIME_KUMA_PASSWORD = data.vault_kv_secret_v2.viktor.data["uptime_kuma_admin_password"] },
|
||||
{
|
||||
for m in local.internal_monitors :
|
||||
"DB_PASSWORD_${upper(replace(m.name, "/[^A-Za-z0-9]/", "_"))}" =>
|
||||
data.vault_kv_secret_v2.viktor.data[m.database_password_vault_key]
|
||||
if m.database_password_vault_key != null
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map_v1" "internal_monitor_targets" {
|
||||
metadata {
|
||||
name = "internal-monitor-targets"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"targets.json" = jsonencode([
|
||||
for m in local.internal_monitors : {
|
||||
name = m.name
|
||||
type = m.type
|
||||
database_connection_string = m.database_connection_string
|
||||
hostname = m.hostname
|
||||
port = m.port
|
||||
url = m.url
|
||||
accepted_statuscodes = m.accepted_statuscodes
|
||||
ignore_tls = m.ignore_tls
|
||||
password_env = m.database_password_vault_key != null ? "DB_PASSWORD_${upper(replace(m.name, "/[^A-Za-z0-9]/", "_"))}" : null
|
||||
interval = m.interval
|
||||
retry_interval = m.retry_interval
|
||||
max_retries = m.max_retries
|
||||
}
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "internal_monitor_sync" {
|
||||
metadata {
|
||||
name = "internal-monitor-sync"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Forbid"
|
||||
failed_jobs_history_limit = 3
|
||||
successful_jobs_history_limit = 3
|
||||
schedule = "*/10 * * * *"
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 300
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
container {
|
||||
name = "sync"
|
||||
image = "docker.io/library/python:3.12-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
pip install --quiet --disable-pip-version-check uptime-kuma-api
|
||||
python3 << 'PYEOF'
|
||||
import json, os, time
|
||||
from uptime_kuma_api import UptimeKumaApi, MonitorType
|
||||
|
||||
UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
|
||||
UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"]
|
||||
|
||||
with open("/config/targets.json") as f:
|
||||
targets = json.load(f)
|
||||
|
||||
api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=120, wait_events=0.2)
|
||||
api.login("admin", UPTIME_KUMA_PASS)
|
||||
|
||||
existing = {m["name"]: m for m in api.get_monitors()}
|
||||
|
||||
for t in targets:
|
||||
name = t["name"]
|
||||
mtype = MonitorType(t["type"])
|
||||
# MYSQL uses `databaseConnectionString` + `radiusPassword` (UK v2 re-uses
|
||||
# radiusPassword for mysql auth — backwards compat). Redis has auth
|
||||
# disabled on the cluster, so password_env is null. PORT monitors use
|
||||
# hostname + port directly. HTTP monitors use url + accepted_statuscodes
|
||||
# + ignoreTls (camelCase on the API; stored as `ignore_tls` in DB).
|
||||
desired = {
|
||||
"type": mtype,
|
||||
"name": name,
|
||||
"interval": t["interval"],
|
||||
"retryInterval": t["retry_interval"],
|
||||
"maxretries": t["max_retries"],
|
||||
}
|
||||
if mtype == MonitorType.PORT:
|
||||
desired["hostname"] = t["hostname"]
|
||||
desired["port"] = t["port"]
|
||||
elif mtype == MonitorType.HTTP:
|
||||
desired["url"] = t["url"]
|
||||
desired["accepted_statuscodes"] = t["accepted_statuscodes"]
|
||||
desired["ignoreTls"] = bool(t["ignore_tls"])
|
||||
else:
|
||||
desired["databaseConnectionString"] = t["database_connection_string"]
|
||||
if t.get("password_env"):
|
||||
desired["radiusPassword"] = os.environ[t["password_env"]]
|
||||
if name not in existing:
|
||||
print(f"Creating monitor: {name}")
|
||||
api.add_monitor(**desired)
|
||||
continue
|
||||
m = existing[name]
|
||||
drift_fields = ["interval", "retryInterval", "maxretries"]
|
||||
if mtype == MonitorType.PORT:
|
||||
drift_fields += ["hostname", "port"]
|
||||
elif mtype == MonitorType.HTTP:
|
||||
drift_fields += ["url", "accepted_statuscodes", "ignoreTls"]
|
||||
else:
|
||||
drift_fields += ["databaseConnectionString"]
|
||||
if "radiusPassword" in desired:
|
||||
drift_fields += ["radiusPassword"]
|
||||
drifted = any(m.get(f) != desired.get(f) for f in drift_fields)
|
||||
if drifted:
|
||||
print(f"Updating monitor {name} (id={m['id']})")
|
||||
edit_kwargs = {f: desired[f] for f in drift_fields if f in desired}
|
||||
api.edit_monitor(m["id"], **edit_kwargs)
|
||||
else:
|
||||
print(f"Monitor {name} (id={m['id']}) already in desired state")
|
||||
time.sleep(0.3)
|
||||
|
||||
api.disconnect()
|
||||
print("Internal monitor sync complete")
|
||||
PYEOF
|
||||
EOT
|
||||
]
|
||||
env_from {
|
||||
secret_ref {
|
||||
name = kubernetes_secret.internal_monitor_sync.metadata[0].name
|
||||
}
|
||||
}
|
||||
volume_mount {
|
||||
name = "config"
|
||||
mount_path = "/config"
|
||||
read_only = true
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
memory = "128Mi"
|
||||
cpu = "10m"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "config"
|
||||
config_map {
|
||||
name = kubernetes_config_map_v1.internal_monitor_targets.metadata[0].name
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue