feat: add external monitoring for all Cloudflare-proxied services

Add automatic external HTTPS monitors to Uptime Kuma for ~96 services
exposed via Cloudflare tunnel. A sync CronJob (every 10min) reads from
a Terraform-generated ConfigMap and creates/deletes [External] monitors
to match cloudflare_proxied_names. Status page groups these separately
as "External Reachability" and pushes a divergence metric to Pushgateway
when services are externally down but internally up. Prometheus alert
ExternalAccessDivergence fires after 15min of divergence.

[ci skip]

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-14 19:04:45 +00:00
parent 3258ff6cb7
commit ff360a8807
4 changed files with 384 additions and 5 deletions

View file

@ -1895,6 +1895,15 @@ serverFiles:
severity: warning
annotations:
summary: "Headscale 5xx error rate is {{ $value | printf \"%.1f\" }}%"
- name: "External Access"
rules:
- alert: ExternalAccessDivergence
expr: external_internal_divergence_count > 0
for: 15m
labels:
severity: warning
annotations:
summary: "{{ $value | printf \"%.0f\" }} service(s) externally unreachable but internally healthy — check Cloudflare tunnel, DNS, or Traefik routing"
extraScrapeConfigs: |
- job_name: 'proxmox-host'

View file

@ -202,7 +202,10 @@ for m in monitors:
raw_type = m.get("type", "unknown")
monitor_type = raw_type.value if hasattr(raw_type, "value") else str(raw_type)
monitor_type = monitor_type.lower().replace("monitortype.", "")
group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper())
if m["name"].startswith("[External] "):
group_name = "External Reachability"
else:
group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper())
if not m.get("active", True):
continue
@ -267,9 +270,220 @@ for m in monitors:
api.disconnect()
print(f"Generated {len(groups)} groups")
# ============ Detect external-down / internal-up divergence ============
external_status = {}
internal_status = {}
for gname, gmonitors in groups.items():
for mon in gmonitors:
if mon["name"].startswith("[External] "):
svc = mon["name"].replace("[External] ", "").lower()
external_status[svc] = mon["status"]
elif gname != "External Reachability":
internal_status[mon["name"].lower()] = mon["status"]
divergent = []
for svc, ext_st in external_status.items():
if ext_st != "down":
continue
for iname, int_st in internal_status.items():
if svc in iname or iname in svc:
if int_st == "up":
divergent.append(svc)
break
divergence_count = len(divergent)
metric_body = (
"# HELP external_internal_divergence_count Services externally down but internally up\n"
"# TYPE external_internal_divergence_count gauge\n"
f"external_internal_divergence_count {divergence_count}\n"
)
for svc in divergent:
metric_body += f'external_internal_divergence_services{{service="{svc}"}} 1\n'
try:
import urllib.request as _ur
req = _ur.Request(
"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/external-monitor-divergence",
data=metric_body.encode(),
method="POST"
)
_ur.urlopen(req, timeout=10)
if divergent:
print(f"WARNING: {len(divergent)} services externally down but internally up: {divergent}")
else:
print("No external/internal divergence detected")
except Exception as e:
print(f"Warning: could not push divergence metric: {e}")
# ============ Fetch incidents from GitHub Issues ============
import urllib.request, urllib.error, re as _re2
def fetch_github_json(url):
req = urllib.request.Request(url, headers={
"Authorization": "token " + GITHUB_TOKEN,
"Accept": "application/vnd.github.v3+json",
"User-Agent": "status-page-pusher",
})
resp = urllib.request.urlopen(req, timeout=15)
return json.loads(resp.read())
def parse_severity(labels):
for lbl in labels:
name = lbl["name"].lower()
if name in ("sev1", "sev2", "sev3"):
return name
return "sev3"
def parse_affected_services(body):
services = []
if not body:
return services
in_section = False
for line in body.split("\n"):
stripped = line.strip()
if stripped.lower().startswith("## affected"):
in_section = True
continue
if in_section:
if stripped.startswith("##"):
break
if stripped.startswith("- ") and not stripped.startswith("- <!--"):
services.append(stripped[2:].strip())
return services
def parse_timeline(comments):
timeline = []
for c in comments:
body = (c.get("body") or "").strip()
status_label = "Update"
if body.startswith("**"):
end = body.find("**", 2)
if end > 2:
status_label = body[2:end]
timeline.append({
"timestamp": c["created_at"],
"status": status_label,
"body": body,
})
return timeline
def extract_postmortem(comments):
for c in reversed(comments):
body = (c.get("body") or "").lower()
if "postmortem" in body:
urls = _re2.findall(r'https?://\S+', c.get("body", ""))
if urls:
return urls[0].rstrip(")>")
return None
incidents_active = []
incidents_resolved = []
user_reports = []
ISSUES_REPO = "ViktorBarzin/infra"
def has_label(issue, name):
return any(l["name"].lower() == name.lower() for l in issue.get("labels", []))
def parse_user_report_service(body):
"""Extract service from GitHub Issue Form dropdown response."""
if not body:
return None
for line in body.split("\n"):
stripped = line.strip()
if stripped and not stripped.startswith("#") and not stripped.startswith("_") and not stripped.startswith("<!"):
prev_was_heading = False
for i, ln in enumerate(body.split("\n")):
if "affected service" in ln.lower():
prev_was_heading = True
continue
if prev_was_heading and ln.strip():
return ln.strip()
return None
try:
issues_url = "https://api.github.com/repos/" + ISSUES_REPO + "/issues"
# Fetch admin-declared incidents (open)
open_incidents = fetch_github_json(
issues_url + "?labels=incident&state=open&per_page=50&sort=created&direction=desc"
)
for issue in open_incidents:
if issue.get("pull_request"):
continue
comments = fetch_github_json(issue["comments_url"]) if issue.get("comments", 0) > 0 else []
incidents_active.append({
"id": issue["number"],
"title": issue["title"],
"type": "incident",
"severity": parse_severity(issue.get("labels", [])),
"status": "active",
"created_at": issue["created_at"],
"updated_at": issue["updated_at"],
"affected_services": parse_affected_services(issue.get("body")),
"timeline": parse_timeline(comments),
"url": issue["html_url"],
"postmortem": None,
})
# Fetch user reports (open, not yet triaged to incident)
open_reports = fetch_github_json(
issues_url + "?labels=user-report&state=open&per_page=20&sort=created&direction=desc"
)
for issue in open_reports:
if issue.get("pull_request"):
continue
if has_label(issue, "incident"):
continue # Already promoted to incident, skip duplicate
svc = parse_user_report_service(issue.get("body"))
user_reports.append({
"id": issue["number"],
"title": issue["title"],
"type": "user-report",
"status": "open",
"created_at": issue["created_at"],
"affected_services": [svc] if svc else [],
"url": issue["html_url"],
})
# Fetch recently closed incidents (last 7 days)
closed_incidents = fetch_github_json(
issues_url + "?labels=incident&state=closed&per_page=20&sort=updated&direction=desc"
)
cutoff_7d = (now - timedelta(days=7)).isoformat()
for issue in closed_incidents:
if issue.get("pull_request"):
continue
if issue.get("closed_at") and issue["closed_at"] < cutoff_7d:
continue
comments = fetch_github_json(issue["comments_url"]) if issue.get("comments", 0) > 0 else []
incidents_resolved.append({
"id": issue["number"],
"title": issue["title"],
"type": "incident",
"severity": parse_severity(issue.get("labels", [])),
"status": "resolved",
"created_at": issue["created_at"],
"closed_at": issue["closed_at"],
"updated_at": issue["updated_at"],
"affected_services": parse_affected_services(issue.get("body")),
"timeline": parse_timeline(comments),
"url": issue["html_url"],
"postmortem": extract_postmortem(comments),
})
print(f"Incidents: {len(incidents_active)} active, {len(incidents_resolved)} resolved, {len(user_reports)} user reports")
except Exception as e:
print(f"Warning: could not fetch incidents: {e}")
status_data = {
"last_updated": now.isoformat(),
"groups": groups,
"incidents": {
"active": incidents_active,
"resolved": incidents_resolved,
"user_reports": user_reports,
},
}
work_dir = "/tmp/status-page"

View file

@ -1,9 +1,11 @@
variable "tls_secret_name" { type = string }
variable "nfs_server" { type = string }
variable "cloudflare_proxied_names" { type = list(string) }
module "uptime-kuma" {
source = "./modules/uptime-kuma"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
tier = local.tiers.cluster
source = "./modules/uptime-kuma"
tls_secret_name = var.tls_secret_name
nfs_server = var.nfs_server
tier = local.tiers.cluster
cloudflare_proxied_names = var.cloudflare_proxied_names
}

View file

@ -1,6 +1,26 @@
variable "tls_secret_name" {}
variable "tier" { type = string }
variable "nfs_server" { type = string }
variable "cloudflare_proxied_names" { type = list(string) }
data "vault_kv_secret_v2" "viktor" {
mount = "secret"
name = "viktor"
}
locals {
# Services that don't respond to standard HTTP health checks
non_http_services = toset(["xray-vless", "xray-ws", "xray-grpc"])
external_monitor_targets = [
for name in var.cloudflare_proxied_names : {
name = name
hostname = name == "viktorbarzin.me" ? "viktorbarzin.me" : "${name}.viktorbarzin.me"
url = name == "viktorbarzin.me" ? "https://viktorbarzin.me" : "https://${name}.viktorbarzin.me"
}
if !contains(local.non_http_services, name)
]
}
resource "kubernetes_namespace" "uptime-kuma" {
metadata {
@ -228,3 +248,137 @@ module "ingress" {
# }
# }
# }
# =============================================================================
# External Monitor Sync
# Ensures Uptime Kuma has external HTTPS monitors for all Cloudflare-proxied services.
# Reads targets from a Terraform-generated ConfigMap, creates/deletes monitors to match.
# =============================================================================
resource "kubernetes_config_map_v1" "external_monitor_targets" {
metadata {
name = "external-monitor-targets"
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
}
data = {
"targets.json" = jsonencode(local.external_monitor_targets)
}
}
resource "kubernetes_cron_job_v1" "external_monitor_sync" {
metadata {
name = "external-monitor-sync"
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
}
spec {
concurrency_policy = "Forbid"
failed_jobs_history_limit = 3
successful_jobs_history_limit = 3
schedule = "*/10 * * * *"
job_template {
metadata {}
spec {
backoff_limit = 1
ttl_seconds_after_finished = 300
template {
metadata {}
spec {
container {
name = "sync"
image = "docker.io/library/python:3.12-alpine"
command = ["/bin/sh", "-c", <<-EOT
pip install --quiet --disable-pip-version-check uptime-kuma-api
python3 << 'PYEOF'
import os, json, time
from uptime_kuma_api import UptimeKumaApi, MonitorType
UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"]
TARGETS_FILE = "/config/targets.json"
PREFIX = "[External] "
with open(TARGETS_FILE) as f:
targets = json.load(f)
print(f"Loaded {len(targets)} external monitor targets")
api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=30)
api.login("admin", UPTIME_KUMA_PASS)
monitors = api.get_monitors()
existing_external = {}
for m in monitors:
if m["name"].startswith(PREFIX):
existing_external[m["name"]] = m
target_names = set()
created = 0
for t in targets:
monitor_name = f"{PREFIX}{t['name']}"
target_names.add(monitor_name)
if monitor_name not in existing_external:
print(f"Creating monitor: {monitor_name} -> {t['url']}")
api.add_monitor(
type=MonitorType.HTTP,
name=monitor_name,
url=t["url"],
interval=300,
maxretries=3,
accepted_statecodes=["200-499"],
)
created += 1
time.sleep(0.3)
# Remove monitors for services no longer in the list
deleted = 0
for name, m in existing_external.items():
if name not in target_names:
print(f"Deleting orphaned monitor: {name}")
api.delete_monitor(m["id"])
deleted += 1
time.sleep(0.3)
api.disconnect()
print(f"Sync complete: {created} created, {deleted} deleted, {len(target_names) - created} unchanged")
PYEOF
EOT
]
env {
name = "UPTIME_KUMA_PASSWORD"
value = data.vault_kv_secret_v2.viktor.data["uptime_kuma_admin_password"]
}
volume_mount {
name = "config"
mount_path = "/config"
read_only = true
}
resources {
requests = {
memory = "128Mi"
cpu = "10m"
}
limits = {
memory = "256Mi"
}
}
}
volume {
name = "config"
config_map {
name = kubernetes_config_map_v1.external_monitor_targets.metadata[0].name
}
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
}
lifecycle {
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}