feat: add external monitoring for all Cloudflare-proxied services
Add automatic external HTTPS monitors to Uptime Kuma for ~96 services exposed via Cloudflare tunnel. A sync CronJob (every 10min) reads from a Terraform-generated ConfigMap and creates/deletes [External] monitors to match cloudflare_proxied_names. Status page groups these separately as "External Reachability" and pushes a divergence metric to Pushgateway when services are externally down but internally up. Prometheus alert ExternalAccessDivergence fires after 15min of divergence. [ci skip] Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3258ff6cb7
commit
ff360a8807
4 changed files with 384 additions and 5 deletions
|
|
@ -1895,6 +1895,15 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Headscale 5xx error rate is {{ $value | printf \"%.1f\" }}%"
|
||||
- name: "External Access"
|
||||
rules:
|
||||
- alert: ExternalAccessDivergence
|
||||
expr: external_internal_divergence_count > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $value | printf \"%.0f\" }} service(s) externally unreachable but internally healthy — check Cloudflare tunnel, DNS, or Traefik routing"
|
||||
|
||||
extraScrapeConfigs: |
|
||||
- job_name: 'proxmox-host'
|
||||
|
|
|
|||
|
|
@ -202,7 +202,10 @@ for m in monitors:
|
|||
raw_type = m.get("type", "unknown")
|
||||
monitor_type = raw_type.value if hasattr(raw_type, "value") else str(raw_type)
|
||||
monitor_type = monitor_type.lower().replace("monitortype.", "")
|
||||
group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper())
|
||||
if m["name"].startswith("[External] "):
|
||||
group_name = "External Reachability"
|
||||
else:
|
||||
group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper())
|
||||
|
||||
if not m.get("active", True):
|
||||
continue
|
||||
|
|
@ -267,9 +270,220 @@ for m in monitors:
|
|||
api.disconnect()
|
||||
print(f"Generated {len(groups)} groups")
|
||||
|
||||
# ============ Detect external-down / internal-up divergence ============
|
||||
external_status = {}
|
||||
internal_status = {}
|
||||
for gname, gmonitors in groups.items():
|
||||
for mon in gmonitors:
|
||||
if mon["name"].startswith("[External] "):
|
||||
svc = mon["name"].replace("[External] ", "").lower()
|
||||
external_status[svc] = mon["status"]
|
||||
elif gname != "External Reachability":
|
||||
internal_status[mon["name"].lower()] = mon["status"]
|
||||
|
||||
divergent = []
|
||||
for svc, ext_st in external_status.items():
|
||||
if ext_st != "down":
|
||||
continue
|
||||
for iname, int_st in internal_status.items():
|
||||
if svc in iname or iname in svc:
|
||||
if int_st == "up":
|
||||
divergent.append(svc)
|
||||
break
|
||||
|
||||
divergence_count = len(divergent)
|
||||
metric_body = (
|
||||
"# HELP external_internal_divergence_count Services externally down but internally up\n"
|
||||
"# TYPE external_internal_divergence_count gauge\n"
|
||||
f"external_internal_divergence_count {divergence_count}\n"
|
||||
)
|
||||
for svc in divergent:
|
||||
metric_body += f'external_internal_divergence_services{{service="{svc}"}} 1\n'
|
||||
|
||||
try:
|
||||
import urllib.request as _ur
|
||||
req = _ur.Request(
|
||||
"http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/external-monitor-divergence",
|
||||
data=metric_body.encode(),
|
||||
method="POST"
|
||||
)
|
||||
_ur.urlopen(req, timeout=10)
|
||||
if divergent:
|
||||
print(f"WARNING: {len(divergent)} services externally down but internally up: {divergent}")
|
||||
else:
|
||||
print("No external/internal divergence detected")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not push divergence metric: {e}")
|
||||
|
||||
# ============ Fetch incidents from GitHub Issues ============
|
||||
import urllib.request, urllib.error, re as _re2
|
||||
|
||||
def fetch_github_json(url):
|
||||
req = urllib.request.Request(url, headers={
|
||||
"Authorization": "token " + GITHUB_TOKEN,
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
"User-Agent": "status-page-pusher",
|
||||
})
|
||||
resp = urllib.request.urlopen(req, timeout=15)
|
||||
return json.loads(resp.read())
|
||||
|
||||
def parse_severity(labels):
|
||||
for lbl in labels:
|
||||
name = lbl["name"].lower()
|
||||
if name in ("sev1", "sev2", "sev3"):
|
||||
return name
|
||||
return "sev3"
|
||||
|
||||
def parse_affected_services(body):
|
||||
services = []
|
||||
if not body:
|
||||
return services
|
||||
in_section = False
|
||||
for line in body.split("\n"):
|
||||
stripped = line.strip()
|
||||
if stripped.lower().startswith("## affected"):
|
||||
in_section = True
|
||||
continue
|
||||
if in_section:
|
||||
if stripped.startswith("##"):
|
||||
break
|
||||
if stripped.startswith("- ") and not stripped.startswith("- <!--"):
|
||||
services.append(stripped[2:].strip())
|
||||
return services
|
||||
|
||||
def parse_timeline(comments):
|
||||
timeline = []
|
||||
for c in comments:
|
||||
body = (c.get("body") or "").strip()
|
||||
status_label = "Update"
|
||||
if body.startswith("**"):
|
||||
end = body.find("**", 2)
|
||||
if end > 2:
|
||||
status_label = body[2:end]
|
||||
timeline.append({
|
||||
"timestamp": c["created_at"],
|
||||
"status": status_label,
|
||||
"body": body,
|
||||
})
|
||||
return timeline
|
||||
|
||||
def extract_postmortem(comments):
|
||||
for c in reversed(comments):
|
||||
body = (c.get("body") or "").lower()
|
||||
if "postmortem" in body:
|
||||
urls = _re2.findall(r'https?://\S+', c.get("body", ""))
|
||||
if urls:
|
||||
return urls[0].rstrip(")>")
|
||||
return None
|
||||
|
||||
incidents_active = []
|
||||
incidents_resolved = []
|
||||
user_reports = []
|
||||
|
||||
ISSUES_REPO = "ViktorBarzin/infra"
|
||||
|
||||
def has_label(issue, name):
|
||||
return any(l["name"].lower() == name.lower() for l in issue.get("labels", []))
|
||||
|
||||
def parse_user_report_service(body):
|
||||
"""Extract service from GitHub Issue Form dropdown response."""
|
||||
if not body:
|
||||
return None
|
||||
for line in body.split("\n"):
|
||||
stripped = line.strip()
|
||||
if stripped and not stripped.startswith("#") and not stripped.startswith("_") and not stripped.startswith("<!"):
|
||||
prev_was_heading = False
|
||||
for i, ln in enumerate(body.split("\n")):
|
||||
if "affected service" in ln.lower():
|
||||
prev_was_heading = True
|
||||
continue
|
||||
if prev_was_heading and ln.strip():
|
||||
return ln.strip()
|
||||
return None
|
||||
|
||||
try:
|
||||
issues_url = "https://api.github.com/repos/" + ISSUES_REPO + "/issues"
|
||||
|
||||
# Fetch admin-declared incidents (open)
|
||||
open_incidents = fetch_github_json(
|
||||
issues_url + "?labels=incident&state=open&per_page=50&sort=created&direction=desc"
|
||||
)
|
||||
for issue in open_incidents:
|
||||
if issue.get("pull_request"):
|
||||
continue
|
||||
comments = fetch_github_json(issue["comments_url"]) if issue.get("comments", 0) > 0 else []
|
||||
incidents_active.append({
|
||||
"id": issue["number"],
|
||||
"title": issue["title"],
|
||||
"type": "incident",
|
||||
"severity": parse_severity(issue.get("labels", [])),
|
||||
"status": "active",
|
||||
"created_at": issue["created_at"],
|
||||
"updated_at": issue["updated_at"],
|
||||
"affected_services": parse_affected_services(issue.get("body")),
|
||||
"timeline": parse_timeline(comments),
|
||||
"url": issue["html_url"],
|
||||
"postmortem": None,
|
||||
})
|
||||
|
||||
# Fetch user reports (open, not yet triaged to incident)
|
||||
open_reports = fetch_github_json(
|
||||
issues_url + "?labels=user-report&state=open&per_page=20&sort=created&direction=desc"
|
||||
)
|
||||
for issue in open_reports:
|
||||
if issue.get("pull_request"):
|
||||
continue
|
||||
if has_label(issue, "incident"):
|
||||
continue # Already promoted to incident, skip duplicate
|
||||
svc = parse_user_report_service(issue.get("body"))
|
||||
user_reports.append({
|
||||
"id": issue["number"],
|
||||
"title": issue["title"],
|
||||
"type": "user-report",
|
||||
"status": "open",
|
||||
"created_at": issue["created_at"],
|
||||
"affected_services": [svc] if svc else [],
|
||||
"url": issue["html_url"],
|
||||
})
|
||||
|
||||
# Fetch recently closed incidents (last 7 days)
|
||||
closed_incidents = fetch_github_json(
|
||||
issues_url + "?labels=incident&state=closed&per_page=20&sort=updated&direction=desc"
|
||||
)
|
||||
cutoff_7d = (now - timedelta(days=7)).isoformat()
|
||||
for issue in closed_incidents:
|
||||
if issue.get("pull_request"):
|
||||
continue
|
||||
if issue.get("closed_at") and issue["closed_at"] < cutoff_7d:
|
||||
continue
|
||||
comments = fetch_github_json(issue["comments_url"]) if issue.get("comments", 0) > 0 else []
|
||||
incidents_resolved.append({
|
||||
"id": issue["number"],
|
||||
"title": issue["title"],
|
||||
"type": "incident",
|
||||
"severity": parse_severity(issue.get("labels", [])),
|
||||
"status": "resolved",
|
||||
"created_at": issue["created_at"],
|
||||
"closed_at": issue["closed_at"],
|
||||
"updated_at": issue["updated_at"],
|
||||
"affected_services": parse_affected_services(issue.get("body")),
|
||||
"timeline": parse_timeline(comments),
|
||||
"url": issue["html_url"],
|
||||
"postmortem": extract_postmortem(comments),
|
||||
})
|
||||
|
||||
print(f"Incidents: {len(incidents_active)} active, {len(incidents_resolved)} resolved, {len(user_reports)} user reports")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not fetch incidents: {e}")
|
||||
|
||||
status_data = {
|
||||
"last_updated": now.isoformat(),
|
||||
"groups": groups,
|
||||
"incidents": {
|
||||
"active": incidents_active,
|
||||
"resolved": incidents_resolved,
|
||||
"user_reports": user_reports,
|
||||
},
|
||||
}
|
||||
|
||||
work_dir = "/tmp/status-page"
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
variable "tls_secret_name" { type = string }
|
||||
variable "nfs_server" { type = string }
|
||||
variable "cloudflare_proxied_names" { type = list(string) }
|
||||
|
||||
module "uptime-kuma" {
|
||||
source = "./modules/uptime-kuma"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
nfs_server = var.nfs_server
|
||||
tier = local.tiers.cluster
|
||||
source = "./modules/uptime-kuma"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
nfs_server = var.nfs_server
|
||||
tier = local.tiers.cluster
|
||||
cloudflare_proxied_names = var.cloudflare_proxied_names
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,26 @@
|
|||
variable "tls_secret_name" {}
|
||||
variable "tier" { type = string }
|
||||
variable "nfs_server" { type = string }
|
||||
variable "cloudflare_proxied_names" { type = list(string) }
|
||||
|
||||
data "vault_kv_secret_v2" "viktor" {
|
||||
mount = "secret"
|
||||
name = "viktor"
|
||||
}
|
||||
|
||||
locals {
|
||||
# Services that don't respond to standard HTTP health checks
|
||||
non_http_services = toset(["xray-vless", "xray-ws", "xray-grpc"])
|
||||
|
||||
external_monitor_targets = [
|
||||
for name in var.cloudflare_proxied_names : {
|
||||
name = name
|
||||
hostname = name == "viktorbarzin.me" ? "viktorbarzin.me" : "${name}.viktorbarzin.me"
|
||||
url = name == "viktorbarzin.me" ? "https://viktorbarzin.me" : "https://${name}.viktorbarzin.me"
|
||||
}
|
||||
if !contains(local.non_http_services, name)
|
||||
]
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace" "uptime-kuma" {
|
||||
metadata {
|
||||
|
|
@ -228,3 +248,137 @@ module "ingress" {
|
|||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
# =============================================================================
|
||||
# External Monitor Sync
|
||||
# Ensures Uptime Kuma has external HTTPS monitors for all Cloudflare-proxied services.
|
||||
# Reads targets from a Terraform-generated ConfigMap, creates/deletes monitors to match.
|
||||
# =============================================================================
|
||||
resource "kubernetes_config_map_v1" "external_monitor_targets" {
|
||||
metadata {
|
||||
name = "external-monitor-targets"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"targets.json" = jsonencode(local.external_monitor_targets)
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "external_monitor_sync" {
|
||||
metadata {
|
||||
name = "external-monitor-sync"
|
||||
namespace = kubernetes_namespace.uptime-kuma.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Forbid"
|
||||
failed_jobs_history_limit = 3
|
||||
successful_jobs_history_limit = 3
|
||||
schedule = "*/10 * * * *"
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 300
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
container {
|
||||
name = "sync"
|
||||
image = "docker.io/library/python:3.12-alpine"
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
pip install --quiet --disable-pip-version-check uptime-kuma-api
|
||||
python3 << 'PYEOF'
|
||||
import os, json, time
|
||||
from uptime_kuma_api import UptimeKumaApi, MonitorType
|
||||
|
||||
UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local"
|
||||
UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"]
|
||||
TARGETS_FILE = "/config/targets.json"
|
||||
PREFIX = "[External] "
|
||||
|
||||
with open(TARGETS_FILE) as f:
|
||||
targets = json.load(f)
|
||||
|
||||
print(f"Loaded {len(targets)} external monitor targets")
|
||||
|
||||
api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=30)
|
||||
api.login("admin", UPTIME_KUMA_PASS)
|
||||
|
||||
monitors = api.get_monitors()
|
||||
existing_external = {}
|
||||
for m in monitors:
|
||||
if m["name"].startswith(PREFIX):
|
||||
existing_external[m["name"]] = m
|
||||
|
||||
target_names = set()
|
||||
created = 0
|
||||
for t in targets:
|
||||
monitor_name = f"{PREFIX}{t['name']}"
|
||||
target_names.add(monitor_name)
|
||||
if monitor_name not in existing_external:
|
||||
print(f"Creating monitor: {monitor_name} -> {t['url']}")
|
||||
api.add_monitor(
|
||||
type=MonitorType.HTTP,
|
||||
name=monitor_name,
|
||||
url=t["url"],
|
||||
interval=300,
|
||||
maxretries=3,
|
||||
accepted_statecodes=["200-499"],
|
||||
)
|
||||
created += 1
|
||||
time.sleep(0.3)
|
||||
|
||||
# Remove monitors for services no longer in the list
|
||||
deleted = 0
|
||||
for name, m in existing_external.items():
|
||||
if name not in target_names:
|
||||
print(f"Deleting orphaned monitor: {name}")
|
||||
api.delete_monitor(m["id"])
|
||||
deleted += 1
|
||||
time.sleep(0.3)
|
||||
|
||||
api.disconnect()
|
||||
print(f"Sync complete: {created} created, {deleted} deleted, {len(target_names) - created} unchanged")
|
||||
PYEOF
|
||||
EOT
|
||||
]
|
||||
env {
|
||||
name = "UPTIME_KUMA_PASSWORD"
|
||||
value = data.vault_kv_secret_v2.viktor.data["uptime_kuma_admin_password"]
|
||||
}
|
||||
volume_mount {
|
||||
name = "config"
|
||||
mount_path = "/config"
|
||||
read_only = true
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
memory = "128Mi"
|
||||
cpu = "10m"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "config"
|
||||
config_map {
|
||||
name = kubernetes_config_map_v1.external_monitor_targets.metadata[0].name
|
||||
}
|
||||
}
|
||||
dns_config {
|
||||
option {
|
||||
name = "ndots"
|
||||
value = "2"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue