diff --git a/stacks/status-page/main.tf b/stacks/status-page/main.tf index 6c943d6c..8fae372c 100644 --- a/stacks/status-page/main.tf +++ b/stacks/status-page/main.tf @@ -68,549 +68,554 @@ resource "kubernetes_cluster_role_binding_v1" "ingress_reader" { } # ============================================================================= -# Status Page Pusher -# Reads Uptime Kuma monitors, generates status.json, pushes to GitHub Pages +# Status Page Pusher ── DISABLED 2026-05-26 +# Reads Uptime Kuma monitors, generates status.json, pushes to GitHub Pages. +# +# Disabled because per-invocation `apk add git` + `pip install uptime-kuma-api` +# was hammering the Proxmox sdc thin pool (~3.2 MB/s of the ~8 MB/s sustained +# host-side, ~804 GB written over 18 h). Re-enable with a custom image that +# bakes git + uptime-kuma-api so cold-install is gone. # ============================================================================= -resource "kubernetes_cron_job_v1" "status_page_pusher" { - metadata { - name = "status-page-pusher" - namespace = kubernetes_namespace_v1.status_page.metadata[0].name - } - spec { - concurrency_policy = "Forbid" - failed_jobs_history_limit = 3 - successful_jobs_history_limit = 3 - schedule = "*/5 * * * *" - job_template { - metadata {} - spec { - backoff_limit = 1 - ttl_seconds_after_finished = 300 - template { - metadata {} - spec { - service_account_name = kubernetes_service_account_v1.status_page.metadata[0].name - container { - name = "status-pusher" - image = "docker.io/library/python:3.12-alpine" - command = ["/bin/sh", "-c", <<-EOT - apk add --no-cache git >/dev/null 2>&1 - pip install --quiet --disable-pip-version-check uptime-kuma-api - python3 << 'PYEOF' -import os, sys, json, time, subprocess -from datetime import datetime, timezone, timedelta -from uptime_kuma_api import UptimeKumaApi - -UPTIME_KUMA_URL = "http://uptime-kuma.uptime-kuma.svc.cluster.local" -UPTIME_KUMA_PASS = os.environ["UPTIME_KUMA_PASSWORD"] -GITHUB_TOKEN = os.environ["GITHUB_TOKEN"] -REPO = "ViktorBarzin/status-page" -REPO_URL = "https://" + GITHUB_TOKEN + "@github.com/" + REPO + ".git" - -TYPE_NAMES = { - "http": "HTTP", - "port": "TCP Port", - "ping": "Ping", - "keyword": "HTTP Keyword", - "grpc-keyword": "gRPC", - "dns": "DNS", - "docker": "Docker", - "push": "Push", - "steam": "Steam", - "gamedig": "GameDig", - "mqtt": "MQTT", - "sqlserver": "SQL Server", - "postgres": "PostgreSQL", - "mysql": "MySQL", - "mongodb": "MongoDB", - "radius": "RADIUS", - "redis": "Redis", - "tailscale-ping": "Tailscale Ping", - "real-browser": "Real Browser", - "group": "Group", - "snmp": "SNMP", - "json-query": "JSON Query", -} - -def beat_status_is_up(status_val): - """Handle both enum and int status values.""" - if hasattr(status_val, "value"): - return status_val.value == 1 - return status_val == 1 - -# Build namespace -> external URL map from K8s ingresses -ingress_map = {} -try: - import ssl, urllib.request - token_path = "/var/run/secrets/kubernetes.io/serviceaccount/token" - ca_path = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" - if os.path.exists(token_path): - with open(token_path) as f: - token = f.read().strip() - ctx = ssl.create_default_context(cafile=ca_path) - k8s_host = os.environ.get("KUBERNETES_SERVICE_HOST", "kubernetes.default.svc") - k8s_port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") - req = urllib.request.Request( - "https://" + k8s_host + ":" + k8s_port + "/apis/networking.k8s.io/v1/ingresses", - headers={"Authorization": "Bearer " + token} - ) - resp = urllib.request.urlopen(req, context=ctx, timeout=10) - ing_data = json.loads(resp.read()) - for item in ing_data.get("items", []): - ns = item["metadata"]["namespace"] - rules = item.get("spec", {}).get("rules", []) - if rules and rules[0].get("host"): - host = rules[0]["host"] - if ns not in ingress_map: - ingress_map[ns] = "https://" + host - print(f"Built ingress map: {len(ingress_map)} namespaces") -except Exception as e: - print(f"Warning: could not build ingress map: {e}") - -print("Connecting to Uptime Kuma...") -api = UptimeKumaApi(UPTIME_KUMA_URL, timeout=30) -api.login("admin", UPTIME_KUMA_PASS) - -monitors = api.get_monitors() -print(f"Fetched {len(monitors)} monitors") - -# Get current heartbeats for live status -heartbeats = api.get_heartbeats() - -now = datetime.now(timezone.utc) - -def calc_uptime(beat_list, hours): - cutoff = now - timedelta(hours=hours) - relevant = [] - for b in beat_list: - t = str(b["time"]) - try: - bt = datetime.fromisoformat(t.replace("Z", "+00:00")) - except (ValueError, TypeError): - continue - if bt.tzinfo is None: - bt = bt.replace(tzinfo=timezone.utc) - if bt > cutoff: - relevant.append(b) - if not relevant: - return None - up_count = sum(1 for b in relevant if beat_status_is_up(b.get("status", 0))) - return round(up_count / len(relevant) * 100, 1) - -groups = {} -for m in monitors: - raw_type = m.get("type", "unknown") - monitor_type = raw_type.value if hasattr(raw_type, "value") else str(raw_type) - monitor_type = monitor_type.lower().replace("monitortype.", "") - if m["name"].startswith("[External] "): - group_name = "External Reachability" - else: - group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper()) - - if not m.get("active", True): - continue - else: - # Get latest heartbeat for current status - mid = m["id"] - mon_beats = heartbeats.get(mid, heartbeats.get(str(mid), [])) - if mon_beats: - # Flatten nested lists (API format varies by version) - flat = [] - for item in mon_beats: - if isinstance(item, list): - flat.extend(item) - elif isinstance(item, dict): - flat.append(item) - mon_beats = flat if flat else mon_beats - latest = mon_beats[-1] if mon_beats else None - if latest and isinstance(latest, dict) and beat_status_is_up(latest.get("status", 0)): - status = "up" - else: - status = "down" - else: - status = "pending" - - uptime_24h = None - uptime_7d = None - uptime_30d = None - try: - beats = api.get_monitor_beats(m["id"], 720) - if beats: - uptime_24h = calc_uptime(beats, 24) - uptime_7d = calc_uptime(beats, 168) - uptime_30d = calc_uptime(beats, 720) - except Exception as e: - print(f" Warning: could not get beats for {m['name']}: {e}") - - if group_name not in groups: - groups[group_name] = [] - - # Extract external URL for HTTP monitors - monitor_url = None - raw_url = m.get("url", "") or "" - if monitor_type == "http" and raw_url: - if ".svc.cluster.local" not in raw_url and raw_url.startswith("http"): - monitor_url = raw_url.rstrip("/") - else: - # Internal URL — derive external from namespace - import re as _re - ns_match = _re.search(r"//[^.]+\.([^.]+)\.svc\.cluster\.local", raw_url) - if ns_match: - ns = ns_match.group(1) - if ns in ingress_map: - monitor_url = ingress_map[ns] - - entry = { - "name": m["name"], - "status": status, - "uptime_24h": uptime_24h, - "uptime_7d": uptime_7d, - "uptime_30d": uptime_30d, - } - if monitor_url: - entry["url"] = monitor_url - - groups[group_name].append(entry) - -api.disconnect() -print(f"Generated {len(groups)} groups") - -# ============ Detect external-down / internal-up divergence ============ -external_status = {} -internal_status = {} -for gname, gmonitors in groups.items(): - for mon in gmonitors: - if mon["name"].startswith("[External] "): - svc = mon["name"].replace("[External] ", "").lower() - external_status[svc] = mon["status"] - elif gname != "External Reachability": - internal_status[mon["name"].lower()] = mon["status"] - -divergent = [] -for svc, ext_st in external_status.items(): - if ext_st != "down": - continue - for iname, int_st in internal_status.items(): - if svc in iname or iname in svc: - if int_st == "up": - divergent.append(svc) - break - -divergence_count = len(divergent) -metric_body = ( - "# HELP external_internal_divergence_count Services externally down but internally up\n" - "# TYPE external_internal_divergence_count gauge\n" - f"external_internal_divergence_count {divergence_count}\n" -) -for svc in divergent: - metric_body += f'external_internal_divergence_services{{service="{svc}"}} 1\n' - -try: - import urllib.request as _ur - req = _ur.Request( - "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/external-monitor-divergence", - data=metric_body.encode(), - method="POST" - ) - _ur.urlopen(req, timeout=10) - if divergent: - print(f"WARNING: {len(divergent)} services externally down but internally up: {divergent}") - else: - print("No external/internal divergence detected") -except Exception as e: - print(f"Warning: could not push divergence metric: {e}") - -# ============ Fetch incidents from GitHub Issues ============ -import urllib.request, urllib.error, re as _re2 - -def fetch_github_json(url): - req = urllib.request.Request(url, headers={ - "Authorization": "token " + GITHUB_TOKEN, - "Accept": "application/vnd.github.v3+json", - "User-Agent": "status-page-pusher", - }) - resp = urllib.request.urlopen(req, timeout=15) - return json.loads(resp.read()) - -def parse_severity(labels): - for lbl in labels: - name = lbl["name"].lower() - if name in ("sev1", "sev2", "sev3"): - return name - return "sev3" - -def parse_affected_services(body): - services = [] - if not body: - return services - in_section = False - for line in body.split("\n"): - stripped = line.strip() - if stripped.lower().startswith("## affected"): - in_section = True - continue - if in_section: - if stripped.startswith("##"): - break - if stripped.startswith("- ") and not stripped.startswith("-