diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index a5f9e5ab..7630a821 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1895,6 +1895,15 @@ serverFiles: severity: warning annotations: summary: "Headscale 5xx error rate is {{ $value | printf \"%.1f\" }}%" + - name: "External Access" + rules: + - alert: ExternalAccessDivergence + expr: external_internal_divergence_count > 0 + for: 15m + labels: + severity: warning + annotations: + summary: "{{ $value | printf \"%.0f\" }} service(s) externally unreachable but internally healthy — check Cloudflare tunnel, DNS, or Traefik routing" extraScrapeConfigs: | - job_name: 'proxmox-host' diff --git a/stacks/status-page/main.tf b/stacks/status-page/main.tf index 33b6f7fc..246db52a 100644 --- a/stacks/status-page/main.tf +++ b/stacks/status-page/main.tf @@ -202,7 +202,10 @@ for m in monitors: raw_type = m.get("type", "unknown") monitor_type = raw_type.value if hasattr(raw_type, "value") else str(raw_type) monitor_type = monitor_type.lower().replace("monitortype.", "") - group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper()) + if m["name"].startswith("[External] "): + group_name = "External Reachability" + else: + group_name = TYPE_NAMES.get(monitor_type, monitor_type.upper()) if not m.get("active", True): continue @@ -267,9 +270,220 @@ for m in monitors: api.disconnect() print(f"Generated {len(groups)} groups") +# ============ Detect external-down / internal-up divergence ============ +external_status = {} +internal_status = {} +for gname, gmonitors in groups.items(): + for mon in gmonitors: + if mon["name"].startswith("[External] "): + svc = mon["name"].replace("[External] ", "").lower() + external_status[svc] = mon["status"] + elif gname != "External Reachability": + internal_status[mon["name"].lower()] = mon["status"] + +divergent = [] +for svc, ext_st in external_status.items(): + if ext_st != "down": + continue + for iname, int_st in internal_status.items(): + if svc in iname or iname in svc: + if int_st == "up": + divergent.append(svc) + break + +divergence_count = len(divergent) +metric_body = ( + "# HELP external_internal_divergence_count Services externally down but internally up\n" + "# TYPE external_internal_divergence_count gauge\n" + f"external_internal_divergence_count {divergence_count}\n" +) +for svc in divergent: + metric_body += f'external_internal_divergence_services{{service="{svc}"}} 1\n' + +try: + import urllib.request as _ur + req = _ur.Request( + "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/external-monitor-divergence", + data=metric_body.encode(), + method="POST" + ) + _ur.urlopen(req, timeout=10) + if divergent: + print(f"WARNING: {len(divergent)} services externally down but internally up: {divergent}") + else: + print("No external/internal divergence detected") +except Exception as e: + print(f"Warning: could not push divergence metric: {e}") + +# ============ Fetch incidents from GitHub Issues ============ +import urllib.request, urllib.error, re as _re2 + +def fetch_github_json(url): + req = urllib.request.Request(url, headers={ + "Authorization": "token " + GITHUB_TOKEN, + "Accept": "application/vnd.github.v3+json", + "User-Agent": "status-page-pusher", + }) + resp = urllib.request.urlopen(req, timeout=15) + return json.loads(resp.read()) + +def parse_severity(labels): + for lbl in labels: + name = lbl["name"].lower() + if name in ("sev1", "sev2", "sev3"): + return name + return "sev3" + +def parse_affected_services(body): + services = [] + if not body: + return services + in_section = False + for line in body.split("\n"): + stripped = line.strip() + if stripped.lower().startswith("## affected"): + in_section = True + continue + if in_section: + if stripped.startswith("##"): + break + if stripped.startswith("- ") and not stripped.startswith("-