From c5e4b1ea71536bdaee2195a355459678323f5870 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 1 Jun 2026 19:38:07 +0000 Subject: [PATCH] kms: add /diag anonymous telemetry collector behind Anubis carve-out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PowerShell activation scripts POST small JSON diagnostics to /diag so script execution errors are captured. The collector (python:3.12-alpine, ConfigMap-mounted) prints each event to stdout as a KMSDIAG line; the cluster's Loki scrapes pod stdout, making events searchable in Grafana (Loki only — no Slack, no Prometheus). Like /scripts, /diag needs a second ingress_factory carve-out with full_host="kms.viktorbarzin.me" so it bypasses the Anubis PoW challenge that PowerShell/curl can't solve. Without full_host the factory would derive kms-diag.viktorbarzin.me and the carve-out would never match. Co-Authored-By: Claude Opus 4.7 --- stacks/kms/files/diag-collector.py | 33 ++++++++ stacks/kms/main.tf | 119 +++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 stacks/kms/files/diag-collector.py diff --git a/stacks/kms/files/diag-collector.py b/stacks/kms/files/diag-collector.py new file mode 100644 index 00000000..657b8ed3 --- /dev/null +++ b/stacks/kms/files/diag-collector.py @@ -0,0 +1,33 @@ +import json +from http.server import BaseHTTPRequestHandler, HTTPServer + +MAX = 16384 + +class Handler(BaseHTTPRequestHandler): + def _respond(self, code): + self.send_response(code) + self.send_header('Content-Length', '0') + self.end_headers() + def do_POST(self): + if self.path.rstrip('/') != '/diag': + self._respond(404); return + try: + n = int(self.headers.get('Content-Length', 0) or 0) + n = min(n, MAX) if n > 0 else 0 + raw = self.rfile.read(n).decode('utf-8', 'replace') if n else '' + obj = json.loads(raw) if raw.strip() else {} + if not isinstance(obj, dict): + obj = {'_raw': str(obj)[:1000]} + ip = self.headers.get('X-Forwarded-For', self.client_address[0]).split(',')[0].strip() + obj['_ip'] = ip + print('KMSDIAG ' + json.dumps(obj, separators=(',', ':'))[:MAX], flush=True) + except Exception as e: + print('KMSDIAG_ERR ' + repr(e)[:500], flush=True) + self._respond(204) + def do_GET(self): + self._respond(200 if self.path.rstrip('/') in ('/healthz', '/diag') else 404) + def log_message(self, *a): + pass + +if __name__ == '__main__': + HTTPServer(('0.0.0.0', 9102), Handler).serve_forever() diff --git a/stacks/kms/main.tf b/stacks/kms/main.tf index 978649b1..fd2db35f 100644 --- a/stacks/kms/main.tf +++ b/stacks/kms/main.tf @@ -156,6 +156,125 @@ module "ingress_scripts" { anti_ai_scraping = false # Static scripts + key list; nothing for scrapers to mine. } +# Anonymous diagnostics collector for the PowerShell activation scripts. The +# activators POST a tiny JSON blob (action/outcome/error) to /diag so script +# failures are captured. The collector prints each event to stdout, which Loki +# scrapes — making them searchable in Grafana. Loki only: no Slack, no +# Prometheus. Like /scripts, /diag must bypass Anubis: PowerShell/curl can't +# solve the PoW challenge, so the carve-out below points at the bare collector. +resource "kubernetes_config_map" "kms_diag_collector" { + metadata { + name = "kms-diag-collector" + namespace = kubernetes_namespace.kms.metadata[0].name + } + data = { + "diag-collector.py" = file("${path.module}/files/diag-collector.py") + } +} + +resource "kubernetes_deployment" "kms_diag" { + metadata { + name = "kms-diag" + namespace = kubernetes_namespace.kms.metadata[0].name + labels = { + app = "kms-diag" + tier = local.tiers.aux + } + } + spec { + replicas = 1 + selector { + match_labels = { + app = "kms-diag" + } + } + template { + metadata { + labels = { + app = "kms-diag" + } + annotations = { + # Reload pods when the collector script changes + "checksum/collector" = sha1(file("${path.module}/files/diag-collector.py")) + } + } + spec { + volume { + name = "diag-collector-script" + config_map { + name = kubernetes_config_map.kms_diag_collector.metadata[0].name + } + } + container { + image = "python:3.12-alpine" + name = "diag-collector" + command = ["python3", "/app/diag-collector.py"] + resources { + limits = { + memory = "64Mi" + } + requests = { + cpu = "5m" + memory = "48Mi" + } + } + port { + container_port = 9102 + } + volume_mount { + name = "diag-collector-script" + mount_path = "/app" + read_only = true + } + } + } + } + } + lifecycle { + ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1 + } +} + +resource "kubernetes_service" "kms_diag" { + metadata { + name = "kms-diag" + namespace = kubernetes_namespace.kms.metadata[0].name + labels = { + app = "kms-diag" + } + } + + spec { + selector = { + app = "kms-diag" + } + port { + port = "9102" + protocol = "TCP" + } + } +} + +# Carve-out for /diag — the anonymous telemetry endpoint. Same rationale as +# /scripts: PowerShell/curl POSTs can't solve Anubis' PoW challenge, so this +# points at the bare kms-diag collector service. full_host MUST match the main +# ingress host; without it the factory derives kms-diag.viktorbarzin.me and the +# carve-out never matches (this exact bug hit the /scripts carve-out). +module "ingress_diag" { + source = "../../modules/kubernetes/ingress_factory" + # auth = "none": public telemetry collector, no login/PoW + auth = "none" + namespace = kubernetes_namespace.kms.metadata[0].name + name = "kms-diag" + service_name = kubernetes_service.kms_diag.metadata[0].name + port = "9102" + ingress_path = ["/diag"] + full_host = "kms.viktorbarzin.me" + dns_type = "none" + tls_secret_name = var.tls_secret_name + anti_ai_scraping = false +} + # Dedicated KMS endpoint hostname. kms.viktorbarzin.me is the *website* (Traefik # 10.0.20.203 internally / :443 externally) and cannot also serve raw KMS on # :1688, so clients pointed at kms.viktorbarzin.me:1688 from the LAN hit Traefik