kms: add /diag anonymous telemetry collector behind Anubis carve-out

The PowerShell activation scripts POST small JSON diagnostics to
/diag so script execution errors are captured. The collector
(python:3.12-alpine, ConfigMap-mounted) prints each event to stdout
as a KMSDIAG line; the cluster's Loki scrapes pod stdout, making
events searchable in Grafana (Loki only — no Slack, no Prometheus).

Like /scripts, /diag needs a second ingress_factory carve-out with
full_host="kms.viktorbarzin.me" so it bypasses the Anubis PoW
challenge that PowerShell/curl can't solve. Without full_host the
factory would derive kms-diag.viktorbarzin.me and the carve-out
would never match.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-01 19:38:07 +00:00
parent 3fa9e2409c
commit c5e4b1ea71
2 changed files with 152 additions and 0 deletions

View file

@ -0,0 +1,33 @@
import json
from http.server import BaseHTTPRequestHandler, HTTPServer
MAX = 16384
class Handler(BaseHTTPRequestHandler):
def _respond(self, code):
self.send_response(code)
self.send_header('Content-Length', '0')
self.end_headers()
def do_POST(self):
if self.path.rstrip('/') != '/diag':
self._respond(404); return
try:
n = int(self.headers.get('Content-Length', 0) or 0)
n = min(n, MAX) if n > 0 else 0
raw = self.rfile.read(n).decode('utf-8', 'replace') if n else ''
obj = json.loads(raw) if raw.strip() else {}
if not isinstance(obj, dict):
obj = {'_raw': str(obj)[:1000]}
ip = self.headers.get('X-Forwarded-For', self.client_address[0]).split(',')[0].strip()
obj['_ip'] = ip
print('KMSDIAG ' + json.dumps(obj, separators=(',', ':'))[:MAX], flush=True)
except Exception as e:
print('KMSDIAG_ERR ' + repr(e)[:500], flush=True)
self._respond(204)
def do_GET(self):
self._respond(200 if self.path.rstrip('/') in ('/healthz', '/diag') else 404)
def log_message(self, *a):
pass
if __name__ == '__main__':
HTTPServer(('0.0.0.0', 9102), Handler).serve_forever()

View file

@ -156,6 +156,125 @@ module "ingress_scripts" {
anti_ai_scraping = false # Static scripts + key list; nothing for scrapers to mine.
}
# Anonymous diagnostics collector for the PowerShell activation scripts. The
# activators POST a tiny JSON blob (action/outcome/error) to /diag so script
# failures are captured. The collector prints each event to stdout, which Loki
# scrapes making them searchable in Grafana. Loki only: no Slack, no
# Prometheus. Like /scripts, /diag must bypass Anubis: PowerShell/curl can't
# solve the PoW challenge, so the carve-out below points at the bare collector.
resource "kubernetes_config_map" "kms_diag_collector" {
metadata {
name = "kms-diag-collector"
namespace = kubernetes_namespace.kms.metadata[0].name
}
data = {
"diag-collector.py" = file("${path.module}/files/diag-collector.py")
}
}
resource "kubernetes_deployment" "kms_diag" {
metadata {
name = "kms-diag"
namespace = kubernetes_namespace.kms.metadata[0].name
labels = {
app = "kms-diag"
tier = local.tiers.aux
}
}
spec {
replicas = 1
selector {
match_labels = {
app = "kms-diag"
}
}
template {
metadata {
labels = {
app = "kms-diag"
}
annotations = {
# Reload pods when the collector script changes
"checksum/collector" = sha1(file("${path.module}/files/diag-collector.py"))
}
}
spec {
volume {
name = "diag-collector-script"
config_map {
name = kubernetes_config_map.kms_diag_collector.metadata[0].name
}
}
container {
image = "python:3.12-alpine"
name = "diag-collector"
command = ["python3", "/app/diag-collector.py"]
resources {
limits = {
memory = "64Mi"
}
requests = {
cpu = "5m"
memory = "48Mi"
}
}
port {
container_port = 9102
}
volume_mount {
name = "diag-collector-script"
mount_path = "/app"
read_only = true
}
}
}
}
}
lifecycle {
ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
}
}
resource "kubernetes_service" "kms_diag" {
metadata {
name = "kms-diag"
namespace = kubernetes_namespace.kms.metadata[0].name
labels = {
app = "kms-diag"
}
}
spec {
selector = {
app = "kms-diag"
}
port {
port = "9102"
protocol = "TCP"
}
}
}
# Carve-out for /diag the anonymous telemetry endpoint. Same rationale as
# /scripts: PowerShell/curl POSTs can't solve Anubis' PoW challenge, so this
# points at the bare kms-diag collector service. full_host MUST match the main
# ingress host; without it the factory derives kms-diag.viktorbarzin.me and the
# carve-out never matches (this exact bug hit the /scripts carve-out).
module "ingress_diag" {
source = "../../modules/kubernetes/ingress_factory"
# auth = "none": public telemetry collector, no login/PoW
auth = "none"
namespace = kubernetes_namespace.kms.metadata[0].name
name = "kms-diag"
service_name = kubernetes_service.kms_diag.metadata[0].name
port = "9102"
ingress_path = ["/diag"]
full_host = "kms.viktorbarzin.me"
dns_type = "none"
tls_secret_name = var.tls_secret_name
anti_ai_scraping = false
}
# Dedicated KMS endpoint hostname. kms.viktorbarzin.me is the *website* (Traefik
# 10.0.20.203 internally / :443 externally) and cannot also serve raw KMS on
# :1688, so clients pointed at kms.viktorbarzin.me:1688 from the LAN hit Traefik