kms: add /diag anonymous telemetry collector behind Anubis carve-out
The PowerShell activation scripts POST small JSON diagnostics to /diag so script execution errors are captured. The collector (python:3.12-alpine, ConfigMap-mounted) prints each event to stdout as a KMSDIAG line; the cluster's Loki scrapes pod stdout, making events searchable in Grafana (Loki only — no Slack, no Prometheus). Like /scripts, /diag needs a second ingress_factory carve-out with full_host="kms.viktorbarzin.me" so it bypasses the Anubis PoW challenge that PowerShell/curl can't solve. Without full_host the factory would derive kms-diag.viktorbarzin.me and the carve-out would never match. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
3fa9e2409c
commit
c5e4b1ea71
2 changed files with 152 additions and 0 deletions
33
stacks/kms/files/diag-collector.py
Normal file
33
stacks/kms/files/diag-collector.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import json
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
|
||||
MAX = 16384
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def _respond(self, code):
|
||||
self.send_response(code)
|
||||
self.send_header('Content-Length', '0')
|
||||
self.end_headers()
|
||||
def do_POST(self):
|
||||
if self.path.rstrip('/') != '/diag':
|
||||
self._respond(404); return
|
||||
try:
|
||||
n = int(self.headers.get('Content-Length', 0) or 0)
|
||||
n = min(n, MAX) if n > 0 else 0
|
||||
raw = self.rfile.read(n).decode('utf-8', 'replace') if n else ''
|
||||
obj = json.loads(raw) if raw.strip() else {}
|
||||
if not isinstance(obj, dict):
|
||||
obj = {'_raw': str(obj)[:1000]}
|
||||
ip = self.headers.get('X-Forwarded-For', self.client_address[0]).split(',')[0].strip()
|
||||
obj['_ip'] = ip
|
||||
print('KMSDIAG ' + json.dumps(obj, separators=(',', ':'))[:MAX], flush=True)
|
||||
except Exception as e:
|
||||
print('KMSDIAG_ERR ' + repr(e)[:500], flush=True)
|
||||
self._respond(204)
|
||||
def do_GET(self):
|
||||
self._respond(200 if self.path.rstrip('/') in ('/healthz', '/diag') else 404)
|
||||
def log_message(self, *a):
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
HTTPServer(('0.0.0.0', 9102), Handler).serve_forever()
|
||||
|
|
@ -156,6 +156,125 @@ module "ingress_scripts" {
|
|||
anti_ai_scraping = false # Static scripts + key list; nothing for scrapers to mine.
|
||||
}
|
||||
|
||||
# Anonymous diagnostics collector for the PowerShell activation scripts. The
|
||||
# activators POST a tiny JSON blob (action/outcome/error) to /diag so script
|
||||
# failures are captured. The collector prints each event to stdout, which Loki
|
||||
# scrapes — making them searchable in Grafana. Loki only: no Slack, no
|
||||
# Prometheus. Like /scripts, /diag must bypass Anubis: PowerShell/curl can't
|
||||
# solve the PoW challenge, so the carve-out below points at the bare collector.
|
||||
resource "kubernetes_config_map" "kms_diag_collector" {
|
||||
metadata {
|
||||
name = "kms-diag-collector"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"diag-collector.py" = file("${path.module}/files/diag-collector.py")
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "kms_diag" {
|
||||
metadata {
|
||||
name = "kms-diag"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
app = "kms-diag"
|
||||
tier = local.tiers.aux
|
||||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
annotations = {
|
||||
# Reload pods when the collector script changes
|
||||
"checksum/collector" = sha1(file("${path.module}/files/diag-collector.py"))
|
||||
}
|
||||
}
|
||||
spec {
|
||||
volume {
|
||||
name = "diag-collector-script"
|
||||
config_map {
|
||||
name = kubernetes_config_map.kms_diag_collector.metadata[0].name
|
||||
}
|
||||
}
|
||||
container {
|
||||
image = "python:3.12-alpine"
|
||||
name = "diag-collector"
|
||||
command = ["python3", "/app/diag-collector.py"]
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "5m"
|
||||
memory = "48Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
container_port = 9102
|
||||
}
|
||||
volume_mount {
|
||||
name = "diag-collector-script"
|
||||
mount_path = "/app"
|
||||
read_only = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "kms_diag" {
|
||||
metadata {
|
||||
name = "kms-diag"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
port {
|
||||
port = "9102"
|
||||
protocol = "TCP"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Carve-out for /diag — the anonymous telemetry endpoint. Same rationale as
|
||||
# /scripts: PowerShell/curl POSTs can't solve Anubis' PoW challenge, so this
|
||||
# points at the bare kms-diag collector service. full_host MUST match the main
|
||||
# ingress host; without it the factory derives kms-diag.viktorbarzin.me and the
|
||||
# carve-out never matches (this exact bug hit the /scripts carve-out).
|
||||
module "ingress_diag" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
# auth = "none": public telemetry collector, no login/PoW
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
name = "kms-diag"
|
||||
service_name = kubernetes_service.kms_diag.metadata[0].name
|
||||
port = "9102"
|
||||
ingress_path = ["/diag"]
|
||||
full_host = "kms.viktorbarzin.me"
|
||||
dns_type = "none"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
}
|
||||
|
||||
# Dedicated KMS endpoint hostname. kms.viktorbarzin.me is the *website* (Traefik
|
||||
# 10.0.20.203 internally / :443 externally) and cannot also serve raw KMS on
|
||||
# :1688, so clients pointed at kms.viktorbarzin.me:1688 from the LAN hit Traefik
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue