kms: deploy slack-notifier sidecar with Prometheus metrics + document public exposure

Slack notifier now also exposes /metrics on :9101 with stdlib HTTP — counts activations and dedup-skips by product, gauges last-activation timestamp. Pod template gets the standard prometheus.io/scrape annotations so the cluster-wide kubernetes-pods job picks it up via pod IP. Memory request bumped to 48Mi to cover counter dicts + HTTPServer. Plus docs: networking.md footnotes the windows-kms row noting public WAN exposure with the rate-limited (max-src-conn 50, max-src-conn-rate 10/60, overload <virusprot> flush) pfSense filter rule, and a new runbook covers log locations, rate-limit tuning, and how to revoke the WAN forward. The matching pfSense rule was tightened in place (TCP-only + rate limits) via SSH; pfSense isn't Terraform-managed.
2026-05-09 22:12:46 +00:00 · 2026-05-09 22:12:46 +00:00 · 08edd92b22
commit 08edd92b22
parent efadeb531d
4 changed files with 456 additions and 3 deletions
--- a/stacks/kms/files/slack-notifier.py
+++ b/stacks/kms/files/slack-notifier.py
@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+Tail vlmcsd verbose log; post a Slack message per activation, and expose
+Prometheus metrics on /metrics for activation counts.
+
+vlmcsd verbose output emits a multi-line block per request:
+  <ts>: IPv4 connection accepted: <ip>:<port>.
+  <ts>: <<< Incoming KMS request
+  <ts>: Application ID    : <uuid> (<name>)
+  <ts>: Activation ID (Product): <uuid> (<product>)
+  <ts>: Workstation name  : <hostname>
+  ...
+  <ts>: IPv4 connection closed: <ip>:<port>.
+
+We accumulate per-connection state and emit on close. Dedupes by
+(client_ip, product) within DEDUP_WINDOW_SECONDS to avoid spam from
+Windows' default 7-day re-activation cycle hitting us repeatedly.
+
+Prometheus metrics (text format, no client_ip label — cardinality):
+  kms_activations_total{product, status}        counter
+  kms_activations_dedup_skipped_total{product}  counter
+  kms_last_activation_timestamp_seconds         gauge
+  kms_slack_notifier_up                         gauge (heartbeat)
+"""
+import json
+import os
+import re
+import sys
+import threading
+import time
+import urllib.error
+import urllib.request
+from collections import OrderedDict
+from http.server import BaseHTTPRequestHandler, HTTPServer
+
+LOG_PATH = os.environ.get("VLMCSD_LOG", "/var/log/vlmcsd/vlmcsd.log")
+WEBHOOK = os.environ["SLACK_WEBHOOK_URL"]
+CHANNEL = os.environ.get("SLACK_CHANNEL", "#alerts")
+DEDUP_WINDOW = int(os.environ.get("DEDUP_WINDOW_SECONDS", "3600"))
+DEDUP_MAX = 4096
+METRICS_PORT = int(os.environ.get("METRICS_PORT", "9101"))
+
+OPEN_RE = re.compile(r":\s*IPv[46] connection accepted:\s*([0-9a-f.:\[\]]+):\d+")
+CLOSE_RE = re.compile(r":\s*IPv[46] connection closed:\s*([0-9a-f.:\[\]]+):\d+")
+APP_RE = re.compile(r":\s*Application ID\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
+PROD_RE = re.compile(r":\s*Activation ID \(Product\)\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
+HOST_RE = re.compile(r":\s*Workstation name\s*:\s*(.+?)\s*$")
+STATUS_RE = re.compile(r":\s*Licensing status\s*:\s*\d+\s*\((.+?)\)\s*$")
+
+_metrics_lock = threading.Lock()
+_activations: dict = {}
+_dedup_skipped: dict = {}
+_last_activation_ts: float = 0.0
+
+
+def _esc(value: str) -> str:
+    return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
+
+
+def record_activation(product: str, status: str) -> None:
+    global _last_activation_ts
+    with _metrics_lock:
+        key = (product, status)
+        _activations[key] = _activations.get(key, 0) + 1
+        _last_activation_ts = time.time()
+
+
+def record_dedup_skip(product: str) -> None:
+    with _metrics_lock:
+        _dedup_skipped[product] = _dedup_skipped.get(product, 0) + 1
+
+
+def render_metrics() -> bytes:
+    out = []
+    with _metrics_lock:
+        activations = dict(_activations)
+        dedup_skipped = dict(_dedup_skipped)
+        last_ts = _last_activation_ts
+
+    out.append("# HELP kms_activations_total KMS activation events that resulted in a Slack post.")
+    out.append("# TYPE kms_activations_total counter")
+    for (product, status), count in sorted(activations.items()):
+        out.append(
+            f'kms_activations_total{{product="{_esc(product)}",status="{_esc(status)}"}} {count}'
+        )
+
+    out.append("# HELP kms_activations_dedup_skipped_total KMS activation events suppressed by dedup window.")
+    out.append("# TYPE kms_activations_dedup_skipped_total counter")
+    for product, count in sorted(dedup_skipped.items()):
+        out.append(f'kms_activations_dedup_skipped_total{{product="{_esc(product)}"}} {count}')
+
+    out.append("# HELP kms_last_activation_timestamp_seconds Unix ts of the last non-deduped activation.")
+    out.append("# TYPE kms_last_activation_timestamp_seconds gauge")
+    out.append(f"kms_last_activation_timestamp_seconds {last_ts}")
+
+    out.append("# HELP kms_slack_notifier_up 1 while the notifier process is running.")
+    out.append("# TYPE kms_slack_notifier_up gauge")
+    out.append("kms_slack_notifier_up 1")
+
+    return ("\n".join(out) + "\n").encode("utf-8")
+
+
+class MetricsHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/healthz":
+            self.send_response(200)
+            self.send_header("Content-Type", "text/plain")
+            self.end_headers()
+            self.wfile.write(b"ok\n")
+            return
+        if self.path != "/metrics":
+            self.send_response(404)
+            self.end_headers()
+            return
+        body = render_metrics()
+        self.send_response(200)
+        self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, *args, **kwargs):
+        pass
+
+
+def start_metrics_server() -> None:
+    server = HTTPServer(("0.0.0.0", METRICS_PORT), MetricsHandler)
+    print(f"[slack-notifier] metrics on :{METRICS_PORT}/metrics", flush=True)
+    server.serve_forever()
+
+
+def slack_post(text: str) -> None:
+    payload = json.dumps({"channel": CHANNEL, "text": text, "username": "kms.viktorbarzin.me", "icon_emoji": ":computer:"}).encode("utf-8")
+    req = urllib.request.Request(WEBHOOK, data=payload, headers={"Content-Type": "application/json"})
+    try:
+        urllib.request.urlopen(req, timeout=10).read()
+    except urllib.error.URLError as exc:
+        print(f"[slack] post failed: {exc}", file=sys.stderr)
+
+
+class DedupCache(OrderedDict):
+    def should_send(self, key: str) -> bool:
+        now = time.time()
+        while self and (now - next(iter(self.values()))) > DEDUP_WINDOW:
+            self.popitem(last=False)
+        if key in self and (now - self[key]) < DEDUP_WINDOW:
+            return False
+        if len(self) >= DEDUP_MAX:
+            self.popitem(last=False)
+        self[key] = now
+        self.move_to_end(key)
+        return True
+
+
+def follow(path: str):
+    while not os.path.exists(path):
+        time.sleep(1)
+    fh = open(path, "r", encoding="utf-8", errors="replace")
+    fh.seek(0, 2)
+    inode = os.fstat(fh.fileno()).st_ino
+    while True:
+        line = fh.readline()
+        if line:
+            yield line.rstrip("\n")
+            continue
+        time.sleep(0.5)
+        try:
+            new_inode = os.stat(path).st_ino
+            if new_inode != inode:
+                fh.close()
+                fh = open(path, "r", encoding="utf-8", errors="replace")
+                inode = new_inode
+        except FileNotFoundError:
+            time.sleep(1)
+
+
+def main() -> None:
+    threading.Thread(target=start_metrics_server, daemon=True).start()
+
+    dedup = DedupCache()
+    print(f"[slack-notifier] tailing {LOG_PATH}, posting to {CHANNEL} via Slack", flush=True)
+    state: dict = {}
+
+    for line in follow(LOG_PATH):
+        if (m := OPEN_RE.search(line)):
+            state = {"ip": m.group(1)}
+            continue
+        if not state:
+            continue
+        if (m := APP_RE.search(line)):
+            state["app"] = m.group(1)
+        elif (m := PROD_RE.search(line)):
+            state["product"] = m.group(1)
+        elif (m := HOST_RE.search(line)):
+            state["host"] = m.group(1)
+        elif (m := STATUS_RE.search(line)):
+            state["status"] = m.group(1)
+        elif CLOSE_RE.search(line):
+            ip = state.get("ip", "?")
+            product = state.get("product", state.get("app", "unknown"))
+            host = state.get("host", "?")
+            status = state.get("status", "unknown")
+            key = f"{ip}|{product}"
+            if dedup.should_send(key):
+                text = (
+                    f":computer: KMS activation\n"
+                    f"• *Client*: `{ip}`\n"
+                    f"• *Workstation*: `{host}`\n"
+                    f"• *Product*: `{product}`\n"
+                    f"• *Status before*: {status}"
+                )
+                slack_post(text)
+                record_activation(product, status)
+                print(f"[slack-notifier] sent: ip={ip} product={product!r} host={host!r}", flush=True)
+            else:
+                record_dedup_skip(product)
+                print(f"[slack-notifier] dedup-skip: ip={ip} product={product!r}", flush=True)
+            state = {}
+
+
+if __name__ == "__main__":
+    main()
--- a/stacks/kms/main.tf
+++ b/stacks/kms/main.tf
@ -119,6 +119,46 @@ module "ingress" {
  }
 }

+resource "kubernetes_config_map" "kms_slack_notifier" {
+  metadata {
+    name      = "kms-slack-notifier"
+    namespace = kubernetes_namespace.kms.metadata[0].name
+  }
+  data = {
+    "notifier.py" = file("${path.module}/files/slack-notifier.py")
+  }
+}
+
+resource "kubernetes_manifest" "kms_slack_external_secret" {
+  manifest = {
+    apiVersion = "external-secrets.io/v1beta1"
+    kind       = "ExternalSecret"
+    metadata = {
+      name      = "kms-slack-webhook"
+      namespace = kubernetes_namespace.kms.metadata[0].name
+    }
+    spec = {
+      refreshInterval = "1h"
+      secretStoreRef = {
+        name = "vault-kv"
+        kind = "ClusterSecretStore"
+      }
+      target = {
+        name           = "kms-slack-webhook"
+        creationPolicy = "Owner"
+      }
+      data = [{
+        secretKey = "url"
+        remoteRef = {
+          key      = "kms"
+          property = "slack_webhook_url"
+        }
+      }]
+    }
+  }
+  depends_on = [kubernetes_namespace.kms]
+}
+
 resource "kubernetes_deployment" "windows_kms" {
  metadata {
    name      = "kms"
@ -140,11 +180,31 @@ resource "kubernetes_deployment" "windows_kms" {
        labels = {
          app = "kms-service"
        }
+        annotations = {
+          # Reload pods when the notifier script changes
+          "checksum/notifier" = sha1(file("${path.module}/files/slack-notifier.py"))
+          # Prometheus scrape — kubernetes-pods job picks up via pod IP
+          "prometheus.io/scrape" = "true"
+          "prometheus.io/port"   = "9101"
+          "prometheus.io/path"   = "/metrics"
+        }
      }
      spec {
+        volume {
+          name = "vlmcsd-log"
+          empty_dir {}
+        }
+        volume {
+          name = "slack-notifier-script"
+          config_map {
+            name = kubernetes_config_map.kms_slack_notifier.metadata[0].name
+          }
+        }
        container {
-          image = "kebe/vlmcsd:latest"
-          name  = "windows-kms"
+          image   = "kebe/vlmcsd:latest"
+          name    = "windows-kms"
+          command = ["/usr/bin/vlmcsd"]
+          args    = ["-D", "-v", "-l", "/var/log/vlmcsd/vlmcsd.log"]
          resources {
            limits = {
              memory = "64Mi"
@ -157,6 +217,59 @@ resource "kubernetes_deployment" "windows_kms" {
          port {
            container_port = 1688
          }
+          volume_mount {
+            name       = "vlmcsd-log"
+            mount_path = "/var/log/vlmcsd"
+          }
+        }
+        container {
+          image   = "python:3.12-alpine"
+          name    = "slack-notifier"
+          command = ["python3", "-u", "/scripts/notifier.py"]
+          env {
+            name  = "VLMCSD_LOG"
+            value = "/var/log/vlmcsd/vlmcsd.log"
+          }
+          env {
+            name  = "SLACK_CHANNEL"
+            value = "#alerts"
+          }
+          env {
+            name  = "DEDUP_WINDOW_SECONDS"
+            value = "3600"
+          }
+          env {
+            name = "SLACK_WEBHOOK_URL"
+            value_from {
+              secret_key_ref {
+                name = "kms-slack-webhook"
+                key  = "url"
+              }
+            }
+          }
+          port {
+            container_port = 9101
+            name           = "metrics"
+          }
+          resources {
+            limits = {
+              memory = "64Mi"
+            }
+            requests = {
+              cpu    = "5m"
+              memory = "48Mi"
+            }
+          }
+          volume_mount {
+            name       = "vlmcsd-log"
+            mount_path = "/var/log/vlmcsd"
+            read_only  = true
+          }
+          volume_mount {
+            name       = "slack-notifier-script"
+            mount_path = "/scripts"
+            read_only  = true
+          }
        }
      }
    }
@ -165,6 +278,7 @@ resource "kubernetes_deployment" "windows_kms" {
    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
    ignore_changes = [spec[0].template[0].spec[0].dns_config]
  }
+  depends_on = [kubernetes_manifest.kms_slack_external_secret]
 }

 resource "kubernetes_service" "windows_kms" {