diff --git a/docs/architecture/networking.md b/docs/architecture/networking.md index e7959589..68834017 100644 --- a/docs/architecture/networking.md +++ b/docs/architecture/networking.md @@ -261,7 +261,7 @@ MetalLB v0.15.3 allocates IPs from the range 10.0.20.200-10.0.20.220 in **Layer | traefik | traefik | 10.0.20.200 (shared) | 80, 443, 443/UDP (HTTP/3), 10200, 10300, 11434/TCP | | coturn | coturn | 10.0.20.200 (shared) | 3478/UDP (STUN/TURN), 49152-49252/UDP (relay) | | headscale | headscale | 10.0.20.200 (shared) | 41641/UDP, 3479/UDP | -| windows-kms | kms | 10.0.20.200 (shared) | 1688/TCP | +| windows-kms¹ | kms | 10.0.20.200 (shared) | 1688/TCP | | qbittorrent | servarr | 10.0.20.200 (shared) | 50000/TCP+UDP | | shadowsocks | shadowsocks | 10.0.20.200 (shared) | 8388/TCP+UDP | | torrserver-bt | tor-proxy | 10.0.20.200 (shared) | 5665/TCP | @@ -272,6 +272,8 @@ MetalLB v0.15.3 allocates IPs from the range 10.0.20.200-10.0.20.220 in **Layer pfSense aliases reference these IPs: `k8s_shared_lb` (10.0.20.200), `technitium_dns` (10.0.20.201). NAT rules use aliases for maintainability. +¹ **windows-kms is publicly WAN-exposed.** pfSense forwards WAN TCP/1688 → `k8s_shared_lb:1688` so any internet host can activate. The matching filter rule applies a per-source rate limit (`max-src-conn 50`, `max-src-conn-rate 10/60`) with `overload ` flush — offenders are auto-added to pfSense's stock `virusprot` pf table for follow-on blocks. Operations (rate-limit tuning, log locations, revocation) are documented in `docs/runbooks/kms-public-exposure.md`. + Critical services are scaled to **3 replicas**: - Traefik (PDB: minAvailable=2) - Authentik (PDB: minAvailable=2) diff --git a/docs/runbooks/kms-public-exposure.md b/docs/runbooks/kms-public-exposure.md new file mode 100644 index 00000000..9a6a4a6f --- /dev/null +++ b/docs/runbooks/kms-public-exposure.md @@ -0,0 +1,115 @@ +# Runbook: KMS public exposure (kms.viktorbarzin.me:1688) + +`kms.viktorbarzin.me:1688/TCP` is intentionally open to the internet so any +visitor can activate Volume License Microsoft products. The webpage at +`https://kms.viktorbarzin.me/` documents how to use it. + +This runbook covers operations on the public exposure: where to find logs, +how to tune the rate limit, how to revoke if abused. + +## Architecture + +- **K8s service**: `windows-kms` in namespace `kms`, MetalLB shared LB IP + `10.0.20.200:1688`. ETP=Cluster, so client IPs in vlmcsd logs are SNAT'd + k8s node IPs (not real-world client IPs). Trade-off accepted — + preserving real client IPs would require a dedicated MetalLB IP with + ETP=Local or a PROXY-protocol bounce; vlmcsd doesn't speak PROXY-v2. +- **pfSense WAN forward**: `WAN TCP/1688 → k8s_shared_lb:1688` + (alias = `10.0.20.200`). Description: `KMS public — kms.viktorbarzin.me`. +- **Filter rule** on the WAN interface, TCP/1688, with state-table + per-source caps: + - `max-src-conn 50` — concurrent connections per source IP + - `max-src-conn-rate 10/60` — 10 new connections per 60 seconds per + source + - `overload ` flush — sources that exceed either cap get added + to pfSense's stock `virusprot` pf table and have their existing states + flushed. (`virusprot` is the only table pfSense's filter generator + targets for `overload`; see `/etc/inc/filter.inc`. Don't try to point + it at a custom table — the schema doesn't expose that knob.) + +## Where the logs are + +### vlmcsd (kms namespace, k8s) + +```bash +# Live tail +kubectl logs -n kms -l app=kms-service -c windows-kms --tail=50 -f + +# All activations in the running pod +kubectl logs -n kms -l app=kms-service -c windows-kms | grep "Incoming KMS request" +``` + +Source IPs in this log are the SNAT'd node IPs because the LB Service uses +ETP=Cluster on a shared MetalLB IP. Don't expect real WAN client IPs here. + +### Slack notifier (kms namespace, k8s) + +```bash +kubectl logs -n kms -l app=kms-service -c slack-notifier --tail=50 -f +``` + +Posts to `#alerts`, dedup window 1h per (source-IP, product). Activations +also increment the Prometheus counter `kms_activations_total{product,status}` +exposed on the same pod at `:9101/metrics` (scraped by the cluster-wide +`kubernetes-pods` job; query via Prometheus or Grafana directly). + +### pfSense — virusprot table and filter hits + +```bash +# SSH to 10.0.20.1 as root +pfctl -t virusprot -T show # who's currently in the virusprot table +pfctl -t virusprot -T expire 86400 # boot anyone added more than 24h ago +pfctl -t virusprot -T flush # nuke the entire table + +# Filter rule hit counts (find the KMS public rule, look at Evaluations / States) +pfctl -sr -v | grep -A 4 1688 + +# State table — current TCP/1688 connections, per source +pfctl -ss | grep ':1688 ' +``` + +## Tightening or loosening the rate limit + +The filter rule is configured via the pfSense web UI +(`Firewall → Rules → WAN`, look for the `KMS public — kms.viktorbarzin.me` +rule) under **Advanced Options → "Maximum new connections per source per +seconds"** and **"Maximum state entries per source"**. + +- **Default**: `max-src-conn 50`, `max-src-conn-rate 10/60` +- To **tighten** (suspected abuse): drop to `max-src-conn 10`, + `max-src-conn-rate 3/60`. Flush state and existing virusprot afterwards + (`pfctl -k 0.0.0.0/0 -K 0.0.0.0/0` is overkill — just save+apply the + rule, pfSense reloads pf and existing virusprot stay blocked). +- To **loosen** (legitimate users blocked): bump to + `max-src-conn-rate 30/60`. The `virusprot` table flush still applies on + overload; reduce its lifetime via + `Firewall → Advanced → State Timeouts` if entries linger. + +The `overload` table entry survives pf reloads. Running +`pfctl -t virusprot -T flush` after a tuning change clears the slate. + +## Revoking the public exposure + +If the activation surface needs to come down (abuse, legal, audit): + +1. **pfSense web UI** → `Firewall → NAT → Port Forward` → find + `WAN TCP/1688 → k8s_shared_lb` → **delete** (or disable). Apply. +2. **pfSense web UI** → `Firewall → Rules → WAN` → find + `KMS public — kms.viktorbarzin.me` → **delete** (or disable). Apply. +3. Verify externally: from a phone tether, `nc -zw3 kms.viktorbarzin.me 1688` + should now fail. + +The k8s service stays reachable on the LAN +(`10.0.20.200:1688` and the internal `kms.viktorbarzin.lan` ingress for +the webpage) — only the WAN port-forward is removed. + +To put it back, recreate the NAT rule (target alias `k8s_shared_lb`, +port `1688`) and the filter rule with the same per-source caps. + +## Related + +- Stack: `stacks/kms/` (Terraform; deployment, MetalLB Service, ingress, + ExternalSecret for the Slack webhook) +- Webpage source: `kms-website/` repo (Hugo + nginx, deployed via Drone CI) +- Networking architecture footnote: + `docs/architecture/networking.md` § "MetalLB & Load Balancing" diff --git a/stacks/kms/files/slack-notifier.py b/stacks/kms/files/slack-notifier.py new file mode 100644 index 00000000..e6f20df2 --- /dev/null +++ b/stacks/kms/files/slack-notifier.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +""" +Tail vlmcsd verbose log; post a Slack message per activation, and expose +Prometheus metrics on /metrics for activation counts. + +vlmcsd verbose output emits a multi-line block per request: + : IPv4 connection accepted: :. + : <<< Incoming KMS request + : Application ID : () + : Activation ID (Product): () + : Workstation name : + ... + : IPv4 connection closed: :. + +We accumulate per-connection state and emit on close. Dedupes by +(client_ip, product) within DEDUP_WINDOW_SECONDS to avoid spam from +Windows' default 7-day re-activation cycle hitting us repeatedly. + +Prometheus metrics (text format, no client_ip label — cardinality): + kms_activations_total{product, status} counter + kms_activations_dedup_skipped_total{product} counter + kms_last_activation_timestamp_seconds gauge + kms_slack_notifier_up gauge (heartbeat) +""" +import json +import os +import re +import sys +import threading +import time +import urllib.error +import urllib.request +from collections import OrderedDict +from http.server import BaseHTTPRequestHandler, HTTPServer + +LOG_PATH = os.environ.get("VLMCSD_LOG", "/var/log/vlmcsd/vlmcsd.log") +WEBHOOK = os.environ["SLACK_WEBHOOK_URL"] +CHANNEL = os.environ.get("SLACK_CHANNEL", "#alerts") +DEDUP_WINDOW = int(os.environ.get("DEDUP_WINDOW_SECONDS", "3600")) +DEDUP_MAX = 4096 +METRICS_PORT = int(os.environ.get("METRICS_PORT", "9101")) + +OPEN_RE = re.compile(r":\s*IPv[46] connection accepted:\s*([0-9a-f.:\[\]]+):\d+") +CLOSE_RE = re.compile(r":\s*IPv[46] connection closed:\s*([0-9a-f.:\[\]]+):\d+") +APP_RE = re.compile(r":\s*Application ID\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)") +PROD_RE = re.compile(r":\s*Activation ID \(Product\)\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)") +HOST_RE = re.compile(r":\s*Workstation name\s*:\s*(.+?)\s*$") +STATUS_RE = re.compile(r":\s*Licensing status\s*:\s*\d+\s*\((.+?)\)\s*$") + +_metrics_lock = threading.Lock() +_activations: dict = {} +_dedup_skipped: dict = {} +_last_activation_ts: float = 0.0 + + +def _esc(value: str) -> str: + return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + + +def record_activation(product: str, status: str) -> None: + global _last_activation_ts + with _metrics_lock: + key = (product, status) + _activations[key] = _activations.get(key, 0) + 1 + _last_activation_ts = time.time() + + +def record_dedup_skip(product: str) -> None: + with _metrics_lock: + _dedup_skipped[product] = _dedup_skipped.get(product, 0) + 1 + + +def render_metrics() -> bytes: + out = [] + with _metrics_lock: + activations = dict(_activations) + dedup_skipped = dict(_dedup_skipped) + last_ts = _last_activation_ts + + out.append("# HELP kms_activations_total KMS activation events that resulted in a Slack post.") + out.append("# TYPE kms_activations_total counter") + for (product, status), count in sorted(activations.items()): + out.append( + f'kms_activations_total{{product="{_esc(product)}",status="{_esc(status)}"}} {count}' + ) + + out.append("# HELP kms_activations_dedup_skipped_total KMS activation events suppressed by dedup window.") + out.append("# TYPE kms_activations_dedup_skipped_total counter") + for product, count in sorted(dedup_skipped.items()): + out.append(f'kms_activations_dedup_skipped_total{{product="{_esc(product)}"}} {count}') + + out.append("# HELP kms_last_activation_timestamp_seconds Unix ts of the last non-deduped activation.") + out.append("# TYPE kms_last_activation_timestamp_seconds gauge") + out.append(f"kms_last_activation_timestamp_seconds {last_ts}") + + out.append("# HELP kms_slack_notifier_up 1 while the notifier process is running.") + out.append("# TYPE kms_slack_notifier_up gauge") + out.append("kms_slack_notifier_up 1") + + return ("\n".join(out) + "\n").encode("utf-8") + + +class MetricsHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/healthz": + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.end_headers() + self.wfile.write(b"ok\n") + return + if self.path != "/metrics": + self.send_response(404) + self.end_headers() + return + body = render_metrics() + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, *args, **kwargs): + pass + + +def start_metrics_server() -> None: + server = HTTPServer(("0.0.0.0", METRICS_PORT), MetricsHandler) + print(f"[slack-notifier] metrics on :{METRICS_PORT}/metrics", flush=True) + server.serve_forever() + + +def slack_post(text: str) -> None: + payload = json.dumps({"channel": CHANNEL, "text": text, "username": "kms.viktorbarzin.me", "icon_emoji": ":computer:"}).encode("utf-8") + req = urllib.request.Request(WEBHOOK, data=payload, headers={"Content-Type": "application/json"}) + try: + urllib.request.urlopen(req, timeout=10).read() + except urllib.error.URLError as exc: + print(f"[slack] post failed: {exc}", file=sys.stderr) + + +class DedupCache(OrderedDict): + def should_send(self, key: str) -> bool: + now = time.time() + while self and (now - next(iter(self.values()))) > DEDUP_WINDOW: + self.popitem(last=False) + if key in self and (now - self[key]) < DEDUP_WINDOW: + return False + if len(self) >= DEDUP_MAX: + self.popitem(last=False) + self[key] = now + self.move_to_end(key) + return True + + +def follow(path: str): + while not os.path.exists(path): + time.sleep(1) + fh = open(path, "r", encoding="utf-8", errors="replace") + fh.seek(0, 2) + inode = os.fstat(fh.fileno()).st_ino + while True: + line = fh.readline() + if line: + yield line.rstrip("\n") + continue + time.sleep(0.5) + try: + new_inode = os.stat(path).st_ino + if new_inode != inode: + fh.close() + fh = open(path, "r", encoding="utf-8", errors="replace") + inode = new_inode + except FileNotFoundError: + time.sleep(1) + + +def main() -> None: + threading.Thread(target=start_metrics_server, daemon=True).start() + + dedup = DedupCache() + print(f"[slack-notifier] tailing {LOG_PATH}, posting to {CHANNEL} via Slack", flush=True) + state: dict = {} + + for line in follow(LOG_PATH): + if (m := OPEN_RE.search(line)): + state = {"ip": m.group(1)} + continue + if not state: + continue + if (m := APP_RE.search(line)): + state["app"] = m.group(1) + elif (m := PROD_RE.search(line)): + state["product"] = m.group(1) + elif (m := HOST_RE.search(line)): + state["host"] = m.group(1) + elif (m := STATUS_RE.search(line)): + state["status"] = m.group(1) + elif CLOSE_RE.search(line): + ip = state.get("ip", "?") + product = state.get("product", state.get("app", "unknown")) + host = state.get("host", "?") + status = state.get("status", "unknown") + key = f"{ip}|{product}" + if dedup.should_send(key): + text = ( + f":computer: KMS activation\n" + f"• *Client*: `{ip}`\n" + f"• *Workstation*: `{host}`\n" + f"• *Product*: `{product}`\n" + f"• *Status before*: {status}" + ) + slack_post(text) + record_activation(product, status) + print(f"[slack-notifier] sent: ip={ip} product={product!r} host={host!r}", flush=True) + else: + record_dedup_skip(product) + print(f"[slack-notifier] dedup-skip: ip={ip} product={product!r}", flush=True) + state = {} + + +if __name__ == "__main__": + main() diff --git a/stacks/kms/main.tf b/stacks/kms/main.tf index e628ecfc..9f8c2094 100644 --- a/stacks/kms/main.tf +++ b/stacks/kms/main.tf @@ -119,6 +119,46 @@ module "ingress" { } } +resource "kubernetes_config_map" "kms_slack_notifier" { + metadata { + name = "kms-slack-notifier" + namespace = kubernetes_namespace.kms.metadata[0].name + } + data = { + "notifier.py" = file("${path.module}/files/slack-notifier.py") + } +} + +resource "kubernetes_manifest" "kms_slack_external_secret" { + manifest = { + apiVersion = "external-secrets.io/v1beta1" + kind = "ExternalSecret" + metadata = { + name = "kms-slack-webhook" + namespace = kubernetes_namespace.kms.metadata[0].name + } + spec = { + refreshInterval = "1h" + secretStoreRef = { + name = "vault-kv" + kind = "ClusterSecretStore" + } + target = { + name = "kms-slack-webhook" + creationPolicy = "Owner" + } + data = [{ + secretKey = "url" + remoteRef = { + key = "kms" + property = "slack_webhook_url" + } + }] + } + } + depends_on = [kubernetes_namespace.kms] +} + resource "kubernetes_deployment" "windows_kms" { metadata { name = "kms" @@ -140,11 +180,31 @@ resource "kubernetes_deployment" "windows_kms" { labels = { app = "kms-service" } + annotations = { + # Reload pods when the notifier script changes + "checksum/notifier" = sha1(file("${path.module}/files/slack-notifier.py")) + # Prometheus scrape — kubernetes-pods job picks up via pod IP + "prometheus.io/scrape" = "true" + "prometheus.io/port" = "9101" + "prometheus.io/path" = "/metrics" + } } spec { + volume { + name = "vlmcsd-log" + empty_dir {} + } + volume { + name = "slack-notifier-script" + config_map { + name = kubernetes_config_map.kms_slack_notifier.metadata[0].name + } + } container { - image = "kebe/vlmcsd:latest" - name = "windows-kms" + image = "kebe/vlmcsd:latest" + name = "windows-kms" + command = ["/usr/bin/vlmcsd"] + args = ["-D", "-v", "-l", "/var/log/vlmcsd/vlmcsd.log"] resources { limits = { memory = "64Mi" @@ -157,6 +217,59 @@ resource "kubernetes_deployment" "windows_kms" { port { container_port = 1688 } + volume_mount { + name = "vlmcsd-log" + mount_path = "/var/log/vlmcsd" + } + } + container { + image = "python:3.12-alpine" + name = "slack-notifier" + command = ["python3", "-u", "/scripts/notifier.py"] + env { + name = "VLMCSD_LOG" + value = "/var/log/vlmcsd/vlmcsd.log" + } + env { + name = "SLACK_CHANNEL" + value = "#alerts" + } + env { + name = "DEDUP_WINDOW_SECONDS" + value = "3600" + } + env { + name = "SLACK_WEBHOOK_URL" + value_from { + secret_key_ref { + name = "kms-slack-webhook" + key = "url" + } + } + } + port { + container_port = 9101 + name = "metrics" + } + resources { + limits = { + memory = "64Mi" + } + requests = { + cpu = "5m" + memory = "48Mi" + } + } + volume_mount { + name = "vlmcsd-log" + mount_path = "/var/log/vlmcsd" + read_only = true + } + volume_mount { + name = "slack-notifier-script" + mount_path = "/scripts" + read_only = true + } } } } @@ -165,6 +278,7 @@ resource "kubernetes_deployment" "windows_kms" { # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2 ignore_changes = [spec[0].template[0].spec[0].dns_config] } + depends_on = [kubernetes_manifest.kms_slack_external_secret] } resource "kubernetes_service" "windows_kms" {