kms: deploy slack-notifier sidecar with Prometheus metrics + document public exposure
Slack notifier now also exposes /metrics on :9101 with stdlib HTTP — counts activations and dedup-skips by product, gauges last-activation timestamp. Pod template gets the standard prometheus.io/scrape annotations so the cluster-wide kubernetes-pods job picks it up via pod IP. Memory request bumped to 48Mi to cover counter dicts + HTTPServer. Plus docs: networking.md footnotes the windows-kms row noting public WAN exposure with the rate-limited (max-src-conn 50, max-src-conn-rate 10/60, overload <virusprot> flush) pfSense filter rule, and a new runbook covers log locations, rate-limit tuning, and how to revoke the WAN forward. The matching pfSense rule was tightened in place (TCP-only + rate limits) via SSH; pfSense isn't Terraform-managed.
This commit is contained in:
parent
efadeb531d
commit
08edd92b22
4 changed files with 456 additions and 3 deletions
222
stacks/kms/files/slack-notifier.py
Normal file
222
stacks/kms/files/slack-notifier.py
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tail vlmcsd verbose log; post a Slack message per activation, and expose
|
||||
Prometheus metrics on /metrics for activation counts.
|
||||
|
||||
vlmcsd verbose output emits a multi-line block per request:
|
||||
<ts>: IPv4 connection accepted: <ip>:<port>.
|
||||
<ts>: <<< Incoming KMS request
|
||||
<ts>: Application ID : <uuid> (<name>)
|
||||
<ts>: Activation ID (Product): <uuid> (<product>)
|
||||
<ts>: Workstation name : <hostname>
|
||||
...
|
||||
<ts>: IPv4 connection closed: <ip>:<port>.
|
||||
|
||||
We accumulate per-connection state and emit on close. Dedupes by
|
||||
(client_ip, product) within DEDUP_WINDOW_SECONDS to avoid spam from
|
||||
Windows' default 7-day re-activation cycle hitting us repeatedly.
|
||||
|
||||
Prometheus metrics (text format, no client_ip label — cardinality):
|
||||
kms_activations_total{product, status} counter
|
||||
kms_activations_dedup_skipped_total{product} counter
|
||||
kms_last_activation_timestamp_seconds gauge
|
||||
kms_slack_notifier_up gauge (heartbeat)
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from collections import OrderedDict
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
|
||||
LOG_PATH = os.environ.get("VLMCSD_LOG", "/var/log/vlmcsd/vlmcsd.log")
|
||||
WEBHOOK = os.environ["SLACK_WEBHOOK_URL"]
|
||||
CHANNEL = os.environ.get("SLACK_CHANNEL", "#alerts")
|
||||
DEDUP_WINDOW = int(os.environ.get("DEDUP_WINDOW_SECONDS", "3600"))
|
||||
DEDUP_MAX = 4096
|
||||
METRICS_PORT = int(os.environ.get("METRICS_PORT", "9101"))
|
||||
|
||||
OPEN_RE = re.compile(r":\s*IPv[46] connection accepted:\s*([0-9a-f.:\[\]]+):\d+")
|
||||
CLOSE_RE = re.compile(r":\s*IPv[46] connection closed:\s*([0-9a-f.:\[\]]+):\d+")
|
||||
APP_RE = re.compile(r":\s*Application ID\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
|
||||
PROD_RE = re.compile(r":\s*Activation ID \(Product\)\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
|
||||
HOST_RE = re.compile(r":\s*Workstation name\s*:\s*(.+?)\s*$")
|
||||
STATUS_RE = re.compile(r":\s*Licensing status\s*:\s*\d+\s*\((.+?)\)\s*$")
|
||||
|
||||
_metrics_lock = threading.Lock()
|
||||
_activations: dict = {}
|
||||
_dedup_skipped: dict = {}
|
||||
_last_activation_ts: float = 0.0
|
||||
|
||||
|
||||
def _esc(value: str) -> str:
|
||||
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
||||
|
||||
|
||||
def record_activation(product: str, status: str) -> None:
|
||||
global _last_activation_ts
|
||||
with _metrics_lock:
|
||||
key = (product, status)
|
||||
_activations[key] = _activations.get(key, 0) + 1
|
||||
_last_activation_ts = time.time()
|
||||
|
||||
|
||||
def record_dedup_skip(product: str) -> None:
|
||||
with _metrics_lock:
|
||||
_dedup_skipped[product] = _dedup_skipped.get(product, 0) + 1
|
||||
|
||||
|
||||
def render_metrics() -> bytes:
|
||||
out = []
|
||||
with _metrics_lock:
|
||||
activations = dict(_activations)
|
||||
dedup_skipped = dict(_dedup_skipped)
|
||||
last_ts = _last_activation_ts
|
||||
|
||||
out.append("# HELP kms_activations_total KMS activation events that resulted in a Slack post.")
|
||||
out.append("# TYPE kms_activations_total counter")
|
||||
for (product, status), count in sorted(activations.items()):
|
||||
out.append(
|
||||
f'kms_activations_total{{product="{_esc(product)}",status="{_esc(status)}"}} {count}'
|
||||
)
|
||||
|
||||
out.append("# HELP kms_activations_dedup_skipped_total KMS activation events suppressed by dedup window.")
|
||||
out.append("# TYPE kms_activations_dedup_skipped_total counter")
|
||||
for product, count in sorted(dedup_skipped.items()):
|
||||
out.append(f'kms_activations_dedup_skipped_total{{product="{_esc(product)}"}} {count}')
|
||||
|
||||
out.append("# HELP kms_last_activation_timestamp_seconds Unix ts of the last non-deduped activation.")
|
||||
out.append("# TYPE kms_last_activation_timestamp_seconds gauge")
|
||||
out.append(f"kms_last_activation_timestamp_seconds {last_ts}")
|
||||
|
||||
out.append("# HELP kms_slack_notifier_up 1 while the notifier process is running.")
|
||||
out.append("# TYPE kms_slack_notifier_up gauge")
|
||||
out.append("kms_slack_notifier_up 1")
|
||||
|
||||
return ("\n".join(out) + "\n").encode("utf-8")
|
||||
|
||||
|
||||
class MetricsHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == "/healthz":
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain")
|
||||
self.end_headers()
|
||||
self.wfile.write(b"ok\n")
|
||||
return
|
||||
if self.path != "/metrics":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
body = render_metrics()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def start_metrics_server() -> None:
|
||||
server = HTTPServer(("0.0.0.0", METRICS_PORT), MetricsHandler)
|
||||
print(f"[slack-notifier] metrics on :{METRICS_PORT}/metrics", flush=True)
|
||||
server.serve_forever()
|
||||
|
||||
|
||||
def slack_post(text: str) -> None:
|
||||
payload = json.dumps({"channel": CHANNEL, "text": text, "username": "kms.viktorbarzin.me", "icon_emoji": ":computer:"}).encode("utf-8")
|
||||
req = urllib.request.Request(WEBHOOK, data=payload, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=10).read()
|
||||
except urllib.error.URLError as exc:
|
||||
print(f"[slack] post failed: {exc}", file=sys.stderr)
|
||||
|
||||
|
||||
class DedupCache(OrderedDict):
|
||||
def should_send(self, key: str) -> bool:
|
||||
now = time.time()
|
||||
while self and (now - next(iter(self.values()))) > DEDUP_WINDOW:
|
||||
self.popitem(last=False)
|
||||
if key in self and (now - self[key]) < DEDUP_WINDOW:
|
||||
return False
|
||||
if len(self) >= DEDUP_MAX:
|
||||
self.popitem(last=False)
|
||||
self[key] = now
|
||||
self.move_to_end(key)
|
||||
return True
|
||||
|
||||
|
||||
def follow(path: str):
|
||||
while not os.path.exists(path):
|
||||
time.sleep(1)
|
||||
fh = open(path, "r", encoding="utf-8", errors="replace")
|
||||
fh.seek(0, 2)
|
||||
inode = os.fstat(fh.fileno()).st_ino
|
||||
while True:
|
||||
line = fh.readline()
|
||||
if line:
|
||||
yield line.rstrip("\n")
|
||||
continue
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
new_inode = os.stat(path).st_ino
|
||||
if new_inode != inode:
|
||||
fh.close()
|
||||
fh = open(path, "r", encoding="utf-8", errors="replace")
|
||||
inode = new_inode
|
||||
except FileNotFoundError:
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
threading.Thread(target=start_metrics_server, daemon=True).start()
|
||||
|
||||
dedup = DedupCache()
|
||||
print(f"[slack-notifier] tailing {LOG_PATH}, posting to {CHANNEL} via Slack", flush=True)
|
||||
state: dict = {}
|
||||
|
||||
for line in follow(LOG_PATH):
|
||||
if (m := OPEN_RE.search(line)):
|
||||
state = {"ip": m.group(1)}
|
||||
continue
|
||||
if not state:
|
||||
continue
|
||||
if (m := APP_RE.search(line)):
|
||||
state["app"] = m.group(1)
|
||||
elif (m := PROD_RE.search(line)):
|
||||
state["product"] = m.group(1)
|
||||
elif (m := HOST_RE.search(line)):
|
||||
state["host"] = m.group(1)
|
||||
elif (m := STATUS_RE.search(line)):
|
||||
state["status"] = m.group(1)
|
||||
elif CLOSE_RE.search(line):
|
||||
ip = state.get("ip", "?")
|
||||
product = state.get("product", state.get("app", "unknown"))
|
||||
host = state.get("host", "?")
|
||||
status = state.get("status", "unknown")
|
||||
key = f"{ip}|{product}"
|
||||
if dedup.should_send(key):
|
||||
text = (
|
||||
f":computer: KMS activation\n"
|
||||
f"• *Client*: `{ip}`\n"
|
||||
f"• *Workstation*: `{host}`\n"
|
||||
f"• *Product*: `{product}`\n"
|
||||
f"• *Status before*: {status}"
|
||||
)
|
||||
slack_post(text)
|
||||
record_activation(product, status)
|
||||
print(f"[slack-notifier] sent: ip={ip} product={product!r} host={host!r}", flush=True)
|
||||
else:
|
||||
record_dedup_skip(product)
|
||||
print(f"[slack-notifier] dedup-skip: ip={ip} product={product!r}", flush=True)
|
||||
state = {}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -119,6 +119,46 @@ module "ingress" {
|
|||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map" "kms_slack_notifier" {
|
||||
metadata {
|
||||
name = "kms-slack-notifier"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"notifier.py" = file("${path.module}/files/slack-notifier.py")
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_manifest" "kms_slack_external_secret" {
|
||||
manifest = {
|
||||
apiVersion = "external-secrets.io/v1beta1"
|
||||
kind = "ExternalSecret"
|
||||
metadata = {
|
||||
name = "kms-slack-webhook"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
refreshInterval = "1h"
|
||||
secretStoreRef = {
|
||||
name = "vault-kv"
|
||||
kind = "ClusterSecretStore"
|
||||
}
|
||||
target = {
|
||||
name = "kms-slack-webhook"
|
||||
creationPolicy = "Owner"
|
||||
}
|
||||
data = [{
|
||||
secretKey = "url"
|
||||
remoteRef = {
|
||||
key = "kms"
|
||||
property = "slack_webhook_url"
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
depends_on = [kubernetes_namespace.kms]
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "windows_kms" {
|
||||
metadata {
|
||||
name = "kms"
|
||||
|
|
@ -140,11 +180,31 @@ resource "kubernetes_deployment" "windows_kms" {
|
|||
labels = {
|
||||
app = "kms-service"
|
||||
}
|
||||
annotations = {
|
||||
# Reload pods when the notifier script changes
|
||||
"checksum/notifier" = sha1(file("${path.module}/files/slack-notifier.py"))
|
||||
# Prometheus scrape — kubernetes-pods job picks up via pod IP
|
||||
"prometheus.io/scrape" = "true"
|
||||
"prometheus.io/port" = "9101"
|
||||
"prometheus.io/path" = "/metrics"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
volume {
|
||||
name = "vlmcsd-log"
|
||||
empty_dir {}
|
||||
}
|
||||
volume {
|
||||
name = "slack-notifier-script"
|
||||
config_map {
|
||||
name = kubernetes_config_map.kms_slack_notifier.metadata[0].name
|
||||
}
|
||||
}
|
||||
container {
|
||||
image = "kebe/vlmcsd:latest"
|
||||
name = "windows-kms"
|
||||
image = "kebe/vlmcsd:latest"
|
||||
name = "windows-kms"
|
||||
command = ["/usr/bin/vlmcsd"]
|
||||
args = ["-D", "-v", "-l", "/var/log/vlmcsd/vlmcsd.log"]
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
|
|
@ -157,6 +217,59 @@ resource "kubernetes_deployment" "windows_kms" {
|
|||
port {
|
||||
container_port = 1688
|
||||
}
|
||||
volume_mount {
|
||||
name = "vlmcsd-log"
|
||||
mount_path = "/var/log/vlmcsd"
|
||||
}
|
||||
}
|
||||
container {
|
||||
image = "python:3.12-alpine"
|
||||
name = "slack-notifier"
|
||||
command = ["python3", "-u", "/scripts/notifier.py"]
|
||||
env {
|
||||
name = "VLMCSD_LOG"
|
||||
value = "/var/log/vlmcsd/vlmcsd.log"
|
||||
}
|
||||
env {
|
||||
name = "SLACK_CHANNEL"
|
||||
value = "#alerts"
|
||||
}
|
||||
env {
|
||||
name = "DEDUP_WINDOW_SECONDS"
|
||||
value = "3600"
|
||||
}
|
||||
env {
|
||||
name = "SLACK_WEBHOOK_URL"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = "kms-slack-webhook"
|
||||
key = "url"
|
||||
}
|
||||
}
|
||||
}
|
||||
port {
|
||||
container_port = 9101
|
||||
name = "metrics"
|
||||
}
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "5m"
|
||||
memory = "48Mi"
|
||||
}
|
||||
}
|
||||
volume_mount {
|
||||
name = "vlmcsd-log"
|
||||
mount_path = "/var/log/vlmcsd"
|
||||
read_only = true
|
||||
}
|
||||
volume_mount {
|
||||
name = "slack-notifier-script"
|
||||
mount_path = "/scripts"
|
||||
read_only = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -165,6 +278,7 @@ resource "kubernetes_deployment" "windows_kms" {
|
|||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
depends_on = [kubernetes_manifest.kms_slack_external_secret]
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "windows_kms" {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue