infra/stacks/kms/files/test_slack_notifier.py
Viktor Barzin d85b54d89d kms: per-connection state in notifier (vlmcsd is multi-threaded)
Bug found via E2E test against the Windows VM (VMID 300). The single
shared `state` dict in slack-notifier.py worked when vlmcsd processed
one connection at a time, but real Windows KMS activations hold the
connection open ~30 seconds (handshake + keep-alive). During that
window vlmcsd accepts other concurrent connections — most relevantly
the new kubelet TCP readiness probe every 5s — and each new OPEN line
reset the shared state, wiping the in-flight activation's
app/product/host before its CLOSE arrived. Result: real activations
were misclassified as probes (no Slack post, no metric increment).

Fix: state is now a dict keyed by `ip:port` with one sub-dict per
in-flight connection. A `__current` pointer tracks the most recent
OPEN so unkeyed detail lines (Application ID, Workstation name, etc.)
can be attributed correctly — vlmcsd writes detail lines immediately
after the OPEN and before any subsequent OPEN, so the heuristic holds.
Orphan CLOSEs (notifier started mid-conn) are now silently dropped
instead of emitting an empty probe event.

Two new regression tests:
- test_kubelet_probe_during_long_activation: 5s probe interleaved into
  a 31s activation block — exact production failure mode.
- test_orphan_close_no_event: bare CLOSE without prior OPEN.

Verified live: triggered slmgr /upk + /ipk + /skms 10.0.20.202 + /ato
on WIN10Pro-DS32. vlmcsd logged the full activation block, notifier
posted to Slack with ip=192.168.1.230 source=external
product='Windows 10 Professional' host='WIN10Pro-DS32.viktorbarzin.lan'
and kms_activations_total{product=Windows 10 Professional,
status=Licensed} 1 — real WAN client IP preserved through the
ETP=Local + dedicated MetalLB IP chain end to end.
2026-05-22 14:16:40 +00:00

151 lines
6.5 KiB
Python

"""Unit tests for slack_notifier classification + state machine.
Run with: cd infra/stacks/kms/files && python3 -m unittest test_slack_notifier
"""
import importlib.util
import os
import unittest
from pathlib import Path
# Load the notifier module from the dashed filename without executing main().
os.environ.setdefault("SLACK_WEBHOOK_URL", "http://example.invalid/webhook")
_spec = importlib.util.spec_from_file_location(
"slack_notifier", Path(__file__).parent / "slack-notifier.py"
)
nm = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(nm)
class ClassifySourceTests(unittest.TestCase):
def test_pod_cidr_is_internal_pod(self):
self.assertEqual(nm.classify_source("10.10.107.224"), "internal_pod")
self.assertEqual(nm.classify_source("10.10.0.1"), "internal_pod")
def test_cluster_lan_is_cluster_node(self):
self.assertEqual(nm.classify_source("10.0.20.103"), "cluster_node")
self.assertEqual(nm.classify_source("10.0.20.200"), "cluster_node")
def test_unknown_source_is_external(self):
self.assertEqual(nm.classify_source("8.8.8.8"), "external")
self.assertEqual(nm.classify_source("203.0.113.42"), "external")
def test_ipv6_external_default(self):
self.assertEqual(nm.classify_source("[2001:db8::1]"), "external")
class IsProbeTests(unittest.TestCase):
def test_open_close_only_is_probe(self):
self.assertTrue(nm.is_probe({"ip": "10.10.107.224"}))
def test_application_id_only_is_not_probe(self):
self.assertFalse(nm.is_probe({"ip": "10.0.20.103", "app": "Windows"}))
def test_product_only_is_not_probe(self):
self.assertFalse(nm.is_probe({"ip": "10.0.20.103", "product": "Office 2021"}))
def test_full_activation_is_not_probe(self):
state = {
"ip": "10.0.20.103",
"app": "Windows",
"product": "Windows 11 Pro",
"host": "DESKTOP-X",
"status": "Notification",
}
self.assertFalse(nm.is_probe(state))
class StateMachineTests(unittest.TestCase):
"""Drive the regex parser through real-shaped vlmcsd log blocks."""
PROBE_BLOCK = [
"2026-05-10 11:00:00: IPv4 connection accepted: 10.10.107.224:54321.",
"2026-05-10 11:00:00: IPv4 connection closed: 10.10.107.224:54321.",
]
ACTIVATION_BLOCK = [
"2026-05-10 11:00:01: IPv4 connection accepted: 10.0.20.103:50001.",
"2026-05-10 11:00:01: <<< Incoming KMS request",
"2026-05-10 11:00:01: Application ID : 55c92734-d682-4d71-983e-d6ec3f16059f (Windows)",
"2026-05-10 11:00:01: Activation ID (Product): 73111121-5638-40f6-bc11-f1d7b0d64300 (Windows 11 Pro)",
"2026-05-10 11:00:01: Workstation name : DESKTOP-MO2323B",
"2026-05-10 11:00:01: Licensing status : 2 (Notification)",
"2026-05-10 11:00:01: IPv4 connection closed: 10.0.20.103:50001.",
]
def _drive(self, lines):
events = []
state = {}
for line in lines:
state, event = nm.process_line(line, state)
if event is not None:
events.append(event)
return events, state
def test_probe_block_emits_probe_event(self):
events, state = self._drive(self.PROBE_BLOCK)
self.assertEqual(len(events), 1)
ev = events[0]
self.assertEqual(ev.kind, "probe")
self.assertEqual(ev.ip, "10.10.107.224")
self.assertEqual(state, {})
def test_activation_block_emits_activation_event(self):
events, state = self._drive(self.ACTIVATION_BLOCK)
self.assertEqual(len(events), 1)
ev = events[0]
self.assertEqual(ev.kind, "activation")
self.assertEqual(ev.ip, "10.0.20.103")
self.assertEqual(ev.product, "Windows 11 Pro")
self.assertEqual(ev.host, "DESKTOP-MO2323B")
self.assertEqual(ev.status, "Notification")
self.assertEqual(state, {})
def test_interleaved_probe_then_activation(self):
events, _ = self._drive(self.PROBE_BLOCK + self.ACTIVATION_BLOCK)
kinds = [e.kind for e in events]
self.assertEqual(kinds, ["probe", "activation"])
def test_kubelet_probe_during_long_activation(self):
"""vlmcsd is multi-threaded. While a real KMS RPC's connection
sits open (Windows holds it ~30s), kubelet's TCP readiness probe
every 5s opens+closes its own connection. The notifier MUST NOT
let the probe's OPEN/CLOSE wipe the in-flight activation's state.
Reproduces the production bug seen on 2026-05-10.
"""
interleaved = [
"2026-05-10 13:12:17: IPv4 connection accepted: 192.168.1.230:53140.",
"2026-05-10 13:12:17: <<< Incoming KMS request",
"2026-05-10 13:12:17: Licensing status : 1 (Licensed)",
"2026-05-10 13:12:17: Application ID : 55c92734-d682-4d71-983e-d6ec3f16059f (Windows)",
"2026-05-10 13:12:17: Activation ID (Product) : 2de67392-b7a7-462a-b1ca-108dd189f588 (Windows 10 Professional)",
"2026-05-10 13:12:17: Workstation name : WIN10Pro-DS32.viktorbarzin.lan",
# ── kubelet probe arrives mid-flight, MUST NOT clobber 53140's state ──
"2026-05-10 13:12:19: IPv4 connection accepted: 10.0.20.102:46498.",
"2026-05-10 13:12:19: IPv4 connection closed: 10.0.20.102:46498.",
"2026-05-10 13:12:24: IPv4 connection accepted: 10.0.20.102:54454.",
"2026-05-10 13:12:24: IPv4 connection closed: 10.0.20.102:54454.",
# ── activation closes 31s after open ──
"2026-05-10 13:12:48: IPv4 connection closed: 192.168.1.230:53140.",
]
events, _ = self._drive(interleaved)
kinds = [e.kind for e in events]
self.assertEqual(kinds, ["probe", "probe", "activation"])
activation = events[-1]
self.assertEqual(activation.ip, "192.168.1.230")
self.assertEqual(activation.product, "Windows 10 Professional")
self.assertEqual(activation.host, "WIN10Pro-DS32.viktorbarzin.lan")
self.assertEqual(activation.status, "Licensed")
def test_orphan_close_no_event(self):
"""If the notifier starts mid-conn, the open was missed but the
close still fires. We MUST NOT emit an event for that — it would
show up with empty fields and look like a probe."""
orphan = [
"2026-05-10 13:00:00: IPv4 connection closed: 192.168.1.230:55555.",
]
events, _ = self._drive(orphan)
self.assertEqual(events, [])
if __name__ == "__main__":
unittest.main()