kms: per-connection state in notifier (vlmcsd is multi-threaded)
Bug found via E2E test against the Windows VM (VMID 300). The single
shared `state` dict in slack-notifier.py worked when vlmcsd processed
one connection at a time, but real Windows KMS activations hold the
connection open ~30 seconds (handshake + keep-alive). During that
window vlmcsd accepts other concurrent connections — most relevantly
the new kubelet TCP readiness probe every 5s — and each new OPEN line
reset the shared state, wiping the in-flight activation's
app/product/host before its CLOSE arrived. Result: real activations
were misclassified as probes (no Slack post, no metric increment).
Fix: state is now a dict keyed by `ip:port` with one sub-dict per
in-flight connection. A `__current` pointer tracks the most recent
OPEN so unkeyed detail lines (Application ID, Workstation name, etc.)
can be attributed correctly — vlmcsd writes detail lines immediately
after the OPEN and before any subsequent OPEN, so the heuristic holds.
Orphan CLOSEs (notifier started mid-conn) are now silently dropped
instead of emitting an empty probe event.
Two new regression tests:
- test_kubelet_probe_during_long_activation: 5s probe interleaved into
a 31s activation block — exact production failure mode.
- test_orphan_close_no_event: bare CLOSE without prior OPEN.
Verified live: triggered slmgr /upk + /ipk + /skms 10.0.20.202 + /ato
on WIN10Pro-DS32. vlmcsd logged the full activation block, notifier
posted to Slack with ip=192.168.1.230 source=external
product='Windows 10 Professional' host='WIN10Pro-DS32.viktorbarzin.lan'
and kms_activations_total{product=Windows 10 Professional,
status=Licensed} 1 — real WAN client IP preserved through the
ETP=Local + dedicated MetalLB IP chain end to end.
This commit is contained in:
parent
9163909fad
commit
d67c3027bc
2 changed files with 81 additions and 27 deletions
|
|
@ -45,8 +45,8 @@ DEDUP_WINDOW = int(os.environ.get("DEDUP_WINDOW_SECONDS", "3600"))
|
|||
DEDUP_MAX = 4096
|
||||
METRICS_PORT = int(os.environ.get("METRICS_PORT", "9101"))
|
||||
|
||||
OPEN_RE = re.compile(r":\s*IPv[46] connection accepted:\s*([0-9a-f.:\[\]]+):\d+")
|
||||
CLOSE_RE = re.compile(r":\s*IPv[46] connection closed:\s*([0-9a-f.:\[\]]+):\d+")
|
||||
OPEN_RE = re.compile(r":\s*IPv[46] connection accepted:\s*([0-9a-f.:\[\]]+):(\d+)")
|
||||
CLOSE_RE = re.compile(r":\s*IPv[46] connection closed:\s*([0-9a-f.:\[\]]+):(\d+)")
|
||||
APP_RE = re.compile(r":\s*Application ID\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
|
||||
PROD_RE = re.compile(r":\s*Activation ID \(Product\)\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
|
||||
HOST_RE = re.compile(r":\s*Workstation name\s*:\s*(.+?)\s*$")
|
||||
|
|
@ -106,40 +106,53 @@ class Event:
|
|||
def process_line(line: str, state: dict):
|
||||
"""Drive the parser one line at a time. Returns (new_state, event_or_None).
|
||||
|
||||
On `connection accepted`, a new state dict is started.
|
||||
On `connection closed`, an Event is emitted and state is reset.
|
||||
Other lines accumulate fields into the current state.
|
||||
`state` tracks per-connection state keyed by `ip:port` (vlmcsd is
|
||||
multi-threaded — concurrent connections interleave in the log, so a
|
||||
single shared dict gets clobbered). The special key `__current` points
|
||||
at the most recent OPEN's key so detail lines (which lack ip:port
|
||||
info) can be attributed to the right connection. Detail lines arrive
|
||||
before the next OPEN under vlmcsd's processing model.
|
||||
"""
|
||||
if (m := OPEN_RE.search(line)):
|
||||
return {"ip": m.group(1)}, None
|
||||
if not state:
|
||||
ip = m.group(1)
|
||||
key = f"{ip}:{m.group(2)}"
|
||||
state[key] = {"ip": ip}
|
||||
state["__current"] = key
|
||||
return state, None
|
||||
if (m := APP_RE.search(line)):
|
||||
state["app"] = m.group(1)
|
||||
return state, None
|
||||
if (m := PROD_RE.search(line)):
|
||||
state["product"] = m.group(1)
|
||||
return state, None
|
||||
if (m := HOST_RE.search(line)):
|
||||
state["host"] = m.group(1)
|
||||
return state, None
|
||||
if (m := STATUS_RE.search(line)):
|
||||
state["status"] = m.group(1)
|
||||
return state, None
|
||||
if CLOSE_RE.search(line):
|
||||
ip = state.get("ip", "?")
|
||||
|
||||
if (m := CLOSE_RE.search(line)):
|
||||
key = f"{m.group(1)}:{m.group(2)}"
|
||||
conn = state.pop(key, None)
|
||||
if state.get("__current") == key:
|
||||
state.pop("__current", None)
|
||||
if conn is None:
|
||||
return state, None # orphan close (e.g., notifier started mid-conn)
|
||||
ip = conn.get("ip", "?")
|
||||
source = classify_source(ip)
|
||||
if is_probe(state):
|
||||
if is_probe(conn):
|
||||
event = Event("probe", ip, source)
|
||||
else:
|
||||
event = Event(
|
||||
"activation", ip, source,
|
||||
app=state.get("app", ""),
|
||||
product=state.get("product", state.get("app", "unknown")),
|
||||
host=state.get("host", "?"),
|
||||
status=state.get("status", "unknown"),
|
||||
app=conn.get("app", ""),
|
||||
product=conn.get("product", conn.get("app", "unknown")),
|
||||
host=conn.get("host", "?"),
|
||||
status=conn.get("status", "unknown"),
|
||||
)
|
||||
return {}, event
|
||||
return state, event
|
||||
|
||||
current = state.get("__current")
|
||||
if not current or current not in state:
|
||||
return state, None
|
||||
conn = state[current]
|
||||
if (m := APP_RE.search(line)):
|
||||
conn["app"] = m.group(1)
|
||||
elif (m := PROD_RE.search(line)):
|
||||
conn["product"] = m.group(1)
|
||||
elif (m := HOST_RE.search(line)):
|
||||
conn["host"] = m.group(1)
|
||||
elif (m := STATUS_RE.search(line)):
|
||||
conn["status"] = m.group(1)
|
||||
return state, None
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue