kms: dedicate MetalLB IP 10.0.20.202 + filter probe noise

Two coupled fixes for the hourly Slack noise + missing client IPs:

1. Move windows-kms off shared 10.0.20.200 to a dedicated MetalLB IP
   10.0.20.202 with externalTrafficPolicy=Local, so vlmcsd sees real
   WAN client IPs (pfSense WAN forwards do DNAT-only; ETP=Local skips
   kube-proxy SNAT). Same pattern mailserver used pre-2026-04-19.
   Sharing 10.0.20.200 is blocked because all 10 services there are
   ETP=Cluster and MetalLB requires consistent ETP per shared IP.

2. Slack notifier now suppresses Slack posts for bare TCP open/close
   pairs (no Application/Activation block) — these are Uptime Kuma's
   port monitor and the new kubelet readiness/liveness probes. Probe
   counts go to a new metric kms_connection_probes_total{source} where
   source classifies the IP as internal_pod / cluster_node / external.
   Real activations are unaffected.

Pod fluidity: added TCP readiness/liveness probes on 1688 to gate Pod
Ready on the listener actually being up — required for ETP=Local so
MetalLB only advertises 10.0.20.202 from a node where vlmcsd is serving.

pfSense side (applied separately, not codified):
- New alias k8s_kms_lb = 10.0.20.202 (KMS-only)
- WAN:1688 NAT + filter rule retargeted from k8s_shared_lb to k8s_kms_lb
- All other forwards on k8s_shared_lb (WireGuard, HTTPS, shadowsocks,
  smtps, etc.) untouched

Runbook updated. Tests added for classify_source / is_probe / process_line.
This commit is contained in:
Viktor Barzin 2026-05-10 13:02:58 +00:00
parent 295cfd776c
commit 9d4e2fc4a0
5 changed files with 317 additions and 56 deletions

View file

@ -12,16 +12,21 @@ vlmcsd verbose output emits a multi-line block per request:
...
<ts>: IPv4 connection closed: <ip>:<port>.
We accumulate per-connection state and emit on close. Dedupes by
(client_ip, product) within DEDUP_WINDOW_SECONDS to avoid spam from
Windows' default 7-day re-activation cycle hitting us repeatedly.
A bare TCP open/close pair (no Application/Activation lines) is a probe
typically Uptime Kuma's port-type monitor on windows-kms.kms:1688. Probes
are counted in `kms_connection_probes_total` but never posted to Slack.
Real activations dedupe by (client_ip, product) within DEDUP_WINDOW_SECONDS
to avoid spam from Windows' default 7-day re-activation cycle.
Prometheus metrics (text format, no client_ip label cardinality):
kms_activations_total{product, status} counter
kms_activations_dedup_skipped_total{product} counter
kms_connection_probes_total{source} counter (probe-only conns)
kms_last_activation_timestamp_seconds gauge
kms_slack_notifier_up gauge (heartbeat)
"""
import ipaddress
import json
import os
import re
@ -47,9 +52,16 @@ PROD_RE = re.compile(r":\s*Activation ID \(Product\)\s*:\s*[0-9a-f-]+\s*\(([^)]+
HOST_RE = re.compile(r":\s*Workstation name\s*:\s*(.+?)\s*$")
STATUS_RE = re.compile(r":\s*Licensing status\s*:\s*\d+\s*\((.+?)\)\s*$")
# Pod CIDR (Calico, kube-proxy SNAT-free intra-cluster traffic) and cluster
# LAN (kube-proxy SNATs ETP=Cluster external traffic to a node IP). Anything
# else is a real client IP that arrived via ETP=Local or pod-to-svc routing.
POD_CIDR = ipaddress.ip_network("10.10.0.0/16")
CLUSTER_LAN_CIDR = ipaddress.ip_network("10.0.20.0/24")
_metrics_lock = threading.Lock()
_activations: dict = {}
_dedup_skipped: dict = {}
_probes: dict = {}
_last_activation_ts: float = 0.0
@ -57,6 +69,80 @@ def _esc(value: str) -> str:
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
def classify_source(ip: str) -> str:
"""Return 'internal_pod', 'cluster_node', or 'external' for a vlmcsd-logged IP."""
raw = ip.strip().strip("[]")
try:
addr = ipaddress.ip_address(raw)
except ValueError:
return "external"
if isinstance(addr, ipaddress.IPv4Address):
if addr in POD_CIDR:
return "internal_pod"
if addr in CLUSTER_LAN_CIDR:
return "cluster_node"
return "external"
def is_probe(state: dict) -> bool:
"""A connection that closed without any Application/Activation block."""
return "app" not in state and "product" not in state
class Event:
__slots__ = ("kind", "ip", "source", "app", "product", "host", "status")
def __init__(self, kind: str, ip: str, source: str, *, app: str = "",
product: str = "", host: str = "", status: str = "") -> None:
self.kind = kind
self.ip = ip
self.source = source
self.app = app
self.product = product
self.host = host
self.status = status
def process_line(line: str, state: dict):
"""Drive the parser one line at a time. Returns (new_state, event_or_None).
On `connection accepted`, a new state dict is started.
On `connection closed`, an Event is emitted and state is reset.
Other lines accumulate fields into the current state.
"""
if (m := OPEN_RE.search(line)):
return {"ip": m.group(1)}, None
if not state:
return state, None
if (m := APP_RE.search(line)):
state["app"] = m.group(1)
return state, None
if (m := PROD_RE.search(line)):
state["product"] = m.group(1)
return state, None
if (m := HOST_RE.search(line)):
state["host"] = m.group(1)
return state, None
if (m := STATUS_RE.search(line)):
state["status"] = m.group(1)
return state, None
if CLOSE_RE.search(line):
ip = state.get("ip", "?")
source = classify_source(ip)
if is_probe(state):
event = Event("probe", ip, source)
else:
event = Event(
"activation", ip, source,
app=state.get("app", ""),
product=state.get("product", state.get("app", "unknown")),
host=state.get("host", "?"),
status=state.get("status", "unknown"),
)
return {}, event
return state, None
def record_activation(product: str, status: str) -> None:
global _last_activation_ts
with _metrics_lock:
@ -70,11 +156,17 @@ def record_dedup_skip(product: str) -> None:
_dedup_skipped[product] = _dedup_skipped.get(product, 0) + 1
def record_probe(source: str) -> None:
with _metrics_lock:
_probes[source] = _probes.get(source, 0) + 1
def render_metrics() -> bytes:
out = []
with _metrics_lock:
activations = dict(_activations)
dedup_skipped = dict(_dedup_skipped)
probes = dict(_probes)
last_ts = _last_activation_ts
out.append("# HELP kms_activations_total KMS activation events that resulted in a Slack post.")
@ -89,6 +181,11 @@ def render_metrics() -> bytes:
for product, count in sorted(dedup_skipped.items()):
out.append(f'kms_activations_dedup_skipped_total{{product="{_esc(product)}"}} {count}')
out.append("# HELP kms_connection_probes_total Probe-only TCP connections (open+close, no KMS RPC).")
out.append("# TYPE kms_connection_probes_total counter")
for source, count in sorted(probes.items()):
out.append(f'kms_connection_probes_total{{source="{_esc(source)}"}} {count}')
out.append("# HELP kms_last_activation_timestamp_seconds Unix ts of the last non-deduped activation.")
out.append("# TYPE kms_last_activation_timestamp_seconds gauge")
out.append(f"kms_last_activation_timestamp_seconds {last_ts}")
@ -174,6 +271,40 @@ def follow(path: str):
time.sleep(1)
def handle_event(event: "Event", dedup: "DedupCache") -> None:
if event.kind == "probe":
record_probe(event.source)
print(
f"[slack-notifier] probe: ip={event.ip} source={event.source}",
flush=True,
)
return
key = f"{event.ip}|{event.product}"
if not dedup.should_send(key):
record_dedup_skip(event.product)
print(
f"[slack-notifier] dedup-skip: ip={event.ip} product={event.product!r}",
flush=True,
)
return
text = (
f":computer: KMS activation\n"
f"• *Client*: `{event.ip}` ({event.source})\n"
f"• *Workstation*: `{event.host}`\n"
f"• *Product*: `{event.product}`\n"
f"• *Status before*: {event.status}"
)
slack_post(text)
record_activation(event.product, event.status)
print(
f"[slack-notifier] sent: ip={event.ip} source={event.source} "
f"product={event.product!r} host={event.host!r}",
flush=True,
)
def main() -> None:
threading.Thread(target=start_metrics_server, daemon=True).start()
@ -182,40 +313,9 @@ def main() -> None:
state: dict = {}
for line in follow(LOG_PATH):
if (m := OPEN_RE.search(line)):
state = {"ip": m.group(1)}
continue
if not state:
continue
if (m := APP_RE.search(line)):
state["app"] = m.group(1)
elif (m := PROD_RE.search(line)):
state["product"] = m.group(1)
elif (m := HOST_RE.search(line)):
state["host"] = m.group(1)
elif (m := STATUS_RE.search(line)):
state["status"] = m.group(1)
elif CLOSE_RE.search(line):
ip = state.get("ip", "?")
product = state.get("product", state.get("app", "unknown"))
host = state.get("host", "?")
status = state.get("status", "unknown")
key = f"{ip}|{product}"
if dedup.should_send(key):
text = (
f":computer: KMS activation\n"
f"• *Client*: `{ip}`\n"
f"• *Workstation*: `{host}`\n"
f"• *Product*: `{product}`\n"
f"• *Status before*: {status}"
)
slack_post(text)
record_activation(product, status)
print(f"[slack-notifier] sent: ip={ip} product={product!r} host={host!r}", flush=True)
else:
record_dedup_skip(product)
print(f"[slack-notifier] dedup-skip: ip={ip} product={product!r}", flush=True)
state = {}
state, event = process_line(line, state)
if event is not None:
handle_event(event, dedup)
if __name__ == "__main__":