fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]
6d224861 came from a --no-checkout worktree whose empty index made the
commit drop every file except two. This restores 05b50d2b's full tree and
correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su
entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the
live infra was never applied from the broken commit.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6d224861c4
commit
fd0f4a0365
1166 changed files with 358546 additions and 0 deletions
152
stacks/kms/.terraform.lock.hcl
generated
Normal file
152
stacks/kms/.terraform.lock.hcl
generated
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
# This file is maintained automatically by "terraform init".
|
||||
# Manual edits may be lost in future updates.
|
||||
|
||||
provider "registry.terraform.io/cloudflare/cloudflare" {
|
||||
version = "4.52.7"
|
||||
constraints = "~> 4.0"
|
||||
hashes = [
|
||||
"h1:pPItIWii5oymR+geZB219ROSPuSODPLTlM4S/u8xLvM=",
|
||||
"zh:0c904ce31a4c6c4a5b3bf7ff1560e77c0cc7e2450c8553ded8e8c90398e1418b",
|
||||
"zh:36183d310c36373fe4cb936b83c595c6fd3b0a94bc7827f28e5789ccbf59752e",
|
||||
"zh:556a568a6f0235e8f41647de9e4d3a1e7b1d6502df8b19b54ec441f1c653ea10",
|
||||
"zh:633ebbd5b0245e75e500ef9be4d9e62288f97e8da3baaa51323892a786d90285",
|
||||
"zh:6acfe60cf52a65ba8f044f748548d2119e7f4fd7f8ebcb14698960d87c68f529",
|
||||
"zh:890df766e9b839623b1f0437355032a3c006226a6c200cd911e15ee1a9014e9f",
|
||||
"zh:904acc31ebb9d6ef68c792074b30532ee61bf515f19e0a3c75b46f126cca1f13",
|
||||
"zh:a1d0a81246afc8750286d3f6fe7a8fbe6460dd2662407b28dbfbabb612e5fa9d",
|
||||
"zh:a41a36fe253fc365fe2b7ffc749624688b2693b4634862fda161179ab100029f",
|
||||
"zh:a7ef269e77ffa8715c8945a2c14322c7ff159ea44c15f62505f3cbb2cae3b32d",
|
||||
"zh:b01aa3bed30610633b762df64332b26f8844a68c3960cebcb30f04918efc67fe",
|
||||
"zh:b069cc2cd18cae10757df3ae030508eac8d55de7e49eda7a5e3e11f2f7fe6455",
|
||||
"zh:b2d2c6313729ebb7465dceece374049e2d08bda34473901be9ff46a8836d42b2",
|
||||
"zh:db0e114edaf4bc2f3d4769958807c83022bfbc619a00bdf4c4bd17faa4ab2d8b",
|
||||
"zh:ecc0aa8b9044f664fd2aaf8fa992d976578f78478980555b4b8f6148e8d1a5fe",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/gavinbunney/kubectl" {
|
||||
version = "1.19.0"
|
||||
constraints = "~> 1.14"
|
||||
hashes = [
|
||||
"h1:9QkxPjp0x5FZFfJbE+B7hBOoads9gmdfj9aYu5N4Sfc=",
|
||||
"zh:1dec8766336ac5b00b3d8f62e3fff6390f5f60699c9299920fc9861a76f00c71",
|
||||
"zh:43f101b56b58d7fead6a511728b4e09f7c41dc2e3963f59cf1c146c4767c6cb7",
|
||||
"zh:4c4fbaa44f60e722f25cc05ee11dfaec282893c5c0ffa27bc88c382dbfbaa35c",
|
||||
"zh:51dd23238b7b677b8a1abbfcc7deec53ffa5ec79e58e3b54d6be334d3d01bc0e",
|
||||
"zh:5afc2ebc75b9d708730dbabdc8f94dd559d7f2fc5a31c5101358bd8d016916ba",
|
||||
"zh:6be6e72d4663776390a82a37e34f7359f726d0120df622f4a2b46619338a168e",
|
||||
"zh:72642d5fcf1e3febb6e5d4ae7b592bb9ff3cb220af041dbda893588e4bf30c0c",
|
||||
"zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
|
||||
"zh:a1da03e3239867b35812ee031a1060fed6e8d8e458e2eaca48b5dd51b35f56f7",
|
||||
"zh:b98b6a6728fe277fcd133bdfa7237bd733eae233f09653523f14460f608f8ba2",
|
||||
"zh:bb8b071d0437f4767695c6158a3cb70df9f52e377c67019971d888b99147511f",
|
||||
"zh:dc89ce4b63bfef708ec29c17e85ad0232a1794336dc54dd88c3ba0b77e764f71",
|
||||
"zh:dd7dd18f1f8218c6cd19592288fde32dccc743cde05b9feeb2883f37c2ff4b4e",
|
||||
"zh:ec4bd5ab3872dedb39fe528319b4bba609306e12ee90971495f109e142d66310",
|
||||
"zh:f610ead42f724c82f5463e0e71fa735a11ffb6101880665d93f48b4a67b9ad82",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/goauthentik/authentik" {
|
||||
version = "2024.12.1"
|
||||
constraints = "~> 2024.10"
|
||||
hashes = [
|
||||
"h1:roBMd+gi+TGgikH/bMzEI8JfvJiMAQWt+8FmokCrQIs=",
|
||||
"zh:090260dc7889ea822ec1d899344e1ee23eba5290461989c0796149c9511f2316",
|
||||
"zh:13c2655ff824b0dc4b9bb832b5ca6d41dba97cb280330258c5fef4115e236209",
|
||||
"zh:166a73c3a810c9c895d68a8ff968158f339f8a2c1c03e20ec9fc5ed99cc64e20",
|
||||
"zh:203777eae1cdc711233315499643180604cff2324411b186b7cf07fdbe16f655",
|
||||
"zh:3b2f18c9a8d28dac74dc6bbf168c946855ab9c68f053578d4630c50d5eaf30a0",
|
||||
"zh:4822275985f6b74b6196c47112316a4252db22cf4ceaef7c9ab4c66d488abf2f",
|
||||
"zh:53ea97562666c8a5a2f6d63d418a302a7f8ee4b7bb7da35dedaa89aa5708b7f0",
|
||||
"zh:56b8a230901e3550c92a1d3f58ee9dafe9853f30fe4315af3ab28ae63262e15d",
|
||||
"zh:6293ab7b1fd8206a0c853591f50186aca4a1eff117b2a773e10760a23a2c83e9",
|
||||
"zh:9433970f79fb92d8aae3ee436db5630ab312c78b6dc9df9c1db3273a18f8aaa1",
|
||||
"zh:95df406214f79b3b98222d7c7fe8fc319a3d90b7a9d53e1d5abbda5dfb8b9436",
|
||||
"zh:a85880da0552a42c8f449390fbd7d8b03541d1a13e04bba9f1404fa658754260",
|
||||
"zh:a95f6e9bd62c67e70eba1b1a14728856b9a6a28cd1e5e3be54a7718882c87e7f",
|
||||
"zh:dd599b51c5beb34a4c6feece244fde07d2558d69929449ab1fd39a5ebe738781",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/helm" {
|
||||
version = "3.1.1"
|
||||
hashes = [
|
||||
"h1:47CqNwkxctJtL/N/JuEj+8QMg8mRNI/NWeKO5/ydfZU=",
|
||||
"h1:5b2ojWKT0noujHiweCds37ZreRFRQLNaErdJLusJN88=",
|
||||
"zh:1a6d5ce931708aec29d1f3d9e360c2a0c35ba5a54d03eeaff0ce3ca597cd0275",
|
||||
"zh:3411919ba2a5941801e677f0fea08bdd0ae22ba3c9ce3309f55554699e06524a",
|
||||
"zh:81b36138b8f2320dc7f877b50f9e38f4bc614affe68de885d322629dd0d16a29",
|
||||
"zh:95a2a0a497a6082ee06f95b38bd0f0d6924a65722892a856cfd914c0d117f104",
|
||||
"zh:9d3e78c2d1bb46508b972210ad706dd8c8b106f8b206ecf096cd211c54f46990",
|
||||
"zh:a79139abf687387a6efdbbb04289a0a8e7eaca2bd91cdc0ce68ea4f3286c2c34",
|
||||
"zh:aaa8784be125fbd50c48d84d6e171d3fb6ef84a221dbc5165c067ce05faab4c8",
|
||||
"zh:afecd301f469975c9d8f350cc482fe656e082b6ab0f677d1a816c3c615837cc1",
|
||||
"zh:c54c22b18d48ff9053d899d178d9ffef7d9d19785d9bf310a07d648b7aac075b",
|
||||
"zh:db2eefd55aea48e73384a555c72bac3f7d428e24147bedb64e1a039398e5b903",
|
||||
"zh:ee61666a233533fd2be971091cecc01650561f1585783c381b6f6e8a390198a4",
|
||||
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/kubernetes" {
|
||||
version = "3.1.0"
|
||||
hashes = [
|
||||
"h1:oodIAuFMikXNmEtil5MQgP4dfSctUBYQiGJfjbsF3NY=",
|
||||
"zh:0215c5c60be62028c09a2f22458e89cda3ef5830a632299f1d401eb3538874b0",
|
||||
"zh:09ebb9f442431e278a310a9423f32caf467cb4b3cad3fe59573ca71fa7b14e20",
|
||||
"zh:0c4e5912f83bb35846ae0a9ae54fc320706ee61894cd21cc6b4181b1c5a2fa5c",
|
||||
"zh:1678c982853ad461e65ccb5e79d585e13ed109dd47dab2a66d3a7a304faeef65",
|
||||
"zh:1c050a5c15e330457a9c18caacf61a923c59d663e13f2962e4b32f04fef523a0",
|
||||
"zh:2c55bcec83be58ec132c7cb0a1ac644758b800d794fdc636d53a0eada0358a3a",
|
||||
"zh:a062bb0aa316c08d8460c66a5d68da71da40de5d3bc3b31abcf3a1a9a19650f1",
|
||||
"zh:a26fdea0afaa9b247c73c0b42843ca51ba7db0ac2571f9d3d50dcabd20ca1b98",
|
||||
"zh:c872c9385a78d502bf5823d61cd3bb0f9a0585030e025eb12585c83451beeaa1",
|
||||
"zh:f180879af931182beee4c8c0d9dab62b81d86f17ddcbe3786ef4c7cec9163a4e",
|
||||
"zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c",
|
||||
"zh:f70f5789264069e0eef06f9b5d5fde955ef7206f7d446d1ce51a4c37a3f3e02f",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/hashicorp/vault" {
|
||||
version = "4.8.0"
|
||||
constraints = "~> 4.0"
|
||||
hashes = [
|
||||
"h1:GPfhH6dr1LY0foPBDYv9bEGifx7eSwYqFcEAOWOUxLk=",
|
||||
"h1:aHqgWQhDBMeZO9iUKwJYMlh4q+xNMUlMIcjRbF4d02Y=",
|
||||
"zh:269ab13433f67684012ae7e15876532b0312f5d0d2002a9cf9febb1279ce5ea6",
|
||||
"zh:4babc95bf0c40eb85005db1dc2ca403c46be4a71dd3e409db3711a56f7a5ca0e",
|
||||
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
|
||||
"zh:86e27c1c625ecc24446a11eeffc3ac319b36c2b4e51251db8579256a0dbcf136",
|
||||
"zh:a32f31da94824009e26b077374440b52098aecb93c92ff55dc3d31dd37c4ea25",
|
||||
"zh:be0a18c6c0425518bab4fbffd82078b82036a88503b5d76064de551c9f646cbf",
|
||||
"zh:be5a77fdfd36863ebeec79cd12b1d13322ffad6821d157a0b279789fa06b5937",
|
||||
"zh:be8317d142a3caad74c7d936039ae27076a1b2b8312ef5208e2871a5f525977c",
|
||||
"zh:c94a84895a3d9954b80e983eed4603330a5cdbbd8eef5b3c99278c2d1402ef3c",
|
||||
"zh:de1fb712784dd8415f011ca5346a34f87fab6046c730557615247e511dbc7d98",
|
||||
"zh:e3eafae7da550f86cae395d6660b2a0e93ec8d2b0e0e5ef982ec762e961fc952",
|
||||
"zh:ff35fb1ab6add288f0f368981e56f780b50405accd1937131cba1137999c8d83",
|
||||
]
|
||||
}
|
||||
|
||||
provider "registry.terraform.io/telmate/proxmox" {
|
||||
version = "3.0.2-rc07"
|
||||
constraints = "3.0.2-rc07"
|
||||
hashes = [
|
||||
"h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=",
|
||||
"zh:2ee860cd0a368b3eaa53f4a9ea46f16dab8a97929e813ea6ef55183f8112c2ca",
|
||||
"zh:415965fd915bae2040d7f79e45f64d6e3ae61149c10114efeac1b34687d7296c",
|
||||
"zh:6584b2055df0e32062561c615e3b6b2c291ca8c959440adda09ef3ec1e1436bd",
|
||||
"zh:65dcfad71928e0a8dd9befc22524ed686be5020b0024dc5cca5184c7420eeb6b",
|
||||
"zh:7253dc29bd265d33f2791ac4f779c5413f16720bb717de8e6c5fcb2c858648ea",
|
||||
"zh:7ec8993da10a47606670f9f67cfd10719a7580641d11c7aa761121c4a2bd66fb",
|
||||
"zh:999a3f7a9dcf517967fc537e6ec930a8172203642fb01b8e1f78f908373db210",
|
||||
"zh:a50e6df7280eb6584a5fd2456e3f5b6df13b2ec8a7fa4605511e438e1863be42",
|
||||
"zh:b25b329a1e42681c509d027fee0365414f0cc5062b65690cfc3386aab16132ae",
|
||||
"zh:c028877fdb438ece48f7bc02b65bbae9ca7b7befbd260e519ccab6c0cbb39f26",
|
||||
"zh:cf0eaa3ea9fcc6d62793637947f1b8d7c885b6ad74695ab47e134e4ff132190f",
|
||||
"zh:d5ade3fae031cc629b7c512a7b60e46570f4c41665e88a595d7efd943dde5ab2",
|
||||
"zh:f388c15ad1ecfc09e7361e3b98bae9b627a3a85f7b908c9f40650969c949901c",
|
||||
"zh:f415cc6f735a3971faae6ac24034afdb9ee83373ef8de19a9631c187d5adc7db",
|
||||
]
|
||||
}
|
||||
2
stacks/kms/files/.gitignore
vendored
Normal file
2
stacks/kms/files/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
__pycache__/
|
||||
*.pyc
|
||||
33
stacks/kms/files/diag-collector.py
Normal file
33
stacks/kms/files/diag-collector.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import json
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
|
||||
MAX = 16384
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def _respond(self, code):
|
||||
self.send_response(code)
|
||||
self.send_header('Content-Length', '0')
|
||||
self.end_headers()
|
||||
def do_POST(self):
|
||||
if self.path.rstrip('/') != '/diag':
|
||||
self._respond(404); return
|
||||
try:
|
||||
n = int(self.headers.get('Content-Length', 0) or 0)
|
||||
n = min(n, MAX) if n > 0 else 0
|
||||
raw = self.rfile.read(n).decode('utf-8', 'replace') if n else ''
|
||||
obj = json.loads(raw) if raw.strip() else {}
|
||||
if not isinstance(obj, dict):
|
||||
obj = {'_raw': str(obj)[:1000]}
|
||||
ip = self.headers.get('X-Forwarded-For', self.client_address[0]).split(',')[0].strip()
|
||||
obj['_ip'] = ip
|
||||
print('KMSDIAG ' + json.dumps(obj, separators=(',', ':'))[:MAX], flush=True)
|
||||
except Exception as e:
|
||||
print('KMSDIAG_ERR ' + repr(e)[:500], flush=True)
|
||||
self._respond(204)
|
||||
def do_GET(self):
|
||||
self._respond(200 if self.path.rstrip('/') in ('/healthz', '/diag') else 404)
|
||||
def log_message(self, *a):
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
HTTPServer(('0.0.0.0', 9102), Handler).serve_forever()
|
||||
335
stacks/kms/files/slack-notifier.py
Normal file
335
stacks/kms/files/slack-notifier.py
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tail vlmcsd verbose log; post a Slack message per activation, and expose
|
||||
Prometheus metrics on /metrics for activation counts.
|
||||
|
||||
vlmcsd verbose output emits a multi-line block per request:
|
||||
<ts>: IPv4 connection accepted: <ip>:<port>.
|
||||
<ts>: <<< Incoming KMS request
|
||||
<ts>: Application ID : <uuid> (<name>)
|
||||
<ts>: Activation ID (Product): <uuid> (<product>)
|
||||
<ts>: Workstation name : <hostname>
|
||||
...
|
||||
<ts>: IPv4 connection closed: <ip>:<port>.
|
||||
|
||||
A bare TCP open/close pair (no Application/Activation lines) is a probe —
|
||||
typically Uptime Kuma's port-type monitor on windows-kms.kms:1688. Probes
|
||||
are counted in `kms_connection_probes_total` but never posted to Slack.
|
||||
|
||||
Real activations dedupe by (client_ip, product) within DEDUP_WINDOW_SECONDS
|
||||
to avoid spam from Windows' default 7-day re-activation cycle.
|
||||
|
||||
Prometheus metrics (text format, no client_ip label — cardinality):
|
||||
kms_activations_total{product, status} counter
|
||||
kms_activations_dedup_skipped_total{product} counter
|
||||
kms_connection_probes_total{source} counter (probe-only conns)
|
||||
kms_last_activation_timestamp_seconds gauge
|
||||
kms_slack_notifier_up gauge (heartbeat)
|
||||
"""
|
||||
import ipaddress
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from collections import OrderedDict
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
|
||||
LOG_PATH = os.environ.get("VLMCSD_LOG", "/var/log/vlmcsd/vlmcsd.log")
|
||||
WEBHOOK = os.environ["SLACK_WEBHOOK_URL"]
|
||||
CHANNEL = os.environ.get("SLACK_CHANNEL", "#alerts")
|
||||
DEDUP_WINDOW = int(os.environ.get("DEDUP_WINDOW_SECONDS", "3600"))
|
||||
DEDUP_MAX = 4096
|
||||
METRICS_PORT = int(os.environ.get("METRICS_PORT", "9101"))
|
||||
|
||||
OPEN_RE = re.compile(r":\s*IPv[46] connection accepted:\s*([0-9a-f.:\[\]]+):(\d+)")
|
||||
CLOSE_RE = re.compile(r":\s*IPv[46] connection closed:\s*([0-9a-f.:\[\]]+):(\d+)")
|
||||
APP_RE = re.compile(r":\s*Application ID\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
|
||||
PROD_RE = re.compile(r":\s*Activation ID \(Product\)\s*:\s*[0-9a-f-]+\s*\(([^)]+)\)")
|
||||
HOST_RE = re.compile(r":\s*Workstation name\s*:\s*(.+?)\s*$")
|
||||
STATUS_RE = re.compile(r":\s*Licensing status\s*:\s*\d+\s*\((.+?)\)\s*$")
|
||||
|
||||
# Pod CIDR (Calico, kube-proxy SNAT-free intra-cluster traffic) and cluster
|
||||
# LAN (kube-proxy SNATs ETP=Cluster external traffic to a node IP). Anything
|
||||
# else is a real client IP that arrived via ETP=Local or pod-to-svc routing.
|
||||
POD_CIDR = ipaddress.ip_network("10.10.0.0/16")
|
||||
CLUSTER_LAN_CIDR = ipaddress.ip_network("10.0.20.0/24")
|
||||
|
||||
_metrics_lock = threading.Lock()
|
||||
_activations: dict = {}
|
||||
_dedup_skipped: dict = {}
|
||||
_probes: dict = {}
|
||||
_last_activation_ts: float = 0.0
|
||||
|
||||
|
||||
def _esc(value: str) -> str:
|
||||
return str(value).replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
||||
|
||||
|
||||
def classify_source(ip: str) -> str:
|
||||
"""Return 'internal_pod', 'cluster_node', or 'external' for a vlmcsd-logged IP."""
|
||||
raw = ip.strip().strip("[]")
|
||||
try:
|
||||
addr = ipaddress.ip_address(raw)
|
||||
except ValueError:
|
||||
return "external"
|
||||
if isinstance(addr, ipaddress.IPv4Address):
|
||||
if addr in POD_CIDR:
|
||||
return "internal_pod"
|
||||
if addr in CLUSTER_LAN_CIDR:
|
||||
return "cluster_node"
|
||||
return "external"
|
||||
|
||||
|
||||
def is_probe(state: dict) -> bool:
|
||||
"""A connection that closed without any Application/Activation block."""
|
||||
return "app" not in state and "product" not in state
|
||||
|
||||
|
||||
class Event:
|
||||
__slots__ = ("kind", "ip", "source", "app", "product", "host", "status")
|
||||
|
||||
def __init__(self, kind: str, ip: str, source: str, *, app: str = "",
|
||||
product: str = "", host: str = "", status: str = "") -> None:
|
||||
self.kind = kind
|
||||
self.ip = ip
|
||||
self.source = source
|
||||
self.app = app
|
||||
self.product = product
|
||||
self.host = host
|
||||
self.status = status
|
||||
|
||||
|
||||
def process_line(line: str, state: dict):
|
||||
"""Drive the parser one line at a time. Returns (new_state, event_or_None).
|
||||
|
||||
`state` tracks per-connection state keyed by `ip:port` (vlmcsd is
|
||||
multi-threaded — concurrent connections interleave in the log, so a
|
||||
single shared dict gets clobbered). The special key `__current` points
|
||||
at the most recent OPEN's key so detail lines (which lack ip:port
|
||||
info) can be attributed to the right connection. Detail lines arrive
|
||||
before the next OPEN under vlmcsd's processing model.
|
||||
"""
|
||||
if (m := OPEN_RE.search(line)):
|
||||
ip = m.group(1)
|
||||
key = f"{ip}:{m.group(2)}"
|
||||
state[key] = {"ip": ip}
|
||||
state["__current"] = key
|
||||
return state, None
|
||||
|
||||
if (m := CLOSE_RE.search(line)):
|
||||
key = f"{m.group(1)}:{m.group(2)}"
|
||||
conn = state.pop(key, None)
|
||||
if state.get("__current") == key:
|
||||
state.pop("__current", None)
|
||||
if conn is None:
|
||||
return state, None # orphan close (e.g., notifier started mid-conn)
|
||||
ip = conn.get("ip", "?")
|
||||
source = classify_source(ip)
|
||||
if is_probe(conn):
|
||||
event = Event("probe", ip, source)
|
||||
else:
|
||||
event = Event(
|
||||
"activation", ip, source,
|
||||
app=conn.get("app", ""),
|
||||
product=conn.get("product", conn.get("app", "unknown")),
|
||||
host=conn.get("host", "?"),
|
||||
status=conn.get("status", "unknown"),
|
||||
)
|
||||
return state, event
|
||||
|
||||
current = state.get("__current")
|
||||
if not current or current not in state:
|
||||
return state, None
|
||||
conn = state[current]
|
||||
if (m := APP_RE.search(line)):
|
||||
conn["app"] = m.group(1)
|
||||
elif (m := PROD_RE.search(line)):
|
||||
conn["product"] = m.group(1)
|
||||
elif (m := HOST_RE.search(line)):
|
||||
conn["host"] = m.group(1)
|
||||
elif (m := STATUS_RE.search(line)):
|
||||
conn["status"] = m.group(1)
|
||||
return state, None
|
||||
|
||||
|
||||
def record_activation(product: str, status: str) -> None:
|
||||
global _last_activation_ts
|
||||
with _metrics_lock:
|
||||
key = (product, status)
|
||||
_activations[key] = _activations.get(key, 0) + 1
|
||||
_last_activation_ts = time.time()
|
||||
|
||||
|
||||
def record_dedup_skip(product: str) -> None:
|
||||
with _metrics_lock:
|
||||
_dedup_skipped[product] = _dedup_skipped.get(product, 0) + 1
|
||||
|
||||
|
||||
def record_probe(source: str) -> None:
|
||||
with _metrics_lock:
|
||||
_probes[source] = _probes.get(source, 0) + 1
|
||||
|
||||
|
||||
def render_metrics() -> bytes:
|
||||
out = []
|
||||
with _metrics_lock:
|
||||
activations = dict(_activations)
|
||||
dedup_skipped = dict(_dedup_skipped)
|
||||
probes = dict(_probes)
|
||||
last_ts = _last_activation_ts
|
||||
|
||||
out.append("# HELP kms_activations_total KMS activation events that resulted in a Slack post.")
|
||||
out.append("# TYPE kms_activations_total counter")
|
||||
for (product, status), count in sorted(activations.items()):
|
||||
out.append(
|
||||
f'kms_activations_total{{product="{_esc(product)}",status="{_esc(status)}"}} {count}'
|
||||
)
|
||||
|
||||
out.append("# HELP kms_activations_dedup_skipped_total KMS activation events suppressed by dedup window.")
|
||||
out.append("# TYPE kms_activations_dedup_skipped_total counter")
|
||||
for product, count in sorted(dedup_skipped.items()):
|
||||
out.append(f'kms_activations_dedup_skipped_total{{product="{_esc(product)}"}} {count}')
|
||||
|
||||
out.append("# HELP kms_connection_probes_total Probe-only TCP connections (open+close, no KMS RPC).")
|
||||
out.append("# TYPE kms_connection_probes_total counter")
|
||||
for source, count in sorted(probes.items()):
|
||||
out.append(f'kms_connection_probes_total{{source="{_esc(source)}"}} {count}')
|
||||
|
||||
out.append("# HELP kms_last_activation_timestamp_seconds Unix ts of the last non-deduped activation.")
|
||||
out.append("# TYPE kms_last_activation_timestamp_seconds gauge")
|
||||
out.append(f"kms_last_activation_timestamp_seconds {last_ts}")
|
||||
|
||||
out.append("# HELP kms_slack_notifier_up 1 while the notifier process is running.")
|
||||
out.append("# TYPE kms_slack_notifier_up gauge")
|
||||
out.append("kms_slack_notifier_up 1")
|
||||
|
||||
return ("\n".join(out) + "\n").encode("utf-8")
|
||||
|
||||
|
||||
class MetricsHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == "/healthz":
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain")
|
||||
self.end_headers()
|
||||
self.wfile.write(b"ok\n")
|
||||
return
|
||||
if self.path != "/metrics":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
body = render_metrics()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def start_metrics_server() -> None:
|
||||
server = HTTPServer(("0.0.0.0", METRICS_PORT), MetricsHandler)
|
||||
print(f"[slack-notifier] metrics on :{METRICS_PORT}/metrics", flush=True)
|
||||
server.serve_forever()
|
||||
|
||||
|
||||
def slack_post(text: str) -> None:
|
||||
payload = json.dumps({"channel": CHANNEL, "text": text, "username": "kms.viktorbarzin.me", "icon_emoji": ":computer:"}).encode("utf-8")
|
||||
req = urllib.request.Request(WEBHOOK, data=payload, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=10).read()
|
||||
except urllib.error.URLError as exc:
|
||||
print(f"[slack] post failed: {exc}", file=sys.stderr)
|
||||
|
||||
|
||||
class DedupCache(OrderedDict):
|
||||
def should_send(self, key: str) -> bool:
|
||||
now = time.time()
|
||||
while self and (now - next(iter(self.values()))) > DEDUP_WINDOW:
|
||||
self.popitem(last=False)
|
||||
if key in self and (now - self[key]) < DEDUP_WINDOW:
|
||||
return False
|
||||
if len(self) >= DEDUP_MAX:
|
||||
self.popitem(last=False)
|
||||
self[key] = now
|
||||
self.move_to_end(key)
|
||||
return True
|
||||
|
||||
|
||||
def follow(path: str):
|
||||
while not os.path.exists(path):
|
||||
time.sleep(1)
|
||||
fh = open(path, "r", encoding="utf-8", errors="replace")
|
||||
fh.seek(0, 2)
|
||||
inode = os.fstat(fh.fileno()).st_ino
|
||||
while True:
|
||||
line = fh.readline()
|
||||
if line:
|
||||
yield line.rstrip("\n")
|
||||
continue
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
new_inode = os.stat(path).st_ino
|
||||
if new_inode != inode:
|
||||
fh.close()
|
||||
fh = open(path, "r", encoding="utf-8", errors="replace")
|
||||
inode = new_inode
|
||||
except FileNotFoundError:
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def handle_event(event: "Event", dedup: "DedupCache") -> None:
|
||||
if event.kind == "probe":
|
||||
record_probe(event.source)
|
||||
print(
|
||||
f"[slack-notifier] probe: ip={event.ip} source={event.source}",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
key = f"{event.ip}|{event.product}"
|
||||
if not dedup.should_send(key):
|
||||
record_dedup_skip(event.product)
|
||||
print(
|
||||
f"[slack-notifier] dedup-skip: ip={event.ip} product={event.product!r}",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
text = (
|
||||
f":computer: KMS activation\n"
|
||||
f"• *Client*: `{event.ip}` ({event.source})\n"
|
||||
f"• *Workstation*: `{event.host}`\n"
|
||||
f"• *Product*: `{event.product}`\n"
|
||||
f"• *Status before*: {event.status}"
|
||||
)
|
||||
slack_post(text)
|
||||
record_activation(event.product, event.status)
|
||||
print(
|
||||
f"[slack-notifier] sent: ip={event.ip} source={event.source} "
|
||||
f"product={event.product!r} host={event.host!r}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
threading.Thread(target=start_metrics_server, daemon=True).start()
|
||||
|
||||
dedup = DedupCache()
|
||||
print(f"[slack-notifier] tailing {LOG_PATH}, posting to {CHANNEL} via Slack", flush=True)
|
||||
state: dict = {}
|
||||
|
||||
for line in follow(LOG_PATH):
|
||||
state, event = process_line(line, state)
|
||||
if event is not None:
|
||||
handle_event(event, dedup)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
151
stacks/kms/files/test_slack_notifier.py
Normal file
151
stacks/kms/files/test_slack_notifier.py
Normal file
|
|
@ -0,0 +1,151 @@
|
|||
"""Unit tests for slack_notifier classification + state machine.
|
||||
|
||||
Run with: cd infra/stacks/kms/files && python3 -m unittest test_slack_notifier
|
||||
"""
|
||||
import importlib.util
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
# Load the notifier module from the dashed filename without executing main().
|
||||
os.environ.setdefault("SLACK_WEBHOOK_URL", "http://example.invalid/webhook")
|
||||
_spec = importlib.util.spec_from_file_location(
|
||||
"slack_notifier", Path(__file__).parent / "slack-notifier.py"
|
||||
)
|
||||
nm = importlib.util.module_from_spec(_spec)
|
||||
_spec.loader.exec_module(nm)
|
||||
|
||||
|
||||
class ClassifySourceTests(unittest.TestCase):
|
||||
def test_pod_cidr_is_internal_pod(self):
|
||||
self.assertEqual(nm.classify_source("10.10.107.224"), "internal_pod")
|
||||
self.assertEqual(nm.classify_source("10.10.0.1"), "internal_pod")
|
||||
|
||||
def test_cluster_lan_is_cluster_node(self):
|
||||
self.assertEqual(nm.classify_source("10.0.20.103"), "cluster_node")
|
||||
self.assertEqual(nm.classify_source("10.0.20.200"), "cluster_node")
|
||||
|
||||
def test_unknown_source_is_external(self):
|
||||
self.assertEqual(nm.classify_source("8.8.8.8"), "external")
|
||||
self.assertEqual(nm.classify_source("203.0.113.42"), "external")
|
||||
|
||||
def test_ipv6_external_default(self):
|
||||
self.assertEqual(nm.classify_source("[2001:db8::1]"), "external")
|
||||
|
||||
|
||||
class IsProbeTests(unittest.TestCase):
|
||||
def test_open_close_only_is_probe(self):
|
||||
self.assertTrue(nm.is_probe({"ip": "10.10.107.224"}))
|
||||
|
||||
def test_application_id_only_is_not_probe(self):
|
||||
self.assertFalse(nm.is_probe({"ip": "10.0.20.103", "app": "Windows"}))
|
||||
|
||||
def test_product_only_is_not_probe(self):
|
||||
self.assertFalse(nm.is_probe({"ip": "10.0.20.103", "product": "Office 2021"}))
|
||||
|
||||
def test_full_activation_is_not_probe(self):
|
||||
state = {
|
||||
"ip": "10.0.20.103",
|
||||
"app": "Windows",
|
||||
"product": "Windows 11 Pro",
|
||||
"host": "DESKTOP-X",
|
||||
"status": "Notification",
|
||||
}
|
||||
self.assertFalse(nm.is_probe(state))
|
||||
|
||||
|
||||
class StateMachineTests(unittest.TestCase):
|
||||
"""Drive the regex parser through real-shaped vlmcsd log blocks."""
|
||||
|
||||
PROBE_BLOCK = [
|
||||
"2026-05-10 11:00:00: IPv4 connection accepted: 10.10.107.224:54321.",
|
||||
"2026-05-10 11:00:00: IPv4 connection closed: 10.10.107.224:54321.",
|
||||
]
|
||||
|
||||
ACTIVATION_BLOCK = [
|
||||
"2026-05-10 11:00:01: IPv4 connection accepted: 10.0.20.103:50001.",
|
||||
"2026-05-10 11:00:01: <<< Incoming KMS request",
|
||||
"2026-05-10 11:00:01: Application ID : 55c92734-d682-4d71-983e-d6ec3f16059f (Windows)",
|
||||
"2026-05-10 11:00:01: Activation ID (Product): 73111121-5638-40f6-bc11-f1d7b0d64300 (Windows 11 Pro)",
|
||||
"2026-05-10 11:00:01: Workstation name : DESKTOP-MO2323B",
|
||||
"2026-05-10 11:00:01: Licensing status : 2 (Notification)",
|
||||
"2026-05-10 11:00:01: IPv4 connection closed: 10.0.20.103:50001.",
|
||||
]
|
||||
|
||||
def _drive(self, lines):
|
||||
events = []
|
||||
state = {}
|
||||
for line in lines:
|
||||
state, event = nm.process_line(line, state)
|
||||
if event is not None:
|
||||
events.append(event)
|
||||
return events, state
|
||||
|
||||
def test_probe_block_emits_probe_event(self):
|
||||
events, state = self._drive(self.PROBE_BLOCK)
|
||||
self.assertEqual(len(events), 1)
|
||||
ev = events[0]
|
||||
self.assertEqual(ev.kind, "probe")
|
||||
self.assertEqual(ev.ip, "10.10.107.224")
|
||||
self.assertEqual(state, {})
|
||||
|
||||
def test_activation_block_emits_activation_event(self):
|
||||
events, state = self._drive(self.ACTIVATION_BLOCK)
|
||||
self.assertEqual(len(events), 1)
|
||||
ev = events[0]
|
||||
self.assertEqual(ev.kind, "activation")
|
||||
self.assertEqual(ev.ip, "10.0.20.103")
|
||||
self.assertEqual(ev.product, "Windows 11 Pro")
|
||||
self.assertEqual(ev.host, "DESKTOP-MO2323B")
|
||||
self.assertEqual(ev.status, "Notification")
|
||||
self.assertEqual(state, {})
|
||||
|
||||
def test_interleaved_probe_then_activation(self):
|
||||
events, _ = self._drive(self.PROBE_BLOCK + self.ACTIVATION_BLOCK)
|
||||
kinds = [e.kind for e in events]
|
||||
self.assertEqual(kinds, ["probe", "activation"])
|
||||
|
||||
def test_kubelet_probe_during_long_activation(self):
|
||||
"""vlmcsd is multi-threaded. While a real KMS RPC's connection
|
||||
sits open (Windows holds it ~30s), kubelet's TCP readiness probe
|
||||
every 5s opens+closes its own connection. The notifier MUST NOT
|
||||
let the probe's OPEN/CLOSE wipe the in-flight activation's state.
|
||||
Reproduces the production bug seen on 2026-05-10.
|
||||
"""
|
||||
interleaved = [
|
||||
"2026-05-10 13:12:17: IPv4 connection accepted: 192.168.1.230:53140.",
|
||||
"2026-05-10 13:12:17: <<< Incoming KMS request",
|
||||
"2026-05-10 13:12:17: Licensing status : 1 (Licensed)",
|
||||
"2026-05-10 13:12:17: Application ID : 55c92734-d682-4d71-983e-d6ec3f16059f (Windows)",
|
||||
"2026-05-10 13:12:17: Activation ID (Product) : 2de67392-b7a7-462a-b1ca-108dd189f588 (Windows 10 Professional)",
|
||||
"2026-05-10 13:12:17: Workstation name : WIN10Pro-DS32.viktorbarzin.lan",
|
||||
# ── kubelet probe arrives mid-flight, MUST NOT clobber 53140's state ──
|
||||
"2026-05-10 13:12:19: IPv4 connection accepted: 10.0.20.102:46498.",
|
||||
"2026-05-10 13:12:19: IPv4 connection closed: 10.0.20.102:46498.",
|
||||
"2026-05-10 13:12:24: IPv4 connection accepted: 10.0.20.102:54454.",
|
||||
"2026-05-10 13:12:24: IPv4 connection closed: 10.0.20.102:54454.",
|
||||
# ── activation closes 31s after open ──
|
||||
"2026-05-10 13:12:48: IPv4 connection closed: 192.168.1.230:53140.",
|
||||
]
|
||||
events, _ = self._drive(interleaved)
|
||||
kinds = [e.kind for e in events]
|
||||
self.assertEqual(kinds, ["probe", "probe", "activation"])
|
||||
activation = events[-1]
|
||||
self.assertEqual(activation.ip, "192.168.1.230")
|
||||
self.assertEqual(activation.product, "Windows 10 Professional")
|
||||
self.assertEqual(activation.host, "WIN10Pro-DS32.viktorbarzin.lan")
|
||||
self.assertEqual(activation.status, "Licensed")
|
||||
|
||||
def test_orphan_close_no_event(self):
|
||||
"""If the notifier starts mid-conn, the open was missed but the
|
||||
close still fires. We MUST NOT emit an event for that — it would
|
||||
show up with empty fields and look like a probe."""
|
||||
orphan = [
|
||||
"2026-05-10 13:00:00: IPv4 connection closed: 192.168.1.230:55555.",
|
||||
]
|
||||
events, _ = self._drive(orphan)
|
||||
self.assertEqual(events, [])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
522
stacks/kms/main.tf
Normal file
522
stacks/kms/main.tf
Normal file
|
|
@ -0,0 +1,522 @@
|
|||
variable "tls_secret_name" {
|
||||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
|
||||
resource "kubernetes_namespace" "kms" {
|
||||
metadata {
|
||||
name = "kms"
|
||||
labels = {
|
||||
"istio-injection" : "disabled"
|
||||
tier = local.tiers.aux
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
|
||||
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
||||
}
|
||||
}
|
||||
|
||||
module "tls_secret" {
|
||||
source = "../../modules/kubernetes/setup_tls_secret"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
tls_secret_name = var.tls_secret_name
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "kms-web-page" {
|
||||
metadata {
|
||||
name = "kms-web-page"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
"app" = "kms-web-page"
|
||||
"kubernetes.io/cluster-service" = "true"
|
||||
tier = local.tiers.aux
|
||||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
selector {
|
||||
match_labels = {
|
||||
"app" = "kms-web-page"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
"app" = "kms-web-page"
|
||||
"kubernetes.io/cluster-service" = "true"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
image_pull_secrets {
|
||||
name = "registry-credentials"
|
||||
}
|
||||
container {
|
||||
image = "forgejo.viktorbarzin.me/viktor/kms-website:${var.image_tag}"
|
||||
name = "kms-web-page"
|
||||
image_pull_policy = "IfNotPresent"
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "64Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
container_port = 80
|
||||
protocol = "TCP"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
spec[0].template[0].spec[0].dns_config,
|
||||
# CI (Woodpecker) manages the live image tag via `kubectl set image`
|
||||
spec[0].template[0].spec[0].container[0].image,
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "kms-web-page" {
|
||||
metadata {
|
||||
name = "kms"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
"app" = "kms-web-page"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
"app" = "kms-web-page"
|
||||
}
|
||||
port {
|
||||
port = "80"
|
||||
protocol = "TCP"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module "anubis" {
|
||||
source = "../../modules/kubernetes/anubis_instance"
|
||||
name = "kms"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
target_url = "http://${kubernetes_service.kms-web-page.metadata[0].name}.${kubernetes_namespace.kms.metadata[0].name}.svc.cluster.local"
|
||||
shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/8"
|
||||
}
|
||||
|
||||
module "ingress" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
auth = "none" # Anubis-fronted; PoW challenge gates bots, no Authentik
|
||||
dns_type = "non-proxied"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
name = "kms"
|
||||
service_name = module.anubis.service_name
|
||||
port = module.anubis.service_port
|
||||
extra_middlewares = ["traefik-x402@kubernetescrd"]
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "KMS"
|
||||
"gethomepage.dev/description" = "License activation server"
|
||||
"gethomepage.dev/icon" = "microsoft.png"
|
||||
"gethomepage.dev/group" = "Other"
|
||||
"gethomepage.dev/pod-selector" = ""
|
||||
}
|
||||
}
|
||||
|
||||
# Carve-out for /scripts/* and /keys.json — the PowerShell activators
|
||||
# (kms-bootstrap.ps1, setup-kms.ps1) that visitors fetch with `iwr ... | iex`,
|
||||
# plus /keys.json (the published GVLK list the scripts fetch to auto-select a
|
||||
# key). Anubis cannot gate these paths: PowerShell/curl are non-JS clients and
|
||||
# can't solve the PoW challenge, so they'd receive the challenge HTML and the
|
||||
# script (or ConvertFrom-Json) would choke on it. Points at the bare
|
||||
# kms-web-page nginx service, bypassing the Anubis proxy. Traefik prioritises
|
||||
# the longer /scripts and /keys.json prefixes over the main "/" router.
|
||||
module "ingress_scripts" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
# auth = "none": public read-only static scripts + key list (iwr|iex). No login, no PoW.
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
name = "kms-scripts"
|
||||
service_name = kubernetes_service.kms-web-page.metadata[0].name
|
||||
port = "80"
|
||||
ingress_path = ["/scripts", "/keys.json"]
|
||||
full_host = "kms.viktorbarzin.me" # MUST match the main ingress host; without this the factory derives kms-scripts.viktorbarzin.me and the carve-out never matches.
|
||||
dns_type = "none" # DNS already owned by the main kms ingress.
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false # Static scripts + key list; nothing for scrapers to mine.
|
||||
}
|
||||
|
||||
# Anonymous diagnostics collector for the PowerShell activation scripts. The
|
||||
# activators POST a tiny JSON blob (action/outcome/error) to /diag so script
|
||||
# failures are captured. The collector prints each event to stdout, which Loki
|
||||
# scrapes — making them searchable in Grafana. Loki only: no Slack, no
|
||||
# Prometheus. Like /scripts, /diag must bypass Anubis: PowerShell/curl can't
|
||||
# solve the PoW challenge, so the carve-out below points at the bare collector.
|
||||
resource "kubernetes_config_map" "kms_diag_collector" {
|
||||
metadata {
|
||||
name = "kms-diag-collector"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"diag-collector.py" = file("${path.module}/files/diag-collector.py")
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "kms_diag" {
|
||||
metadata {
|
||||
name = "kms-diag"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
app = "kms-diag"
|
||||
tier = local.tiers.aux
|
||||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
annotations = {
|
||||
# Reload pods when the collector script changes
|
||||
"checksum/collector" = sha1(file("${path.module}/files/diag-collector.py"))
|
||||
}
|
||||
}
|
||||
spec {
|
||||
volume {
|
||||
name = "diag-collector-script"
|
||||
config_map {
|
||||
name = kubernetes_config_map.kms_diag_collector.metadata[0].name
|
||||
}
|
||||
}
|
||||
container {
|
||||
image = "python:3.12-alpine"
|
||||
name = "diag-collector"
|
||||
command = ["python3", "/app/diag-collector.py"]
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "5m"
|
||||
memory = "48Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
container_port = 9102
|
||||
}
|
||||
volume_mount {
|
||||
name = "diag-collector-script"
|
||||
mount_path = "/app"
|
||||
read_only = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [spec[0].template[0].spec[0].dns_config] # KYVERNO_LIFECYCLE_V1
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "kms_diag" {
|
||||
metadata {
|
||||
name = "kms-diag"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
selector = {
|
||||
app = "kms-diag"
|
||||
}
|
||||
port {
|
||||
port = "9102"
|
||||
protocol = "TCP"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Carve-out for /diag — the anonymous telemetry endpoint. Same rationale as
|
||||
# /scripts: PowerShell/curl POSTs can't solve Anubis' PoW challenge, so this
|
||||
# points at the bare kms-diag collector service. full_host MUST match the main
|
||||
# ingress host; without it the factory derives kms-diag.viktorbarzin.me and the
|
||||
# carve-out never matches (this exact bug hit the /scripts carve-out).
|
||||
module "ingress_diag" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
# auth = "none": public telemetry collector, no login/PoW
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
name = "kms-diag"
|
||||
service_name = kubernetes_service.kms_diag.metadata[0].name
|
||||
port = "9102"
|
||||
ingress_path = ["/diag"]
|
||||
full_host = "kms.viktorbarzin.me"
|
||||
dns_type = "none"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
anti_ai_scraping = false
|
||||
}
|
||||
|
||||
# Dedicated KMS endpoint hostname. kms.viktorbarzin.me is the *website* (Traefik
|
||||
# 10.0.20.203 internally / :443 externally) and cannot also serve raw KMS on
|
||||
# :1688, so clients pointed at kms.viktorbarzin.me:1688 from the LAN hit Traefik
|
||||
# (no 1688 listener) and fail with "KMS server cannot be reached". vlmcs.* is
|
||||
# A-only (NO AAAA — the IPv6 tunnel doesn't forward 1688) and resolves to the
|
||||
# vlmcsd MetalLB IP both ways:
|
||||
# external: vlmcs.viktorbarzin.me -> 176.12.22.76 -> pfSense WAN NAT :1688 -> 10.0.20.202
|
||||
# internal: vlmcs.viktorbarzin.me -> 10.0.20.202 (Technitium split-horizon, set via API)
|
||||
resource "cloudflare_record" "vlmcs" {
|
||||
name = "vlmcs"
|
||||
content = "176.12.22.76" # public_ip (mirrors config.tfvars / ingress_factory default)
|
||||
proxied = false # raw TCP 1688 — Cloudflare proxy is HTTP-only
|
||||
ttl = 1
|
||||
type = "A"
|
||||
zone_id = "fd2c5dd4efe8fe38958944e74d0ced6d" # cloudflare_zone_id
|
||||
allow_overwrite = true
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map" "kms_slack_notifier" {
|
||||
metadata {
|
||||
name = "kms-slack-notifier"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"notifier.py" = file("${path.module}/files/slack-notifier.py")
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_manifest" "kms_slack_external_secret" {
|
||||
manifest = {
|
||||
apiVersion = "external-secrets.io/v1beta1"
|
||||
kind = "ExternalSecret"
|
||||
metadata = {
|
||||
name = "kms-slack-webhook"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
refreshInterval = "1h"
|
||||
secretStoreRef = {
|
||||
name = "vault-kv"
|
||||
kind = "ClusterSecretStore"
|
||||
}
|
||||
target = {
|
||||
name = "kms-slack-webhook"
|
||||
creationPolicy = "Owner"
|
||||
}
|
||||
data = [{
|
||||
secretKey = "url"
|
||||
remoteRef = {
|
||||
key = "kms"
|
||||
property = "slack_webhook_url"
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
depends_on = [kubernetes_namespace.kms]
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "windows_kms" {
|
||||
metadata {
|
||||
name = "kms"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
app = "kms-service"
|
||||
tier = local.tiers.aux
|
||||
}
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
selector {
|
||||
match_labels = {
|
||||
app = "kms-service"
|
||||
}
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = {
|
||||
app = "kms-service"
|
||||
}
|
||||
annotations = {
|
||||
# Reload pods when the notifier script changes
|
||||
"checksum/notifier" = sha1(file("${path.module}/files/slack-notifier.py"))
|
||||
# Prometheus scrape — kubernetes-pods job picks up via pod IP
|
||||
"prometheus.io/scrape" = "true"
|
||||
"prometheus.io/port" = "9101"
|
||||
"prometheus.io/path" = "/metrics"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
volume {
|
||||
name = "vlmcsd-log"
|
||||
empty_dir {}
|
||||
}
|
||||
volume {
|
||||
name = "slack-notifier-script"
|
||||
config_map {
|
||||
name = kubernetes_config_map.kms_slack_notifier.metadata[0].name
|
||||
}
|
||||
}
|
||||
container {
|
||||
image = "kebe/vlmcsd:latest"
|
||||
name = "windows-kms"
|
||||
command = ["/usr/bin/vlmcsd"]
|
||||
args = ["-D", "-v", "-l", "/var/log/vlmcsd/vlmcsd.log"]
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "64Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
container_port = 1688
|
||||
}
|
||||
# Gate Pod Ready on the listener actually being up. Required for
|
||||
# ETP=Local: MetalLB only advertises 10.0.20.202 from a node where
|
||||
# the backing pod is Ready, so without this the pod is "Ready"
|
||||
# before vlmcsd has bound 1688 and ARP can briefly point at a node
|
||||
# that drops connections during pod start.
|
||||
readiness_probe {
|
||||
tcp_socket { port = 1688 }
|
||||
initial_delay_seconds = 1
|
||||
period_seconds = 5
|
||||
failure_threshold = 3
|
||||
}
|
||||
liveness_probe {
|
||||
tcp_socket { port = 1688 }
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 30
|
||||
failure_threshold = 3
|
||||
}
|
||||
volume_mount {
|
||||
name = "vlmcsd-log"
|
||||
mount_path = "/var/log/vlmcsd"
|
||||
}
|
||||
}
|
||||
container {
|
||||
image = "python:3.12-alpine"
|
||||
name = "slack-notifier"
|
||||
command = ["python3", "-u", "/scripts/notifier.py"]
|
||||
env {
|
||||
name = "VLMCSD_LOG"
|
||||
value = "/var/log/vlmcsd/vlmcsd.log"
|
||||
}
|
||||
env {
|
||||
name = "SLACK_CHANNEL"
|
||||
value = "#alerts"
|
||||
}
|
||||
env {
|
||||
name = "DEDUP_WINDOW_SECONDS"
|
||||
value = "3600"
|
||||
}
|
||||
env {
|
||||
name = "SLACK_WEBHOOK_URL"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = "kms-slack-webhook"
|
||||
key = "url"
|
||||
}
|
||||
}
|
||||
}
|
||||
port {
|
||||
container_port = 9101
|
||||
name = "metrics"
|
||||
}
|
||||
resources {
|
||||
limits = {
|
||||
memory = "64Mi"
|
||||
}
|
||||
requests = {
|
||||
cpu = "5m"
|
||||
memory = "48Mi"
|
||||
}
|
||||
}
|
||||
volume_mount {
|
||||
name = "vlmcsd-log"
|
||||
mount_path = "/var/log/vlmcsd"
|
||||
read_only = true
|
||||
}
|
||||
volume_mount {
|
||||
name = "slack-notifier-script"
|
||||
mount_path = "/scripts"
|
||||
read_only = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [
|
||||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
||||
metadata[0].annotations["keel.sh/policy"],
|
||||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
metadata[0].annotations["keel.sh/match-tag"],
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
spec[0].template[0].spec[0].container[1].image,
|
||||
metadata[0].annotations["kubernetes.io/change-cause"],
|
||||
metadata[0].annotations["deployment.kubernetes.io/revision"],
|
||||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
|
||||
]
|
||||
}
|
||||
depends_on = [kubernetes_manifest.kms_slack_external_secret]
|
||||
}
|
||||
|
||||
resource "kubernetes_service" "windows_kms" {
|
||||
metadata {
|
||||
name = "windows-kms"
|
||||
namespace = kubernetes_namespace.kms.metadata[0].name
|
||||
labels = {
|
||||
app = "kms-service"
|
||||
}
|
||||
annotations = {
|
||||
# Dedicated MetalLB IP (not shared) so ETP=Local can preserve real
|
||||
# client IPs in the vlmcsd log. Sharing 10.0.20.200 isn't an option:
|
||||
# all 10 services there are ETP=Cluster and MetalLB requires a single
|
||||
# ETP per shared IP.
|
||||
"metallb.io/loadBalancerIPs" = "10.0.20.202"
|
||||
}
|
||||
}
|
||||
|
||||
spec {
|
||||
type = "LoadBalancer"
|
||||
external_traffic_policy = "Local"
|
||||
selector = {
|
||||
app = "kms-service"
|
||||
}
|
||||
port {
|
||||
port = "1688"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# CI retrigger 2026-05-16T13:42:57+00:00 — bulk enrollment apply (pipeline #689 killed)
|
||||
# CI retrigger v2 2026-05-16T13:46:35+00:00
|
||||
|
||||
# CI retrigger v3 2026-05-16T14:06:39Z
|
||||
|
||||
# CI retrigger v4 2026-05-16T14:13:59Z
|
||||
|
||||
# CI retrigger v5 2026-05-16T23:10:38Z
|
||||
|
||||
# CI retrigger v6 2026-05-16T23:18:58Z
|
||||
53
stacks/kms/providers.tf
Normal file
53
stacks/kms/providers.tf
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
|
||||
terraform {
|
||||
required_providers {
|
||||
vault = {
|
||||
source = "hashicorp/vault"
|
||||
version = "~> 4.0"
|
||||
}
|
||||
cloudflare = {
|
||||
source = "cloudflare/cloudflare"
|
||||
version = "~> 4"
|
||||
}
|
||||
authentik = {
|
||||
source = "goauthentik/authentik"
|
||||
version = "~> 2024.10"
|
||||
}
|
||||
# kubectl (gavinbunney) — workaround for hashicorp/kubernetes
|
||||
# `kubernetes_manifest` panics on Kyverno CRDs. See beads code-e2dp.
|
||||
# Declared for all stacks but only used where opted-in.
|
||||
kubectl = {
|
||||
source = "gavinbunney/kubectl"
|
||||
version = "~> 1.14"
|
||||
}
|
||||
proxmox = {
|
||||
source = "telmate/proxmox"
|
||||
version = "3.0.2-rc07"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
variable "kube_config_path" {
|
||||
type = string
|
||||
default = "~/.kube/config"
|
||||
}
|
||||
|
||||
provider "kubernetes" {
|
||||
config_path = var.kube_config_path
|
||||
}
|
||||
|
||||
provider "helm" {
|
||||
kubernetes = {
|
||||
config_path = var.kube_config_path
|
||||
}
|
||||
}
|
||||
|
||||
provider "vault" {
|
||||
address = "https://vault.viktorbarzin.me"
|
||||
skip_child_token = true
|
||||
}
|
||||
|
||||
provider "kubectl" {
|
||||
config_path = var.kube_config_path
|
||||
load_config_file = true
|
||||
}
|
||||
1
stacks/kms/secrets
Symbolic link
1
stacks/kms/secrets
Symbolic link
|
|
@ -0,0 +1 @@
|
|||
../../secrets
|
||||
8
stacks/kms/terragrunt.hcl
Normal file
8
stacks/kms/terragrunt.hcl
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
include "root" {
|
||||
path = find_in_parent_folders()
|
||||
}
|
||||
|
||||
dependency "platform" {
|
||||
config_path = "../platform"
|
||||
skip_outputs = true
|
||||
}
|
||||
5
stacks/kms/variables.tf
Normal file
5
stacks/kms/variables.tf
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
variable "image_tag" {
|
||||
type = string
|
||||
default = "latest"
|
||||
description = "kms-website image tag pushed to forgejo.viktorbarzin.me/viktor/kms-website. Use 8-char git SHA in CI."
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue