diff --git a/.claude/reference/service-catalog.md b/.claude/reference/service-catalog.md index 08a13a0b..7ef62e1c 100644 --- a/.claude/reference/service-catalog.md +++ b/.claude/reference/service-catalog.md @@ -62,7 +62,7 @@ | blog | Personal blog | blog | | descheduler | Pod descheduler | descheduler | | hackmd | Collaborative markdown | hackmd | -| kms | Key management | kms | +| kms | Windows/Office volume-license activation (vlmcsd); site kms.viktorbarzin.me, endpoint vlmcs.viktorbarzin.me:1688 | kms | | privatebin | Encrypted pastebin | privatebin | | vault | HashiCorp Vault | vault | | reloader | ConfigMap/Secret reloader | reloader | diff --git a/docs/runbooks/kms-public-exposure.md b/docs/runbooks/kms-public-exposure.md index 2e727003..88d02ddd 100644 --- a/docs/runbooks/kms-public-exposure.md +++ b/docs/runbooks/kms-public-exposure.md @@ -1,9 +1,24 @@ -# Runbook: KMS public exposure (kms.viktorbarzin.me:1688) +# Runbook: KMS public exposure (vlmcs.viktorbarzin.me:1688) -`kms.viktorbarzin.me:1688/TCP` is intentionally open to the internet so any +`vlmcs.viktorbarzin.me:1688/TCP` is intentionally open to the internet so any visitor can activate Volume License Microsoft products. The webpage at `https://kms.viktorbarzin.me/` documents how to use it. +**Two hostnames, on purpose** (do not merge them): + +- `kms.viktorbarzin.me` — the **website** (Traefik). Serves the docs and the + `/scripts/*.ps1` activators. Internally resolves to the Traefik LB + (`10.0.20.203`), which has **no** `:1688` listener. +- `vlmcs.viktorbarzin.me` — the **KMS endpoint** (vlmcsd). A-only (no AAAA — + the IPv6 tunnel doesn't forward 1688). Resolves to `10.0.20.202` on the LAN + (Technitium split-horizon, set via API — `cloudflare_record.vlmcs` in + `stacks/kms` owns the public A) and to `176.12.22.76` on the internet + (Cloudflare → pfSense WAN NAT :1688). Every `slmgr` / `ospp` command on the + page points here. + +Pointing a client at `kms.viktorbarzin.me:1688` fails from the LAN with "KMS +server cannot be reached" — that name is the website, not the KMS server. + This runbook covers operations on the public exposure: where to find logs, how to tune the rate limit, how to revoke if abused. @@ -25,9 +40,10 @@ how to tune the rate limit, how to revoke if abused. - `kms.viktorbarzin.lan` A `10.0.20.200` (Traefik — for the user-facing website at `https://kms.viktorbarzin.lan/`; **not** the KMS server) Manual override (e.g., for clients without the suffix or for clients - on the public internet): `slmgr /skms kms.viktorbarzin.me:1688` (WAN - path via pfSense forward) or `slmgr /skms 10.0.20.202:1688` (direct). - To revert a manually-overridden client back to auto-discovery: + on the public internet): `slmgr /skms vlmcs.viktorbarzin.me:1688` (works + LAN + WAN) or `slmgr /skms 10.0.20.202:1688` (LAN, direct). Do **not** use + `kms.viktorbarzin.me:1688` — that name is the website (Traefik), not the + KMS server. To revert a manually-overridden client back to auto-discovery: `slmgr /ckms`. - **Pod fluidity**: deployment has `replicas=1` (notifier dedup state is per-pod) with no node affinity. TCP readiness/liveness probes on 1688 @@ -54,6 +70,23 @@ how to tune the rate limit, how to revoke if abused. `kms_connection_probes_total{source}` (`source` ∈ `internal_pod`, `cluster_node`, `external`) and log to stdout, but never post to Slack. Real activations still post. +- **Website `/scripts` + `/keys.json` carve-out**: the website is Anubis-fronted + (PoW challenge). `/scripts/*` and `/keys.json` are carved out to the bare + nginx backend (`module.ingress_scripts` in `stacks/kms`, `ingress_path`) + because PowerShell `iwr | iex` / `ConvertFrom-Json` are non-JS clients that + can't solve the PoW — without the carve-out they'd download the Anubis + challenge HTML and choke. Everything else stays behind Anubis. Verify: + `curl -A curl https://kms.viktorbarzin.me/scripts/setup-kms.ps1` and + `.../keys.json` both return real content (not "Making sure you're not a bot!"). +- **Auto-key selection**: the scripts no longer require the user to pick a GVLK. + `/keys.json` is `data/products.yaml` rendered to JSON (Hugo KEYS output format). + When no Volume License key is installed, `setup-kms.ps1` / `kms-bootstrap.ps1` + detect the edition — Windows via registry `EditionID` (+ `CurrentBuildNumber` + for LTSC/Server, which share an EditionID across releases), Office via the + Click-to-Run `ProductReleaseIds` — fetch `/keys.json`, and `slmgr /ipk` / + `ospp /inpkey` the matching key before activating. Only fires when not already + licensed (never clobbers a working retail key). Azure-Edition server SKUs are + intentionally unmapped (they collide with Datacenter and KMS may fail there). ## Where the logs are @@ -153,6 +186,7 @@ itself is independent of any forward and persists across delete/restore. - Stack: `stacks/kms/` (Terraform; deployment, MetalLB Service, ingress, ExternalSecret for the Slack webhook) -- Webpage source: `kms-website/` repo (Hugo + nginx, deployed via Drone CI) +- Webpage source: `kms-website/` repo (Hugo + nginx; Woodpecker builds + + pushes to forgejo, then `kubectl set image deployment/kms-web-page`) - Networking architecture footnote: `docs/architecture/networking.md` § "MetalLB & Load Balancing" diff --git a/secrets/fullchain.pem b/secrets/fullchain.pem index 6a978c8f..de4af81f 100644 Binary files a/secrets/fullchain.pem and b/secrets/fullchain.pem differ diff --git a/secrets/privkey.pem b/secrets/privkey.pem index 6436c083..b28796fa 100644 Binary files a/secrets/privkey.pem and b/secrets/privkey.pem differ diff --git a/stacks/kms/.terraform.lock.hcl b/stacks/kms/.terraform.lock.hcl index 9fbd2e13..05f8a359 100644 --- a/stacks/kms/.terraform.lock.hcl +++ b/stacks/kms/.terraform.lock.hcl @@ -24,6 +24,29 @@ provider "registry.terraform.io/cloudflare/cloudflare" { ] } +provider "registry.terraform.io/gavinbunney/kubectl" { + version = "1.19.0" + constraints = "~> 1.14" + hashes = [ + "h1:9QkxPjp0x5FZFfJbE+B7hBOoads9gmdfj9aYu5N4Sfc=", + "zh:1dec8766336ac5b00b3d8f62e3fff6390f5f60699c9299920fc9861a76f00c71", + "zh:43f101b56b58d7fead6a511728b4e09f7c41dc2e3963f59cf1c146c4767c6cb7", + "zh:4c4fbaa44f60e722f25cc05ee11dfaec282893c5c0ffa27bc88c382dbfbaa35c", + "zh:51dd23238b7b677b8a1abbfcc7deec53ffa5ec79e58e3b54d6be334d3d01bc0e", + "zh:5afc2ebc75b9d708730dbabdc8f94dd559d7f2fc5a31c5101358bd8d016916ba", + "zh:6be6e72d4663776390a82a37e34f7359f726d0120df622f4a2b46619338a168e", + "zh:72642d5fcf1e3febb6e5d4ae7b592bb9ff3cb220af041dbda893588e4bf30c0c", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:a1da03e3239867b35812ee031a1060fed6e8d8e458e2eaca48b5dd51b35f56f7", + "zh:b98b6a6728fe277fcd133bdfa7237bd733eae233f09653523f14460f608f8ba2", + "zh:bb8b071d0437f4767695c6158a3cb70df9f52e377c67019971d888b99147511f", + "zh:dc89ce4b63bfef708ec29c17e85ad0232a1794336dc54dd88c3ba0b77e764f71", + "zh:dd7dd18f1f8218c6cd19592288fde32dccc743cde05b9feeb2883f37c2ff4b4e", + "zh:ec4bd5ab3872dedb39fe528319b4bba609306e12ee90971495f109e142d66310", + "zh:f610ead42f724c82f5463e0e71fa735a11ffb6101880665d93f48b4a67b9ad82", + ] +} + provider "registry.terraform.io/goauthentik/authentik" { version = "2024.12.1" constraints = "~> 2024.10" @@ -105,3 +128,25 @@ provider "registry.terraform.io/hashicorp/vault" { "zh:ff35fb1ab6add288f0f368981e56f780b50405accd1937131cba1137999c8d83", ] } + +provider "registry.terraform.io/telmate/proxmox" { + version = "3.0.2-rc07" + constraints = "3.0.2-rc07" + hashes = [ + "h1:zp5hpQJQ4t4zROSLqdltVpBO+Riy9VugtfFbpyTw1aM=", + "zh:2ee860cd0a368b3eaa53f4a9ea46f16dab8a97929e813ea6ef55183f8112c2ca", + "zh:415965fd915bae2040d7f79e45f64d6e3ae61149c10114efeac1b34687d7296c", + "zh:6584b2055df0e32062561c615e3b6b2c291ca8c959440adda09ef3ec1e1436bd", + "zh:65dcfad71928e0a8dd9befc22524ed686be5020b0024dc5cca5184c7420eeb6b", + "zh:7253dc29bd265d33f2791ac4f779c5413f16720bb717de8e6c5fcb2c858648ea", + "zh:7ec8993da10a47606670f9f67cfd10719a7580641d11c7aa761121c4a2bd66fb", + "zh:999a3f7a9dcf517967fc537e6ec930a8172203642fb01b8e1f78f908373db210", + "zh:a50e6df7280eb6584a5fd2456e3f5b6df13b2ec8a7fa4605511e438e1863be42", + "zh:b25b329a1e42681c509d027fee0365414f0cc5062b65690cfc3386aab16132ae", + "zh:c028877fdb438ece48f7bc02b65bbae9ca7b7befbd260e519ccab6c0cbb39f26", + "zh:cf0eaa3ea9fcc6d62793637947f1b8d7c885b6ad74695ab47e134e4ff132190f", + "zh:d5ade3fae031cc629b7c512a7b60e46570f4c41665e88a595d7efd943dde5ab2", + "zh:f388c15ad1ecfc09e7361e3b98bae9b627a3a85f7b908c9f40650969c949901c", + "zh:f415cc6f735a3971faae6ac24034afdb9ee83373ef8de19a9631c187d5adc7db", + ] +} diff --git a/stacks/kms/backend.tf b/stacks/kms/backend.tf index ef601d70..1f8dd7d1 100644 --- a/stacks/kms/backend.tf +++ b/stacks/kms/backend.tf @@ -1,7 +1,7 @@ # Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa terraform { backend "pg" { - conn_str = "postgres://terraform_state:ts7DGcKmTTY-5ujz4mhh@10.0.20.200:5432/terraform_state?sslmode=disable" + conn_str = "postgres://terraform_state:WR2rnNyiLIb-gUcIxOeF@10.0.20.200:5432/terraform_state?sslmode=disable" schema_name = "kms" } } diff --git a/stacks/kms/main.tf b/stacks/kms/main.tf index 83c9dd7f..978649b1 100644 --- a/stacks/kms/main.tf +++ b/stacks/kms/main.tf @@ -9,7 +9,7 @@ resource "kubernetes_namespace" "kms" { name = "kms" labels = { "istio-injection" : "disabled" - tier = local.tiers.aux + tier = local.tiers.aux "keel.sh/enrolled" = "true" } } @@ -133,6 +133,47 @@ module "ingress" { } } +# Carve-out for /scripts/* and /keys.json — the PowerShell activators +# (kms-bootstrap.ps1, setup-kms.ps1) that visitors fetch with `iwr ... | iex`, +# plus /keys.json (the published GVLK list the scripts fetch to auto-select a +# key). Anubis cannot gate these paths: PowerShell/curl are non-JS clients and +# can't solve the PoW challenge, so they'd receive the challenge HTML and the +# script (or ConvertFrom-Json) would choke on it. Points at the bare +# kms-web-page nginx service, bypassing the Anubis proxy. Traefik prioritises +# the longer /scripts and /keys.json prefixes over the main "/" router. +module "ingress_scripts" { + source = "../../modules/kubernetes/ingress_factory" + # auth = "none": public read-only static scripts + key list (iwr|iex). No login, no PoW. + auth = "none" + namespace = kubernetes_namespace.kms.metadata[0].name + name = "kms-scripts" + service_name = kubernetes_service.kms-web-page.metadata[0].name + port = "80" + ingress_path = ["/scripts", "/keys.json"] + full_host = "kms.viktorbarzin.me" # MUST match the main ingress host; without this the factory derives kms-scripts.viktorbarzin.me and the carve-out never matches. + dns_type = "none" # DNS already owned by the main kms ingress. + tls_secret_name = var.tls_secret_name + anti_ai_scraping = false # Static scripts + key list; nothing for scrapers to mine. +} + +# Dedicated KMS endpoint hostname. kms.viktorbarzin.me is the *website* (Traefik +# 10.0.20.203 internally / :443 externally) and cannot also serve raw KMS on +# :1688, so clients pointed at kms.viktorbarzin.me:1688 from the LAN hit Traefik +# (no 1688 listener) and fail with "KMS server cannot be reached". vlmcs.* is +# A-only (NO AAAA — the IPv6 tunnel doesn't forward 1688) and resolves to the +# vlmcsd MetalLB IP both ways: +# external: vlmcs.viktorbarzin.me -> 176.12.22.76 -> pfSense WAN NAT :1688 -> 10.0.20.202 +# internal: vlmcs.viktorbarzin.me -> 10.0.20.202 (Technitium split-horizon, set via API) +resource "cloudflare_record" "vlmcs" { + name = "vlmcs" + content = "176.12.22.76" # public_ip (mirrors config.tfvars / ingress_factory default) + proxied = false # raw TCP 1688 — Cloudflare proxy is HTTP-only + ttl = 1 + type = "A" + zone_id = "fd2c5dd4efe8fe38958944e74d0ced6d" # cloudflare_zone_id + allow_overwrite = true +} + resource "kubernetes_config_map" "kms_slack_notifier" { metadata { name = "kms-slack-notifier" diff --git a/stacks/kms/providers.tf b/stacks/kms/providers.tf index 012af700..3d0bc2c6 100644 --- a/stacks/kms/providers.tf +++ b/stacks/kms/providers.tf @@ -13,6 +13,17 @@ terraform { source = "goauthentik/authentik" version = "~> 2024.10" } + # kubectl (gavinbunney) — workaround for hashicorp/kubernetes + # `kubernetes_manifest` panics on Kyverno CRDs. See beads code-e2dp. + # Declared for all stacks but only used where opted-in. + kubectl = { + source = "gavinbunney/kubectl" + version = "~> 1.14" + } + proxmox = { + source = "telmate/proxmox" + version = "3.0.2-rc07" + } } } @@ -35,3 +46,8 @@ provider "vault" { address = "https://vault.viktorbarzin.me" skip_child_token = true } + +provider "kubectl" { + config_path = var.kube_config_path + load_config_file = true +} diff --git a/stacks/traefik/modules/traefik/main.tf b/stacks/traefik/modules/traefik/main.tf index 1ed2ac41..8ab0e4e6 100644 --- a/stacks/traefik/modules/traefik/main.tf +++ b/stacks/traefik/modules/traefik/main.tf @@ -351,6 +351,16 @@ resource "kubernetes_config_map" "bot_block_proxy_config" { } server { listen 8080; + + # Browsers accumulate one authentik_proxy_ cookie per Authentik + # Proxy Provider on the parent domain. With 30+ services under + # viktorbarzin.me the combined Cookie header exceeds nginx's default + # 4 x 8k large_client_header_buffers and the ai-bot-block forward-auth + # rejects it with 400 (and error-pages then shows "Too big request + # header" 431). Match auth-proxy-config: 8 x 64k accepts the pile. + client_header_buffer_size 8k; + large_client_header_buffers 8 64k; + location /auth { access_by_lua_block { ngx.req.clear_header("If-Match")