From efd28ccce54ee216e1fe56e2664402c0e0277973 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 00:50:30 +0000 Subject: [PATCH] anubis: fix 500 on multi-replica + roll out to 6 more public sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Browser visits to viktorbarzin.me started returning HTTP 500 with `store: key not found: "challenge:..."` in pod logs. Root cause: each Anubis pod stores in-flight challenges in process memory; with 2 replicas behind a ClusterIP, the PoW-solved request can be routed to a different pod than the one that issued the challenge. Anubis upstream documents the same caveat ("when running multiple instances on the same base domain, the key must be the same across all instances" — true for the ed25519 signing key, but the challenge store is still pod-local without a shared backend). Drop module default replicas: 2 → 1. Worst-case: ~1s cold-start on pod restart. Real fix (Redis-backed challenge store) noted as a follow-up in CLAUDE.md. Roll Anubis out to: f1-stream, cyberchef (cc), jsoncrack (json), privatebin (pb), homepage (home), real-estate-crawler (wrongmove UI only — `/api` ingress stays direct via path-based ingress carve- out so XHRs from the SPA bypass the challenge). End-state: 9 public hosts now Anubis-fronted (blog, www, kms, travel, f1, cc, json, pb, home, wrongmove). All return the challenge HTML to bare curl/browser; verified-IP search engines and /robots.txt + /.well-known still skip via the strict-policy allowlist. --- .claude/CLAUDE.md | 2 +- modules/kubernetes/anubis_instance/main.tf | 4 ++-- stacks/cyberchef/main.tf | 20 ++++++++++++++----- stacks/f1-stream/main.tf | 10 ++++++++++ stacks/homepage/main.tf | 23 +++++++++++++++------- stacks/jsoncrack/main.tf | 20 ++++++++++++++----- stacks/privatebin/main.tf | 10 ++++++++++ stacks/real-estate-crawler/main.tf | 23 ++++++++++++++++------ 8 files changed, 86 insertions(+), 26 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 3275cd4b..8d281743 100755 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -29,7 +29,7 @@ Violations cause state drift, which causes future applies to break or silently r - **New services need CI/CD** and **monitoring** (Prometheus/Uptime Kuma) - **New service**: Use `setup-project` skill for full workflow - **Ingress**: `ingress_factory` module. Auth: `protected = true`. Anti-AI: on by default. **DNS**: `dns_type = "proxied"` (Cloudflare CDN) or `"non-proxied"` (direct A/AAAA). DNS records are auto-created — no need to edit `config.tfvars`. -- **Anubis PoW challenge** (`modules/kubernetes/anubis_instance/`): per-site reverse proxy that issues a 30-day JWT cookie after a tiny PoW solve. Use for **public, content-bearing sites without app-level auth** (blog, docs, wikis, static landing pages). Pattern: declare `module "anubis" { source = "../../modules/kubernetes/anubis_instance"; name = "X"; namespace = ...; target_url = "http://..svc.cluster.local" }`, then in `ingress_factory` set `service_name = module.anubis.service_name`, `port = module.anubis.service_port`, `anti_ai_scraping = false`. Shared ed25519 key in Vault `secret/viktor` -> `anubis_ed25519_key`; cookie scoped to `viktorbarzin.me` so one solve covers all Anubis-fronted subdomains. **DO NOT put Anubis in front of Git/API/WebDAV/CLI endpoints** — clients without JS can't solve PoW. Active on: blog, kms, travel-blog. See `.claude/reference/patterns.md` "Anti-AI Scraping" for full layering. +- **Anubis PoW challenge** (`modules/kubernetes/anubis_instance/`): per-site reverse proxy that issues a 30-day JWT cookie after a tiny PoW solve. Use for **public, content-bearing sites without app-level auth** (blog, docs, wikis, static landing pages). Pattern: declare `module "anubis" { source = "../../modules/kubernetes/anubis_instance"; name = "X"; namespace = ...; target_url = "http://..svc.cluster.local" }`, then in `ingress_factory` set `service_name = module.anubis.service_name`, `port = module.anubis.service_port`, `anti_ai_scraping = false`. Shared ed25519 key in Vault `secret/viktor` -> `anubis_ed25519_key`; cookie scoped to `viktorbarzin.me` so one solve covers all Anubis-fronted subdomains. **DO NOT put Anubis in front of Git/API/WebDAV/CLI endpoints** — clients without JS can't solve PoW. **Replicas default to 1** because Anubis stores in-flight challenges in process memory; a challenge issued by pod A and solved against pod B errors with `store: key not found` (HTTP 500). Bumping replicas requires wiring a shared Redis store (TODO). For path-level carve-outs (e.g. wrongmove has `/` behind Anubis but `/api` direct), declare a second `ingress_factory` with `ingress_path = ["/api"]` pointing at the bare backend service. Active on: blog, www, kms, travel, f1, cc, json, pb (privatebin), home (homepage), wrongmove (UI only). See `.claude/reference/patterns.md` "Anti-AI Scraping" for full layering. - **Docker images**: Always build for `linux/amd64`. Use 8-char git SHA tags — `:latest` causes stale pull-through cache. - **Private registry**: `forgejo.viktorbarzin.me/viktor/` (Forgejo packages, OAuth-style PAT auth). Use `image: forgejo.viktorbarzin.me/viktor/:` + `imagePullSecrets: [{name: registry-credentials}]`. Kyverno auto-syncs the Secret to all namespaces. Containerd `hosts.toml` on every node redirects to in-cluster Traefik LB `10.0.20.200` to avoid hairpin NAT. Push-side: viktor PAT in Vault `secret/ci/global/forgejo_push_token` (Forgejo container packages are scoped per-user; only the package owner can push, ci-pusher cannot write to viktor/*). Pull-side: cluster-puller PAT in Vault `secret/viktor/forgejo_pull_token`. Retention CronJob (`forgejo-cleanup` in `forgejo` ns, daily 04:00) keeps newest 10 versions + always `:latest`; integrity probed every 15min by `forgejo-integrity-probe` in `monitoring` ns (catalog walk + manifest HEAD on every blob). See `docs/plans/2026-05-07-forgejo-registry-consolidation-{design,plan}.md` for the migration history. Pull-through caches for upstream registries (DockerHub, GHCR, Quay, k8s.gcr, Kyverno) stay on the registry VM at `10.0.20.10` ports 5000/5010/5020/5030/5040 — the old port-5050 R/W private registry was decommissioned 2026-05-07. - **LinuxServer.io containers**: `DOCKER_MODS` runs apt-get on every start — bake slow mods into a custom image (`RUN /docker-mods || true` then `ENV DOCKER_MODS=`). Set `NO_CHOWN=true` to skip recursive chown that hangs on NFS mounts. diff --git a/modules/kubernetes/anubis_instance/main.tf b/modules/kubernetes/anubis_instance/main.tf index c316921f..55129bbf 100644 --- a/modules/kubernetes/anubis_instance/main.tf +++ b/modules/kubernetes/anubis_instance/main.tf @@ -56,8 +56,8 @@ variable "image_tag" { variable "replicas" { type = number - default = 2 - description = "Replica count. 2 + matching ed25519 key = HA without sticky sessions." + default = 1 + description = "Replica count. Default 1 because Anubis stores in-flight challenges in process memory — with N>1 a challenge issued by pod A and solved against pod B fails with `store: key not found` (HTTP 500). For HA, configure a shared store (Redis) and bump this. Per-pod 128Mi @ idle is cheap, single-pod restart is sub-second, so 1 is fine for content sites." } variable "memory" { diff --git a/stacks/cyberchef/main.tf b/stacks/cyberchef/main.tf index 916b513b..7cdd49ad 100644 --- a/stacks/cyberchef/main.tf +++ b/stacks/cyberchef/main.tf @@ -104,12 +104,22 @@ resource "kubernetes_service" "cyberchef" { } +module "anubis" { + source = "../../modules/kubernetes/anubis_instance" + name = "cc" + namespace = kubernetes_namespace.cyberchef.metadata[0].name + target_url = "http://${kubernetes_service.cyberchef.metadata[0].name}.${kubernetes_namespace.cyberchef.metadata[0].name}.svc.cluster.local" +} + module "ingress" { - source = "../../modules/kubernetes/ingress_factory" - dns_type = "proxied" - namespace = kubernetes_namespace.cyberchef.metadata[0].name - name = "cc" - tls_secret_name = var.tls_secret_name + source = "../../modules/kubernetes/ingress_factory" + dns_type = "proxied" + namespace = kubernetes_namespace.cyberchef.metadata[0].name + name = "cc" + service_name = module.anubis.service_name + port = module.anubis.service_port + tls_secret_name = var.tls_secret_name + anti_ai_scraping = false extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "CyberChef" diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf index cfa1cd60..c2f5edaf 100644 --- a/stacks/f1-stream/main.tf +++ b/stacks/f1-stream/main.tf @@ -228,13 +228,23 @@ module "tls_secret" { } +module "anubis" { + source = "../../modules/kubernetes/anubis_instance" + name = "f1" + namespace = kubernetes_namespace.f1-stream.metadata[0].name + target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local" +} + module "ingress" { source = "../../modules/kubernetes/ingress_factory" dns_type = "non-proxied" namespace = kubernetes_namespace.f1-stream.metadata[0].name name = "f1" + service_name = module.anubis.service_name + port = module.anubis.service_port tls_secret_name = var.tls_secret_name exclude_crowdsec = true + anti_ai_scraping = false extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "F1 Stream" diff --git a/stacks/homepage/main.tf b/stacks/homepage/main.tf index 007f7533..8a6bf959 100644 --- a/stacks/homepage/main.tf +++ b/stacks/homepage/main.tf @@ -137,14 +137,23 @@ resource "kubernetes_service" "cache_proxy" { } } +module "anubis" { + source = "../../modules/kubernetes/anubis_instance" + name = "homepage" + namespace = kubernetes_namespace.homepage.metadata[0].name + target_url = "http://${kubernetes_service.cache_proxy.metadata[0].name}.${kubernetes_namespace.homepage.metadata[0].name}.svc.cluster.local" +} + module "ingress" { - source = "../../modules/kubernetes/ingress_factory" - namespace = kubernetes_namespace.homepage.metadata[0].name - name = "homepage" - host = "home" - dns_type = "proxied" - service_name = kubernetes_service.cache_proxy.metadata[0].name - tls_secret_name = var.tls_secret_name + source = "../../modules/kubernetes/ingress_factory" + namespace = kubernetes_namespace.homepage.metadata[0].name + name = "homepage" + host = "home" + dns_type = "proxied" + service_name = module.anubis.service_name + port = module.anubis.service_port + tls_secret_name = var.tls_secret_name + anti_ai_scraping = false extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "Homepage" diff --git a/stacks/jsoncrack/main.tf b/stacks/jsoncrack/main.tf index 1e1a1de8..d3c26b91 100644 --- a/stacks/jsoncrack/main.tf +++ b/stacks/jsoncrack/main.tf @@ -84,12 +84,22 @@ resource "kubernetes_service" "jsoncrack" { } } +module "anubis" { + source = "../../modules/kubernetes/anubis_instance" + name = "json" + namespace = kubernetes_namespace.jsoncrack.metadata[0].name + target_url = "http://${kubernetes_service.jsoncrack.metadata[0].name}.${kubernetes_namespace.jsoncrack.metadata[0].name}.svc.cluster.local" +} + module "ingress" { - source = "../../modules/kubernetes/ingress_factory" - dns_type = "proxied" - namespace = kubernetes_namespace.jsoncrack.metadata[0].name - name = "json" - tls_secret_name = var.tls_secret_name + source = "../../modules/kubernetes/ingress_factory" + dns_type = "proxied" + namespace = kubernetes_namespace.jsoncrack.metadata[0].name + name = "json" + service_name = module.anubis.service_name + port = module.anubis.service_port + tls_secret_name = var.tls_secret_name + anti_ai_scraping = false extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "JSON Crack" diff --git a/stacks/privatebin/main.tf b/stacks/privatebin/main.tf index dc3a94b6..d7358964 100644 --- a/stacks/privatebin/main.tf +++ b/stacks/privatebin/main.tf @@ -131,12 +131,22 @@ resource "kubernetes_service" "privatebin" { } } +module "anubis" { + source = "../../modules/kubernetes/anubis_instance" + name = "privatebin" + namespace = kubernetes_namespace.privatebin.metadata[0].name + target_url = "http://${kubernetes_service.privatebin.metadata[0].name}.${kubernetes_namespace.privatebin.metadata[0].name}.svc.cluster.local" +} + module "ingress" { source = "../../modules/kubernetes/ingress_factory" namespace = kubernetes_namespace.privatebin.metadata[0].name name = "privatebin" host = "pb" dns_type = "proxied" + service_name = module.anubis.service_name + port = module.anubis.service_port + anti_ai_scraping = false tls_secret_name = var.tls_secret_name custom_content_security_policy = "script-src 'self' 'unsafe-inline' 'unsafe-eval' 'wasm-unsafe-eval'" extra_annotations = { diff --git a/stacks/real-estate-crawler/main.tf b/stacks/real-estate-crawler/main.tf index 3a1d274e..2dc037c7 100644 --- a/stacks/real-estate-crawler/main.tf +++ b/stacks/real-estate-crawler/main.tf @@ -330,13 +330,24 @@ resource "kubernetes_service" "realestate-crawler-api" { } } +# Anubis fronts the UI ingress only; the /api ingress (`module "ingress-api"`) +# stays direct so XHRs from the UI bypass the challenge. +module "anubis" { + source = "../../modules/kubernetes/anubis_instance" + name = "wrongmove" + namespace = kubernetes_namespace.realestate-crawler.metadata[0].name + target_url = "http://realestate-crawler-ui.${kubernetes_namespace.realestate-crawler.metadata[0].name}.svc.cluster.local" +} + module "ingress" { - source = "../../modules/kubernetes/ingress_factory" - dns_type = "proxied" - namespace = kubernetes_namespace.realestate-crawler.metadata[0].name - name = "wrongmove" - service_name = "realestate-crawler-ui" - tls_secret_name = var.tls_secret_name + source = "../../modules/kubernetes/ingress_factory" + dns_type = "proxied" + namespace = kubernetes_namespace.realestate-crawler.metadata[0].name + name = "wrongmove" + service_name = module.anubis.service_name + port = module.anubis.service_port + anti_ai_scraping = false + tls_secret_name = var.tls_secret_name extra_annotations = { "gethomepage.dev/enabled" = "true" "gethomepage.dev/name" = "Wrongmove"