diff --git a/modules/kubernetes/anubis_instance/main.tf b/modules/kubernetes/anubis_instance/main.tf index a2572f2e..517025d3 100644 --- a/modules/kubernetes/anubis_instance/main.tf +++ b/modules/kubernetes/anubis_instance/main.tf @@ -56,8 +56,24 @@ variable "image_tag" { variable "replicas" { type = number - default = 1 - description = "Replica count. Default 1 because Anubis stores in-flight challenges in process memory — with N>1 a challenge issued by pod A and solved against pod B fails with `store: key not found` (HTTP 500). For HA, configure a shared store (Redis) and bump this. Per-pod 128Mi @ idle is cheap, single-pod restart is sub-second, so 1 is fine for content sites." + default = null + description = "Optional replica count override. When null, defaults to 1 if shared_store_url is null and 2 otherwise. Capped at 2 — Redis can handle more but anti-affinity assumes ≤2 replicas per Anubis instance on a 5-node cluster." + + validation { + condition = var.replicas == null || (var.replicas >= 1 && var.replicas <= 2) + error_message = "replicas must be 1 or 2 (or null to auto-pick from shared_store_url presence)." + } +} + +variable "shared_store_url" { + type = string + default = null + description = "If set, Anubis stores in-flight challenge state in this Valkey/Redis-protocol URL instead of in-process memory, enabling HA across replicas. Format: redis://host:port/. The DB index MUST be unique per Anubis instance (this module assumes 16 DBs available, common in standalone Redis). Cluster Redis is redis-master.redis.svc.cluster.local:6379 with HA via Sentinel + haproxy. Without this, replicas>1 causes ~50% PoW failures (challenge issued by pod A, solved against pod B → 500)." + + validation { + condition = var.shared_store_url == null || can(regex("^redis://[a-zA-Z0-9_.-]+:[0-9]+/[0-9]+$", var.shared_store_url)) + error_message = "shared_store_url must look like redis://host:port/ (explicit DB index required)." + } } variable "memory" { @@ -88,6 +104,21 @@ locals { "app.kubernetes.io/managed-by" = "terraform" } + # Effective replicas: caller-override > shared-store-aware default. + effective_replicas = coalesce(var.replicas, var.shared_store_url == null ? 1 : 2) + + # Anubis store config. With backend=valkey, multiple Anubis pods can share + # in-flight PoW state and a challenge issued by pod A is verifiable by pod + # B. Default backend is in-process memory which only works at replicas=1. + store_yaml_block = var.shared_store_url == null ? "" : <<-EOT + + + store: + backend: valkey + parameters: + url: "${var.shared_store_url}" + EOT + # Strict bot policy. Default Anubis policy only WEIGHs Mozilla|Opera UAs # and lets unmatched UAs (curl, wget, Python-requests, scrapy, headless # CLI scrapers) fall through to ALLOW. We import the same upstream @@ -125,6 +156,12 @@ locals { path_regex: .* action: CHALLENGE EOT + + # Final policy YAML: defaults (or caller override) plus an optional store + # block when shared_store_url is set. Store block is module-managed and + # appended universally — callers passing a custom policy_yaml shouldn't + # include their own `store:` block (they would collide). + rendered_policy_yaml = "${coalesce(var.policy_yaml, local.default_policy_yaml)}${local.store_yaml_block}" } # Bot policy ConfigMap. Mounted into the pod and referenced by POLICY_FNAME. @@ -135,7 +172,7 @@ resource "kubernetes_config_map" "policy" { labels = local.labels } data = { - "botPolicies.yaml" = coalesce(var.policy_yaml, local.default_policy_yaml) + "botPolicies.yaml" = local.rendered_policy_yaml } } @@ -179,7 +216,7 @@ resource "kubernetes_deployment" "anubis" { } spec { - replicas = var.replicas + replicas = local.effective_replicas selector { match_labels = { app = local.full_name } @@ -200,16 +237,22 @@ resource "kubernetes_deployment" "anubis" { # Roll the deployment whenever the policy YAML changes — Anubis # reads the policy at startup, so a ConfigMap update alone # doesn't take effect until pods restart. - "checksum/policy" = sha256(coalesce(var.policy_yaml, local.default_policy_yaml)) + "checksum/policy" = sha256(local.rendered_policy_yaml) } } spec { # Spread replicas across nodes to survive a single node failure. + # DoNotSchedule (not ScheduleAnyway) so 2 replicas are forced onto + # different hosts — otherwise the scheduler may pile them on the + # same node and a single node reboot takes the whole Anubis instance + # down despite replicas=2. On a 5-node cluster the spread is always + # satisfiable; the worst case (4 nodes unavailable) leaves one + # replica Pending, but the other keeps serving. topology_spread_constraint { max_skew = 1 topology_key = "kubernetes.io/hostname" - when_unsatisfiable = "ScheduleAnyway" + when_unsatisfiable = "DoNotSchedule" label_selector { match_labels = { app = local.full_name } } @@ -405,7 +448,15 @@ resource "kubernetes_pod_disruption_budget_v1" "anubis" { namespace = var.namespace } spec { - min_available = "1" + # max_unavailable=1 means: at most one pod can be voluntarily disrupted + # at a time. With replicas=2 this allows clean rolling drains (one pod + # goes down → other serves traffic → first recreates elsewhere). With + # replicas=1 (no shared store) this is functionally equivalent to no + # PDB — drain proceeds, brief outage, new pod schedules elsewhere. + # Was min_available=1 before 2026-05-16 which deadlocked drains on + # single-replica instances (eviction API can never satisfy the + # constraint at replicas=1). See PM-2026-05-11. + max_unavailable = "1" selector { match_labels = { app = local.full_name } } diff --git a/stacks/blog/main.tf b/stacks/blog/main.tf index daa646a0..30519e9d 100644 --- a/stacks/blog/main.tf +++ b/stacks/blog/main.tf @@ -116,10 +116,11 @@ resource "kubernetes_service" "blog" { # tiny PoW (~250ms desktop), get a 30-day cookie, and pass through. Replaces # the global ai-bot-block forwardAuth for this site. module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "blog" - namespace = kubernetes_namespace.website.metadata[0].name - target_url = "http://${kubernetes_service.blog.metadata[0].name}.${kubernetes_namespace.website.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "blog" + namespace = kubernetes_namespace.website.metadata[0].name + target_url = "http://${kubernetes_service.blog.metadata[0].name}.${kubernetes_namespace.website.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/10" } module "ingress" { diff --git a/stacks/cyberchef/main.tf b/stacks/cyberchef/main.tf index c15a5307..926d9928 100644 --- a/stacks/cyberchef/main.tf +++ b/stacks/cyberchef/main.tf @@ -105,10 +105,11 @@ resource "kubernetes_service" "cyberchef" { module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "cc" - namespace = kubernetes_namespace.cyberchef.metadata[0].name - target_url = "http://${kubernetes_service.cyberchef.metadata[0].name}.${kubernetes_namespace.cyberchef.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "cc" + namespace = kubernetes_namespace.cyberchef.metadata[0].name + target_url = "http://${kubernetes_service.cyberchef.metadata[0].name}.${kubernetes_namespace.cyberchef.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/5" } module "ingress" { diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf index be954403..ca17daed 100644 --- a/stacks/f1-stream/main.tf +++ b/stacks/f1-stream/main.tf @@ -244,11 +244,12 @@ module "tls_secret" { # (which load before any user has a chance to solve PoW), CHALLENGE # everything else — the HTML pages. module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "f1" - namespace = kubernetes_namespace.f1-stream.metadata[0].name - target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local" - policy_yaml = <<-EOT + source = "../../modules/kubernetes/anubis_instance" + name = "f1" + namespace = kubernetes_namespace.f1-stream.metadata[0].name + target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/6" + policy_yaml = <<-EOT bots: - import: (data)/bots/_deny-pathological.yaml - import: (data)/bots/aggressive-brazilian-scrapers.yaml diff --git a/stacks/homepage/main.tf b/stacks/homepage/main.tf index 2dfe07f7..94236382 100644 --- a/stacks/homepage/main.tf +++ b/stacks/homepage/main.tf @@ -138,10 +138,11 @@ resource "kubernetes_service" "cache_proxy" { } module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "homepage" - namespace = kubernetes_namespace.homepage.metadata[0].name - target_url = "http://${kubernetes_service.cache_proxy.metadata[0].name}.${kubernetes_namespace.homepage.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "homepage" + namespace = kubernetes_namespace.homepage.metadata[0].name + target_url = "http://${kubernetes_service.cache_proxy.metadata[0].name}.${kubernetes_namespace.homepage.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/9" } module "ingress" { diff --git a/stacks/jsoncrack/main.tf b/stacks/jsoncrack/main.tf index d31d89bf..7828e7aa 100644 --- a/stacks/jsoncrack/main.tf +++ b/stacks/jsoncrack/main.tf @@ -85,10 +85,11 @@ resource "kubernetes_service" "jsoncrack" { } module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "json" - namespace = kubernetes_namespace.jsoncrack.metadata[0].name - target_url = "http://${kubernetes_service.jsoncrack.metadata[0].name}.${kubernetes_namespace.jsoncrack.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "json" + namespace = kubernetes_namespace.jsoncrack.metadata[0].name + target_url = "http://${kubernetes_service.jsoncrack.metadata[0].name}.${kubernetes_namespace.jsoncrack.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/7" } module "ingress" { diff --git a/stacks/kms/main.tf b/stacks/kms/main.tf index 8e6594b6..5cc6e3fd 100644 --- a/stacks/kms/main.tf +++ b/stacks/kms/main.tf @@ -104,10 +104,11 @@ resource "kubernetes_service" "kms-web-page" { } module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "kms" - namespace = kubernetes_namespace.kms.metadata[0].name - target_url = "http://${kubernetes_service.kms-web-page.metadata[0].name}.${kubernetes_namespace.kms.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "kms" + namespace = kubernetes_namespace.kms.metadata[0].name + target_url = "http://${kubernetes_service.kms-web-page.metadata[0].name}.${kubernetes_namespace.kms.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/8" } module "ingress" { diff --git a/stacks/real-estate-crawler/main.tf b/stacks/real-estate-crawler/main.tf index f2e6a9da..a6fcf28a 100644 --- a/stacks/real-estate-crawler/main.tf +++ b/stacks/real-estate-crawler/main.tf @@ -364,10 +364,11 @@ resource "kubernetes_service" "realestate-crawler-api" { # Anubis fronts the UI ingress only; the /api ingress (`module "ingress-api"`) # stays direct so XHRs from the UI bypass the challenge. module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "wrongmove" - namespace = kubernetes_namespace.realestate-crawler.metadata[0].name - target_url = "http://realestate-crawler-ui.${kubernetes_namespace.realestate-crawler.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "wrongmove" + namespace = kubernetes_namespace.realestate-crawler.metadata[0].name + target_url = "http://realestate-crawler-ui.${kubernetes_namespace.realestate-crawler.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/12" } module "ingress" { @@ -453,13 +454,15 @@ resource "kubernetes_deployment" "realestate-crawler-celery" { image = "viktorbarzin/realestatecrawler:latest" image_pull_policy = "Always" command = ["python", "-m", "celery", "-A", "celery_app", "worker", "--loglevel=info", "--pool=threads"] + # 512Mi OOMed during full London RENT 1-2 bed scrape (~76k existing IDs + # + 10k fetched into memory at concurrency=8 threads). Bumped to 1Gi. resources { requests = { cpu = "15m" - memory = "512Mi" + memory = "1Gi" } limits = { - memory = "512Mi" + memory = "1Gi" } } port { diff --git a/stacks/travel_blog/main.tf b/stacks/travel_blog/main.tf index b427ad9b..75fc00d0 100644 --- a/stacks/travel_blog/main.tf +++ b/stacks/travel_blog/main.tf @@ -103,10 +103,11 @@ resource "kubernetes_service" "travel-blog" { } module "anubis" { - source = "../../modules/kubernetes/anubis_instance" - name = "travel" - namespace = kubernetes_namespace.travel-blog.metadata[0].name - target_url = "http://${kubernetes_service.travel-blog.metadata[0].name}.${kubernetes_namespace.travel-blog.metadata[0].name}.svc.cluster.local" + source = "../../modules/kubernetes/anubis_instance" + name = "travel" + namespace = kubernetes_namespace.travel-blog.metadata[0].name + target_url = "http://${kubernetes_service.travel-blog.metadata[0].name}.${kubernetes_namespace.travel-blog.metadata[0].name}.svc.cluster.local" + shared_store_url = "redis://redis-master.redis.svc.cluster.local:6379/11" } module "ingress" {