Anubis pre-2026-05-16 ran at replicas=1 because in-flight PoW challenge
state lived in process memory — a challenge issued by pod A wouldn't be
verifiable by pod B (HTTP 500 "store: key not found"). The PDB at
`minAvailable=1` made this worse: with replicas=1 the eviction API can
NEVER satisfy the constraint, so every drain on a node hosting an Anubis
pod looped forever. This is what stalled the manual K8s upgrade on
2026-05-11 (had to delete pods directly to bypass eviction) and was
about to block kured on Monday 2026-05-18 once the kured sentinel fix
landed.
Anubis upstream has first-class support for a Valkey/Redis-protocol
shared store (documented as the "Kubernetes worker pool" pattern).
Wire it up:
- modules/kubernetes/anubis_instance: add `shared_store_url` variable.
When set, appends a `store: { backend: valkey, parameters: { url } }`
block to the rendered policy YAML and defaults replicas to 2 (capped
at 2). PDB switched from `minAvailable=1` to `maxUnavailable=1` so
drains can take down one pod at a time. topologySpreadConstraint
tightened to `DoNotSchedule` so the two replicas land on different
nodes — a single node loss never takes a whole Anubis instance down.
- All 8 call sites (cyberchef, jsoncrack, kms, homepage, blog,
travel_blog, real-estate-crawler, f1-stream) opted in. Each picks a
unique Redis DB index (5–12) on `redis-master.redis:6379`. Cluster
Redis already runs HA via Sentinel + haproxy, no new infra needed.
Verified: every Anubis Deployment now 2/2 Ready with pods on different
nodes; PDBs allow 1 disruption; Redis DBs 5,7,8,10 already populated
by live traffic post-apply; Palo Alto Networks scanner hit blog right
after apply and the challenge log shows the new state path.
Drain on any worker now succeeds without a `predrain_unstick` workaround
— eviction API is satisfied because at most one pod is unavailable at a
time, and the other replica keeps serving. Monday's kured reboot wave
should roll through cleanly.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
474 lines
15 KiB
HCL
474 lines
15 KiB
HCL
terraform {
|
|
required_providers {
|
|
kubernetes = {
|
|
source = "hashicorp/kubernetes"
|
|
}
|
|
}
|
|
}
|
|
|
|
# Per-site Anubis reverse proxy.
|
|
# Sits between Traefik and the real backend. On first visit, serves a
|
|
# proof-of-work challenge; on success, drops a long-lived JWT cookie and
|
|
# proxies the request through to `target_url`.
|
|
#
|
|
# Sharing a single ed25519 signing key across instances + COOKIE_DOMAIN at
|
|
# the registrable domain means a token solved on one viktorbarzin.me subdomain
|
|
# is honoured by every other Anubis-fronted site.
|
|
|
|
variable "name" {
|
|
type = string
|
|
description = "Short logical name (e.g. \"blog\"). Used to derive Service / Deployment / Secret names as anubis-<name>."
|
|
}
|
|
|
|
variable "namespace" {
|
|
type = string
|
|
description = "Namespace to deploy into — typically the same as the protected backend service."
|
|
}
|
|
|
|
variable "target_url" {
|
|
type = string
|
|
description = "Backend URL Anubis forwards passing requests to (e.g. http://blog.website.svc.cluster.local)."
|
|
}
|
|
|
|
variable "cookie_domain" {
|
|
type = string
|
|
default = "viktorbarzin.me"
|
|
description = "Cookie domain — set to the registrable domain so a single PoW solve covers every Anubis-fronted subdomain."
|
|
}
|
|
|
|
variable "difficulty" {
|
|
type = number
|
|
default = 2
|
|
description = "PoW difficulty (leading-zero hex chars). 2 = ~250ms desktop / ~700ms mobile. Bump for stronger filtering."
|
|
}
|
|
|
|
variable "cookie_expiration_hours" {
|
|
type = number
|
|
default = 720 # 30 days
|
|
description = "Lifetime of the issued JWT cookie in hours."
|
|
}
|
|
|
|
variable "image_tag" {
|
|
type = string
|
|
default = "v1.25.0"
|
|
description = "ghcr.io/techarohq/anubis tag — pin to a release, never :latest."
|
|
}
|
|
|
|
variable "replicas" {
|
|
type = number
|
|
default = null
|
|
description = "Optional replica count override. When null, defaults to 1 if shared_store_url is null and 2 otherwise. Capped at 2 — Redis can handle more but anti-affinity assumes ≤2 replicas per Anubis instance on a 5-node cluster."
|
|
|
|
validation {
|
|
condition = var.replicas == null || (var.replicas >= 1 && var.replicas <= 2)
|
|
error_message = "replicas must be 1 or 2 (or null to auto-pick from shared_store_url presence)."
|
|
}
|
|
}
|
|
|
|
variable "shared_store_url" {
|
|
type = string
|
|
default = null
|
|
description = "If set, Anubis stores in-flight challenge state in this Valkey/Redis-protocol URL instead of in-process memory, enabling HA across replicas. Format: redis://host:port/<db-index>. The DB index MUST be unique per Anubis instance (this module assumes 16 DBs available, common in standalone Redis). Cluster Redis is redis-master.redis.svc.cluster.local:6379 with HA via Sentinel + haproxy. Without this, replicas>1 causes ~50% PoW failures (challenge issued by pod A, solved against pod B → 500)."
|
|
|
|
validation {
|
|
condition = var.shared_store_url == null || can(regex("^redis://[a-zA-Z0-9_.-]+:[0-9]+/[0-9]+$", var.shared_store_url))
|
|
error_message = "shared_store_url must look like redis://host:port/<db-index> (explicit DB index required)."
|
|
}
|
|
}
|
|
|
|
variable "memory" {
|
|
type = string
|
|
default = "128Mi"
|
|
description = "requests==limits memory. Anubis docs suggest 128Mi handles many concurrent clients."
|
|
}
|
|
|
|
variable "policy_yaml" {
|
|
type = string
|
|
default = null
|
|
description = "Override the strict default bot-policy YAML. Leave null to use the catch-all CHALLENGE policy."
|
|
}
|
|
|
|
variable "cpu_request" {
|
|
type = string
|
|
default = "20m"
|
|
description = "CPU request. PoW verification is server-cheap (just hash check)."
|
|
}
|
|
|
|
locals {
|
|
full_name = "anubis-${var.name}"
|
|
labels = {
|
|
"app" = local.full_name
|
|
"app.kubernetes.io/name" = "anubis"
|
|
"app.kubernetes.io/instance" = local.full_name
|
|
"app.kubernetes.io/component" = "ai-bot-challenge"
|
|
"app.kubernetes.io/managed-by" = "terraform"
|
|
}
|
|
|
|
# Effective replicas: caller-override > shared-store-aware default.
|
|
effective_replicas = coalesce(var.replicas, var.shared_store_url == null ? 1 : 2)
|
|
|
|
# Anubis store config. With backend=valkey, multiple Anubis pods can share
|
|
# in-flight PoW state and a challenge issued by pod A is verifiable by pod
|
|
# B. Default backend is in-process memory which only works at replicas=1.
|
|
store_yaml_block = var.shared_store_url == null ? "" : <<-EOT
|
|
|
|
|
|
store:
|
|
backend: valkey
|
|
parameters:
|
|
url: "${var.shared_store_url}"
|
|
EOT
|
|
|
|
# Strict bot policy. Default Anubis policy only WEIGHs Mozilla|Opera UAs
|
|
# and lets unmatched UAs (curl, wget, Python-requests, scrapy, headless
|
|
# CLI scrapers) fall through to ALLOW. We import the same upstream
|
|
# snippets and append a catch-all CHALLENGE so anyone without JS+PoW
|
|
# capability is filtered.
|
|
default_policy_yaml = <<-EOT
|
|
bots:
|
|
# Hard-deny known-bad bots first — runs before the method bypass so
|
|
# a declared bad bot can't sneak through by sending a POST.
|
|
- import: (data)/bots/_deny-pathological.yaml
|
|
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
|
# Hard-deny declared AI/LLM crawlers (ClaudeBot, GPTBot, Bytespider, …).
|
|
- import: (data)/meta/ai-block-aggressive.yaml
|
|
# Whitelist legitimate search-engine crawlers (Googlebot, Bingbot, …).
|
|
- import: (data)/crawlers/_allow-good.yaml
|
|
# Challenge Firefox AI previews specifically.
|
|
- import: (data)/clients/x-firefox-ai.yaml
|
|
# Allow /.well-known, /robots.txt, /favicon.*, /sitemap.xml — keeps
|
|
# the internet working for benign crawlers and discovery clients.
|
|
- import: (data)/common/keep-internet-working.yaml
|
|
# Allow every non-GET request through. Rationale: AI scrapers steal
|
|
# the body of GETs (page content) — they don't POST. State-mutating
|
|
# methods come from app XHRs (PrivateBin paste creation, Komga
|
|
# uploads, SPA actions) and CORS preflight (OPTIONS). Challenging
|
|
# those breaks the app, because the JS expects JSON and gets the
|
|
# Anubis HTML challenge page. CrowdSec + rate-limit + per-app auth
|
|
# already cover abuse on these methods.
|
|
- name: allow-non-get-methods
|
|
action: ALLOW
|
|
expression: method != "GET"
|
|
# Catch-all: every remaining (GET) request must solve the challenge.
|
|
# This closes the "unmatched UA falls through to ALLOW" gap that
|
|
# lets curl/wget/Python-requests scrape non-CDN-fronted hosts.
|
|
- name: catchall-challenge
|
|
path_regex: .*
|
|
action: CHALLENGE
|
|
EOT
|
|
|
|
# Final policy YAML: defaults (or caller override) plus an optional store
|
|
# block when shared_store_url is set. Store block is module-managed and
|
|
# appended universally — callers passing a custom policy_yaml shouldn't
|
|
# include their own `store:` block (they would collide).
|
|
rendered_policy_yaml = "${coalesce(var.policy_yaml, local.default_policy_yaml)}${local.store_yaml_block}"
|
|
}
|
|
|
|
# Bot policy ConfigMap. Mounted into the pod and referenced by POLICY_FNAME.
|
|
resource "kubernetes_config_map" "policy" {
|
|
metadata {
|
|
name = "${local.full_name}-policy"
|
|
namespace = var.namespace
|
|
labels = local.labels
|
|
}
|
|
data = {
|
|
"botPolicies.yaml" = local.rendered_policy_yaml
|
|
}
|
|
}
|
|
|
|
# ED25519 signing key — pulled from Vault `secret/viktor` -> field
|
|
# `anubis_ed25519_key`. Same key across every instance so JWTs are
|
|
# cross-validatable, enabling cross-subdomain SSO.
|
|
resource "kubernetes_manifest" "ed25519_secret" {
|
|
manifest = {
|
|
apiVersion = "external-secrets.io/v1beta1"
|
|
kind = "ExternalSecret"
|
|
metadata = {
|
|
name = "${local.full_name}-key"
|
|
namespace = var.namespace
|
|
}
|
|
spec = {
|
|
refreshInterval = "1h"
|
|
secretStoreRef = {
|
|
name = "vault-kv"
|
|
kind = "ClusterSecretStore"
|
|
}
|
|
target = {
|
|
name = "${local.full_name}-key"
|
|
creationPolicy = "Owner"
|
|
}
|
|
data = [{
|
|
secretKey = "key"
|
|
remoteRef = {
|
|
key = "viktor"
|
|
property = "anubis_ed25519_key"
|
|
}
|
|
}]
|
|
}
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_deployment" "anubis" {
|
|
metadata {
|
|
name = local.full_name
|
|
namespace = var.namespace
|
|
labels = local.labels
|
|
}
|
|
|
|
spec {
|
|
replicas = local.effective_replicas
|
|
|
|
selector {
|
|
match_labels = { app = local.full_name }
|
|
}
|
|
|
|
strategy {
|
|
type = "RollingUpdate"
|
|
rolling_update {
|
|
max_surge = 1
|
|
max_unavailable = 0
|
|
}
|
|
}
|
|
|
|
template {
|
|
metadata {
|
|
labels = local.labels
|
|
annotations = {
|
|
# Roll the deployment whenever the policy YAML changes — Anubis
|
|
# reads the policy at startup, so a ConfigMap update alone
|
|
# doesn't take effect until pods restart.
|
|
"checksum/policy" = sha256(local.rendered_policy_yaml)
|
|
}
|
|
}
|
|
|
|
spec {
|
|
# Spread replicas across nodes to survive a single node failure.
|
|
# DoNotSchedule (not ScheduleAnyway) so 2 replicas are forced onto
|
|
# different hosts — otherwise the scheduler may pile them on the
|
|
# same node and a single node reboot takes the whole Anubis instance
|
|
# down despite replicas=2. On a 5-node cluster the spread is always
|
|
# satisfiable; the worst case (4 nodes unavailable) leaves one
|
|
# replica Pending, but the other keeps serving.
|
|
topology_spread_constraint {
|
|
max_skew = 1
|
|
topology_key = "kubernetes.io/hostname"
|
|
when_unsatisfiable = "DoNotSchedule"
|
|
label_selector {
|
|
match_labels = { app = local.full_name }
|
|
}
|
|
}
|
|
|
|
container {
|
|
name = "anubis"
|
|
image = "ghcr.io/techarohq/anubis:${var.image_tag}"
|
|
|
|
port {
|
|
name = "http"
|
|
container_port = 8923
|
|
}
|
|
port {
|
|
name = "metrics"
|
|
container_port = 9090
|
|
}
|
|
|
|
env {
|
|
name = "BIND"
|
|
value = ":8923"
|
|
}
|
|
env {
|
|
name = "METRICS_BIND"
|
|
value = ":9090"
|
|
}
|
|
env {
|
|
name = "TARGET"
|
|
value = var.target_url
|
|
}
|
|
env {
|
|
name = "DIFFICULTY"
|
|
value = tostring(var.difficulty)
|
|
}
|
|
env {
|
|
name = "COOKIE_EXPIRATION_TIME"
|
|
value = "${var.cookie_expiration_hours}h"
|
|
}
|
|
# Cross-subdomain SSO: cookie scoped to the registrable domain so
|
|
# a JWT solved on any Anubis-fronted subdomain is honoured on every
|
|
# other one. (COOKIE_DOMAIN and COOKIE_DYNAMIC_DOMAIN are mutually
|
|
# exclusive — picking the explicit form.)
|
|
env {
|
|
name = "COOKIE_DOMAIN"
|
|
value = var.cookie_domain
|
|
}
|
|
env {
|
|
name = "COOKIE_SECURE"
|
|
value = "true"
|
|
}
|
|
env {
|
|
name = "COOKIE_SAME_SITE"
|
|
value = "Lax"
|
|
}
|
|
# Built-in robots.txt that disallows known AI scrapers — well-behaved
|
|
# bots get blocked here without ever paying the PoW cost.
|
|
env {
|
|
name = "SERVE_ROBOTS_TXT"
|
|
value = "true"
|
|
}
|
|
# Drop cluster-internal IPs from XFF so Anubis sees the real client.
|
|
env {
|
|
name = "XFF_STRIP_PRIVATE"
|
|
value = "true"
|
|
}
|
|
env {
|
|
name = "SLOG_LEVEL"
|
|
value = "INFO"
|
|
}
|
|
env {
|
|
name = "ED25519_PRIVATE_KEY_HEX_FILE"
|
|
# Mounted from the ESO-managed Secret below.
|
|
value = "/keys/key"
|
|
}
|
|
env {
|
|
name = "POLICY_FNAME"
|
|
value = "/config/botPolicies.yaml"
|
|
}
|
|
|
|
volume_mount {
|
|
name = "ed25519-key"
|
|
mount_path = "/keys"
|
|
read_only = true
|
|
}
|
|
volume_mount {
|
|
name = "policy"
|
|
mount_path = "/config"
|
|
read_only = true
|
|
}
|
|
|
|
resources {
|
|
requests = {
|
|
cpu = var.cpu_request
|
|
memory = var.memory
|
|
}
|
|
limits = {
|
|
memory = var.memory
|
|
}
|
|
}
|
|
|
|
# Liveness + readiness on the metrics endpoint (zero auth, always 200).
|
|
liveness_probe {
|
|
http_get {
|
|
path = "/metrics"
|
|
port = "metrics"
|
|
}
|
|
initial_delay_seconds = 10
|
|
period_seconds = 30
|
|
failure_threshold = 3
|
|
}
|
|
readiness_probe {
|
|
http_get {
|
|
path = "/metrics"
|
|
port = "metrics"
|
|
}
|
|
initial_delay_seconds = 2
|
|
period_seconds = 5
|
|
failure_threshold = 2
|
|
}
|
|
|
|
security_context {
|
|
run_as_non_root = true
|
|
run_as_user = 1000
|
|
run_as_group = 1000
|
|
allow_privilege_escalation = false
|
|
read_only_root_filesystem = true
|
|
capabilities {
|
|
drop = ["ALL"]
|
|
}
|
|
}
|
|
}
|
|
|
|
volume {
|
|
name = "ed25519-key"
|
|
secret {
|
|
secret_name = "${local.full_name}-key"
|
|
items {
|
|
key = "key"
|
|
path = "key"
|
|
}
|
|
}
|
|
}
|
|
volume {
|
|
name = "policy"
|
|
config_map {
|
|
name = kubernetes_config_map.policy.metadata[0].name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
lifecycle {
|
|
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
|
ignore_changes = [spec[0].template[0].spec[0].dns_config]
|
|
}
|
|
|
|
depends_on = [kubernetes_manifest.ed25519_secret]
|
|
}
|
|
|
|
resource "kubernetes_service" "anubis" {
|
|
metadata {
|
|
name = local.full_name
|
|
namespace = var.namespace
|
|
labels = local.labels
|
|
annotations = {
|
|
"prometheus.io/scrape" = "true"
|
|
"prometheus.io/path" = "/metrics"
|
|
"prometheus.io/port" = "9090"
|
|
}
|
|
}
|
|
|
|
spec {
|
|
selector = { app = local.full_name }
|
|
port {
|
|
name = "http"
|
|
port = 8080
|
|
target_port = 8923
|
|
protocol = "TCP"
|
|
}
|
|
port {
|
|
name = "metrics"
|
|
port = 9090
|
|
target_port = 9090
|
|
protocol = "TCP"
|
|
}
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_pod_disruption_budget_v1" "anubis" {
|
|
metadata {
|
|
name = local.full_name
|
|
namespace = var.namespace
|
|
}
|
|
spec {
|
|
# max_unavailable=1 means: at most one pod can be voluntarily disrupted
|
|
# at a time. With replicas=2 this allows clean rolling drains (one pod
|
|
# goes down → other serves traffic → first recreates elsewhere). With
|
|
# replicas=1 (no shared store) this is functionally equivalent to no
|
|
# PDB — drain proceeds, brief outage, new pod schedules elsewhere.
|
|
# Was min_available=1 before 2026-05-16 which deadlocked drains on
|
|
# single-replica instances (eviction API can never satisfy the
|
|
# constraint at replicas=1). See PM-2026-05-11.
|
|
max_unavailable = "1"
|
|
selector {
|
|
match_labels = { app = local.full_name }
|
|
}
|
|
}
|
|
}
|
|
|
|
output "service_name" {
|
|
value = kubernetes_service.anubis.metadata[0].name
|
|
description = "ClusterIP service name. Pass this to ingress_factory's `service_name` so Traefik routes through Anubis."
|
|
}
|
|
|
|
output "service_port" {
|
|
value = 8080
|
|
description = "Service port. Anubis listens on 8923 inside; the Service exposes 8080."
|
|
}
|