anubis: strict bot policy — catch-all CHALLENGE for unmatched UAs

The default upstream policy only WEIGHs Mozilla|Opera UAs and lets
everything else (curl, wget, python-requests, scrapy, headless CLI
scrapers) fall through to the implicit ALLOW. On non-CDN-fronted
hosts (kms, anything dns_type=non-proxied) this meant a plain
`curl https://kms.viktorbarzin.me/` returned the real backend
content with no challenge — defeating the whole point of the
"avoid casual scrapers" intent.

Now the module ships a custom POLICY_FNAME mounted via ConfigMap:
- Imports the upstream deny-pathological / ai-block-aggressive /
  allow-good-crawlers / keep-internet-working snippets unchanged
- Adds a final `path_regex: .*` → action: CHALLENGE catch-all

Result: only IP-verified search engines (Googlebot from Google IPs,
Bingbot, etc.) and well-known paths (robots.txt, .well-known,
favicon, sitemap) skip the challenge. Everything else — including
spoofed-Googlebot-UA-from-random-IP — solves PoW or gets nothing.

Verified post-apply: curl default UA on viktorbarzin.me + kms +
travel returns the Anubis challenge HTML; /robots.txt still 200s
straight through.
This commit is contained in:
Viktor Barzin 2026-05-10 00:21:56 +00:00
parent c73cd26a73
commit 12fbc404ec

View file

@ -66,6 +66,12 @@ variable "memory" {
description = "requests==limits memory. Anubis docs suggest 128Mi handles many concurrent clients."
}
variable "policy_yaml" {
type = string
default = null
description = "Override the strict default bot-policy YAML. Leave null to use the catch-all CHALLENGE policy."
}
variable "cpu_request" {
type = string
default = "20m"
@ -81,6 +87,45 @@ locals {
"app.kubernetes.io/component" = "ai-bot-challenge"
"app.kubernetes.io/managed-by" = "terraform"
}
# Strict bot policy. Default Anubis policy only WEIGHs Mozilla|Opera UAs
# and lets unmatched UAs (curl, wget, Python-requests, scrapy, headless
# CLI scrapers) fall through to ALLOW. We import the same upstream
# snippets and append a catch-all CHALLENGE so anyone without JS+PoW
# capability is filtered.
default_policy_yaml = <<-EOT
bots:
# Hard-deny known-bad bots first.
- import: (data)/bots/_deny-pathological.yaml
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
# Hard-deny declared AI/LLM crawlers (ClaudeBot, GPTBot, Bytespider, ).
- import: (data)/meta/ai-block-aggressive.yaml
# Whitelist legitimate search-engine crawlers (Googlebot, Bingbot, ).
- import: (data)/crawlers/_allow-good.yaml
# Challenge Firefox AI previews specifically.
- import: (data)/clients/x-firefox-ai.yaml
# Allow /.well-known, /robots.txt, /favicon.*, /sitemap.xml keeps
# the internet working for benign crawlers and discovery clients.
- import: (data)/common/keep-internet-working.yaml
# Catch-all: every remaining request must solve the challenge. This
# closes the "unmatched UA falls through to ALLOW" gap that lets
# curl/wget/Python-requests scrape non-CDN-fronted hosts.
- name: catchall-challenge
path_regex: .*
action: CHALLENGE
EOT
}
# Bot policy ConfigMap. Mounted into the pod and referenced by POLICY_FNAME.
resource "kubernetes_config_map" "policy" {
metadata {
name = "${local.full_name}-policy"
namespace = var.namespace
labels = local.labels
}
data = {
"botPolicies.yaml" = coalesce(var.policy_yaml, local.default_policy_yaml)
}
}
# ED25519 signing key pulled from Vault `secret/viktor` -> field
@ -222,12 +267,21 @@ resource "kubernetes_deployment" "anubis" {
# Mounted from the ESO-managed Secret below.
value = "/keys/key"
}
env {
name = "POLICY_FNAME"
value = "/config/botPolicies.yaml"
}
volume_mount {
name = "ed25519-key"
mount_path = "/keys"
read_only = true
}
volume_mount {
name = "policy"
mount_path = "/config"
read_only = true
}
resources {
requests = {
@ -281,6 +335,12 @@ resource "kubernetes_deployment" "anubis" {
}
}
}
volume {
name = "policy"
config_map {
name = kubernetes_config_map.policy.metadata[0].name
}
}
}
}
}