From 3da01e6e1e6ef0c1921b0f48cd841f32139cf852 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 10 May 2026 14:55:50 +0000 Subject: [PATCH] anubis: only challenge GET requests; allow everything else MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PrivateBin's XHR `POST /` (paste creation) was the trigger — Anubis's catch-all CHALLENGE rule served an HTML challenge page where the JS expected JSON, breaking paste creation entirely. Same shape will hit any SPA XHR or CORS preflight on the other 8 Anubis-fronted sites (homepage actions, kms upload-then-poll, wrongmove search refresh, jsoncrack share, etc.) the moment it gets exercised. Add an `ALLOW` rule keyed on `method != "GET"` between the AI/UA-block imports and the catch-all CHALLENGE. Rationale: * AI scrapers consume GET response bodies — they don't POST. * State-mutating XHRs and OPTIONS preflight need to bypass the challenge or the app breaks. * CrowdSec + per-route rate-limit + app-level auth already cover abuse on mutating methods, so this gives up nothing. * Hard-deny rules for known-bad bots run first, so a declared bad bot can't sneak through by sending a POST. Also added a `checksum/policy` annotation on the Anubis pod template sourced from `sha256(coalesce(var.policy_yaml, default_policy_yaml))` so future policy changes auto-roll the deployment instead of needing a manual `kubectl rollout restart`. f1-stream had its own policy override (path carve-outs for SvelteKit asset hashes and JSON data routes); mirrored the new rule there too. Applied to all 8 Anubis-fronted stacks: blog, kms, f1-stream, travel_blog, real-estate-crawler, homepage, cyberchef, jsoncrack. Verified per stack: GET / returns the Anubis challenge page; POST, PUT, DELETE, OPTIONS pass through to the backend (HTTP 301/405/502 from the upstream app, never the Anubis "not a bot" HTML). --- modules/kubernetes/anubis_instance/main.tf | 25 ++++++++++++++++++---- stacks/f1-stream/main.tf | 5 +++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/modules/kubernetes/anubis_instance/main.tf b/modules/kubernetes/anubis_instance/main.tf index 55129bbf..a2572f2e 100644 --- a/modules/kubernetes/anubis_instance/main.tf +++ b/modules/kubernetes/anubis_instance/main.tf @@ -95,7 +95,8 @@ locals { # capability is filtered. default_policy_yaml = <<-EOT bots: - # Hard-deny known-bad bots first. + # Hard-deny known-bad bots first — runs before the method bypass so + # a declared bad bot can't sneak through by sending a POST. - import: (data)/bots/_deny-pathological.yaml - import: (data)/bots/aggressive-brazilian-scrapers.yaml # Hard-deny declared AI/LLM crawlers (ClaudeBot, GPTBot, Bytespider, …). @@ -107,9 +108,19 @@ locals { # Allow /.well-known, /robots.txt, /favicon.*, /sitemap.xml — keeps # the internet working for benign crawlers and discovery clients. - import: (data)/common/keep-internet-working.yaml - # Catch-all: every remaining request must solve the challenge. This - # closes the "unmatched UA falls through to ALLOW" gap that lets - # curl/wget/Python-requests scrape non-CDN-fronted hosts. + # Allow every non-GET request through. Rationale: AI scrapers steal + # the body of GETs (page content) — they don't POST. State-mutating + # methods come from app XHRs (PrivateBin paste creation, Komga + # uploads, SPA actions) and CORS preflight (OPTIONS). Challenging + # those breaks the app, because the JS expects JSON and gets the + # Anubis HTML challenge page. CrowdSec + rate-limit + per-app auth + # already cover abuse on these methods. + - name: allow-non-get-methods + action: ALLOW + expression: method != "GET" + # Catch-all: every remaining (GET) request must solve the challenge. + # This closes the "unmatched UA falls through to ALLOW" gap that + # lets curl/wget/Python-requests scrape non-CDN-fronted hosts. - name: catchall-challenge path_regex: .* action: CHALLENGE @@ -185,6 +196,12 @@ resource "kubernetes_deployment" "anubis" { template { metadata { labels = local.labels + annotations = { + # Roll the deployment whenever the policy YAML changes — Anubis + # reads the policy at startup, so a ConfigMap update alone + # doesn't take effect until pods restart. + "checksum/policy" = sha256(coalesce(var.policy_yaml, local.default_policy_yaml)) + } } spec { diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf index f219bd79..42063108 100644 --- a/stacks/f1-stream/main.tf +++ b/stacks/f1-stream/main.tf @@ -262,6 +262,11 @@ module "anubis" { - name: f1-data-routes path_regex: ^/(embed|embed-asset|extract|extractors|health|proxy|relay|schedule|streams)(/|\?|$) action: ALLOW + # Allow non-GET methods unconditionally — AI scrapers GET the body, + # they don't POST. Mutating XHRs and CORS preflight need to bypass. + - name: allow-non-get-methods + action: ALLOW + expression: method != "GET" - name: catchall-challenge path_regex: .* action: CHALLENGE