From 3da01e6e1e6ef0c1921b0f48cd841f32139cf852 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Sun, 10 May 2026 14:55:50 +0000
Subject: [PATCH] anubis: only challenge GET requests; allow everything else
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PrivateBin's XHR `POST /` (paste creation) was the trigger — Anubis's
catch-all CHALLENGE rule served an HTML challenge page where the JS
expected JSON, breaking paste creation entirely. Same shape will hit
any SPA XHR or CORS preflight on the other 8 Anubis-fronted sites
(homepage actions, kms upload-then-poll, wrongmove search refresh,
jsoncrack share, etc.) the moment it gets exercised.

Add an `ALLOW` rule keyed on `method != "GET"` between the AI/UA-block
imports and the catch-all CHALLENGE. Rationale:

  * AI scrapers consume GET response bodies — they don't POST.
  * State-mutating XHRs and OPTIONS preflight need to bypass the
    challenge or the app breaks.
  * CrowdSec + per-route rate-limit + app-level auth already cover
    abuse on mutating methods, so this gives up nothing.
  * Hard-deny rules for known-bad bots run first, so a declared bad
    bot can't sneak through by sending a POST.

Also added a `checksum/policy` annotation on the Anubis pod template
sourced from `sha256(coalesce(var.policy_yaml, default_policy_yaml))`
so future policy changes auto-roll the deployment instead of needing
a manual `kubectl rollout restart`.

f1-stream had its own policy override (path carve-outs for SvelteKit
asset hashes and JSON data routes); mirrored the new rule there too.

Applied to all 8 Anubis-fronted stacks: blog, kms, f1-stream,
travel_blog, real-estate-crawler, homepage, cyberchef, jsoncrack.
Verified per stack: GET / returns the Anubis challenge page; POST,
PUT, DELETE, OPTIONS pass through to the backend (HTTP 301/405/502
from the upstream app, never the Anubis "not a bot" HTML).
---
 modules/kubernetes/anubis_instance/main.tf | 25 ++++++++++++++++++----
 stacks/f1-stream/main.tf                   |  5 +++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/modules/kubernetes/anubis_instance/main.tf b/modules/kubernetes/anubis_instance/main.tf
index 55129bbf..a2572f2e 100644
--- a/modules/kubernetes/anubis_instance/main.tf
+++ b/modules/kubernetes/anubis_instance/main.tf
@@ -95,7 +95,8 @@ locals {
   # capability is filtered.
   default_policy_yaml = <<-EOT
     bots:
-      # Hard-deny known-bad bots first.
+      # Hard-deny known-bad bots first — runs before the method bypass so
+      # a declared bad bot can't sneak through by sending a POST.
       - import: (data)/bots/_deny-pathological.yaml
       - import: (data)/bots/aggressive-brazilian-scrapers.yaml
       # Hard-deny declared AI/LLM crawlers (ClaudeBot, GPTBot, Bytespider, …).
@@ -107,9 +108,19 @@ locals {
       # Allow /.well-known, /robots.txt, /favicon.*, /sitemap.xml — keeps
       # the internet working for benign crawlers and discovery clients.
       - import: (data)/common/keep-internet-working.yaml
-      # Catch-all: every remaining request must solve the challenge. This
-      # closes the "unmatched UA falls through to ALLOW" gap that lets
-      # curl/wget/Python-requests scrape non-CDN-fronted hosts.
+      # Allow every non-GET request through. Rationale: AI scrapers steal
+      # the body of GETs (page content) — they don't POST. State-mutating
+      # methods come from app XHRs (PrivateBin paste creation, Komga
+      # uploads, SPA actions) and CORS preflight (OPTIONS). Challenging
+      # those breaks the app, because the JS expects JSON and gets the
+      # Anubis HTML challenge page. CrowdSec + rate-limit + per-app auth
+      # already cover abuse on these methods.
+      - name: allow-non-get-methods
+        action: ALLOW
+        expression: method != "GET"
+      # Catch-all: every remaining (GET) request must solve the challenge.
+      # This closes the "unmatched UA falls through to ALLOW" gap that
+      # lets curl/wget/Python-requests scrape non-CDN-fronted hosts.
       - name: catchall-challenge
         path_regex: .*
         action: CHALLENGE
@@ -185,6 +196,12 @@ resource "kubernetes_deployment" "anubis" {
     template {
       metadata {
         labels = local.labels
+        annotations = {
+          # Roll the deployment whenever the policy YAML changes — Anubis
+          # reads the policy at startup, so a ConfigMap update alone
+          # doesn't take effect until pods restart.
+          "checksum/policy" = sha256(coalesce(var.policy_yaml, local.default_policy_yaml))
+        }
       }
 
       spec {
diff --git a/stacks/f1-stream/main.tf b/stacks/f1-stream/main.tf
index f219bd79..42063108 100644
--- a/stacks/f1-stream/main.tf
+++ b/stacks/f1-stream/main.tf
@@ -262,6 +262,11 @@ module "anubis" {
       - name: f1-data-routes
         path_regex: ^/(embed|embed-asset|extract|extractors|health|proxy|relay|schedule|streams)(/|\?|$)
         action: ALLOW
+      # Allow non-GET methods unconditionally — AI scrapers GET the body,
+      # they don't POST. Mutating XHRs and CORS preflight need to bypass.
+      - name: allow-non-get-methods
+        action: ALLOW
+        expression: method != "GET"
       - name: catchall-challenge
         path_regex: .*
         action: CHALLENGE