anubis: re-protect f1 with a per-host policy that allows JSON routes

Earlier f1 revert left the host fully unprotected (no Anubis, exclude_crowdsec=true on the ingress already). Re-add Anubis with a custom policy_yaml that: - ALLOWs /_app/* (SvelteKit immutable JS/CSS chunks loaded before any cookie exists), /openapi.json, /docs, /api/* (FastAPI meta). - ALLOWs the 9 known JSON/proxy routes (schedule, streams, embed, embed-asset, extract, extractors, health, proxy, relay) so the SvelteKit SPA's XHRs return JSON instead of the challenge HTML. - Catch-all CHALLENGE for everything else — the SPA HTML pages (which fall through to FastAPI's `/{path}` catch-all) get the PoW gate. The ALLOWed JSON routes are technically scrapeable by a determined bot, but the user's stated goal is "avoid accidental scrapes" — the HTML/SPA is the AI-training target, and that stays gated. Verified: / → Anubis challenge HTML; /schedule, /streams → JSON; /_app/.../app.js → text/javascript; ClaudeBot UA → Anubis deny page.
2026-05-10 01:24:50 +00:00 · 2026-05-10 01:24:50 +00:00 · 04cb22fd3b
commit 04cb22fd3b
parent a89d4a7d2a
1 changed files with 43 additions and 5 deletions
--- a/stacks/f1-stream/main.tf
+++ b/stacks/f1-stream/main.tf
@ -228,18 +228,56 @@ module "tls_secret" {
 }


-# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed,
-# /embed-asset, …) all on the same path tree, so putting Anubis in front
-# breaks XHR data fetches with "Unexpected token '<', '<!doctype '" — the
-# challenge HTML lands where JSON is expected. Anubis is removed for f1
-# until/unless we add a /api carve-out the way wrongmove does.
+# f1-stream serves its SvelteKit SPA via the FastAPI `/{path}` catch-all
+# and exposes 14 JSON/proxy routes at root (/schedule, /streams, /embed,
+# /embed-asset, /relay, /proxy, /extract, /extractors, /health). A flat
+# Anubis catch-all CHALLENGE breaks the SPA's XHRs with "Unexpected token
+# '<', '<!doctype '" because the schedule fetch lands on the challenge HTML.
+# Custom policy: ALLOW the known JSON routes + SvelteKit `_app/` assets
+# (which load before any user has a chance to solve PoW), CHALLENGE
+# everything else — the HTML pages.
+module "anubis" {
+  source     = "../../modules/kubernetes/anubis_instance"
+  name       = "f1"
+  namespace  = kubernetes_namespace.f1-stream.metadata[0].name
+  target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local"
+  policy_yaml = <<-EOT
+    bots:
+      - import: (data)/bots/_deny-pathological.yaml
+      - import: (data)/bots/aggressive-brazilian-scrapers.yaml
+      - import: (data)/meta/ai-block-aggressive.yaml
+      - import: (data)/crawlers/_allow-good.yaml
+      - import: (data)/clients/x-firefox-ai.yaml
+      - import: (data)/common/keep-internet-working.yaml
+      # SvelteKit immutable assets (CSS/JS chunks) and OpenAPI/health routes —
+      # served pre-cookie, must pass without challenge.
+      - name: f1-svelte-assets-and-meta
+        path_regex: ^/(_app/|openapi\.json|docs|api/)
+        action: ALLOW
+      # Application JSON routes — XHR'd by the SPA after the user has solved
+      # the PoW for `/`. We allow them unconditionally because the alternative
+      # (carve-out per route via separate Ingress objects) is brittle and
+      # because the data they expose (stream URLs, schedule metadata) is not
+      # the AI-scraping target — the HTML/SPA is.
+      - name: f1-data-routes
+        path_regex: ^/(embed|embed-asset|extract|extractors|health|proxy|relay|schedule|streams)(/|\?|$)
+        action: ALLOW
+      - name: catchall-challenge
+        path_regex: .*
+        action: CHALLENGE
+  EOT
+}
+
 module "ingress" {
  source           = "../../modules/kubernetes/ingress_factory"
  dns_type         = "non-proxied"
  namespace        = kubernetes_namespace.f1-stream.metadata[0].name
  name             = "f1"
+  service_name     = module.anubis.service_name
+  port             = module.anubis.service_port
  tls_secret_name  = var.tls_secret_name
  exclude_crowdsec = true
+  anti_ai_scraping = false
  extra_annotations = {
    "gethomepage.dev/enabled"      = "true"
    "gethomepage.dev/name"         = "F1 Stream"