anubis: re-protect f1 with a per-host policy that allows JSON routes
Earlier f1 revert left the host fully unprotected (no Anubis,
exclude_crowdsec=true on the ingress already). Re-add Anubis with
a custom policy_yaml that:
- ALLOWs /_app/* (SvelteKit immutable JS/CSS chunks loaded before
any cookie exists), /openapi.json, /docs, /api/* (FastAPI meta).
- ALLOWs the 9 known JSON/proxy routes (schedule, streams,
embed, embed-asset, extract, extractors, health, proxy, relay)
so the SvelteKit SPA's XHRs return JSON instead of the challenge
HTML.
- Catch-all CHALLENGE for everything else — the SPA HTML pages
(which fall through to FastAPI's `/{path}` catch-all) get the
PoW gate.
The ALLOWed JSON routes are technically scrapeable by a determined
bot, but the user's stated goal is "avoid accidental scrapes" — the
HTML/SPA is the AI-training target, and that stays gated.
Verified: / → Anubis challenge HTML; /schedule, /streams → JSON;
/_app/.../app.js → text/javascript; ClaudeBot UA → Anubis deny page.
This commit is contained in:
parent
a89d4a7d2a
commit
04cb22fd3b
1 changed files with 43 additions and 5 deletions
|
|
@ -228,18 +228,56 @@ module "tls_secret" {
|
|||
}
|
||||
|
||||
|
||||
# NOTE: f1-stream serves its SPA + JSON data endpoints (/schedule, /embed,
|
||||
# /embed-asset, …) all on the same path tree, so putting Anubis in front
|
||||
# breaks XHR data fetches with "Unexpected token '<', '<!doctype '" — the
|
||||
# challenge HTML lands where JSON is expected. Anubis is removed for f1
|
||||
# until/unless we add a /api carve-out the way wrongmove does.
|
||||
# f1-stream serves its SvelteKit SPA via the FastAPI `/{path}` catch-all
|
||||
# and exposes 14 JSON/proxy routes at root (/schedule, /streams, /embed,
|
||||
# /embed-asset, /relay, /proxy, /extract, /extractors, /health). A flat
|
||||
# Anubis catch-all CHALLENGE breaks the SPA's XHRs with "Unexpected token
|
||||
# '<', '<!doctype '" because the schedule fetch lands on the challenge HTML.
|
||||
# Custom policy: ALLOW the known JSON routes + SvelteKit `_app/` assets
|
||||
# (which load before any user has a chance to solve PoW), CHALLENGE
|
||||
# everything else — the HTML pages.
|
||||
module "anubis" {
|
||||
source = "../../modules/kubernetes/anubis_instance"
|
||||
name = "f1"
|
||||
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
||||
target_url = "http://${kubernetes_service.f1-stream.metadata[0].name}.${kubernetes_namespace.f1-stream.metadata[0].name}.svc.cluster.local"
|
||||
policy_yaml = <<-EOT
|
||||
bots:
|
||||
- import: (data)/bots/_deny-pathological.yaml
|
||||
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
||||
- import: (data)/meta/ai-block-aggressive.yaml
|
||||
- import: (data)/crawlers/_allow-good.yaml
|
||||
- import: (data)/clients/x-firefox-ai.yaml
|
||||
- import: (data)/common/keep-internet-working.yaml
|
||||
# SvelteKit immutable assets (CSS/JS chunks) and OpenAPI/health routes —
|
||||
# served pre-cookie, must pass without challenge.
|
||||
- name: f1-svelte-assets-and-meta
|
||||
path_regex: ^/(_app/|openapi\.json|docs|api/)
|
||||
action: ALLOW
|
||||
# Application JSON routes — XHR'd by the SPA after the user has solved
|
||||
# the PoW for `/`. We allow them unconditionally because the alternative
|
||||
# (carve-out per route via separate Ingress objects) is brittle and
|
||||
# because the data they expose (stream URLs, schedule metadata) is not
|
||||
# the AI-scraping target — the HTML/SPA is.
|
||||
- name: f1-data-routes
|
||||
path_regex: ^/(embed|embed-asset|extract|extractors|health|proxy|relay|schedule|streams)(/|\?|$)
|
||||
action: ALLOW
|
||||
- name: catchall-challenge
|
||||
path_regex: .*
|
||||
action: CHALLENGE
|
||||
EOT
|
||||
}
|
||||
|
||||
module "ingress" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
dns_type = "non-proxied"
|
||||
namespace = kubernetes_namespace.f1-stream.metadata[0].name
|
||||
name = "f1"
|
||||
service_name = module.anubis.service_name
|
||||
port = module.anubis.service_port
|
||||
tls_secret_name = var.tls_secret_name
|
||||
exclude_crowdsec = true
|
||||
anti_ai_scraping = false
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "F1 Stream"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue