anubis: strict bot policy — catch-all CHALLENGE for unmatched UAs
The default upstream policy only WEIGHs Mozilla|Opera UAs and lets everything else (curl, wget, python-requests, scrapy, headless CLI scrapers) fall through to the implicit ALLOW. On non-CDN-fronted hosts (kms, anything dns_type=non-proxied) this meant a plain `curl https://kms.viktorbarzin.me/` returned the real backend content with no challenge — defeating the whole point of the "avoid casual scrapers" intent. Now the module ships a custom POLICY_FNAME mounted via ConfigMap: - Imports the upstream deny-pathological / ai-block-aggressive / allow-good-crawlers / keep-internet-working snippets unchanged - Adds a final `path_regex: .*` → action: CHALLENGE catch-all Result: only IP-verified search engines (Googlebot from Google IPs, Bingbot, etc.) and well-known paths (robots.txt, .well-known, favicon, sitemap) skip the challenge. Everything else — including spoofed-Googlebot-UA-from-random-IP — solves PoW or gets nothing. Verified post-apply: curl default UA on viktorbarzin.me + kms + travel returns the Anubis challenge HTML; /robots.txt still 200s straight through.
This commit is contained in:
parent
c73cd26a73
commit
12fbc404ec
1 changed files with 60 additions and 0 deletions
|
|
@ -66,6 +66,12 @@ variable "memory" {
|
|||
description = "requests==limits memory. Anubis docs suggest 128Mi handles many concurrent clients."
|
||||
}
|
||||
|
||||
variable "policy_yaml" {
|
||||
type = string
|
||||
default = null
|
||||
description = "Override the strict default bot-policy YAML. Leave null to use the catch-all CHALLENGE policy."
|
||||
}
|
||||
|
||||
variable "cpu_request" {
|
||||
type = string
|
||||
default = "20m"
|
||||
|
|
@ -81,6 +87,45 @@ locals {
|
|||
"app.kubernetes.io/component" = "ai-bot-challenge"
|
||||
"app.kubernetes.io/managed-by" = "terraform"
|
||||
}
|
||||
|
||||
# Strict bot policy. Default Anubis policy only WEIGHs Mozilla|Opera UAs
|
||||
# and lets unmatched UAs (curl, wget, Python-requests, scrapy, headless
|
||||
# CLI scrapers) fall through to ALLOW. We import the same upstream
|
||||
# snippets and append a catch-all CHALLENGE so anyone without JS+PoW
|
||||
# capability is filtered.
|
||||
default_policy_yaml = <<-EOT
|
||||
bots:
|
||||
# Hard-deny known-bad bots first.
|
||||
- import: (data)/bots/_deny-pathological.yaml
|
||||
- import: (data)/bots/aggressive-brazilian-scrapers.yaml
|
||||
# Hard-deny declared AI/LLM crawlers (ClaudeBot, GPTBot, Bytespider, …).
|
||||
- import: (data)/meta/ai-block-aggressive.yaml
|
||||
# Whitelist legitimate search-engine crawlers (Googlebot, Bingbot, …).
|
||||
- import: (data)/crawlers/_allow-good.yaml
|
||||
# Challenge Firefox AI previews specifically.
|
||||
- import: (data)/clients/x-firefox-ai.yaml
|
||||
# Allow /.well-known, /robots.txt, /favicon.*, /sitemap.xml — keeps
|
||||
# the internet working for benign crawlers and discovery clients.
|
||||
- import: (data)/common/keep-internet-working.yaml
|
||||
# Catch-all: every remaining request must solve the challenge. This
|
||||
# closes the "unmatched UA falls through to ALLOW" gap that lets
|
||||
# curl/wget/Python-requests scrape non-CDN-fronted hosts.
|
||||
- name: catchall-challenge
|
||||
path_regex: .*
|
||||
action: CHALLENGE
|
||||
EOT
|
||||
}
|
||||
|
||||
# Bot policy ConfigMap. Mounted into the pod and referenced by POLICY_FNAME.
|
||||
resource "kubernetes_config_map" "policy" {
|
||||
metadata {
|
||||
name = "${local.full_name}-policy"
|
||||
namespace = var.namespace
|
||||
labels = local.labels
|
||||
}
|
||||
data = {
|
||||
"botPolicies.yaml" = coalesce(var.policy_yaml, local.default_policy_yaml)
|
||||
}
|
||||
}
|
||||
|
||||
# ED25519 signing key — pulled from Vault `secret/viktor` -> field
|
||||
|
|
@ -222,12 +267,21 @@ resource "kubernetes_deployment" "anubis" {
|
|||
# Mounted from the ESO-managed Secret below.
|
||||
value = "/keys/key"
|
||||
}
|
||||
env {
|
||||
name = "POLICY_FNAME"
|
||||
value = "/config/botPolicies.yaml"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
name = "ed25519-key"
|
||||
mount_path = "/keys"
|
||||
read_only = true
|
||||
}
|
||||
volume_mount {
|
||||
name = "policy"
|
||||
mount_path = "/config"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
resources {
|
||||
requests = {
|
||||
|
|
@ -281,6 +335,12 @@ resource "kubernetes_deployment" "anubis" {
|
|||
}
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "policy"
|
||||
config_map {
|
||||
name = kubernetes_config_map.policy.metadata[0].name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue