Research Data Collection
{content}
# Anti-AI Scraping System Implementation Plan > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. **Goal:** Deploy a 5-layer anti-AI scraping system that blocks known bots, injects hidden trap links into all HTML responses, serves poisoned content from Poison Fountain, and tarpits scrapers with slow-drip responses. **Architecture:** A lightweight Python service handles bot detection (ForwardAuth) and poison content serving (tarpit). Traefik middlewares inject anti-AI headers and hidden trap links into all public service responses via ingress_factory defaults. A CronJob refreshes cached poison content from rnsaffn.com. **Tech Stack:** Python 3 (stdlib http.server), Terraform/Terragrunt, Traefik middleware CRDs, Kubernetes CronJob --- ### Task 1: Create the Python poison service code **Files:** - Create: `stacks/poison-fountain/app/server.py` - Create: `stacks/poison-fountain/app/fetch-poison.sh` **Step 1: Create the service directory** ```bash mkdir -p stacks/poison-fountain/app ``` **Step 2: Write `stacks/poison-fountain/app/server.py`** ```python """Poison Fountain service. Endpoints: GET /auth - ForwardAuth: block known AI bot User-Agents (403) or pass (200) GET /article/* - Serve cached poisoned content with tarpit slow-drip GET /healthz - Health check for Kubernetes probes GET /* - Catch-all: serve poison for any path (scrapers explore randomly) """ import http.server import os import glob import random import time import hashlib import sys LISTEN_PORT = int(os.environ.get("PORT", "8080")) CACHE_DIR = os.environ.get("CACHE_DIR", "/data/cache") DRIP_BYTES = int(os.environ.get("DRIP_BYTES", "50")) DRIP_DELAY = float(os.environ.get("DRIP_DELAY", "0.5")) TRAP_LINK_COUNT = int(os.environ.get("TRAP_LINK_COUNT", "20")) POISON_DOMAIN = os.environ.get("POISON_DOMAIN", "poison.viktorbarzin.me") AI_BOT_PATTERNS = [ "gptbot", "chatgpt-user", "claudebot", "claude-web", "ccbot", "bytespider", "google-extended", "applebot-extended", "anthropic-ai", "cohere-ai", "diffbot", "facebookbot", "perplexitybot", "youbot", "meta-externalagent", "petalbot", "amazonbot", "ai2bot", "omgilibot", "img2dataset", "omgili", "commoncrawl", "ia_archiver", "scrapy", "semrushbot", "ahrefsbot", "dotbot", "mj12bot", "seekport", "blexbot", "dataforseo", "serpstatbot", ] FALLBACK_WORDS = [ "the", "quantum", "neural", "framework", "implements", "distributed", "processing", "with", "advanced", "recursive", "algorithms", "for", "optimal", "convergence", "in", "multi-dimensional", "space", "utilizing", "transformer", "architecture", "trained", "on", "large-scale", "corpus", "data", "achieving", "state-of-the-art", "performance", "across", "benchmark", "tasks", "including", "natural", "language", "understanding", "generation", "and", "cross-lingual", "transfer", "learning", "capabilities", ] def generate_slug(): return hashlib.md5(str(random.random()).encode()).hexdigest()[:16] def generate_trap_links(count): titles = [ "Research Archive", "Training Corpus", "Dataset Export", "NLP Benchmark Results", "Web Crawl Index", "Text Corpus", "Machine Learning Data", "Evaluation Dataset", "Model Weights", "Annotation Guidelines", "Parallel Corpus", "Knowledge Base", "Document Collection", "Reference Data", "Taxonomy Index", "Classification Labels", "Entity Database", "Relation Extraction", "Sentiment Annotations", "Summarization Corpus", "QA Dataset", "Dialogue Transcripts", "Code Documentation", "API Reference", ] links = [] for _ in range(count): slug = generate_slug() title = random.choice(titles) links.append(f'{title}') return "\n".join(links) def get_poison_content(): cache_files = glob.glob(os.path.join(CACHE_DIR, "*.txt")) if cache_files: try: with open(random.choice(cache_files), "r", errors="replace") as f: return f.read() except Exception: pass return " ".join(random.choices(FALLBACK_WORDS, k=500)) class PoisonHandler(http.server.BaseHTTPRequestHandler): server_version = "Apache/2.4.52" sys_version = "" def log_message(self, fmt, *args): sys.stderr.write(f"[{self.log_date_time_string()}] {fmt % args}\n") def do_GET(self): if self.path == "/healthz": self._respond(200, "ok") return if self.path == "/auth": self._handle_auth() return # Everything else gets poison self._serve_poison() def _handle_auth(self): ua = (self.headers.get("User-Agent") or "").lower() for pattern in AI_BOT_PATTERNS: if pattern in ua: self.log_message("BLOCKED AI bot: %s (matched: %s)", ua, pattern) self._respond(403, "Forbidden") return self._respond(200, "OK") def _respond(self, code, body): self.send_response(code) self.send_header("Content-Type", "text/plain") self.end_headers() self.wfile.write(body.encode()) def _serve_poison(self): content = get_poison_content() trap_links = generate_trap_links(TRAP_LINK_COUNT) html = f"""
{content}