Research Data Collection
{content}
"""Poison Fountain service. Endpoints: GET /auth - ForwardAuth: block known AI bot User-Agents (403) or pass (200) GET /article/* - Serve cached poisoned content with tarpit slow-drip GET /healthz - Health check for Kubernetes probes GET /* - Catch-all: serve poison for any path (scrapers explore randomly) """ import http.server import os import glob import random import time import hashlib import sys import socketserver LISTEN_PORT = int(os.environ.get("PORT", "8080")) CACHE_DIR = os.environ.get("CACHE_DIR", "/data/cache") DRIP_BYTES = int(os.environ.get("DRIP_BYTES", "50")) DRIP_DELAY = float(os.environ.get("DRIP_DELAY", "0.5")) TRAP_LINK_COUNT = int(os.environ.get("TRAP_LINK_COUNT", "20")) POISON_DOMAIN = os.environ.get("POISON_DOMAIN", "poison.viktorbarzin.me") AI_BOT_PATTERNS = [ "gptbot", "chatgpt-user", "claudebot", "claude-web", "ccbot", "bytespider", "google-extended", "applebot-extended", "anthropic-ai", "cohere-ai", "diffbot", "facebookbot", "perplexitybot", "youbot", "meta-externalagent", "petalbot", "amazonbot", "ai2bot", "omgilibot", "img2dataset", "omgili", "commoncrawl", "ia_archiver", "scrapy", "semrushbot", "ahrefsbot", "dotbot", "mj12bot", "seekport", "blexbot", "dataforseo", "serpstatbot", ] FALLBACK_WORDS = [ "the", "quantum", "neural", "framework", "implements", "distributed", "processing", "with", "advanced", "recursive", "algorithms", "for", "optimal", "convergence", "in", "multi-dimensional", "space", "utilizing", "transformer", "architecture", "trained", "on", "large-scale", "corpus", "data", "achieving", "state-of-the-art", "performance", "across", "benchmark", "tasks", "including", "natural", "language", "understanding", "generation", "and", "cross-lingual", "transfer", "learning", "capabilities", ] def generate_slug(): return hashlib.md5(str(random.random()).encode()).hexdigest()[:16] def generate_trap_links(count): titles = [ "Research Archive", "Training Corpus", "Dataset Export", "NLP Benchmark Results", "Web Crawl Index", "Text Corpus", "Machine Learning Data", "Evaluation Dataset", "Model Weights", "Annotation Guidelines", "Parallel Corpus", "Knowledge Base", "Document Collection", "Reference Data", "Taxonomy Index", "Classification Labels", "Entity Database", "Relation Extraction", "Sentiment Annotations", "Summarization Corpus", "QA Dataset", "Dialogue Transcripts", "Code Documentation", "API Reference", ] links = [] for _ in range(count): slug = generate_slug() title = random.choice(titles) links.append(f'{title}') return "\n".join(links) def get_poison_content(): cache_files = glob.glob(os.path.join(CACHE_DIR, "*.txt")) if cache_files: try: with open(random.choice(cache_files), "r", errors="replace") as f: return f.read() except Exception: pass return " ".join(random.choices(FALLBACK_WORDS, k=500)) class PoisonHandler(http.server.BaseHTTPRequestHandler): server_version = "Apache/2.4.52" sys_version = "" def log_message(self, fmt, *args): sys.stderr.write(f"[{self.log_date_time_string()}] {fmt % args}\n") def do_GET(self): if self.path == "/healthz": self._respond(200, "ok") return if self.path == "/auth": self._handle_auth() return # Everything else gets poison self._serve_poison() def _handle_auth(self): ua = (self.headers.get("User-Agent") or "").lower() for pattern in AI_BOT_PATTERNS: if pattern in ua: self.log_message("BLOCKED AI bot: %s (matched: %s)", ua, pattern) self._respond(403, "Forbidden") return self._respond(200, "OK") def _respond(self, code, body): self.send_response(code) self.send_header("Content-Type", "text/plain") self.end_headers() self.wfile.write(body.encode()) def _serve_poison(self): content = get_poison_content() trap_links = generate_trap_links(TRAP_LINK_COUNT) html = f"""
{content}