From 3da24fdf7a2fe7b3b7bf690691fbd112b5a5f26f Mon Sep 17 00:00:00 2001
From: Viktor Barzin <me@viktorbarzin.me>
Date: Sat, 18 Apr 2026 22:48:04 +0000
Subject: [PATCH] extractor: preextract PDF text with pdftotext before calling
 Claude
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, each extraction took 5-10 minutes because the base64'd PDF
expanded to ~300KB of prompt tokens. poppler-utils ships pdftotext which
turns a 200KB PDF into ~3KB of plain text in milliseconds. Claude (Haiku)
then processes the text in seconds.

- Dockerfile installs poppler-utils in the runtime stage (one-liner).
- _build_prompt() tries pdftotext -layout first; falls back to base64 if
  pdftotext is missing (local dev) or the PDF is unreadable (scanned image).
- Agent file documents the PAYSLIP_TEXT fast path — still handles
  PDF_BASE64 for fallback.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Dockerfile                  |  4 ++++
 payslip_ingest/extractor.py | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index bd6aa0b..2244b2a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,6 +20,10 @@ FROM python:3.12-slim
 
 WORKDIR /app
 
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
 RUN useradd --system --uid 10001 --home /app --shell /usr/sbin/nologin payslip
 
 COPY --from=builder --chown=payslip:payslip /app /app
diff --git a/payslip_ingest/extractor.py b/payslip_ingest/extractor.py
index 3c36d32..2275a49 100644
--- a/payslip_ingest/extractor.py
+++ b/payslip_ingest/extractor.py
@@ -1,6 +1,7 @@
 import asyncio
 import base64
 import json
+import shutil
 from typing import Any
 
 import httpx
@@ -10,6 +11,8 @@ from payslip_ingest.schema import ExtractedPayslip
 
 AGENT_PATH = ".claude/agents/payslip-extractor"
 
+PDFTOTEXT_PATH = shutil.which("pdftotext")
+
 EXTRACTION_PROMPT = (
     "You are extracting fields from a UK payslip PDF. Return ONLY a single JSON object "
     "matching this exact schema — no prose, no markdown fences.\n"
@@ -53,6 +56,33 @@ class ExtractorError(RuntimeError):
     pass
 
 
+def _build_prompt(pdf_bytes: bytes) -> str:
+    """Shrink the prompt: prefer pdftotext output over raw base64.
+
+    Base64 of a 200KB PDF expands to ~270KB of tokens, which makes even Haiku
+    take 5-10 minutes per extraction. pdftotext normally yields 2-5KB of clean
+    text that Claude processes in seconds. We ship the PDF bytes as a fallback
+    only when pdftotext isn't available or fails (scanned-image PDFs, etc.).
+    """
+    if PDFTOTEXT_PATH:
+        try:
+            import subprocess
+            proc = subprocess.run(
+                [PDFTOTEXT_PATH, "-layout", "-enc", "UTF-8", "-", "-"],
+                input=pdf_bytes,
+                capture_output=True,
+                timeout=30,
+                check=False,
+            )
+            text = proc.stdout.decode("utf-8", errors="replace").strip()
+            if text:
+                return f"{EXTRACTION_PROMPT}\n\nPAYSLIP_TEXT:\n{text}\n"
+        except (subprocess.SubprocessError, OSError):
+            pass
+    encoded = base64.b64encode(pdf_bytes).decode("ascii")
+    return f"{EXTRACTION_PROMPT}\n\nPDF_BASE64:\n{encoded}\n"
+
+
 class ClaudeExtractor:
     """Calls claude-agent-service to extract structured fields from a payslip PDF.
 
@@ -91,8 +121,7 @@ class ClaudeExtractor:
             raise ExtractorError(f"Extracted payload failed schema validation: {exc}") from exc
 
     async def _submit_job(self, pdf_bytes: bytes, doc_metadata: dict[str, Any]) -> str:
-        encoded = base64.b64encode(pdf_bytes).decode("ascii")
-        prompt = f"{EXTRACTION_PROMPT}\n\nPDF_BASE64:\n{encoded}\n"
+        prompt = _build_prompt(pdf_bytes)
         body = {
             "prompt": prompt,
             "agent": AGENT_PATH,