From 3da24fdf7a2fe7b3b7bf690691fbd112b5a5f26f Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 22:48:04 +0000 Subject: [PATCH] extractor: preextract PDF text with pdftotext before calling Claude MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, each extraction took 5-10 minutes because the base64'd PDF expanded to ~300KB of prompt tokens. poppler-utils ships pdftotext which turns a 200KB PDF into ~3KB of plain text in milliseconds. Claude (Haiku) then processes the text in seconds. - Dockerfile installs poppler-utils in the runtime stage (one-liner). - _build_prompt() tries pdftotext -layout first; falls back to base64 if pdftotext is missing (local dev) or the PDF is unreadable (scanned image). - Agent file documents the PAYSLIP_TEXT fast path — still handles PDF_BASE64 for fallback. Co-Authored-By: Claude Opus 4.7 (1M context) --- Dockerfile | 4 ++++ payslip_ingest/extractor.py | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd6aa0b..2244b2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,6 +20,10 @@ FROM python:3.12-slim WORKDIR /app +RUN apt-get update \ + && apt-get install -y --no-install-recommends poppler-utils \ + && rm -rf /var/lib/apt/lists/* + RUN useradd --system --uid 10001 --home /app --shell /usr/sbin/nologin payslip COPY --from=builder --chown=payslip:payslip /app /app diff --git a/payslip_ingest/extractor.py b/payslip_ingest/extractor.py index 3c36d32..2275a49 100644 --- a/payslip_ingest/extractor.py +++ b/payslip_ingest/extractor.py @@ -1,6 +1,7 @@ import asyncio import base64 import json +import shutil from typing import Any import httpx @@ -10,6 +11,8 @@ from payslip_ingest.schema import ExtractedPayslip AGENT_PATH = ".claude/agents/payslip-extractor" +PDFTOTEXT_PATH = shutil.which("pdftotext") + EXTRACTION_PROMPT = ( "You are extracting fields from a UK payslip PDF. Return ONLY a single JSON object " "matching this exact schema — no prose, no markdown fences.\n" @@ -53,6 +56,33 @@ class ExtractorError(RuntimeError): pass +def _build_prompt(pdf_bytes: bytes) -> str: + """Shrink the prompt: prefer pdftotext output over raw base64. + + Base64 of a 200KB PDF expands to ~270KB of tokens, which makes even Haiku + take 5-10 minutes per extraction. pdftotext normally yields 2-5KB of clean + text that Claude processes in seconds. We ship the PDF bytes as a fallback + only when pdftotext isn't available or fails (scanned-image PDFs, etc.). + """ + if PDFTOTEXT_PATH: + try: + import subprocess + proc = subprocess.run( + [PDFTOTEXT_PATH, "-layout", "-enc", "UTF-8", "-", "-"], + input=pdf_bytes, + capture_output=True, + timeout=30, + check=False, + ) + text = proc.stdout.decode("utf-8", errors="replace").strip() + if text: + return f"{EXTRACTION_PROMPT}\n\nPAYSLIP_TEXT:\n{text}\n" + except (subprocess.SubprocessError, OSError): + pass + encoded = base64.b64encode(pdf_bytes).decode("ascii") + return f"{EXTRACTION_PROMPT}\n\nPDF_BASE64:\n{encoded}\n" + + class ClaudeExtractor: """Calls claude-agent-service to extract structured fields from a payslip PDF. @@ -91,8 +121,7 @@ class ClaudeExtractor: raise ExtractorError(f"Extracted payload failed schema validation: {exc}") from exc async def _submit_job(self, pdf_bytes: bytes, doc_metadata: dict[str, Any]) -> str: - encoded = base64.b64encode(pdf_bytes).decode("ascii") - prompt = f"{EXTRACTION_PROMPT}\n\nPDF_BASE64:\n{encoded}\n" + prompt = _build_prompt(pdf_bytes) body = { "prompt": prompt, "agent": AGENT_PATH,