extractor: preextract PDF text with pdftotext before calling Claude
Without this, each extraction took 5-10 minutes because the base64'd PDF expanded to ~300KB of prompt tokens. poppler-utils ships pdftotext which turns a 200KB PDF into ~3KB of plain text in milliseconds. Claude (Haiku) then processes the text in seconds. - Dockerfile installs poppler-utils in the runtime stage (one-liner). - _build_prompt() tries pdftotext -layout first; falls back to base64 if pdftotext is missing (local dev) or the PDF is unreadable (scanned image). - Agent file documents the PAYSLIP_TEXT fast path — still handles PDF_BASE64 for fallback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
693ec4a5d4
commit
3da24fdf7a
2 changed files with 35 additions and 2 deletions
|
|
@ -20,6 +20,10 @@ FROM python:3.12-slim
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends poppler-utils \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN useradd --system --uid 10001 --home /app --shell /usr/sbin/nologin payslip
|
RUN useradd --system --uid 10001 --home /app --shell /usr/sbin/nologin payslip
|
||||||
|
|
||||||
COPY --from=builder --chown=payslip:payslip /app /app
|
COPY --from=builder --chown=payslip:payslip /app /app
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
@ -10,6 +11,8 @@ from payslip_ingest.schema import ExtractedPayslip
|
||||||
|
|
||||||
AGENT_PATH = ".claude/agents/payslip-extractor"
|
AGENT_PATH = ".claude/agents/payslip-extractor"
|
||||||
|
|
||||||
|
PDFTOTEXT_PATH = shutil.which("pdftotext")
|
||||||
|
|
||||||
EXTRACTION_PROMPT = (
|
EXTRACTION_PROMPT = (
|
||||||
"You are extracting fields from a UK payslip PDF. Return ONLY a single JSON object "
|
"You are extracting fields from a UK payslip PDF. Return ONLY a single JSON object "
|
||||||
"matching this exact schema — no prose, no markdown fences.\n"
|
"matching this exact schema — no prose, no markdown fences.\n"
|
||||||
|
|
@ -53,6 +56,33 @@ class ExtractorError(RuntimeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _build_prompt(pdf_bytes: bytes) -> str:
|
||||||
|
"""Shrink the prompt: prefer pdftotext output over raw base64.
|
||||||
|
|
||||||
|
Base64 of a 200KB PDF expands to ~270KB of tokens, which makes even Haiku
|
||||||
|
take 5-10 minutes per extraction. pdftotext normally yields 2-5KB of clean
|
||||||
|
text that Claude processes in seconds. We ship the PDF bytes as a fallback
|
||||||
|
only when pdftotext isn't available or fails (scanned-image PDFs, etc.).
|
||||||
|
"""
|
||||||
|
if PDFTOTEXT_PATH:
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
proc = subprocess.run(
|
||||||
|
[PDFTOTEXT_PATH, "-layout", "-enc", "UTF-8", "-", "-"],
|
||||||
|
input=pdf_bytes,
|
||||||
|
capture_output=True,
|
||||||
|
timeout=30,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
text = proc.stdout.decode("utf-8", errors="replace").strip()
|
||||||
|
if text:
|
||||||
|
return f"{EXTRACTION_PROMPT}\n\nPAYSLIP_TEXT:\n{text}\n"
|
||||||
|
except (subprocess.SubprocessError, OSError):
|
||||||
|
pass
|
||||||
|
encoded = base64.b64encode(pdf_bytes).decode("ascii")
|
||||||
|
return f"{EXTRACTION_PROMPT}\n\nPDF_BASE64:\n{encoded}\n"
|
||||||
|
|
||||||
|
|
||||||
class ClaudeExtractor:
|
class ClaudeExtractor:
|
||||||
"""Calls claude-agent-service to extract structured fields from a payslip PDF.
|
"""Calls claude-agent-service to extract structured fields from a payslip PDF.
|
||||||
|
|
||||||
|
|
@ -91,8 +121,7 @@ class ClaudeExtractor:
|
||||||
raise ExtractorError(f"Extracted payload failed schema validation: {exc}") from exc
|
raise ExtractorError(f"Extracted payload failed schema validation: {exc}") from exc
|
||||||
|
|
||||||
async def _submit_job(self, pdf_bytes: bytes, doc_metadata: dict[str, Any]) -> str:
|
async def _submit_job(self, pdf_bytes: bytes, doc_metadata: dict[str, Any]) -> str:
|
||||||
encoded = base64.b64encode(pdf_bytes).decode("ascii")
|
prompt = _build_prompt(pdf_bytes)
|
||||||
prompt = f"{EXTRACTION_PROMPT}\n\nPDF_BASE64:\n{encoded}\n"
|
|
||||||
body = {
|
body = {
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"agent": AGENT_PATH,
|
"agent": AGENT_PATH,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue