processor: skip non-payslip docs by title pattern

The Paperless 'payslip' tag has been applied over the years to P60 annual summaries, performance/year-end letters, Compensation_EMEA/PSC letters, comp-review letters, and RSU grant agreements. These are legitimate financial docs but not monthly payslips, and including them pollutes the dashboards (a P60 amount is ~12x a single month). Filter by title regex before hitting Claude so we skip cheaply and don't burn extraction credit on them. Status returned is 'skipped_non_payslip' to distinguish from the 'already-ingested' skip. Covers: P60*, *performance*(letter|year-end)*, compensation_emea, *psc*, comp-letter, rsu grant*. New parameterized tests cover both the exclude list and representative real payslip titles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 23:32:17 +00:00 · 2026-04-18 23:32:17 +00:00 · 86cac65572
commit 86cac65572
parent c696bf32f0
2 changed files with 56 additions and 0 deletions
--- a/payslip_ingest/processor.py
+++ b/payslip_ingest/processor.py
@ -1,5 +1,6 @@
 import json
 import logging
+import re
 from dataclasses import dataclass
 from decimal import Decimal
 from typing import Any, Protocol
@ -15,6 +16,20 @@ from payslip_ingest.tax_year import derive_tax_year

 log = logging.getLogger(__name__)

+# Paperless's `payslip` tag has drifted over the years — it gets sprinkled on
+# annual summaries (P60), performance/bonus letters, RSU grants, comp-review
+# letters. Those are legitimate financial docs but they aren't monthly payslips
+# and including them would skew every chart (a P60 looks like a single payslip
+# 12x normal size). We skip by title pattern before hitting Claude so we don't
+# burn extraction budget on them either.
+NON_PAYSLIP_TITLE_RE = re.compile(
+    r"p[\s._-]?60"
+    r"|performance.*(letter|year.end)|year.end.*letter"
+    r"|compensation[_ ]emea|\bpsc\b|comp[-_ ]?letter"
+    r"|rsu\s*grant",
+    re.IGNORECASE,
+)
+

 class _SessionFactory(Protocol):

@ -44,6 +59,10 @@ async def process_document(
            return ProcessResult(doc_id=doc_id, status="skipped")

    metadata = await paperless.get_document(doc_id)
+    title = (metadata.get("title") or "").strip()
+    if NON_PAYSLIP_TITLE_RE.search(title):
+        log.info("skipping doc_id=%s — title %r matches non-payslip pattern", doc_id, title)
+        return ProcessResult(doc_id=doc_id, status="skipped_non_payslip")
    pdf_bytes = await paperless.download_document(doc_id)
    extracted = await extractor.extract(pdf_bytes, metadata)