processor: skip non-payslip docs by title pattern

The Paperless 'payslip' tag has been applied over the years to P60 annual summaries, performance/year-end letters, Compensation_EMEA/PSC letters, comp-review letters, and RSU grant agreements. These are legitimate financial docs but not monthly payslips, and including them pollutes the dashboards (a P60 amount is ~12x a single month). Filter by title regex before hitting Claude so we skip cheaply and don't burn extraction credit on them. Status returned is 'skipped_non_payslip' to distinguish from the 'already-ingested' skip. Covers: P60*, *performance*(letter|year-end)*, compensation_emea, *psc*, comp-letter, rsu grant*. New parameterized tests cover both the exclude list and representative real payslip titles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 23:32:17 +00:00 · 2026-04-18 23:32:17 +00:00 · 86cac65572
commit 86cac65572
parent c696bf32f0
2 changed files with 56 additions and 0 deletions
--- a/payslip_ingest/processor.py
+++ b/payslip_ingest/processor.py
@ -1,5 +1,6 @@
 import json
 import logging
+import re
 from dataclasses import dataclass
 from decimal import Decimal
 from typing import Any, Protocol
@ -15,6 +16,20 @@ from payslip_ingest.tax_year import derive_tax_year

 log = logging.getLogger(__name__)

+# Paperless's `payslip` tag has drifted over the years — it gets sprinkled on
+# annual summaries (P60), performance/bonus letters, RSU grants, comp-review
+# letters. Those are legitimate financial docs but they aren't monthly payslips
+# and including them would skew every chart (a P60 looks like a single payslip
+# 12x normal size). We skip by title pattern before hitting Claude so we don't
+# burn extraction budget on them either.
+NON_PAYSLIP_TITLE_RE = re.compile(
+    r"p[\s._-]?60"
+    r"|performance.*(letter|year.end)|year.end.*letter"
+    r"|compensation[_ ]emea|\bpsc\b|comp[-_ ]?letter"
+    r"|rsu\s*grant",
+    re.IGNORECASE,
+)
+

 class _SessionFactory(Protocol):

@ -44,6 +59,10 @@ async def process_document(
            return ProcessResult(doc_id=doc_id, status="skipped")

    metadata = await paperless.get_document(doc_id)
+    title = (metadata.get("title") or "").strip()
+    if NON_PAYSLIP_TITLE_RE.search(title):
+        log.info("skipping doc_id=%s — title %r matches non-payslip pattern", doc_id, title)
+        return ProcessResult(doc_id=doc_id, status="skipped_non_payslip")
    pdf_bytes = await paperless.download_document(doc_id)
    extracted = await extractor.extract(pdf_bytes, metadata)

--- a/tests/test_processor.py
+++ b/tests/test_processor.py
@ -111,6 +111,43 @@ async def test_process_document_skips_existing(paperless: AsyncMock, extractor:
    extractor.extract.assert_not_called()


+@pytest.mark.parametrize("title", [
+    "p60-meta-2025",
+    "20001_Tax_254680_P60_2021_To_2022",
+    "2024_Performance@_Year-end Letter_Viktor Barzin_1",
+    "254680_Viktor_Barzin_18 Compensation_EMEA_20230311_2022 YE PSC",
+    "2024-comp-letter",
+    "RSU Grant Agreement 2024",
+])
+async def test_process_document_skips_non_payslip_by_title(paperless: AsyncMock,
+                                                           extractor: AsyncMock,
+                                                           title: str) -> None:
+    paperless.get_document.return_value = {"id": 42, "title": title}
+    factory = _SessionFactory([_FakeSession(existing_ids=[])])
+
+    result = await process_document(42, factory, paperless, extractor)
+
+    assert result.status == "skipped_non_payslip"
+    paperless.download_document.assert_not_called()
+    extractor.extract.assert_not_called()
+
+
+@pytest.mark.parametrize("title", [
+    "Payslip_2026-02-27",
+    "20001_PY_254680_Jan_2022",
+    "UKPY_254680_31_Jul_2019",
+])
+async def test_process_document_keeps_real_payslips(paperless: AsyncMock, extractor: AsyncMock,
+                                                    title: str) -> None:
+    paperless.get_document.return_value = {"id": 42, "title": title}
+    factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])])
+
+    result = await process_document(42, factory, paperless, extractor)
+
+    assert result.status == "inserted"
+    extractor.extract.assert_awaited_once()
+
+
 async def test_process_document_flags_validation_failure(paperless: AsyncMock,
                                                         extractor: AsyncMock) -> None:
    bad = _sample_extraction()