processor: skip non-payslip docs by title pattern

The Paperless 'payslip' tag has been applied over the years to P60 annual summaries, performance/year-end letters, Compensation_EMEA/PSC letters, comp-review letters, and RSU grant agreements. These are legitimate financial docs but not monthly payslips, and including them pollutes the dashboards (a P60 amount is ~12x a single month). Filter by title regex before hitting Claude so we skip cheaply and don't burn extraction credit on them. Status returned is 'skipped_non_payslip' to distinguish from the 'already-ingested' skip. Covers: P60*, *performance*(letter|year-end)*, compensation_emea, *psc*, comp-letter, rsu grant*. New parameterized tests cover both the exclude list and representative real payslip titles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 23:32:17 +00:00 · 2026-04-18 23:32:17 +00:00 · 86cac65572
commit 86cac65572
parent c696bf32f0
2 changed files with 56 additions and 0 deletions
--- a/payslip_ingest/processor.py
+++ b/payslip_ingest/processor.py
@ -1,5 +1,6 @@
 import json
 import logging
 import re
 from dataclasses import dataclass
 from decimal import Decimal
 from typing import Any, Protocol
@ -15,6 +16,20 @@ from payslip_ingest.tax_year import derive_tax_year
 log = logging.getLogger(__name__)
 # Paperless's `payslip` tag has drifted over the years — it gets sprinkled on
 # annual summaries (P60), performance/bonus letters, RSU grants, comp-review
 # letters. Those are legitimate financial docs but they aren't monthly payslips
 # and including them would skew every chart (a P60 looks like a single payslip
 # 12x normal size). We skip by title pattern before hitting Claude so we don't
 # burn extraction budget on them either.
 NON_PAYSLIP_TITLE_RE = re.compile(
    r"p[\s._-]?60"
    r"|performance.*(letter|year.end)|year.end.*letter"
    r"|compensation[_ ]emea|\bpsc\b|comp[-_ ]?letter"
    r"|rsu\s*grant",
    re.IGNORECASE,
 )
 class _SessionFactory(Protocol):
@ -44,6 +59,10 @@ async def process_document(
            return ProcessResult(doc_id=doc_id, status="skipped")
    metadata = await paperless.get_document(doc_id)
    title = (metadata.get("title") or "").strip()
    if NON_PAYSLIP_TITLE_RE.search(title):
        log.info("skipping doc_id=%s — title %r matches non-payslip pattern", doc_id, title)
        return ProcessResult(doc_id=doc_id, status="skipped_non_payslip")
    pdf_bytes = await paperless.download_document(doc_id)
    extracted = await extractor.extract(pdf_bytes, metadata)
--- a/tests/test_processor.py
+++ b/tests/test_processor.py
@ -111,6 +111,43 @@ async def test_process_document_skips_existing(paperless: AsyncMock, extractor:
    extractor.extract.assert_not_called()
@pytest.mark.parametrize("title", [
    "p60-meta-2025",
    "20001_Tax_254680_P60_2021_To_2022",
    "2024_Performance@_Year-end Letter_Viktor Barzin_1",
    "254680_Viktor_Barzin_18 Compensation_EMEA_20230311_2022 YE PSC",
    "2024-comp-letter",
    "RSU Grant Agreement 2024",
 ])
 async def test_process_document_skips_non_payslip_by_title(paperless: AsyncMock,
                                                           extractor: AsyncMock,
                                                           title: str) -> None:
    paperless.get_document.return_value = {"id": 42, "title": title}
    factory = _SessionFactory([_FakeSession(existing_ids=[])])
    result = await process_document(42, factory, paperless, extractor)
    assert result.status == "skipped_non_payslip"
    paperless.download_document.assert_not_called()
    extractor.extract.assert_not_called()
@pytest.mark.parametrize("title", [
    "Payslip_2026-02-27",
    "20001_PY_254680_Jan_2022",
    "UKPY_254680_31_Jul_2019",
 ])
 async def test_process_document_keeps_real_payslips(paperless: AsyncMock, extractor: AsyncMock,
                                                    title: str) -> None:
    paperless.get_document.return_value = {"id": 42, "title": title}
    factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])])
    result = await process_document(42, factory, paperless, extractor)
    assert result.status == "inserted"
    extractor.extract.assert_awaited_once()
 async def test_process_document_flags_validation_failure(paperless: AsyncMock,
                                                         extractor: AsyncMock) -> None:
    bad = _sample_extraction()