processor: skip non-payslip docs by title pattern

The Paperless 'payslip' tag has been applied over the years to P60 annual
summaries, performance/year-end letters, Compensation_EMEA/PSC letters,
comp-review letters, and RSU grant agreements. These are legitimate
financial docs but not monthly payslips, and including them pollutes
the dashboards (a P60 amount is ~12x a single month).

Filter by title regex before hitting Claude so we skip cheaply and
don't burn extraction credit on them. Status returned is
'skipped_non_payslip' to distinguish from the 'already-ingested' skip.

Covers: P60*, *performance*(letter|year-end)*, compensation_emea,
*psc*, comp-letter, rsu grant*. New parameterized tests cover both
the exclude list and representative real payslip titles.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-18 23:32:17 +00:00
parent c696bf32f0
commit 86cac65572
2 changed files with 56 additions and 0 deletions

View file

@ -111,6 +111,43 @@ async def test_process_document_skips_existing(paperless: AsyncMock, extractor:
extractor.extract.assert_not_called()
@pytest.mark.parametrize("title", [
"p60-meta-2025",
"20001_Tax_254680_P60_2021_To_2022",
"2024_Performance@_Year-end Letter_Viktor Barzin_1",
"254680_Viktor_Barzin_18 Compensation_EMEA_20230311_2022 YE PSC",
"2024-comp-letter",
"RSU Grant Agreement 2024",
])
async def test_process_document_skips_non_payslip_by_title(paperless: AsyncMock,
extractor: AsyncMock,
title: str) -> None:
paperless.get_document.return_value = {"id": 42, "title": title}
factory = _SessionFactory([_FakeSession(existing_ids=[])])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "skipped_non_payslip"
paperless.download_document.assert_not_called()
extractor.extract.assert_not_called()
@pytest.mark.parametrize("title", [
"Payslip_2026-02-27",
"20001_PY_254680_Jan_2022",
"UKPY_254680_31_Jul_2019",
])
async def test_process_document_keeps_real_payslips(paperless: AsyncMock, extractor: AsyncMock,
title: str) -> None:
paperless.get_document.return_value = {"id": 42, "title": title}
factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "inserted"
extractor.extract.assert_awaited_once()
async def test_process_document_flags_validation_failure(paperless: AsyncMock,
extractor: AsyncMock) -> None:
bad = _sample_extraction()