diff --git a/payslip_ingest/processor.py b/payslip_ingest/processor.py index 78f6531..9eb8c8d 100644 --- a/payslip_ingest/processor.py +++ b/payslip_ingest/processor.py @@ -1,5 +1,6 @@ import json import logging +import re from dataclasses import dataclass from decimal import Decimal from typing import Any, Protocol @@ -15,6 +16,20 @@ from payslip_ingest.tax_year import derive_tax_year log = logging.getLogger(__name__) +# Paperless's `payslip` tag has drifted over the years — it gets sprinkled on +# annual summaries (P60), performance/bonus letters, RSU grants, comp-review +# letters. Those are legitimate financial docs but they aren't monthly payslips +# and including them would skew every chart (a P60 looks like a single payslip +# 12x normal size). We skip by title pattern before hitting Claude so we don't +# burn extraction budget on them either. +NON_PAYSLIP_TITLE_RE = re.compile( + r"p[\s._-]?60" + r"|performance.*(letter|year.end)|year.end.*letter" + r"|compensation[_ ]emea|\bpsc\b|comp[-_ ]?letter" + r"|rsu\s*grant", + re.IGNORECASE, +) + class _SessionFactory(Protocol): @@ -44,6 +59,10 @@ async def process_document( return ProcessResult(doc_id=doc_id, status="skipped") metadata = await paperless.get_document(doc_id) + title = (metadata.get("title") or "").strip() + if NON_PAYSLIP_TITLE_RE.search(title): + log.info("skipping doc_id=%s — title %r matches non-payslip pattern", doc_id, title) + return ProcessResult(doc_id=doc_id, status="skipped_non_payslip") pdf_bytes = await paperless.download_document(doc_id) extracted = await extractor.extract(pdf_bytes, metadata) diff --git a/tests/test_processor.py b/tests/test_processor.py index 6792af4..6348a0e 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -111,6 +111,43 @@ async def test_process_document_skips_existing(paperless: AsyncMock, extractor: extractor.extract.assert_not_called() +@pytest.mark.parametrize("title", [ + "p60-meta-2025", + "20001_Tax_254680_P60_2021_To_2022", + "2024_Performance@_Year-end Letter_Viktor Barzin_1", + "254680_Viktor_Barzin_18 Compensation_EMEA_20230311_2022 YE PSC", + "2024-comp-letter", + "RSU Grant Agreement 2024", +]) +async def test_process_document_skips_non_payslip_by_title(paperless: AsyncMock, + extractor: AsyncMock, + title: str) -> None: + paperless.get_document.return_value = {"id": 42, "title": title} + factory = _SessionFactory([_FakeSession(existing_ids=[])]) + + result = await process_document(42, factory, paperless, extractor) + + assert result.status == "skipped_non_payslip" + paperless.download_document.assert_not_called() + extractor.extract.assert_not_called() + + +@pytest.mark.parametrize("title", [ + "Payslip_2026-02-27", + "20001_PY_254680_Jan_2022", + "UKPY_254680_31_Jul_2019", +]) +async def test_process_document_keeps_real_payslips(paperless: AsyncMock, extractor: AsyncMock, + title: str) -> None: + paperless.get_document.return_value = {"id": 42, "title": title} + factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])]) + + result = await process_document(42, factory, paperless, extractor) + + assert result.status == "inserted" + extractor.extract.assert_awaited_once() + + async def test_process_document_flags_validation_failure(paperless: AsyncMock, extractor: AsyncMock) -> None: bad = _sample_extraction()