processor: skip non-payslip docs by title pattern
The Paperless 'payslip' tag has been applied over the years to P60 annual summaries, performance/year-end letters, Compensation_EMEA/PSC letters, comp-review letters, and RSU grant agreements. These are legitimate financial docs but not monthly payslips, and including them pollutes the dashboards (a P60 amount is ~12x a single month). Filter by title regex before hitting Claude so we skip cheaply and don't burn extraction credit on them. Status returned is 'skipped_non_payslip' to distinguish from the 'already-ingested' skip. Covers: P60*, *performance*(letter|year-end)*, compensation_emea, *psc*, comp-letter, rsu grant*. New parameterized tests cover both the exclude list and representative real payslip titles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c696bf32f0
commit
86cac65572
2 changed files with 56 additions and 0 deletions
|
|
@ -1,5 +1,6 @@
|
|||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from decimal import Decimal
|
||||
from typing import Any, Protocol
|
||||
|
|
@ -15,6 +16,20 @@ from payslip_ingest.tax_year import derive_tax_year
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# Paperless's `payslip` tag has drifted over the years — it gets sprinkled on
|
||||
# annual summaries (P60), performance/bonus letters, RSU grants, comp-review
|
||||
# letters. Those are legitimate financial docs but they aren't monthly payslips
|
||||
# and including them would skew every chart (a P60 looks like a single payslip
|
||||
# 12x normal size). We skip by title pattern before hitting Claude so we don't
|
||||
# burn extraction budget on them either.
|
||||
NON_PAYSLIP_TITLE_RE = re.compile(
|
||||
r"p[\s._-]?60"
|
||||
r"|performance.*(letter|year.end)|year.end.*letter"
|
||||
r"|compensation[_ ]emea|\bpsc\b|comp[-_ ]?letter"
|
||||
r"|rsu\s*grant",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class _SessionFactory(Protocol):
|
||||
|
||||
|
|
@ -44,6 +59,10 @@ async def process_document(
|
|||
return ProcessResult(doc_id=doc_id, status="skipped")
|
||||
|
||||
metadata = await paperless.get_document(doc_id)
|
||||
title = (metadata.get("title") or "").strip()
|
||||
if NON_PAYSLIP_TITLE_RE.search(title):
|
||||
log.info("skipping doc_id=%s — title %r matches non-payslip pattern", doc_id, title)
|
||||
return ProcessResult(doc_id=doc_id, status="skipped_non_payslip")
|
||||
pdf_bytes = await paperless.download_document(doc_id)
|
||||
extracted = await extractor.extract(pdf_bytes, metadata)
|
||||
|
||||
|
|
|
|||
|
|
@ -111,6 +111,43 @@ async def test_process_document_skips_existing(paperless: AsyncMock, extractor:
|
|||
extractor.extract.assert_not_called()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("title", [
|
||||
"p60-meta-2025",
|
||||
"20001_Tax_254680_P60_2021_To_2022",
|
||||
"2024_Performance@_Year-end Letter_Viktor Barzin_1",
|
||||
"254680_Viktor_Barzin_18 Compensation_EMEA_20230311_2022 YE PSC",
|
||||
"2024-comp-letter",
|
||||
"RSU Grant Agreement 2024",
|
||||
])
|
||||
async def test_process_document_skips_non_payslip_by_title(paperless: AsyncMock,
|
||||
extractor: AsyncMock,
|
||||
title: str) -> None:
|
||||
paperless.get_document.return_value = {"id": 42, "title": title}
|
||||
factory = _SessionFactory([_FakeSession(existing_ids=[])])
|
||||
|
||||
result = await process_document(42, factory, paperless, extractor)
|
||||
|
||||
assert result.status == "skipped_non_payslip"
|
||||
paperless.download_document.assert_not_called()
|
||||
extractor.extract.assert_not_called()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("title", [
|
||||
"Payslip_2026-02-27",
|
||||
"20001_PY_254680_Jan_2022",
|
||||
"UKPY_254680_31_Jul_2019",
|
||||
])
|
||||
async def test_process_document_keeps_real_payslips(paperless: AsyncMock, extractor: AsyncMock,
|
||||
title: str) -> None:
|
||||
paperless.get_document.return_value = {"id": 42, "title": title}
|
||||
factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])])
|
||||
|
||||
result = await process_document(42, factory, paperless, extractor)
|
||||
|
||||
assert result.status == "inserted"
|
||||
extractor.extract.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_process_document_flags_validation_failure(paperless: AsyncMock,
|
||||
extractor: AsyncMock) -> None:
|
||||
bad = _sample_extraction()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue