processor: dedup bonus within tax year — zero out repeats

Meta UK pays one performance bonus per tax year (2 historically, 1
since the comp cycle change). If the same bonus amount appears on a
second payslip in the same tax_year — typically from the extractor
misreading a YTD figure as a current-period row — summing the column
on the dashboard exaggerates total comp by 2x.

`_dedup_bonus` keeps the first occurrence per (tax_year, amount) and
stores subsequent matches with bonus=0. Original figure is preserved
in raw_extraction for auditability; a warning is logged each time.

Fixes the 2021 tax year inflation flagged by the user. Existing
duplicates need a one-shot SQL cleanup (backfill task code-7z0).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-19 15:33:07 +00:00
parent 26e43b1055
commit 92e4ecaf78
2 changed files with 85 additions and 3 deletions

View file

@ -36,7 +36,7 @@ def _sample_extraction() -> ExtractedPayslip:
class _FakeSession:
"""Minimal AsyncSession stand-in that records flushes and execute calls."""
def __init__(self, existing_ids: list[int]):
def __init__(self, existing_ids: list[int | None]):
self._existing_ids = existing_ids
self.added: list[Any] = []
self.begin_calls = 0
@ -288,3 +288,59 @@ async def test_p60_tag_absent_follows_payslip_path(paperless: AsyncMock, extract
assert result.status == "inserted"
assert result.extractor == "meta_uk_regex"
assert result.p60_id is None
async def test_bonus_dedup_zeros_repeat_within_tax_year(paperless: AsyncMock,
extractor: AsyncMock) -> None:
"""Same bonus amount already seen in tax_year → insert as 0.
Meta pays one perf bonus per tax year (2 historically). A duplicate
amount usually means the extractor misread the YTD figure as current
period we keep the first occurrence and suppress subsequent matches.
"""
sample = _sample_extraction().model_dump()
sample["bonus"] = Decimal("25000.00")
extractor.extract.return_value = ExtractedPayslip.model_validate(sample)
# Insert session has 2 execute calls: paperless_doc_id dedup, bonus dedup.
# existing_ids=[None, 1] → paperless_doc_id not found; bonus already seen (id=1).
factory = _SessionFactory([
_FakeSession(existing_ids=[]),
_FakeSession(existing_ids=[None, 1]),
])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "inserted"
assert factory.used[1].added[0].bonus == Decimal("0")
async def test_bonus_dedup_keeps_first_occurrence(paperless: AsyncMock,
extractor: AsyncMock) -> None:
"""First occurrence of a bonus in a tax_year is preserved."""
sample = _sample_extraction().model_dump()
sample["bonus"] = Decimal("7777.77")
extractor.extract.return_value = ExtractedPayslip.model_validate(sample)
factory = _SessionFactory([
_FakeSession(existing_ids=[]),
_FakeSession(existing_ids=[None, None]),
])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "inserted"
assert factory.used[1].added[0].bonus == Decimal("7777.77")
async def test_bonus_dedup_skips_when_bonus_zero(paperless: AsyncMock,
extractor: AsyncMock) -> None:
"""Bonus=0 rows skip the dedup query — avoids a pointless DB hit."""
sample = _sample_extraction().model_dump()
sample["bonus"] = Decimal("0")
extractor.extract.return_value = ExtractedPayslip.model_validate(sample)
# existing_ids only contains 1 entry — if dedup ran we'd exhaust it.
factory = _SessionFactory([
_FakeSession(existing_ids=[]),
_FakeSession(existing_ids=[None]),
])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "inserted"
assert factory.used[1].added[0].bonus == Decimal("0")