From 92e4ecaf7870543d49cbadfcedb21edbbba43d58 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 19 Apr 2026 15:33:07 +0000 Subject: [PATCH] =?UTF-8?q?processor:=20dedup=20bonus=20within=20tax=20yea?= =?UTF-8?q?r=20=E2=80=94=20zero=20out=20repeats?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meta UK pays one performance bonus per tax year (2 historically, 1 since the comp cycle change). If the same bonus amount appears on a second payslip in the same tax_year — typically from the extractor misreading a YTD figure as a current-period row — summing the column on the dashboard exaggerates total comp by 2x. `_dedup_bonus` keeps the first occurrence per (tax_year, amount) and stores subsequent matches with bonus=0. Original figure is preserved in raw_extraction for auditability; a warning is logged each time. Fixes the 2021 tax year inflation flagged by the user. Existing duplicates need a one-shot SQL cleanup (backfill task code-7z0). Co-Authored-By: Claude Opus 4.7 (1M context) --- payslip_ingest/processor.py | 30 +++++++++++++++++-- tests/test_processor.py | 58 ++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/payslip_ingest/processor.py b/payslip_ingest/processor.py index c10f955..ec7a0fb 100644 --- a/payslip_ingest/processor.py +++ b/payslip_ingest/processor.py @@ -190,6 +190,9 @@ async def _insert_payslip( if existing_id is not None: return None + tax_year = derive_tax_year(extracted.pay_date) + bonus = await _dedup_bonus(session, tax_year, extracted.bonus, doc_id) + row = Payslip( paperless_doc_id=doc_id, pay_date=extracted.pay_date, @@ -206,7 +209,7 @@ async def _insert_payslip( rsu_vest=extracted.rsu_vest, rsu_offset=extracted.rsu_offset, salary=extracted.salary, - bonus=extracted.bonus, + bonus=bonus, pension_sacrifice=extracted.pension_sacrifice, taxable_pay=extracted.taxable_pay, ytd_tax_paid=extracted.ytd_tax_paid, @@ -217,7 +220,7 @@ async def _insert_payslip( ytd_rsu_excs_refund=extracted.ytd_rsu_excs_refund, other_deductions=_decimals_to_float(extracted.other_deductions), net_pay=extracted.net_pay, - tax_year=derive_tax_year(extracted.pay_date), + tax_year=tax_year, raw_extraction=raw, validated=validated, ) @@ -230,6 +233,29 @@ def _decimals_to_float(mapping: dict[str, Decimal]) -> dict[str, float]: return {k: float(v) for k, v in mapping.items()} +async def _dedup_bonus(session: Any, tax_year: str, bonus: Decimal, doc_id: int) -> Decimal: + """Zero out a repeated bonus within the same tax year. + + Meta pays one performance bonus per year (capped at 2 historically; 1 + since the comp cycle change). If the parser or Claude extractor picks + up the same amount twice — usually from a payslip that reprints the + YTD bonus figure as a current-period row — charting sums the amount + multiple times and exaggerates total comp. Rather than surface + duplicates, we keep the first occurrence (earliest pay_date in the + tax year) and ingest subsequent matches with bonus=0. The original + figure stays in `raw_extraction` for auditability. + """ + if bonus <= 0: + return bonus + existing = await session.execute( + select(Payslip.id).where(Payslip.tax_year == tax_year, Payslip.bonus == bonus)) + if existing.scalar() is not None: + log.warning("duplicate bonus suppressed: doc_id=%s tax_year=%s bonus=%s (kept first)", + doc_id, tax_year, bonus) + return Decimal("0") + return bonus + + async def _handle_p60( doc_id: int, pdf_bytes: bytes, diff --git a/tests/test_processor.py b/tests/test_processor.py index f095d8e..55f0c7a 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -36,7 +36,7 @@ def _sample_extraction() -> ExtractedPayslip: class _FakeSession: """Minimal AsyncSession stand-in that records flushes and execute calls.""" - def __init__(self, existing_ids: list[int]): + def __init__(self, existing_ids: list[int | None]): self._existing_ids = existing_ids self.added: list[Any] = [] self.begin_calls = 0 @@ -288,3 +288,59 @@ async def test_p60_tag_absent_follows_payslip_path(paperless: AsyncMock, extract assert result.status == "inserted" assert result.extractor == "meta_uk_regex" assert result.p60_id is None + + +async def test_bonus_dedup_zeros_repeat_within_tax_year(paperless: AsyncMock, + extractor: AsyncMock) -> None: + """Same bonus amount already seen in tax_year → insert as 0. + + Meta pays one perf bonus per tax year (2 historically). A duplicate + amount usually means the extractor misread the YTD figure as current + period — we keep the first occurrence and suppress subsequent matches. + """ + sample = _sample_extraction().model_dump() + sample["bonus"] = Decimal("25000.00") + extractor.extract.return_value = ExtractedPayslip.model_validate(sample) + + # Insert session has 2 execute calls: paperless_doc_id dedup, bonus dedup. + # existing_ids=[None, 1] → paperless_doc_id not found; bonus already seen (id=1). + factory = _SessionFactory([ + _FakeSession(existing_ids=[]), + _FakeSession(existing_ids=[None, 1]), + ]) + result = await process_document(42, factory, paperless, extractor) + assert result.status == "inserted" + assert factory.used[1].added[0].bonus == Decimal("0") + + +async def test_bonus_dedup_keeps_first_occurrence(paperless: AsyncMock, + extractor: AsyncMock) -> None: + """First occurrence of a bonus in a tax_year is preserved.""" + sample = _sample_extraction().model_dump() + sample["bonus"] = Decimal("7777.77") + extractor.extract.return_value = ExtractedPayslip.model_validate(sample) + + factory = _SessionFactory([ + _FakeSession(existing_ids=[]), + _FakeSession(existing_ids=[None, None]), + ]) + result = await process_document(42, factory, paperless, extractor) + assert result.status == "inserted" + assert factory.used[1].added[0].bonus == Decimal("7777.77") + + +async def test_bonus_dedup_skips_when_bonus_zero(paperless: AsyncMock, + extractor: AsyncMock) -> None: + """Bonus=0 rows skip the dedup query — avoids a pointless DB hit.""" + sample = _sample_extraction().model_dump() + sample["bonus"] = Decimal("0") + extractor.extract.return_value = ExtractedPayslip.model_validate(sample) + + # existing_ids only contains 1 entry — if dedup ran we'd exhaust it. + factory = _SessionFactory([ + _FakeSession(existing_ids=[]), + _FakeSession(existing_ids=[None]), + ]) + result = await process_document(42, factory, paperless, extractor) + assert result.status == "inserted" + assert factory.used[1].added[0].bonus == Decimal("0")