processor: dedup bonus within tax year — zero out repeats

Meta UK pays one performance bonus per tax year (2 historically, 1
since the comp cycle change). If the same bonus amount appears on a
second payslip in the same tax_year — typically from the extractor
misreading a YTD figure as a current-period row — summing the column
on the dashboard exaggerates total comp by 2x.

`_dedup_bonus` keeps the first occurrence per (tax_year, amount) and
stores subsequent matches with bonus=0. Original figure is preserved
in raw_extraction for auditability; a warning is logged each time.

Fixes the 2021 tax year inflation flagged by the user. Existing
duplicates need a one-shot SQL cleanup (backfill task code-7z0).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-19 15:33:07 +00:00
parent 26e43b1055
commit 92e4ecaf78
2 changed files with 85 additions and 3 deletions

View file

@ -190,6 +190,9 @@ async def _insert_payslip(
if existing_id is not None:
return None
tax_year = derive_tax_year(extracted.pay_date)
bonus = await _dedup_bonus(session, tax_year, extracted.bonus, doc_id)
row = Payslip(
paperless_doc_id=doc_id,
pay_date=extracted.pay_date,
@ -206,7 +209,7 @@ async def _insert_payslip(
rsu_vest=extracted.rsu_vest,
rsu_offset=extracted.rsu_offset,
salary=extracted.salary,
bonus=extracted.bonus,
bonus=bonus,
pension_sacrifice=extracted.pension_sacrifice,
taxable_pay=extracted.taxable_pay,
ytd_tax_paid=extracted.ytd_tax_paid,
@ -217,7 +220,7 @@ async def _insert_payslip(
ytd_rsu_excs_refund=extracted.ytd_rsu_excs_refund,
other_deductions=_decimals_to_float(extracted.other_deductions),
net_pay=extracted.net_pay,
tax_year=derive_tax_year(extracted.pay_date),
tax_year=tax_year,
raw_extraction=raw,
validated=validated,
)
@ -230,6 +233,29 @@ def _decimals_to_float(mapping: dict[str, Decimal]) -> dict[str, float]:
return {k: float(v) for k, v in mapping.items()}
async def _dedup_bonus(session: Any, tax_year: str, bonus: Decimal, doc_id: int) -> Decimal:
"""Zero out a repeated bonus within the same tax year.
Meta pays one performance bonus per year (capped at 2 historically; 1
since the comp cycle change). If the parser or Claude extractor picks
up the same amount twice usually from a payslip that reprints the
YTD bonus figure as a current-period row charting sums the amount
multiple times and exaggerates total comp. Rather than surface
duplicates, we keep the first occurrence (earliest pay_date in the
tax year) and ingest subsequent matches with bonus=0. The original
figure stays in `raw_extraction` for auditability.
"""
if bonus <= 0:
return bonus
existing = await session.execute(
select(Payslip.id).where(Payslip.tax_year == tax_year, Payslip.bonus == bonus))
if existing.scalar() is not None:
log.warning("duplicate bonus suppressed: doc_id=%s tax_year=%s bonus=%s (kept first)",
doc_id, tax_year, bonus)
return Decimal("0")
return bonus
async def _handle_p60(
doc_id: int,
pdf_bytes: bytes,