processor: dedup bonus within tax year — zero out repeats
Meta UK pays one performance bonus per tax year (2 historically, 1 since the comp cycle change). If the same bonus amount appears on a second payslip in the same tax_year — typically from the extractor misreading a YTD figure as a current-period row — summing the column on the dashboard exaggerates total comp by 2x. `_dedup_bonus` keeps the first occurrence per (tax_year, amount) and stores subsequent matches with bonus=0. Original figure is preserved in raw_extraction for auditability; a warning is logged each time. Fixes the 2021 tax year inflation flagged by the user. Existing duplicates need a one-shot SQL cleanup (backfill task code-7z0). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
26e43b1055
commit
92e4ecaf78
2 changed files with 85 additions and 3 deletions
|
|
@ -190,6 +190,9 @@ async def _insert_payslip(
|
|||
if existing_id is not None:
|
||||
return None
|
||||
|
||||
tax_year = derive_tax_year(extracted.pay_date)
|
||||
bonus = await _dedup_bonus(session, tax_year, extracted.bonus, doc_id)
|
||||
|
||||
row = Payslip(
|
||||
paperless_doc_id=doc_id,
|
||||
pay_date=extracted.pay_date,
|
||||
|
|
@ -206,7 +209,7 @@ async def _insert_payslip(
|
|||
rsu_vest=extracted.rsu_vest,
|
||||
rsu_offset=extracted.rsu_offset,
|
||||
salary=extracted.salary,
|
||||
bonus=extracted.bonus,
|
||||
bonus=bonus,
|
||||
pension_sacrifice=extracted.pension_sacrifice,
|
||||
taxable_pay=extracted.taxable_pay,
|
||||
ytd_tax_paid=extracted.ytd_tax_paid,
|
||||
|
|
@ -217,7 +220,7 @@ async def _insert_payslip(
|
|||
ytd_rsu_excs_refund=extracted.ytd_rsu_excs_refund,
|
||||
other_deductions=_decimals_to_float(extracted.other_deductions),
|
||||
net_pay=extracted.net_pay,
|
||||
tax_year=derive_tax_year(extracted.pay_date),
|
||||
tax_year=tax_year,
|
||||
raw_extraction=raw,
|
||||
validated=validated,
|
||||
)
|
||||
|
|
@ -230,6 +233,29 @@ def _decimals_to_float(mapping: dict[str, Decimal]) -> dict[str, float]:
|
|||
return {k: float(v) for k, v in mapping.items()}
|
||||
|
||||
|
||||
async def _dedup_bonus(session: Any, tax_year: str, bonus: Decimal, doc_id: int) -> Decimal:
|
||||
"""Zero out a repeated bonus within the same tax year.
|
||||
|
||||
Meta pays one performance bonus per year (capped at 2 historically; 1
|
||||
since the comp cycle change). If the parser or Claude extractor picks
|
||||
up the same amount twice — usually from a payslip that reprints the
|
||||
YTD bonus figure as a current-period row — charting sums the amount
|
||||
multiple times and exaggerates total comp. Rather than surface
|
||||
duplicates, we keep the first occurrence (earliest pay_date in the
|
||||
tax year) and ingest subsequent matches with bonus=0. The original
|
||||
figure stays in `raw_extraction` for auditability.
|
||||
"""
|
||||
if bonus <= 0:
|
||||
return bonus
|
||||
existing = await session.execute(
|
||||
select(Payslip.id).where(Payslip.tax_year == tax_year, Payslip.bonus == bonus))
|
||||
if existing.scalar() is not None:
|
||||
log.warning("duplicate bonus suppressed: doc_id=%s tax_year=%s bonus=%s (kept first)",
|
||||
doc_id, tax_year, bonus)
|
||||
return Decimal("0")
|
||||
return bonus
|
||||
|
||||
|
||||
async def _handle_p60(
|
||||
doc_id: int,
|
||||
pdf_bytes: bytes,
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ def _sample_extraction() -> ExtractedPayslip:
|
|||
class _FakeSession:
|
||||
"""Minimal AsyncSession stand-in that records flushes and execute calls."""
|
||||
|
||||
def __init__(self, existing_ids: list[int]):
|
||||
def __init__(self, existing_ids: list[int | None]):
|
||||
self._existing_ids = existing_ids
|
||||
self.added: list[Any] = []
|
||||
self.begin_calls = 0
|
||||
|
|
@ -288,3 +288,59 @@ async def test_p60_tag_absent_follows_payslip_path(paperless: AsyncMock, extract
|
|||
assert result.status == "inserted"
|
||||
assert result.extractor == "meta_uk_regex"
|
||||
assert result.p60_id is None
|
||||
|
||||
|
||||
async def test_bonus_dedup_zeros_repeat_within_tax_year(paperless: AsyncMock,
|
||||
extractor: AsyncMock) -> None:
|
||||
"""Same bonus amount already seen in tax_year → insert as 0.
|
||||
|
||||
Meta pays one perf bonus per tax year (2 historically). A duplicate
|
||||
amount usually means the extractor misread the YTD figure as current
|
||||
period — we keep the first occurrence and suppress subsequent matches.
|
||||
"""
|
||||
sample = _sample_extraction().model_dump()
|
||||
sample["bonus"] = Decimal("25000.00")
|
||||
extractor.extract.return_value = ExtractedPayslip.model_validate(sample)
|
||||
|
||||
# Insert session has 2 execute calls: paperless_doc_id dedup, bonus dedup.
|
||||
# existing_ids=[None, 1] → paperless_doc_id not found; bonus already seen (id=1).
|
||||
factory = _SessionFactory([
|
||||
_FakeSession(existing_ids=[]),
|
||||
_FakeSession(existing_ids=[None, 1]),
|
||||
])
|
||||
result = await process_document(42, factory, paperless, extractor)
|
||||
assert result.status == "inserted"
|
||||
assert factory.used[1].added[0].bonus == Decimal("0")
|
||||
|
||||
|
||||
async def test_bonus_dedup_keeps_first_occurrence(paperless: AsyncMock,
|
||||
extractor: AsyncMock) -> None:
|
||||
"""First occurrence of a bonus in a tax_year is preserved."""
|
||||
sample = _sample_extraction().model_dump()
|
||||
sample["bonus"] = Decimal("7777.77")
|
||||
extractor.extract.return_value = ExtractedPayslip.model_validate(sample)
|
||||
|
||||
factory = _SessionFactory([
|
||||
_FakeSession(existing_ids=[]),
|
||||
_FakeSession(existing_ids=[None, None]),
|
||||
])
|
||||
result = await process_document(42, factory, paperless, extractor)
|
||||
assert result.status == "inserted"
|
||||
assert factory.used[1].added[0].bonus == Decimal("7777.77")
|
||||
|
||||
|
||||
async def test_bonus_dedup_skips_when_bonus_zero(paperless: AsyncMock,
|
||||
extractor: AsyncMock) -> None:
|
||||
"""Bonus=0 rows skip the dedup query — avoids a pointless DB hit."""
|
||||
sample = _sample_extraction().model_dump()
|
||||
sample["bonus"] = Decimal("0")
|
||||
extractor.extract.return_value = ExtractedPayslip.model_validate(sample)
|
||||
|
||||
# existing_ids only contains 1 entry — if dedup ran we'd exhaust it.
|
||||
factory = _SessionFactory([
|
||||
_FakeSession(existing_ids=[]),
|
||||
_FakeSession(existing_ids=[None]),
|
||||
])
|
||||
result = await process_document(42, factory, paperless, extractor)
|
||||
assert result.status == "inserted"
|
||||
assert factory.used[1].added[0].bonus == Decimal("0")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue