parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth

Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on
the grossed-up figure, so `income_tax` on the slip is the total PAYE
(cash + RSU-attributed). Dashboards that stacked the raw figure made
vest-month tax look ~2x higher than "cash tax paid". Introduce
`cash_income_tax = income_tax * (gross_pay - pension_sacrifice) /
taxable_pay` as a derived column alongside the raw figure. Dashboards
can now stack cash vs RSU-attributed tax as separate segments.

Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund`
from the Payments grid — needed for reconciliation against HMRC annual
figures.

P60 ingest: new parser under `parsers/p60.py` anchoring on statutory
HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`,
`In this employment` pay/tax row, NI letter bands). Processor routes
documents carrying the `p60` Paperless tag to `_handle_p60` which
writes to the new `payslip_ingest.p60_reference` table (one row per
tax_year+employer). App lifespan resolves the tag id at startup; missing
tag disables dispatch without breaking payslip ingest. Paperless tag
creation + webhook config are manual follow-ups.

Migrations:
- 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on
  payslip, all nullable.
- 0005 — p60_reference table with (tax_year, employer) unique +
  paperless_doc_id unique for idempotent re-uploads.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-19 15:23:05 +00:00
parent d91f34ddb4
commit 26e43b1055
14 changed files with 644 additions and 15 deletions

51
tests/fixtures/meta_uk_p60_2024_25.txt vendored Normal file
View file

@ -0,0 +1,51 @@
P60 End of Year Certificate
Tax year to 5 April 2025
Employee's details
Surname BARZIN
First two forenames VIKTOR
National Insurance number AA 12 34 56 A
Works/Payroll number 254680
Pay and Income Tax details
Pay Tax deducted
In previous employment(s) £0.00 £0.00
In this employment £232,630.34 £95,820.11
Total for year £232,630.34 £95,820.11
Final tax code 1257L
National Insurance contributions in this employment
NI table letter Earnings at Earnings above Total of employee's
LEL LEL up to UEL contributions in
this employment
A £6,396.00 £47,268.00 £5,172.40
Statutory payments included in the pay 'In this employment' figure
Statutory Maternity Pay £0.00
Statutory Paternity Pay £0.00
Student Loan deductions in this employment £0.00
Other details
Your employer's full name and address
Facebook UK Limited
10 Brock Street
London
NW1 3FG
Employer PAYE reference 120/FA12345
This form shows your total pay for Income Tax purposes in this employment
for the year. Any overtime, bonus, commission etc, Statutory Sick Pay,
Statutory Maternity Pay, Statutory Paternity Pay or Shared Parental Pay,
Statutory Parental Bereavement Pay is included.

View file

@ -38,6 +38,15 @@ def test_parses_variant_b_modern() -> None:
assert result.ytd_taxable_pay == Decimal("373601.64")
assert result.ytd_gross == Decimal("232630.34")
# Derived cash-only PAYE: income_tax * (gross - pension_sacrifice) / taxable_pay
# = 31311.90 * 39282.69 / 72096.92 = 17060.59 (vs 31311.90 total PAYE)
assert result.cash_income_tax is not None
assert abs(result.cash_income_tax - Decimal("17060.59")) <= Decimal("0.02")
# YTD column of RSU lines in the Payments grid
assert result.ytd_rsu_tax_offset == Decimal("124674.27")
assert result.ytd_rsu_excs_refund == Decimal("3221.32")
def test_parses_variant_b_with_bonus() -> None:
"""March 2025 — variant B, bonus + RSU + multiple other deductions."""
@ -145,6 +154,28 @@ def test_parses_variant_a_2021_08() -> None:
assert result.taxable_pay == Decimal("15323.16")
def test_cash_income_tax_falls_back_when_taxable_pay_missing() -> None:
"""When taxable_pay is None, cash_income_tax == income_tax (no RSU grossing)."""
from payslip_ingest.parsers.meta_uk import _cash_income_tax
assert _cash_income_tax(Decimal("1000"), Decimal("5000"), Decimal("100"),
None) == Decimal("1000")
assert _cash_income_tax(Decimal("1000"), Decimal("5000"), Decimal("100"),
Decimal("0")) == Decimal("1000")
def test_variant_a_cash_income_tax_pro_rata() -> None:
"""Variant A fixture with taxable_pay → cash_income_tax is pro-rata.
2021-06 has taxable_pay=5095.86 (= gross_pay), pension_sacrifice=152.90,
income_tax=1410.07 cash_income_tax = 1410.07 * 4942.96 / 5095.86 = 1367.76.
"""
result = parse_meta_uk(_load("meta_uk_2021_06_variant_a_bik.txt"))
assert result.taxable_pay == Decimal("5095.86")
assert result.cash_income_tax is not None
assert abs(result.cash_income_tax - Decimal("1367.76")) <= Decimal("0.02")
def test_raises_on_non_meta_payslip() -> None:
with pytest.raises(ParserError):
parse_meta_uk("This is not a Meta payslip\nRandom text\n")

74
tests/test_p60_parser.py Normal file
View file

@ -0,0 +1,74 @@
from decimal import Decimal
from pathlib import Path
import pytest
from payslip_ingest.parsers.p60 import P60ParserError, parse_p60
FIXTURES = Path(__file__).parent / "fixtures"
def _load(name: str) -> str:
return (FIXTURES / name).read_text(encoding="utf-8")
def test_parses_meta_uk_p60_2024_25() -> None:
result = parse_p60(_load("meta_uk_p60_2024_25.txt"))
assert result.tax_year == "2024/25"
assert result.employer == "Facebook UK Limited"
assert result.employer_paye_ref == "120/FA12345"
assert result.gross_pay == Decimal("232630.34")
assert result.income_tax == Decimal("95820.11")
assert result.national_insurance == Decimal("5172.40")
assert result.student_loan == Decimal("0.00")
assert result.tax_code == "1257L"
def test_parse_p60_raises_on_non_p60_text() -> None:
with pytest.raises(P60ParserError, match="does not look like a P60"):
parse_p60("Payslip for March 2025\nGross: £1000\n")
def test_parse_p60_raises_on_empty_text() -> None:
with pytest.raises(P60ParserError):
parse_p60("")
def test_parse_p60_raises_without_tax_year_anchor() -> None:
with pytest.raises(P60ParserError, match="Tax year"):
parse_p60("P60\nSome other content without the required anchor\n")
def test_parse_p60_handles_old_facebook_uk_ltd_spelling() -> None:
"""Pre-2022 P60s list the employer as `Facebook UK Ltd` (no `Limited`)."""
text = _load("meta_uk_p60_2024_25.txt").replace("Facebook UK Limited", "Facebook UK Ltd")
result = parse_p60(text)
assert result.employer == "Facebook UK Ltd"
def test_parse_p60_student_loan_missing_is_none() -> None:
"""P60s for years without student-loan deductions omit that line entirely."""
text = _load("meta_uk_p60_2024_25.txt")
# Strip the Student Loan line (simulating a year pre-loan).
stripped = "\n".join(line for line in text.splitlines() if "Student Loan" not in line)
result = parse_p60(stripped)
assert result.student_loan is None
def test_parse_p60_tax_code_missing_is_none() -> None:
"""Some historical P60s may not print a `Final tax code` line."""
text = _load("meta_uk_p60_2024_25.txt").replace("Final tax code", "XXX")
result = parse_p60(text)
assert result.tax_code is None
def test_parse_p60_sums_ni_across_letter_bands() -> None:
"""Employees who cross NI letter bands mid-year get one row per letter."""
text = _load("meta_uk_p60_2024_25.txt")
# Append a second NI letter row — same shape as the A row in the fixture.
extra = "C £6,396.00 £47,268.00 £1,000.00\n"
augmented = text + "\n" + extra
result = parse_p60(augmented)
# 5172.40 (letter A, in fixture) + 1000.00 (letter C, appended)
assert result.national_insurance == Decimal("6172.40")

View file

@ -246,3 +246,45 @@ async def test_rejects_zero_gross_zero_net(paperless: AsyncMock, extractor: Asyn
factory = _SessionFactory([_FakeSession(existing_ids=[])])
with pytest.raises(ValueError, match="zero gross and net"):
await process_document(42, factory, paperless, extractor)
async def test_p60_tag_routes_to_p60_handler(paperless: AsyncMock, extractor: AsyncMock,
monkeypatch: pytest.MonkeyPatch) -> None:
"""A doc carrying the P60 tag id goes to _handle_p60 (not the payslip path)."""
p60_text = (FIXTURES / "meta_uk_p60_2024_25.txt").read_text(encoding="utf-8")
monkeypatch.setattr(processor, "_pdftotext", lambda _: p60_text)
paperless.get_document.return_value = {"id": 42, "title": "P60 2024-25", "tags": [7]}
# Two sessions: one for combined dedup, one for the P60 insert.
factory = _SessionFactory([
_FakeSession(existing_ids=[]),
_FakeSession(existing_ids=[]),
])
result = await process_document(42, factory, paperless, extractor, p60_tag_id=7)
assert result.status == "inserted"
assert result.extractor == "p60_regex"
assert result.p60_id == 1
# Extractor (Claude) must not be called for a P60.
extractor.extract.assert_not_called()
inserted_row = factory.used[1].added[0]
assert inserted_row.tax_year == "2024/25"
assert inserted_row.gross_pay == Decimal("232630.34")
assert inserted_row.income_tax == Decimal("95820.11")
async def test_p60_tag_absent_follows_payslip_path(paperless: AsyncMock, extractor: AsyncMock,
monkeypatch: pytest.MonkeyPatch) -> None:
"""A regular payslip (no P60 tag) still goes through the payslip path."""
meta_text = (FIXTURES / "meta_uk_2026_02.txt").read_text(encoding="utf-8")
monkeypatch.setattr(processor, "_pdftotext", lambda _: meta_text)
paperless.get_document.return_value = {"id": 42, "title": "Payslip", "tags": [3]}
factory = _SessionFactory([
_FakeSession(existing_ids=[]),
_FakeSession(existing_ids=[]),
])
result = await process_document(42, factory, paperless, extractor, p60_tag_id=7)
assert result.status == "inserted"
assert result.extractor == "meta_uk_regex"
assert result.p60_id is None