payslip-ingest/payslip_ingest/parsers/p60.py
Viktor Barzin 26e43b1055 parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth
Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on
the grossed-up figure, so `income_tax` on the slip is the total PAYE
(cash + RSU-attributed). Dashboards that stacked the raw figure made
vest-month tax look ~2x higher than "cash tax paid". Introduce
`cash_income_tax = income_tax * (gross_pay - pension_sacrifice) /
taxable_pay` as a derived column alongside the raw figure. Dashboards
can now stack cash vs RSU-attributed tax as separate segments.

Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund`
from the Payments grid — needed for reconciliation against HMRC annual
figures.

P60 ingest: new parser under `parsers/p60.py` anchoring on statutory
HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`,
`In this employment` pay/tax row, NI letter bands). Processor routes
documents carrying the `p60` Paperless tag to `_handle_p60` which
writes to the new `payslip_ingest.p60_reference` table (one row per
tax_year+employer). App lifespan resolves the tag id at startup; missing
tag disables dispatch without breaking payslip ingest. Paperless tag
creation + webhook config are manual follow-ups.

Migrations:
- 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on
  payslip, all nullable.
- 0005 — p60_reference table with (tax_year, employer) unique +
  paperless_doc_id unique for idempotent re-uploads.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:23:05 +00:00

152 lines
5.7 KiB
Python

"""Regex-based parser for HMRC P60 End of Year Certificates.
UK P60 format is statutory — every employer's P60 has the same line anchors:
`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment`
(gross pay), `Tax deducted`, etc. We lean on those anchors rather than
column layout because pdftotext output varies between employers.
Handles two employer spellings that Meta has used on P60s over the years:
`Facebook UK Ltd` (pre-2022) and `Facebook UK Limited` (2022+). Returns an
`ExtractedP60` dataclass; on structural miss raises `P60ParserError`.
"""
import re
from dataclasses import dataclass
from decimal import Decimal
from payslip_ingest.parsers.meta_uk import AMOUNT_RE, EMPLOYER_RE, _to_decimal
class P60ParserError(ValueError):
"""Raised when the P60 template cannot be matched."""
TAX_YEAR_RE = re.compile(r"Tax year to 5 April\s+(\d{4})")
# HMRC PAYE references are `NNN/XXXXXXX` — 3 digits + slash + alphanumeric.
PAYE_REF_RE = re.compile(r"Employer PAYE reference\s+(\d{3}\s*/\s*[A-Z0-9]+)")
# Wrap AMOUNT_RE.pattern in a non-capturing group so its top-level `|`
# stays scoped inside — otherwise embedding it in a larger regex flips
# the alternation into the outer context.
AMOUNT_FRAG = "(?:" + AMOUNT_RE.pattern + ")"
# The canonical P60 has a row `In this employment £<pay> £<tax>`. We
# capture both amounts: group 1 = pay, group 2 = tax deducted.
IN_EMPLOYMENT_RE = re.compile(r"In this employment[^\n\d]+£?\s*(" + AMOUNT_FRAG +
r")[^\n\d]+£?\s*(" + AMOUNT_FRAG + r")")
# Fallback: some P60 layouts (e.g. older printouts) put pay and tax on
# separate lines — a `Total for year` row has both, same shape.
TOTAL_FOR_YEAR_RE = re.compile(r"Total for year[^\n\d]+£?\s*(" + AMOUNT_FRAG + r")[^\n\d]+£?\s*(" +
AMOUNT_FRAG + r")")
# NI totals are split by letter (A/B/C/H). Anchor on lines that start with
# a single letter followed by three whitespace-separated amounts; take the
# 3rd amount as the employee's contribution for that letter band.
NI_LETTER_LINE_RE = re.compile(
r"^[A-Z]\s+£?\s*" + AMOUNT_FRAG + r"\s+£?\s*" + AMOUNT_FRAG + r"\s+£?\s*(" + AMOUNT_FRAG + r")",
re.MULTILINE)
# Student loan is optional — not every P60 has one. Zero is still "has one".
STUDENT_LOAN_RE = re.compile(r"Student Loan (?:repayments|deductions)[^\n\d]*£?\s*(" + AMOUNT_FRAG +
r")")
TAX_CODE_RE = re.compile(r"Final tax code\s+([0-9A-Z]+[A-Z])")
@dataclass
class ExtractedP60:
tax_year: str # "2024/25"
employer: str
employer_paye_ref: str | None
gross_pay: Decimal
income_tax: Decimal
national_insurance: Decimal
student_loan: Decimal | None
tax_code: str | None
def to_raw(self) -> dict[str, str | None]:
"""Snapshot for `raw_extraction` JSON column."""
return {
"tax_year": self.tax_year,
"employer": self.employer,
"employer_paye_ref": self.employer_paye_ref,
"gross_pay": str(self.gross_pay),
"income_tax": str(self.income_tax),
"national_insurance": str(self.national_insurance),
"student_loan": str(self.student_loan) if self.student_loan is not None else None,
"tax_code": self.tax_code,
}
def parse_p60(text: str) -> ExtractedP60:
if not text.strip():
raise P60ParserError("empty text")
if "P60" not in text:
raise P60ParserError("does not look like a P60 (missing 'P60' marker)")
tax_year = _parse_tax_year(text)
employer = _parse_employer(text)
paye_ref = _parse_paye_ref(text)
gross_pay, income_tax = _parse_pay_and_tax(text)
ni = _sum_ni(text)
student_loan = _optional_amount(text, STUDENT_LOAN_RE)
tax_code = _match_group(text, TAX_CODE_RE)
return ExtractedP60(
tax_year=tax_year,
employer=employer,
employer_paye_ref=paye_ref,
gross_pay=gross_pay,
income_tax=income_tax,
national_insurance=ni,
student_loan=student_loan,
tax_code=tax_code,
)
def _parse_tax_year(text: str) -> str:
m = TAX_YEAR_RE.search(text)
if not m:
raise P60ParserError("`Tax year to 5 April YYYY` anchor not found")
ending_year = int(m.group(1))
# "to 5 April 2025" → the tax year is 2024/25.
return f"{ending_year - 1}/{str(ending_year)[-2:]}"
def _parse_employer(text: str) -> str:
m = EMPLOYER_RE.search(text)
if not m:
raise P60ParserError("employer name not found (expected Facebook UK Ltd/Limited)")
return m.group(0)
def _parse_paye_ref(text: str) -> str | None:
m = PAYE_REF_RE.search(text)
if not m:
return None
return re.sub(r"\s+", "", m.group(1))
def _parse_pay_and_tax(text: str) -> tuple[Decimal, Decimal]:
"""Return (gross_pay, income_tax) from the `In this employment` row.
Falls back to `Total for year` if the primary row isn't present — some
older / reformatted P60s only print the totals line.
"""
m = IN_EMPLOYMENT_RE.search(text) or TOTAL_FOR_YEAR_RE.search(text)
if not m:
raise P60ParserError("Neither `In this employment` nor `Total for year` pay/tax row found")
return _to_decimal(m.group(1)), _to_decimal(m.group(2))
def _optional_amount(text: str, pattern: re.Pattern[str]) -> Decimal | None:
m = pattern.search(text)
return _to_decimal(m.group(1)) if m else None
def _sum_ni(text: str) -> Decimal:
"""Sum contributions across all NI letter rows (A/B/C/H ...)."""
total = Decimal("0")
for m in NI_LETTER_LINE_RE.finditer(text):
total += _to_decimal(m.group(1))
return total
def _match_group(text: str, pattern: re.Pattern[str]) -> str | None:
m = pattern.search(text)
return m.group(1).strip() if m else None