Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
152 lines
5.7 KiB
Python
152 lines
5.7 KiB
Python
"""Regex-based parser for HMRC P60 End of Year Certificates.
|
|
|
|
UK P60 format is statutory — every employer's P60 has the same line anchors:
|
|
`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment`
|
|
(gross pay), `Tax deducted`, etc. We lean on those anchors rather than
|
|
column layout because pdftotext output varies between employers.
|
|
|
|
Handles two employer spellings that Meta has used on P60s over the years:
|
|
`Facebook UK Ltd` (pre-2022) and `Facebook UK Limited` (2022+). Returns an
|
|
`ExtractedP60` dataclass; on structural miss raises `P60ParserError`.
|
|
"""
|
|
import re
|
|
from dataclasses import dataclass
|
|
from decimal import Decimal
|
|
|
|
from payslip_ingest.parsers.meta_uk import AMOUNT_RE, EMPLOYER_RE, _to_decimal
|
|
|
|
|
|
class P60ParserError(ValueError):
|
|
"""Raised when the P60 template cannot be matched."""
|
|
|
|
|
|
TAX_YEAR_RE = re.compile(r"Tax year to 5 April\s+(\d{4})")
|
|
# HMRC PAYE references are `NNN/XXXXXXX` — 3 digits + slash + alphanumeric.
|
|
PAYE_REF_RE = re.compile(r"Employer PAYE reference\s+(\d{3}\s*/\s*[A-Z0-9]+)")
|
|
# Wrap AMOUNT_RE.pattern in a non-capturing group so its top-level `|`
|
|
# stays scoped inside — otherwise embedding it in a larger regex flips
|
|
# the alternation into the outer context.
|
|
AMOUNT_FRAG = "(?:" + AMOUNT_RE.pattern + ")"
|
|
# The canonical P60 has a row `In this employment £<pay> £<tax>`. We
|
|
# capture both amounts: group 1 = pay, group 2 = tax deducted.
|
|
IN_EMPLOYMENT_RE = re.compile(r"In this employment[^\n\d]+£?\s*(" + AMOUNT_FRAG +
|
|
r")[^\n\d]+£?\s*(" + AMOUNT_FRAG + r")")
|
|
# Fallback: some P60 layouts (e.g. older printouts) put pay and tax on
|
|
# separate lines — a `Total for year` row has both, same shape.
|
|
TOTAL_FOR_YEAR_RE = re.compile(r"Total for year[^\n\d]+£?\s*(" + AMOUNT_FRAG + r")[^\n\d]+£?\s*(" +
|
|
AMOUNT_FRAG + r")")
|
|
# NI totals are split by letter (A/B/C/H). Anchor on lines that start with
|
|
# a single letter followed by three whitespace-separated amounts; take the
|
|
# 3rd amount as the employee's contribution for that letter band.
|
|
NI_LETTER_LINE_RE = re.compile(
|
|
r"^[A-Z]\s+£?\s*" + AMOUNT_FRAG + r"\s+£?\s*" + AMOUNT_FRAG + r"\s+£?\s*(" + AMOUNT_FRAG + r")",
|
|
re.MULTILINE)
|
|
# Student loan is optional — not every P60 has one. Zero is still "has one".
|
|
STUDENT_LOAN_RE = re.compile(r"Student Loan (?:repayments|deductions)[^\n\d]*£?\s*(" + AMOUNT_FRAG +
|
|
r")")
|
|
TAX_CODE_RE = re.compile(r"Final tax code\s+([0-9A-Z]+[A-Z])")
|
|
|
|
|
|
@dataclass
|
|
class ExtractedP60:
|
|
tax_year: str # "2024/25"
|
|
employer: str
|
|
employer_paye_ref: str | None
|
|
gross_pay: Decimal
|
|
income_tax: Decimal
|
|
national_insurance: Decimal
|
|
student_loan: Decimal | None
|
|
tax_code: str | None
|
|
|
|
def to_raw(self) -> dict[str, str | None]:
|
|
"""Snapshot for `raw_extraction` JSON column."""
|
|
return {
|
|
"tax_year": self.tax_year,
|
|
"employer": self.employer,
|
|
"employer_paye_ref": self.employer_paye_ref,
|
|
"gross_pay": str(self.gross_pay),
|
|
"income_tax": str(self.income_tax),
|
|
"national_insurance": str(self.national_insurance),
|
|
"student_loan": str(self.student_loan) if self.student_loan is not None else None,
|
|
"tax_code": self.tax_code,
|
|
}
|
|
|
|
|
|
def parse_p60(text: str) -> ExtractedP60:
|
|
if not text.strip():
|
|
raise P60ParserError("empty text")
|
|
if "P60" not in text:
|
|
raise P60ParserError("does not look like a P60 (missing 'P60' marker)")
|
|
|
|
tax_year = _parse_tax_year(text)
|
|
employer = _parse_employer(text)
|
|
paye_ref = _parse_paye_ref(text)
|
|
|
|
gross_pay, income_tax = _parse_pay_and_tax(text)
|
|
ni = _sum_ni(text)
|
|
student_loan = _optional_amount(text, STUDENT_LOAN_RE)
|
|
tax_code = _match_group(text, TAX_CODE_RE)
|
|
|
|
return ExtractedP60(
|
|
tax_year=tax_year,
|
|
employer=employer,
|
|
employer_paye_ref=paye_ref,
|
|
gross_pay=gross_pay,
|
|
income_tax=income_tax,
|
|
national_insurance=ni,
|
|
student_loan=student_loan,
|
|
tax_code=tax_code,
|
|
)
|
|
|
|
|
|
def _parse_tax_year(text: str) -> str:
|
|
m = TAX_YEAR_RE.search(text)
|
|
if not m:
|
|
raise P60ParserError("`Tax year to 5 April YYYY` anchor not found")
|
|
ending_year = int(m.group(1))
|
|
# "to 5 April 2025" → the tax year is 2024/25.
|
|
return f"{ending_year - 1}/{str(ending_year)[-2:]}"
|
|
|
|
|
|
def _parse_employer(text: str) -> str:
|
|
m = EMPLOYER_RE.search(text)
|
|
if not m:
|
|
raise P60ParserError("employer name not found (expected Facebook UK Ltd/Limited)")
|
|
return m.group(0)
|
|
|
|
|
|
def _parse_paye_ref(text: str) -> str | None:
|
|
m = PAYE_REF_RE.search(text)
|
|
if not m:
|
|
return None
|
|
return re.sub(r"\s+", "", m.group(1))
|
|
|
|
|
|
def _parse_pay_and_tax(text: str) -> tuple[Decimal, Decimal]:
|
|
"""Return (gross_pay, income_tax) from the `In this employment` row.
|
|
|
|
Falls back to `Total for year` if the primary row isn't present — some
|
|
older / reformatted P60s only print the totals line.
|
|
"""
|
|
m = IN_EMPLOYMENT_RE.search(text) or TOTAL_FOR_YEAR_RE.search(text)
|
|
if not m:
|
|
raise P60ParserError("Neither `In this employment` nor `Total for year` pay/tax row found")
|
|
return _to_decimal(m.group(1)), _to_decimal(m.group(2))
|
|
|
|
|
|
def _optional_amount(text: str, pattern: re.Pattern[str]) -> Decimal | None:
|
|
m = pattern.search(text)
|
|
return _to_decimal(m.group(1)) if m else None
|
|
|
|
|
|
def _sum_ni(text: str) -> Decimal:
|
|
"""Sum contributions across all NI letter rows (A/B/C/H ...)."""
|
|
total = Decimal("0")
|
|
for m in NI_LETTER_LINE_RE.finditer(text):
|
|
total += _to_decimal(m.group(1))
|
|
return total
|
|
|
|
|
|
def _match_group(text: str, pattern: re.Pattern[str]) -> str | None:
|
|
m = pattern.search(text)
|
|
return m.group(1).strip() if m else None
|