payslip-ingest/payslip_ingest/parsers/p60.py

"""Regex-based parser for HMRC P60 End of Year Certificates.

UK P60 format is statutory — every employer's P60 has the same line anchors:
`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment`
(gross pay), `Tax deducted`, etc. We lean on those anchors rather than
column layout because pdftotext output varies between employers.

Handles two employer spellings that Meta has used on P60s over the years:
`Facebook UK Ltd` (pre-2022) and `Facebook UK Limited` (2022+). Returns an
`ExtractedP60` dataclass; on structural miss raises `P60ParserError`.
"""
import re
from dataclasses import dataclass
from decimal import Decimal

from payslip_ingest.parsers.meta_uk import AMOUNT_RE, EMPLOYER_RE, _to_decimal


class P60ParserError(ValueError):
    """Raised when the P60 template cannot be matched."""


TAX_YEAR_RE = re.compile(r"Tax year to 5 April\s+(\d{4})")
# HMRC PAYE references are `NNN/XXXXXXX` — 3 digits + slash + alphanumeric.
PAYE_REF_RE = re.compile(r"Employer PAYE reference\s+(\d{3}\s*/\s*[A-Z0-9]+)")
# Wrap AMOUNT_RE.pattern in a non-capturing group so its top-level `|`
# stays scoped inside — otherwise embedding it in a larger regex flips
# the alternation into the outer context.
AMOUNT_FRAG = "(?:" + AMOUNT_RE.pattern + ")"
# The canonical P60 has a row `In this employment  £<pay>  £<tax>`. We
# capture both amounts: group 1 = pay, group 2 = tax deducted.
IN_EMPLOYMENT_RE = re.compile(r"In this employment[^\n\d]+£?\s*(" + AMOUNT_FRAG +
                              r")[^\n\d]+£?\s*(" + AMOUNT_FRAG + r")")
# Fallback: some P60 layouts (e.g. older printouts) put pay and tax on
# separate lines — a `Total for year` row has both, same shape.
TOTAL_FOR_YEAR_RE = re.compile(r"Total for year[^\n\d]+£?\s*(" + AMOUNT_FRAG + r")[^\n\d]+£?\s*(" +
                               AMOUNT_FRAG + r")")
# NI totals are split by letter (A/B/C/H). Anchor on lines that start with
# a single letter followed by three whitespace-separated amounts; take the
# 3rd amount as the employee's contribution for that letter band.
NI_LETTER_LINE_RE = re.compile(
    r"^[A-Z]\s+£?\s*" + AMOUNT_FRAG + r"\s+£?\s*" + AMOUNT_FRAG + r"\s+£?\s*(" + AMOUNT_FRAG + r")",
    re.MULTILINE)
# Student loan is optional — not every P60 has one. Zero is still "has one".
STUDENT_LOAN_RE = re.compile(r"Student Loan (?:repayments|deductions)[^\n\d]*£?\s*(" + AMOUNT_FRAG +
                             r")")
TAX_CODE_RE = re.compile(r"Final tax code\s+([0-9A-Z]+[A-Z])")


@dataclass
class ExtractedP60:
    tax_year: str  # "2024/25"
    employer: str
    employer_paye_ref: str | None
    gross_pay: Decimal
    income_tax: Decimal
    national_insurance: Decimal
    student_loan: Decimal | None
    tax_code: str | None

    def to_raw(self) -> dict[str, str | None]:
        """Snapshot for `raw_extraction` JSON column."""
        return {
            "tax_year": self.tax_year,
            "employer": self.employer,
            "employer_paye_ref": self.employer_paye_ref,
            "gross_pay": str(self.gross_pay),
            "income_tax": str(self.income_tax),
            "national_insurance": str(self.national_insurance),
            "student_loan": str(self.student_loan) if self.student_loan is not None else None,
            "tax_code": self.tax_code,
        }


def parse_p60(text: str) -> ExtractedP60:
    if not text.strip():
        raise P60ParserError("empty text")
    if "P60" not in text:
        raise P60ParserError("does not look like a P60 (missing 'P60' marker)")

    tax_year = _parse_tax_year(text)
    employer = _parse_employer(text)
    paye_ref = _parse_paye_ref(text)

    gross_pay, income_tax = _parse_pay_and_tax(text)
    ni = _sum_ni(text)
    student_loan = _optional_amount(text, STUDENT_LOAN_RE)
    tax_code = _match_group(text, TAX_CODE_RE)

    return ExtractedP60(
        tax_year=tax_year,
        employer=employer,
        employer_paye_ref=paye_ref,
        gross_pay=gross_pay,
        income_tax=income_tax,
        national_insurance=ni,
        student_loan=student_loan,
        tax_code=tax_code,
    )


def _parse_tax_year(text: str) -> str:
    m = TAX_YEAR_RE.search(text)
    if not m:
        raise P60ParserError("`Tax year to 5 April YYYY` anchor not found")
    ending_year = int(m.group(1))
    # "to 5 April 2025" → the tax year is 2024/25.
    return f"{ending_year - 1}/{str(ending_year)[-2:]}"


def _parse_employer(text: str) -> str:
    m = EMPLOYER_RE.search(text)
    if not m:
        raise P60ParserError("employer name not found (expected Facebook UK Ltd/Limited)")
    return m.group(0)


def _parse_paye_ref(text: str) -> str | None:
    m = PAYE_REF_RE.search(text)
    if not m:
        return None
    return re.sub(r"\s+", "", m.group(1))


def _parse_pay_and_tax(text: str) -> tuple[Decimal, Decimal]:
    """Return (gross_pay, income_tax) from the `In this employment` row.

    Falls back to `Total for year` if the primary row isn't present — some
    older / reformatted P60s only print the totals line.
    """
    m = IN_EMPLOYMENT_RE.search(text) or TOTAL_FOR_YEAR_RE.search(text)
    if not m:
        raise P60ParserError("Neither `In this employment` nor `Total for year` pay/tax row found")
    return _to_decimal(m.group(1)), _to_decimal(m.group(2))


def _optional_amount(text: str, pattern: re.Pattern[str]) -> Decimal | None:
    m = pattern.search(text)
    return _to_decimal(m.group(1)) if m else None


def _sum_ni(text: str) -> Decimal:
    """Sum contributions across all NI letter rows (A/B/C/H ...)."""
    total = Decimal("0")
    for m in NI_LETTER_LINE_RE.finditer(text):
        total += _to_decimal(m.group(1))
    return total


def _match_group(text: str, pattern: re.Pattern[str]) -> str | None:
    m = pattern.search(text)
    return m.group(1).strip() if m else None