payslip-ingest/payslip_ingest/parsers/meta_uk.py

"""Regex-based Meta UK payslip parser.

Meta UK payslips use a stable template that splits into two layout variants
with a hard boundary at the 2022-01-31 template change:

- Variant A (pre-2022): single-column "Description / This Period / This Year"
  layout. No RSU lines (Viktor's pre-vest tenure). AE Pension EE lists as a
  positive deduction against a pre-sacrifice gross.

- Variant B (post-2022): side-by-side "Payments | Deductions | Year to Date"
  three-column layout. AE Pension EE sits in the Payments column as a
  negative line — i.e. salary sacrifice reduces Total Payment before it hits
  PAYE. RSU vest arrives as two lines in Payments: "RSU Tax Offset" (the
  notional RSU value) and "RSU Excs Refund" (any over-withheld amount
  returned). Their sum is what we attribute as `rsu_vest`.

Parser returns `ExtractedPayslip`. On any structural miss (header not found,
Pay Date missing, totals row malformed) it raises `ParserError` — the caller
falls back to ClaudeExtractor so we never silently drop a payslip.
"""
import re
from datetime import date, datetime
from decimal import Decimal

from payslip_ingest.schema import ExtractedPayslip


class ParserError(ValueError):
    """Raised when the Meta UK template cannot be matched."""


AMOUNT_RE = re.compile(r"-?\d{1,3}(?:,\d{3})*\.\d{2}")
PAY_DATE_RE = re.compile(r"Pay Date:\s*(\d{2}/\d{2}/\d{4})")
PERIOD_START_RE = re.compile(r"Period Start:\s*(\d{2}/\d{2}/\d{4})")
PERIOD_END_RE = re.compile(r"Period End:\s*(\d{2}/\d{2}/\d{4})")

EMPLOYER = "Facebook UK Limited"


def parse_meta_uk(text: str) -> ExtractedPayslip:
    if not text.strip():
        raise ParserError("empty text")
    if "Facebook UK Limited" not in text and "Meta Platforms" not in text:
        raise ParserError("does not look like a Meta UK payslip")

    lines = text.splitlines()
    if _is_variant_b(lines):
        return _parse_variant_b(text, lines)
    if _is_variant_a(lines):
        return _parse_variant_a(text, lines)
    raise ParserError("neither variant A nor variant B header found")


def _is_variant_b(lines: list[str]) -> bool:
    return any("Payments" in line and "Deductions" in line and "Year to Date" in line
               for line in lines)


def _is_variant_a(lines: list[str]) -> bool:
    return any("Description" in line and "This Period" in line and "This Year" in line
               for line in lines)


def _to_decimal(s: str) -> Decimal:
    return Decimal(s.replace(",", ""))


def _parse_uk_date(s: str) -> date:
    return datetime.strptime(s, "%d/%m/%Y").date()


def _find_field(text: str, pattern: re.Pattern[str]) -> str | None:
    m = pattern.search(text)
    return m.group(1) if m else None


def _last_amount(segment: str) -> tuple[str, Decimal | None]:
    """Return (label, rightmost numeric amount) parsed out of one cell.

    pdftotext -layout keeps Meta's column alignment stable, so each cell in
    a row is "label ... amount" (optionally "label units rate amount" but
    Meta leaves units/rate blank). We take the rightmost token as the
    amount and whatever precedes it, stripped, as the label.
    """
    matches = list(AMOUNT_RE.finditer(segment))
    if not matches:
        return segment.strip(), None
    last = matches[-1]
    label = segment[:last.start()].strip()
    return label, _to_decimal(last.group())


def _parse_dates(text: str) -> tuple[date, date | None, date | None]:
    pay_date_str = _find_field(text, PAY_DATE_RE)
    if pay_date_str is None:
        raise ParserError("Pay Date not found")
    period_start = _find_field(text, PERIOD_START_RE)
    period_end = _find_field(text, PERIOD_END_RE)
    return (
        _parse_uk_date(pay_date_str),
        _parse_uk_date(period_start) if period_start else None,
        _parse_uk_date(period_end) if period_end else None,
    )


def _parse_variant_b(text: str, lines: list[str]) -> ExtractedPayslip:
    header_idx, d_col, y_col = _find_variant_b_header(lines)
    payments, payments_order, deductions = _collect_b_rows(lines, header_idx, d_col, y_col)
    gross_pay, net_pay = _parse_b_totals_row(lines, header_idx, d_col, y_col)
    summary = _parse_summary_block(lines)

    ae_pension = payments.get("AE Pension EE", Decimal("0"))
    pension_sacrifice = abs(ae_pension) if ae_pension < 0 else Decimal("0")

    rsu_vest = (payments.get("RSU Tax Offset", Decimal("0")) +
                payments.get("RSU Excs Refund", Decimal("0")))

    income_tax = deductions.get("Tax paid", deductions.get("Tax", Decimal("0")))
    nic = deductions.get("Employee NIC", deductions.get("National Insurance", Decimal("0")))
    student_loan = deductions.get("Student Loans", deductions.get("Student Loan", Decimal("0")))

    other_deductions = _build_other_deductions_b(deductions, payments_order)

    pay_date, period_start, period_end = _parse_dates(text)

    return ExtractedPayslip(
        pay_date=pay_date,
        pay_period_start=period_start,
        pay_period_end=period_end,
        employer=EMPLOYER,
        currency="GBP",
        gross_pay=gross_pay,
        income_tax=income_tax,
        national_insurance=nic,
        pension_employee=Decimal("0"),
        pension_employer=Decimal("0"),
        student_loan=student_loan,
        rsu_vest=rsu_vest,
        rsu_offset=Decimal("0"),
        salary=payments.get("Salary", Decimal("0")),
        bonus=payments.get("Perform Bonus", payments.get("Bonus", Decimal("0"))),
        pension_sacrifice=pension_sacrifice,
        taxable_pay=summary.get("taxable_pay"),
        ytd_tax_paid=summary.get("ytd_tax_paid"),
        ytd_taxable_pay=summary.get("ytd_taxable_pay"),
        ytd_gross=summary.get("ytd_gross"),
        other_deductions=other_deductions,
        net_pay=net_pay,
    )


def _find_variant_b_header(lines: list[str]) -> tuple[int, int, int]:
    for i, line in enumerate(lines):
        if "Payments" in line and "Deductions" in line and "Year to Date" in line:
            return i, line.index("Deductions"), line.index("Year to Date")
    raise ParserError("variant B header not found")


def _collect_b_rows(
    lines: list[str],
    header_idx: int,
    d_col: int,
    y_col: int,
) -> tuple[dict[str, Decimal], list[tuple[str, Decimal]], dict[str, Decimal]]:
    payments: dict[str, Decimal] = {}
    order: list[tuple[str, Decimal]] = []
    deductions: dict[str, Decimal] = {}
    for i in range(header_idx + 1, len(lines)):
        line = lines[i].rstrip()
        if not line.strip() or "Total Payment" in line:
            if "Total Payment" in line:
                return payments, order, deductions
            continue
        p_seg = line[:d_col] if len(line) > d_col else line
        d_seg = line[d_col:y_col] if len(line) > d_col else ""
        p_label, p_amount = _last_amount(p_seg)
        if p_label and p_amount is not None:
            payments[p_label] = p_amount
            order.append((p_label, p_amount))
        d_label, d_amount = _last_amount(d_seg)
        if d_label and d_amount is not None:
            deductions[d_label] = d_amount
    return payments, order, deductions


def _parse_b_totals_row(
    lines: list[str],
    header_idx: int,
    d_col: int,
    y_col: int,
) -> tuple[Decimal, Decimal]:
    for i in range(header_idx + 1, len(lines)):
        line = lines[i]
        if "Total Payment" not in line:
            continue
        p_seg = line[:d_col] if len(line) > d_col else line
        y_seg = line[y_col:] if len(line) > y_col else ""
        _, gross_pay = _last_amount(p_seg)
        _, net_pay = _last_amount(y_seg) if "Net Pay" in y_seg else (None, None)
        if gross_pay is None:
            raise ParserError("Total Payment amount missing")
        if net_pay is None:
            raise ParserError("Net Pay amount missing from totals row")
        return gross_pay, net_pay
    raise ParserError("totals row not found")


def _parse_summary_block(lines: list[str]) -> dict[str, Decimal]:
    """Pull Taxable Pay (this period + YTD), Tax Paid (YTD), Total Gross (YTD).

    The summary sits after the totals row. Each row has 4 columns but only
    the numeric ones matter; we use "2+ numbers on a line starting with
    LABEL:" as the anchor, period-value first, YTD second.
    """
    result: dict[str, Decimal] = {}
    for line in lines:
        stripped = line.lstrip()
        if stripped.startswith("Taxable Pay:"):
            nums = AMOUNT_RE.findall(line)
            if len(nums) >= 1:
                result["taxable_pay"] = _to_decimal(nums[0])
            if len(nums) >= 2:
                result["ytd_taxable_pay"] = _to_decimal(nums[1])
        elif stripped.startswith("Total Gross:"):
            nums = AMOUNT_RE.findall(line)
            if len(nums) >= 2:
                result["ytd_gross"] = _to_decimal(nums[1])
        elif stripped.startswith("Tax Paid:"):
            nums = AMOUNT_RE.findall(line)
            if len(nums) >= 2:
                result["ytd_tax_paid"] = _to_decimal(nums[1])
    return result


PAYMENTS_KNOWN = {
    "Salary",
    "Perform Bonus",
    "Bonus",
    "AE Pension EE",
    "RSU Tax Offset",
    "RSU Excs Refund",
}
DEDUCTIONS_KNOWN = {
    "Tax paid",
    "Tax",
    "Employee NIC",
    "National Insurance",
    "Student Loans",
    "Student Loan",
}


def _build_other_deductions_b(
    deductions: dict[str, Decimal],
    payments_order: list[tuple[str, Decimal]],
) -> dict[str, Decimal]:
    # Negative payments (Cycle To Work, Share Save, AE Pension EE) are
    # already subtracted from Total Payment — adding them here would
    # double-count in the validation formula. They remain visible in
    # raw_extraction for historical reference.
    del payments_order
    return {k: v for k, v in deductions.items() if k not in DEDUCTIONS_KNOWN}


def _parse_variant_a(text: str, lines: list[str]) -> ExtractedPayslip:
    header_idx = _find_variant_a_header(lines)
    items = _collect_a_rows(lines, header_idx)
    gross_pay, net_pay = _parse_a_gross_net(lines)

    salary = items.get("Salary", Decimal("0"))
    bonus = items.get("Bonus", Decimal("0"))
    taxable_pay = items.get("Taxable Pay")
    income_tax = items.get("Tax", Decimal("0"))
    nic = items.get("National Insurance", Decimal("0"))
    student_loan = items.get("Student Loans", items.get("Student Loan", Decimal("0")))
    pension_employee = items.get("AE Pension EE", Decimal("0"))

    known = {
        "Salary",
        "Bonus",
        "Taxable Pay",
        "Tax",
        "National Insurance",
        "Student Loans",
        "Student Loan",
        "AE Pension EE",
    }
    other_deductions = {k: v for k, v in items.items() if k not in known}

    pay_date, period_start, period_end = _parse_dates(text)

    return ExtractedPayslip(
        pay_date=pay_date,
        pay_period_start=period_start,
        pay_period_end=period_end,
        employer=EMPLOYER,
        currency="GBP",
        gross_pay=gross_pay,
        income_tax=income_tax,
        national_insurance=nic,
        pension_employee=pension_employee,
        pension_employer=Decimal("0"),
        student_loan=student_loan,
        rsu_vest=Decimal("0"),
        rsu_offset=Decimal("0"),
        salary=salary,
        bonus=bonus,
        pension_sacrifice=Decimal("0"),
        taxable_pay=taxable_pay,
        ytd_tax_paid=None,
        ytd_taxable_pay=None,
        ytd_gross=None,
        other_deductions=other_deductions,
        net_pay=net_pay,
    )


def _find_variant_a_header(lines: list[str]) -> int:
    for i, line in enumerate(lines):
        if "Description" in line and "This Period" in line and "This Year" in line:
            return i
    raise ParserError("variant A header not found")


def _collect_a_rows(lines: list[str], header_idx: int) -> dict[str, Decimal]:
    items: dict[str, Decimal] = {}
    for i in range(header_idx + 1, len(lines)):
        line = lines[i].rstrip()
        if not line.strip() or line.lstrip().startswith("-"):
            continue
        if "Gross Pay" in line or "Net Pay" in line:
            break
        amounts = list(AMOUNT_RE.finditer(line))
        if not amounts:
            continue
        label = line[:amounts[0].start()].strip()
        if label:
            items[label] = _to_decimal(amounts[0].group())
    return items


def _parse_a_gross_net(lines: list[str]) -> tuple[Decimal, Decimal]:
    gross_pay: Decimal | None = None
    net_pay: Decimal | None = None
    for line in lines:
        if "Gross Pay" in line and gross_pay is None:
            nums = AMOUNT_RE.findall(line)
            if nums:
                gross_pay = _to_decimal(nums[0])
        if "Net Pay" in line and net_pay is None:
            nums = AMOUNT_RE.findall(line)
            if nums:
                net_pay = _to_decimal(nums[0])
    if gross_pay is None:
        raise ParserError("Gross Pay not found")
    if net_pay is None:
        raise ParserError("Net Pay not found")
    return gross_pay, net_pay