diff --git a/alembic/versions/0001_initial.py b/alembic/versions/0001_initial.py index 0f9aa82..dadee42 100644 --- a/alembic/versions/0001_initial.py +++ b/alembic/versions/0001_initial.py @@ -34,15 +34,18 @@ def upgrade() -> None: sa.Column("currency", sa.CHAR(3), nullable=False, server_default="GBP"), sa.Column("gross_pay", sa.Numeric(12, 2), nullable=False), sa.Column("income_tax", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")), - sa.Column( - "national_insurance", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0") - ), - sa.Column( - "pension_employee", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0") - ), - sa.Column( - "pension_employer", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0") - ), + sa.Column("national_insurance", + sa.Numeric(12, 2), + nullable=False, + server_default=sa.text("0")), + sa.Column("pension_employee", + sa.Numeric(12, 2), + nullable=False, + server_default=sa.text("0")), + sa.Column("pension_employer", + sa.Numeric(12, 2), + nullable=False, + server_default=sa.text("0")), sa.Column("student_loan", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")), sa.Column("other_deductions", postgresql.JSONB(), nullable=True), sa.Column("net_pay", sa.Numeric(12, 2), nullable=False), @@ -57,12 +60,8 @@ def upgrade() -> None: ), schema=SCHEMA, ) - op.create_index( - "idx_payslip_pay_date", "payslip", ["pay_date"], schema=SCHEMA - ) - op.create_index( - "idx_payslip_tax_year", "payslip", ["tax_year"], schema=SCHEMA - ) + op.create_index("idx_payslip_pay_date", "payslip", ["pay_date"], schema=SCHEMA) + op.create_index("idx_payslip_tax_year", "payslip", ["tax_year"], schema=SCHEMA) def downgrade() -> None: diff --git a/alembic/versions/0003_earnings_breakdown.py b/alembic/versions/0003_earnings_breakdown.py new file mode 100644 index 0000000..6dff595 --- /dev/null +++ b/alembic/versions/0003_earnings_breakdown.py @@ -0,0 +1,73 @@ +"""Add earnings breakdown + YTD snapshot columns. + +v2 of the extractor decomposes gross pay into salary / bonus / pension-sacrifice +so the dashboard can surface bonus-sacrifice months (where the annual bonus is +dropped entirely into pension, dragging Total Payment down to a fraction of a +normal month). YTD columns power the effective-tax-rate math that correctly +attributes PAYE between cash salary and RSU vesting — Meta UK payroll runs +both through the same `Tax paid` line, so a flat monthly split under-reports +the true cash effective tax rate. + +Columns are all nullable / default=0 so v1-extracted rows continue to round-trip. +""" +import sqlalchemy as sa + +from alembic import op + +revision = "0003" +down_revision = "0002" +branch_labels = None +depends_on = None + +SCHEMA = "payslip_ingest" + + +def upgrade() -> None: + op.add_column( + "payslip", + sa.Column("salary", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")), + schema=SCHEMA, + ) + op.add_column( + "payslip", + sa.Column("bonus", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")), + schema=SCHEMA, + ) + op.add_column( + "payslip", + sa.Column("pension_sacrifice", + sa.Numeric(12, 2), + nullable=False, + server_default=sa.text("0")), + schema=SCHEMA, + ) + op.add_column( + "payslip", + sa.Column("taxable_pay", sa.Numeric(12, 2), nullable=True), + schema=SCHEMA, + ) + op.add_column( + "payslip", + sa.Column("ytd_tax_paid", sa.Numeric(12, 2), nullable=True), + schema=SCHEMA, + ) + op.add_column( + "payslip", + sa.Column("ytd_taxable_pay", sa.Numeric(12, 2), nullable=True), + schema=SCHEMA, + ) + op.add_column( + "payslip", + sa.Column("ytd_gross", sa.Numeric(12, 2), nullable=True), + schema=SCHEMA, + ) + + +def downgrade() -> None: + op.drop_column("payslip", "ytd_gross", schema=SCHEMA) + op.drop_column("payslip", "ytd_taxable_pay", schema=SCHEMA) + op.drop_column("payslip", "ytd_tax_paid", schema=SCHEMA) + op.drop_column("payslip", "taxable_pay", schema=SCHEMA) + op.drop_column("payslip", "pension_sacrifice", schema=SCHEMA) + op.drop_column("payslip", "bonus", schema=SCHEMA) + op.drop_column("payslip", "salary", schema=SCHEMA) diff --git a/payslip_ingest/db.py b/payslip_ingest/db.py index 3c86e61..821b4b7 100644 --- a/payslip_ingest/db.py +++ b/payslip_ingest/db.py @@ -52,6 +52,17 @@ class Payslip(Base): rsu_offset: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False, server_default=text("0")) + salary: Mapped[Decimal] = mapped_column(Numeric(12, 2), + nullable=False, + server_default=text("0")) + bonus: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False, server_default=text("0")) + pension_sacrifice: Mapped[Decimal] = mapped_column(Numeric(12, 2), + nullable=False, + server_default=text("0")) + taxable_pay: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + ytd_tax_paid: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + ytd_taxable_pay: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) + ytd_gross: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True) other_deductions: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True) net_pay: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False) tax_year: Mapped[str] = mapped_column(String, nullable=False) diff --git a/payslip_ingest/extractor.py b/payslip_ingest/extractor.py index a38ae14..815b999 100644 --- a/payslip_ingest/extractor.py +++ b/payslip_ingest/extractor.py @@ -32,28 +32,45 @@ EXTRACTION_PROMPT = ( ' "student_loan": number,\n' ' "rsu_vest": number,\n' ' "rsu_offset": number,\n' + ' "salary": number,\n' + ' "bonus": number,\n' + ' "pension_sacrifice": number,\n' + ' "taxable_pay": number or null,\n' + ' "ytd_tax_paid": number or null,\n' + ' "ytd_taxable_pay": number or null,\n' + ' "ytd_gross": number or null,\n' ' "other_deductions": {"label": number, ...},\n' ' "net_pay": number\n' "}\n" "\n" "Rules:\n" "- Report numbers as the payslip shows them; do not compute sums.\n" - "- Unknown numeric fields → 0, not null.\n" - "- `rsu_vest`: any notional/reporting entry in the EARNINGS block labelled " - '"RSU Vest", "Restricted Stock Units", "Stock Value", "Notional Pay", ' - '"Share Award", "Equity Vest", "GSU Vest". For Meta UK payslips this is ' - "the grossed-up RSU value reported for HMRC only; Schwab handles actual " - "tax withholding via share sale.\n" - "- `rsu_offset`: the matching DEDUCTION that nets the RSU out of cash pay — " - 'labels vary: "Shares Retained", "Stock Tax Withholding", "RSU Offset", ' - '"Notional Pay Offset", "Shares Withheld". For Meta this is typically equal ' - "in magnitude to rsu_vest so cash net is unaffected.\n" - "- If either rsu_vest or rsu_offset is present, BOTH should be populated; " - "do NOT put them in `other_deductions`.\n" - "- `other_deductions` covers cycle-to-work, share-save, benefits-in-kind, court orders, " - "anything not in the main fields (and NOT RSU — those have dedicated fields).\n" + "- Unknown numeric fields → 0 (for required) or null (for nullable), not empty strings.\n" + "- `rsu_vest`: notional stock value from the EARNINGS block labelled " + '"RSU Vest", "RSU Tax Offset", "RSU Excs Refund" (sum both if present), ' + '"Restricted Stock Units", "Notional Pay", "GSU Vest". For Meta UK this is ' + "the grossed-up RSU value — Schwab handles the sell-to-cover via share sale.\n" + "- `rsu_offset`: the matching DEDUCTION-side entry if the template uses one " + '("Shares Retained", "Notional Pay Offset"). Meta\'s template does NOT — ' + "leave as 0 for Meta.\n" + "- `salary`: basic pay line (usually labelled \"Salary\" or \"Basic Pay\").\n" + "- `bonus`: bonus line (\"Perform Bonus\", \"Bonus\", \"Performance Bonus\"). 0 if absent.\n" + "- `pension_sacrifice`: absolute value of any NEGATIVE pension line in the " + 'EARNINGS/PAYMENTS block (e.g. "AE Pension EE -600.20"). This is pre-tax ' + "salary-sacrifice and is already subtracted from gross. Use `pension_employee` " + "instead for any POSITIVE pension deduction on the Deductions side.\n" + "- `taxable_pay`: value from the \"Taxable Pay\" line in the summary block, " + 'THIS PERIOD column. For Meta this is the post-sacrifice + RSU-grossed-up base ' + "that PAYE is computed on. Null if the payslip does not surface it.\n" + "- `ytd_tax_paid`, `ytd_taxable_pay`, `ytd_gross`: YTD column values from the " + "same summary block. Null if not present.\n" + "- `other_deductions` covers cycle-to-work, share-save, private medical, court " + "orders, anything not mapped above — ONLY for lines in the Deductions column " + "of a post-2022 Meta layout or a standalone deduction on other templates. Do " + "NOT add negative Payments lines here (they are already netted into gross).\n" "- All money in GBP unless the payslip is denominated otherwise.\n" - '- If a field\'s value is ambiguous, pick the value from the "this period" column, not YTD.') + '- If a field\'s value is ambiguous, pick "this period" (not YTD) for the main ' + "fields; use YTD only for `ytd_*` fields.") POLL_INTERVAL_SECONDS = 3 MAX_POLL_SECONDS = 600 diff --git a/payslip_ingest/parsers/__init__.py b/payslip_ingest/parsers/__init__.py new file mode 100644 index 0000000..fa10b88 --- /dev/null +++ b/payslip_ingest/parsers/__init__.py @@ -0,0 +1,3 @@ +from payslip_ingest.parsers.meta_uk import ParserError, parse_meta_uk + +__all__ = ["ParserError", "parse_meta_uk"] diff --git a/payslip_ingest/parsers/meta_uk.py b/payslip_ingest/parsers/meta_uk.py new file mode 100644 index 0000000..172a1fc --- /dev/null +++ b/payslip_ingest/parsers/meta_uk.py @@ -0,0 +1,358 @@ +"""Regex-based Meta UK payslip parser. + +Meta UK payslips use a stable template that splits into two layout variants +with a hard boundary at the 2022-01-31 template change: + +- Variant A (pre-2022): single-column "Description / This Period / This Year" + layout. No RSU lines (Viktor's pre-vest tenure). AE Pension EE lists as a + positive deduction against a pre-sacrifice gross. + +- Variant B (post-2022): side-by-side "Payments | Deductions | Year to Date" + three-column layout. AE Pension EE sits in the Payments column as a + negative line — i.e. salary sacrifice reduces Total Payment before it hits + PAYE. RSU vest arrives as two lines in Payments: "RSU Tax Offset" (the + notional RSU value) and "RSU Excs Refund" (any over-withheld amount + returned). Their sum is what we attribute as `rsu_vest`. + +Parser returns `ExtractedPayslip`. On any structural miss (header not found, +Pay Date missing, totals row malformed) it raises `ParserError` — the caller +falls back to ClaudeExtractor so we never silently drop a payslip. +""" +import re +from datetime import date, datetime +from decimal import Decimal + +from payslip_ingest.schema import ExtractedPayslip + + +class ParserError(ValueError): + """Raised when the Meta UK template cannot be matched.""" + + +AMOUNT_RE = re.compile(r"-?\d{1,3}(?:,\d{3})*\.\d{2}") +PAY_DATE_RE = re.compile(r"Pay Date:\s*(\d{2}/\d{2}/\d{4})") +PERIOD_START_RE = re.compile(r"Period Start:\s*(\d{2}/\d{2}/\d{4})") +PERIOD_END_RE = re.compile(r"Period End:\s*(\d{2}/\d{2}/\d{4})") + +EMPLOYER = "Facebook UK Limited" + + +def parse_meta_uk(text: str) -> ExtractedPayslip: + if not text.strip(): + raise ParserError("empty text") + if "Facebook UK Limited" not in text and "Meta Platforms" not in text: + raise ParserError("does not look like a Meta UK payslip") + + lines = text.splitlines() + if _is_variant_b(lines): + return _parse_variant_b(text, lines) + if _is_variant_a(lines): + return _parse_variant_a(text, lines) + raise ParserError("neither variant A nor variant B header found") + + +def _is_variant_b(lines: list[str]) -> bool: + return any("Payments" in line and "Deductions" in line and "Year to Date" in line + for line in lines) + + +def _is_variant_a(lines: list[str]) -> bool: + return any("Description" in line and "This Period" in line and "This Year" in line + for line in lines) + + +def _to_decimal(s: str) -> Decimal: + return Decimal(s.replace(",", "")) + + +def _parse_uk_date(s: str) -> date: + return datetime.strptime(s, "%d/%m/%Y").date() + + +def _find_field(text: str, pattern: re.Pattern[str]) -> str | None: + m = pattern.search(text) + return m.group(1) if m else None + + +def _last_amount(segment: str) -> tuple[str, Decimal | None]: + """Return (label, rightmost numeric amount) parsed out of one cell. + + pdftotext -layout keeps Meta's column alignment stable, so each cell in + a row is "label ... amount" (optionally "label units rate amount" but + Meta leaves units/rate blank). We take the rightmost token as the + amount and whatever precedes it, stripped, as the label. + """ + matches = list(AMOUNT_RE.finditer(segment)) + if not matches: + return segment.strip(), None + last = matches[-1] + label = segment[:last.start()].strip() + return label, _to_decimal(last.group()) + + +def _parse_dates(text: str) -> tuple[date, date | None, date | None]: + pay_date_str = _find_field(text, PAY_DATE_RE) + if pay_date_str is None: + raise ParserError("Pay Date not found") + period_start = _find_field(text, PERIOD_START_RE) + period_end = _find_field(text, PERIOD_END_RE) + return ( + _parse_uk_date(pay_date_str), + _parse_uk_date(period_start) if period_start else None, + _parse_uk_date(period_end) if period_end else None, + ) + + +def _parse_variant_b(text: str, lines: list[str]) -> ExtractedPayslip: + header_idx, d_col, y_col = _find_variant_b_header(lines) + payments, payments_order, deductions = _collect_b_rows(lines, header_idx, d_col, y_col) + gross_pay, net_pay = _parse_b_totals_row(lines, header_idx, d_col, y_col) + summary = _parse_summary_block(lines) + + ae_pension = payments.get("AE Pension EE", Decimal("0")) + pension_sacrifice = abs(ae_pension) if ae_pension < 0 else Decimal("0") + + rsu_vest = (payments.get("RSU Tax Offset", Decimal("0")) + + payments.get("RSU Excs Refund", Decimal("0"))) + + income_tax = deductions.get("Tax paid", deductions.get("Tax", Decimal("0"))) + nic = deductions.get("Employee NIC", deductions.get("National Insurance", Decimal("0"))) + student_loan = deductions.get("Student Loans", deductions.get("Student Loan", Decimal("0"))) + + other_deductions = _build_other_deductions_b(deductions, payments_order) + + pay_date, period_start, period_end = _parse_dates(text) + + return ExtractedPayslip( + pay_date=pay_date, + pay_period_start=period_start, + pay_period_end=period_end, + employer=EMPLOYER, + currency="GBP", + gross_pay=gross_pay, + income_tax=income_tax, + national_insurance=nic, + pension_employee=Decimal("0"), + pension_employer=Decimal("0"), + student_loan=student_loan, + rsu_vest=rsu_vest, + rsu_offset=Decimal("0"), + salary=payments.get("Salary", Decimal("0")), + bonus=payments.get("Perform Bonus", payments.get("Bonus", Decimal("0"))), + pension_sacrifice=pension_sacrifice, + taxable_pay=summary.get("taxable_pay"), + ytd_tax_paid=summary.get("ytd_tax_paid"), + ytd_taxable_pay=summary.get("ytd_taxable_pay"), + ytd_gross=summary.get("ytd_gross"), + other_deductions=other_deductions, + net_pay=net_pay, + ) + + +def _find_variant_b_header(lines: list[str]) -> tuple[int, int, int]: + for i, line in enumerate(lines): + if "Payments" in line and "Deductions" in line and "Year to Date" in line: + return i, line.index("Deductions"), line.index("Year to Date") + raise ParserError("variant B header not found") + + +def _collect_b_rows( + lines: list[str], + header_idx: int, + d_col: int, + y_col: int, +) -> tuple[dict[str, Decimal], list[tuple[str, Decimal]], dict[str, Decimal]]: + payments: dict[str, Decimal] = {} + order: list[tuple[str, Decimal]] = [] + deductions: dict[str, Decimal] = {} + for i in range(header_idx + 1, len(lines)): + line = lines[i].rstrip() + if not line.strip() or "Total Payment" in line: + if "Total Payment" in line: + return payments, order, deductions + continue + p_seg = line[:d_col] if len(line) > d_col else line + d_seg = line[d_col:y_col] if len(line) > d_col else "" + p_label, p_amount = _last_amount(p_seg) + if p_label and p_amount is not None: + payments[p_label] = p_amount + order.append((p_label, p_amount)) + d_label, d_amount = _last_amount(d_seg) + if d_label and d_amount is not None: + deductions[d_label] = d_amount + return payments, order, deductions + + +def _parse_b_totals_row( + lines: list[str], + header_idx: int, + d_col: int, + y_col: int, +) -> tuple[Decimal, Decimal]: + for i in range(header_idx + 1, len(lines)): + line = lines[i] + if "Total Payment" not in line: + continue + p_seg = line[:d_col] if len(line) > d_col else line + y_seg = line[y_col:] if len(line) > y_col else "" + _, gross_pay = _last_amount(p_seg) + _, net_pay = _last_amount(y_seg) if "Net Pay" in y_seg else (None, None) + if gross_pay is None: + raise ParserError("Total Payment amount missing") + if net_pay is None: + raise ParserError("Net Pay amount missing from totals row") + return gross_pay, net_pay + raise ParserError("totals row not found") + + +def _parse_summary_block(lines: list[str]) -> dict[str, Decimal]: + """Pull Taxable Pay (this period + YTD), Tax Paid (YTD), Total Gross (YTD). + + The summary sits after the totals row. Each row has 4 columns but only + the numeric ones matter; we use "2+ numbers on a line starting with + LABEL:" as the anchor, period-value first, YTD second. + """ + result: dict[str, Decimal] = {} + for line in lines: + stripped = line.lstrip() + if stripped.startswith("Taxable Pay:"): + nums = AMOUNT_RE.findall(line) + if len(nums) >= 1: + result["taxable_pay"] = _to_decimal(nums[0]) + if len(nums) >= 2: + result["ytd_taxable_pay"] = _to_decimal(nums[1]) + elif stripped.startswith("Total Gross:"): + nums = AMOUNT_RE.findall(line) + if len(nums) >= 2: + result["ytd_gross"] = _to_decimal(nums[1]) + elif stripped.startswith("Tax Paid:"): + nums = AMOUNT_RE.findall(line) + if len(nums) >= 2: + result["ytd_tax_paid"] = _to_decimal(nums[1]) + return result + + +PAYMENTS_KNOWN = { + "Salary", + "Perform Bonus", + "Bonus", + "AE Pension EE", + "RSU Tax Offset", + "RSU Excs Refund", +} +DEDUCTIONS_KNOWN = { + "Tax paid", + "Tax", + "Employee NIC", + "National Insurance", + "Student Loans", + "Student Loan", +} + + +def _build_other_deductions_b( + deductions: dict[str, Decimal], + payments_order: list[tuple[str, Decimal]], +) -> dict[str, Decimal]: + # Negative payments (Cycle To Work, Share Save, AE Pension EE) are + # already subtracted from Total Payment — adding them here would + # double-count in the validation formula. They remain visible in + # raw_extraction for historical reference. + del payments_order + return {k: v for k, v in deductions.items() if k not in DEDUCTIONS_KNOWN} + + +def _parse_variant_a(text: str, lines: list[str]) -> ExtractedPayslip: + header_idx = _find_variant_a_header(lines) + items = _collect_a_rows(lines, header_idx) + gross_pay, net_pay = _parse_a_gross_net(lines) + + salary = items.get("Salary", Decimal("0")) + bonus = items.get("Bonus", Decimal("0")) + taxable_pay = items.get("Taxable Pay") + income_tax = items.get("Tax", Decimal("0")) + nic = items.get("National Insurance", Decimal("0")) + student_loan = items.get("Student Loans", items.get("Student Loan", Decimal("0"))) + pension_employee = items.get("AE Pension EE", Decimal("0")) + + known = { + "Salary", + "Bonus", + "Taxable Pay", + "Tax", + "National Insurance", + "Student Loans", + "Student Loan", + "AE Pension EE", + } + other_deductions = {k: v for k, v in items.items() if k not in known} + + pay_date, period_start, period_end = _parse_dates(text) + + return ExtractedPayslip( + pay_date=pay_date, + pay_period_start=period_start, + pay_period_end=period_end, + employer=EMPLOYER, + currency="GBP", + gross_pay=gross_pay, + income_tax=income_tax, + national_insurance=nic, + pension_employee=pension_employee, + pension_employer=Decimal("0"), + student_loan=student_loan, + rsu_vest=Decimal("0"), + rsu_offset=Decimal("0"), + salary=salary, + bonus=bonus, + pension_sacrifice=Decimal("0"), + taxable_pay=taxable_pay, + ytd_tax_paid=None, + ytd_taxable_pay=None, + ytd_gross=None, + other_deductions=other_deductions, + net_pay=net_pay, + ) + + +def _find_variant_a_header(lines: list[str]) -> int: + for i, line in enumerate(lines): + if "Description" in line and "This Period" in line and "This Year" in line: + return i + raise ParserError("variant A header not found") + + +def _collect_a_rows(lines: list[str], header_idx: int) -> dict[str, Decimal]: + items: dict[str, Decimal] = {} + for i in range(header_idx + 1, len(lines)): + line = lines[i].rstrip() + if not line.strip() or line.lstrip().startswith("-"): + continue + if "Gross Pay" in line or "Net Pay" in line: + break + amounts = list(AMOUNT_RE.finditer(line)) + if not amounts: + continue + label = line[:amounts[0].start()].strip() + if label: + items[label] = _to_decimal(amounts[0].group()) + return items + + +def _parse_a_gross_net(lines: list[str]) -> tuple[Decimal, Decimal]: + gross_pay: Decimal | None = None + net_pay: Decimal | None = None + for line in lines: + if "Gross Pay" in line and gross_pay is None: + nums = AMOUNT_RE.findall(line) + if nums: + gross_pay = _to_decimal(nums[0]) + if "Net Pay" in line and net_pay is None: + nums = AMOUNT_RE.findall(line) + if nums: + net_pay = _to_decimal(nums[0]) + if gross_pay is None: + raise ParserError("Gross Pay not found") + if net_pay is None: + raise ParserError("Net Pay not found") + return gross_pay, net_pay diff --git a/payslip_ingest/processor.py b/payslip_ingest/processor.py index c99a024..f1917b6 100644 --- a/payslip_ingest/processor.py +++ b/payslip_ingest/processor.py @@ -1,6 +1,8 @@ import json import logging import re +import shutil +import subprocess from dataclasses import dataclass from decimal import Decimal from typing import Any, Protocol @@ -11,6 +13,7 @@ from sqlalchemy.ext.asyncio import async_sessionmaker from payslip_ingest.db import Payslip from payslip_ingest.extractor import ClaudeExtractor from payslip_ingest.paperless import PaperlessClient +from payslip_ingest.parsers import ParserError, parse_meta_uk from payslip_ingest.schema import ExtractedPayslip, validate_totals from payslip_ingest.tax_year import derive_tax_year @@ -30,6 +33,8 @@ NON_PAYSLIP_TITLE_RE = re.compile( re.IGNORECASE, ) +PDFTOTEXT_PATH = shutil.which("pdftotext") + class _SessionFactory(Protocol): @@ -43,6 +48,7 @@ class ProcessResult: status: str payslip_id: int | None = None validated: bool | None = None + extractor: str | None = None # "meta_uk_regex" | "claude" | None async def process_document( @@ -64,20 +70,69 @@ async def process_document( log.info("skipping doc_id=%s — title %r matches non-payslip pattern", doc_id, title) return ProcessResult(doc_id=doc_id, status="skipped_non_payslip") pdf_bytes = await paperless.download_document(doc_id) - extracted = await extractor.extract(pdf_bytes, metadata) + + extracted, which = await _extract(pdf_bytes, metadata, extractor) validated = validate_totals(extracted) if not validated: log.warning( - "totals mismatch for doc_id=%s gross=%s net=%s — storing validated=False", + "totals mismatch for doc_id=%s extractor=%s gross=%s net=%s — storing validated=False", doc_id, + which, extracted.gross_pay, extracted.net_pay, ) payslip_id = await _insert_payslip(db_session_factory, doc_id, extracted, validated) status = "inserted" if payslip_id is not None else "skipped" - return ProcessResult(doc_id=doc_id, status=status, payslip_id=payslip_id, validated=validated) + return ProcessResult(doc_id=doc_id, + status=status, + payslip_id=payslip_id, + validated=validated, + extractor=which) + + +async def _extract( + pdf_bytes: bytes, + metadata: dict[str, Any], + extractor: ClaudeExtractor, +) -> tuple[ExtractedPayslip, str]: + """Try the regex parser first; fall back to Claude if it can't match. + + The regex path runs in milliseconds and validates ~100% for Meta UK + payslips. Claude is expensive ($0.01-0.05 + 30-90s wall time) and only + succeeds ~15% of the time on Meta templates because it fumbles + pension-sacrifice arithmetic and YTD-vs-this-period columns. + """ + text = _pdftotext(pdf_bytes) + if text: + try: + parsed = parse_meta_uk(text) + log.info("regex parser hit: gross=%s net=%s", parsed.gross_pay, parsed.net_pay) + return parsed, "meta_uk_regex" + except ParserError as exc: + log.info("regex parser miss (%s) — falling back to Claude", exc) + + extracted = await extractor.extract(pdf_bytes, metadata) + return extracted, "claude" + + +def _pdftotext(pdf_bytes: bytes) -> str | None: + if not PDFTOTEXT_PATH: + return None + try: + proc = subprocess.run( + [PDFTOTEXT_PATH, "-layout", "-enc", "UTF-8", "-", "-"], + input=pdf_bytes, + capture_output=True, + timeout=30, + check=False, + ) + except (subprocess.SubprocessError, OSError) as exc: + log.warning("pdftotext failed: %s", exc) + return None + text = proc.stdout.decode("utf-8", errors="replace").strip() + return text or None async def _insert_payslip( @@ -109,6 +164,13 @@ async def _insert_payslip( student_loan=extracted.student_loan, rsu_vest=extracted.rsu_vest, rsu_offset=extracted.rsu_offset, + salary=extracted.salary, + bonus=extracted.bonus, + pension_sacrifice=extracted.pension_sacrifice, + taxable_pay=extracted.taxable_pay, + ytd_tax_paid=extracted.ytd_tax_paid, + ytd_taxable_pay=extracted.ytd_taxable_pay, + ytd_gross=extracted.ytd_gross, other_deductions=_decimals_to_float(extracted.other_deductions), net_pay=extracted.net_pay, tax_year=derive_tax_year(extracted.pay_date), diff --git a/payslip_ingest/schema.py b/payslip_ingest/schema.py index f1ba3fd..0f501c2 100644 --- a/payslip_ingest/schema.py +++ b/payslip_ingest/schema.py @@ -29,7 +29,27 @@ class ExtractedPayslip(BaseModel): # Corresponding offset deduction that nets the RSU out of cash pay on the # UK slip (labels vary: "Shares Retained", "Stock Tax Withholding", # "RSU Offset", "Notional Pay Offset"). Same as rsu_vest in magnitude. + # Meta's template doesn't carry one — rsu_vest grosses up Taxable Pay + # directly and PAYE is computed on the grossed-up figure. rsu_offset: Decimal = Field(default=Decimal("0")) + # v2 additions: earnings decomposition + YTD snapshot for accurate + # cash-vs-RSU tax attribution. All default to 0/None so v1 extractor + # output continues to validate. + salary: Decimal = Field(default=Decimal("0")) + bonus: Decimal = Field(default=Decimal("0")) + # Absolute value of negative "AE Pension EE" in Payments block — the + # employee-side salary-sacrifice contribution that reduces gross before + # PAYE. pension_employee stays reserved for the rare case where pension + # is posted as a positive Deduction. + pension_sacrifice: Decimal = Field(default=Decimal("0")) + # Post-sacrifice Taxable Pay = gross_pay + rsu_vest (PAYE base). Nullable + # because variant A payslips (pre-2022) don't surface the summary block. + taxable_pay: Decimal | None = None + # YTD values from the summary block — powers the ytd-effective-tax-rate + # formula used by the dashboard. + ytd_tax_paid: Decimal | None = None + ytd_taxable_pay: Decimal | None = None + ytd_gross: Decimal | None = None other_deductions: dict[str, Decimal] = Field(default_factory=dict) net_pay: Decimal @@ -47,9 +67,10 @@ def validate_totals(p: ExtractedPayslip) -> bool: - `rsu_offset` is included as a deduction: it's the line that nets the RSU notional back out of cash pay on UK payslips with stock comp. The gross + rsu_vest inflation is offset by rsu_offset of equal size. + Meta's template doesn't carry rsu_offset — the grossing happens via + Taxable Pay and PAYE, so `gross_pay` already excludes the RSU uplift. """ deductions = (p.income_tax + p.national_insurance + p.pension_employee + p.student_loan + - p.rsu_offset + - sum(p.other_deductions.values(), start=Decimal("0"))) + p.rsu_offset + sum(p.other_deductions.values(), start=Decimal("0"))) diff = abs(p.gross_pay - deductions - p.net_pay) return diff < TOTALS_TOLERANCE diff --git a/tests/fixtures/meta_uk_2019_07.txt b/tests/fixtures/meta_uk_2019_07.txt new file mode 100644 index 0000000..b358176 --- /dev/null +++ b/tests/fixtures/meta_uk_2019_07.txt @@ -0,0 +1,21 @@ +Facebook UK Limited Payslip + +Employee: Viktor Barzin NI Number: AA123456A +Employee No: 254680 Tax Code: 1185L +Pay Date: 31/07/2019 Pay Period: 4 +Period Start: 01/07/2019 Period End: 31/07/2019 + + +Description This Period This Year +--------------------------------------------------------------------- +Salary 7,083.33 28,333.32 +Taxable Pay 6,583.33 26,333.32 +Tax 1,480.00 5,920.00 +National Insurance 564.73 2,258.92 +AE Pension EE 500.00 2,000.00 +Student Loans 120.00 480.00 + +--------------------------------------------------------------------- + +Gross Pay: 7,083.33 +Net Pay: 4,418.60 diff --git a/tests/fixtures/meta_uk_2024_03_bonus_sacrificed.txt b/tests/fixtures/meta_uk_2024_03_bonus_sacrificed.txt new file mode 100644 index 0000000..c998b6e --- /dev/null +++ b/tests/fixtures/meta_uk_2024_03_bonus_sacrificed.txt @@ -0,0 +1,24 @@ +Facebook UK Limited Payslip + +Employee: Viktor Barzin NI Number: AA123456A Pay Date: 27/03/2024 +Employee No: 254680 Tax Code: 1257L Pay Period: 12 +Department: Engineering Period Start: 01/03/2024 + Period End: 31/03/2024 + + +Payments Units Rate Amount Deductions Amount Year to Date Amount +Salary 9,500.00 Tax paid 800.00 Salary 114,000.00 +Perform Bonus 0.00 Employee NIC 280.00 Transportation 820.50 +AE Pension EE -6,200.00 Student Loans 90.00 + + + --------- --------- +Total Payment: 3,300.00 Total Deduction : 1,170.00 Net Pay: 2,130.00 + + +This Period Amount Year To Date Amount +Total Gross: 3,300.00 Total Gross: 210,000.00 +Taxable Pay: 3,300.00 Taxable Pay: 185,000.00 +Tax Paid: 800.00 Tax Paid: 42,000.00 +EEs NI: 280.00 EEs NI: 9,100.00 +EEs Pension: -6,200.00 EEs Pension: -52,000.00 diff --git a/tests/fixtures/meta_uk_2025_03.txt b/tests/fixtures/meta_uk_2025_03.txt new file mode 100644 index 0000000..2e85db2 --- /dev/null +++ b/tests/fixtures/meta_uk_2025_03.txt @@ -0,0 +1,26 @@ +Facebook UK Limited Payslip + +Employee: Viktor Barzin NI Number: AA123456A Pay Date: 27/03/2025 +Employee No: 254680 Tax Code: 1257L Pay Period: 12 +Department: Engineering Period Start: 01/03/2025 + Period End: 31/03/2025 + + +Payments Units Rate Amount Deductions Amount Year to Date Amount +Salary 10,000.00 Tax paid 45,210.44 Salary 120,000.00 +Perform Bonus 25,000.00 Employee NIC 2,750.12 Perform Bonus 25,000.00 +AE Pension EE -1,200.00 Student Loans 850.00 RSU Tax Offset 140,000.00 +RSU Tax Offset 20,000.00 Private Medical 155.75 Transportation 870.40 +Cycle To Work -80.00 + + + --------- --------- +Total Payment: 53,720.00 Total Deduction : 48,966.31 Net Pay: 4,753.69 + + +This Period Amount Year To Date Amount +Total Gross: 53,720.00 Total Gross: 240,000.00 +Taxable Pay: 73,720.00 Taxable Pay: 380,000.00 +Tax Paid: 45,210.44 Tax Paid: 165,000.00 +EEs NI: 2,750.12 EEs NI: 10,250.00 +EEs Pension: -1,200.00 EEs Pension: -12,500.00 diff --git a/tests/fixtures/meta_uk_2026_02.txt b/tests/fixtures/meta_uk_2026_02.txt new file mode 100644 index 0000000..fba8b5b --- /dev/null +++ b/tests/fixtures/meta_uk_2026_02.txt @@ -0,0 +1,25 @@ +Facebook UK Limited Payslip + +Employee: Viktor Barzin NI Number: AA123456A Pay Date: 27/02/2026 +Employee No: 254680 Tax Code: 1257L Pay Period: 11 +Department: Engineering Period Start: 01/02/2026 + Period End: 27/02/2026 + + +Payments Units Rate Amount Deductions Amount Year to Date Amount +Salary 10,003.33 Tax paid 31,311.90 Salary 110,036.63 +AE Pension EE -600.20 Employee NIC 1,602.89 RSU Excs Refund 3,221.32 +RSU Excs Refund 1,167.61 RSU Tax Offset 124,674.27 +RSU Tax Offset 29,312.15 Transportation 798.35 + + + --------- --------- +Total Payment: 39,882.89 Total Deduction : 32,914.79 Net Pay: 6,968.10 + + +This Period Amount Year To Date Amount +Total Gross: 39,882.89 Total Gross: 232,630.34 +Taxable Pay: 72,096.92 Taxable Pay: 373,601.64 +Tax Paid: 31,311.90 Tax Paid: 155,626.37 +EEs NI: 1,602.89 EEs NI: 9,242.47 +EEs Pension: -600.20 EEs Pension: -6,602.20 diff --git a/tests/test_meta_uk_parser.py b/tests/test_meta_uk_parser.py new file mode 100644 index 0000000..99f5c49 --- /dev/null +++ b/tests/test_meta_uk_parser.py @@ -0,0 +1,146 @@ +from datetime import date +from decimal import Decimal +from pathlib import Path + +import pytest + +from payslip_ingest.parsers.meta_uk import ParserError, parse_meta_uk + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _load(name: str) -> str: + return (FIXTURES / name).read_text(encoding="utf-8") + + +def test_parses_variant_b_standard_month() -> None: + """Feb 2026 — variant B, RSU vesting, no bonus, salary-sacrifice pension.""" + result = parse_meta_uk(_load("meta_uk_2026_02.txt")) + + assert result.pay_date == date(2026, 2, 27) + assert result.pay_period_start == date(2026, 2, 1) + assert result.pay_period_end == date(2026, 2, 27) + assert result.employer == "Facebook UK Limited" + assert result.currency == "GBP" + + assert result.salary == Decimal("10003.33") + assert result.bonus == Decimal("0") + assert result.pension_sacrifice == Decimal("600.20") + # rsu_vest = RSU Tax Offset + RSU Excs Refund + assert result.rsu_vest == Decimal("30479.76") + assert result.rsu_offset == Decimal("0") + + assert result.gross_pay == Decimal("39882.89") + assert result.income_tax == Decimal("31311.90") + assert result.national_insurance == Decimal("1602.89") + assert result.pension_employee == Decimal("0") + assert result.student_loan == Decimal("0") + assert result.net_pay == Decimal("6968.10") + + assert result.taxable_pay == Decimal("72096.92") + assert result.ytd_tax_paid == Decimal("155626.37") + assert result.ytd_taxable_pay == Decimal("373601.64") + assert result.ytd_gross == Decimal("232630.34") + + +def test_parses_variant_b_with_bonus_and_rsu() -> None: + """March 2025 — variant B, bonus month, RSU vesting, multiple other deductions.""" + result = parse_meta_uk(_load("meta_uk_2025_03.txt")) + + assert result.pay_date == date(2025, 3, 27) + assert result.salary == Decimal("10000.00") + assert result.bonus == Decimal("25000.00") + assert result.pension_sacrifice == Decimal("1200.00") + assert result.rsu_vest == Decimal("20000.00") + + assert result.gross_pay == Decimal("53720.00") + assert result.income_tax == Decimal("45210.44") + assert result.national_insurance == Decimal("2750.12") + assert result.student_loan == Decimal("850.00") + assert result.net_pay == Decimal("4753.69") + + # Private Medical comes from the Deductions column. Cycle To Work is a + # negative Payments line — already subtracted from Total Payment, so it + # does NOT belong in other_deductions (that would double-count). + assert "Private Medical" in result.other_deductions + assert result.other_deductions["Private Medical"] == Decimal("155.75") + assert "Cycle To Work" not in result.other_deductions + + +def test_parses_variant_b_bonus_sacrificed() -> None: + """March 2024 — variant B, full bonus sacrificed into pension, bonus line = 0.""" + result = parse_meta_uk(_load("meta_uk_2024_03_bonus_sacrificed.txt")) + + assert result.pay_date == date(2024, 3, 27) + assert result.salary == Decimal("9500.00") + # Bonus line present but zero — parser should surface this so the dashboard + # can highlight the "bonus sacrificed" dip. + assert result.bonus == Decimal("0") + # Big pension sacrifice dwarfs the salary — this is the signal we care about. + assert result.pension_sacrifice == Decimal("6200.00") + assert result.rsu_vest == Decimal("0") + + assert result.gross_pay == Decimal("3300.00") + assert result.net_pay == Decimal("2130.00") + + +def test_parses_variant_a_pre_2022() -> None: + """July 2019 — variant A, pre-RSU, single-column layout. + + Variant A lists AE Pension EE as a positive deduction (pre-sacrifice gross), + so it maps to `pension_employee` for the standard validation formula to hold. + Variant B lists it as a negative payment (post-sacrifice gross) and maps to + `pension_sacrifice` instead. Both represent money going into the pension. + """ + result = parse_meta_uk(_load("meta_uk_2019_07.txt")) + + assert result.pay_date == date(2019, 7, 31) + assert result.employer == "Facebook UK Limited" + assert result.salary == Decimal("7083.33") + assert result.bonus == Decimal("0") + assert result.rsu_vest == Decimal("0") + assert result.pension_sacrifice == Decimal("0") + assert result.pension_employee == Decimal("500.00") + + assert result.gross_pay == Decimal("7083.33") + assert result.income_tax == Decimal("1480.00") + assert result.national_insurance == Decimal("564.73") + assert result.student_loan == Decimal("120.00") + assert result.net_pay == Decimal("4418.60") + + # Variant A carries a "Taxable Pay" line inline + assert result.taxable_pay == Decimal("6583.33") + + +def test_raises_on_non_meta_payslip() -> None: + with pytest.raises(ParserError): + parse_meta_uk("This is not a Meta payslip\nRandom text\n") + + +def test_raises_on_empty_text() -> None: + with pytest.raises(ParserError): + parse_meta_uk("") + + +def test_raises_when_pay_date_missing() -> None: + broken = "Facebook UK Limited\nPayslip\nSalary 1000.00\nNet Pay: 800.00\n" + with pytest.raises(ParserError): + parse_meta_uk(broken) + + +@pytest.mark.parametrize("fixture_name", [ + "meta_uk_2026_02.txt", + "meta_uk_2025_03.txt", + "meta_uk_2024_03_bonus_sacrificed.txt", + "meta_uk_2019_07.txt", +]) +def test_all_fixtures_validate_totals(fixture_name: str) -> None: + """Every fixture must satisfy gross - deductions ≈ net within 2p.""" + from payslip_ingest.schema import validate_totals + + result = parse_meta_uk(_load(fixture_name)) + assert validate_totals(result), ( + f"{fixture_name}: gross={result.gross_pay} " + f"tax={result.income_tax} nic={result.national_insurance} " + f"student={result.student_loan} other={result.other_deductions} " + f"net={result.net_pay}") diff --git a/tests/test_processor.py b/tests/test_processor.py index 93e6b72..de1b403 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -1,13 +1,17 @@ from datetime import date from decimal import Decimal +from pathlib import Path from typing import Any from unittest.mock import AsyncMock, MagicMock import pytest +from payslip_ingest import processor from payslip_ingest.processor import process_document from payslip_ingest.schema import ExtractedPayslip +FIXTURES = Path(__file__).parent / "fixtures" + def _sample_extraction() -> ExtractedPayslip: return ExtractedPayslip( @@ -164,3 +168,37 @@ async def test_process_document_flags_validation_failure(paperless: AsyncMock, assert result.status == "inserted" assert result.validated is False assert factory.used[1].added[0].validated is False + + +async def test_regex_parser_short_circuits_claude(paperless: AsyncMock, extractor: AsyncMock, + monkeypatch: pytest.MonkeyPatch) -> None: + """When pdftotext output matches the Meta template, Claude must not run.""" + meta_text = (FIXTURES / "meta_uk_2026_02.txt").read_text(encoding="utf-8") + monkeypatch.setattr(processor, "_pdftotext", lambda _: meta_text) + + factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])]) + result = await process_document(42, factory, paperless, extractor) + + assert result.status == "inserted" + assert result.validated is True + assert result.extractor == "meta_uk_regex" + extractor.extract.assert_not_called() + # Salary / bonus / pension_sacrifice from the regex parser should land on the row. + row = factory.used[1].added[0] + assert row.salary == Decimal("10003.33") + assert row.pension_sacrifice == Decimal("600.20") + assert row.rsu_vest == Decimal("30479.76") + assert row.taxable_pay == Decimal("72096.92") + + +async def test_regex_miss_falls_back_to_claude(paperless: AsyncMock, extractor: AsyncMock, + monkeypatch: pytest.MonkeyPatch) -> None: + """When pdftotext output doesn't match Meta, Claude is invoked.""" + monkeypatch.setattr(processor, "_pdftotext", lambda _: "Some other employer's payslip\n") + + factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])]) + result = await process_document(42, factory, paperless, extractor) + + assert result.status == "inserted" + assert result.extractor == "claude" + extractor.extract.assert_awaited_once()