payslip-ingest/payslip_ingest/db.py

163 lines
9.2 KiB
Python
Raw Normal View History

import os
from datetime import date, datetime
from decimal import Decimal
from typing import Any
from sqlalchemy import JSON, TIMESTAMP, Boolean, Date, Integer, Numeric, String, text
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
SCHEMA_NAME = "payslip_ingest"
class Base(DeclarativeBase):
pass
# JSONB on Postgres, plain JSON (as text) on SQLite — tests use SQLite, prod uses Postgres.
JSON_TYPE = JSONB().with_variant(JSON(), "sqlite")
class Payslip(Base):
__tablename__ = "payslip"
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
paperless_doc_id: Mapped[int] = mapped_column(Integer, unique=True, nullable=False)
pay_date: Mapped[date] = mapped_column(Date, nullable=False)
pay_period_start: Mapped[date | None] = mapped_column(Date, nullable=True)
pay_period_end: Mapped[date | None] = mapped_column(Date, nullable=True)
employer: Mapped[str | None] = mapped_column(String, nullable=True)
currency: Mapped[str] = mapped_column(String(3), nullable=False, server_default="GBP")
gross_pay: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
income_tax: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
national_insurance: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
pension_employee: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
pension_employer: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
student_loan: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
rsu_vest: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
rsu_offset: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
v2: regex parser for Meta UK template + accurate RSU tax attribution ## Context v1 shipped a Claude Haiku-based extractor that validated only 10/71 backfilled rows. Haiku fumbles the arithmetic on pension salary-sacrifice, conflates RSU vest with regular earnings, and occasionally misreads YTD vs this-period columns — so 86% of rows land with validated=false and the downstream dashboards under-report take-home. Meta UK uses a stable two-variant template (pre/post 2022-01-31 boundary), so a regex parser is both faster (ms vs. 30-90s + $0.01-0.05/call) and more accurate. v2 introduces that parser as the primary path, keeps Claude as the fallback for non-Meta payslips, and surfaces new fields the dashboard needs to attribute PAYE between cash salary and RSU vests correctly. ## This change ### Parser (new) `payslip_ingest/parsers/meta_uk.py` detects the layout variant by header presence: - **Variant A** (pre-2022): vertical Description/This Period/This Year. `AE Pension EE` is a positive deduction against a pre-sacrifice gross — maps to `pension_employee` for the existing validation formula to hold. - **Variant B** (post-2022): side-by-side Payments | Deductions | Year to Date. `AE Pension EE` is NEGATIVE in Payments (salary sacrifice) — maps to `pension_sacrifice` and is already netted into Total Payment. `rsu_vest = RSU Tax Offset + RSU Excs Refund` (Meta's template inflates Taxable Pay without using a matching offset deduction). Column boundaries come from the header row's anchor positions; each data row slices into 3 cells and the last numeric token per cell is the amount. Anchor misses raise ParserError so the caller falls back to Claude rather than silently returning bad data. ### New fields Schema + DB + Claude prompt gain: - `salary`, `bonus`, `pension_sacrifice` — earnings decomposition for the dashboard's bonus-sacrifice visibility and earnings-breakdown chart - `taxable_pay`, `ytd_tax_paid`, `ytd_taxable_pay`, `ytd_gross` — powers the YTD-effective-rate method of attributing cash tax vs RSU tax, which is the only method that's accurate month-to-month All new columns default to 0 / null so v1 rows continue to round-trip. ### Orchestration processor.py tries `parse_meta_uk(pdftotext(pdf))` first. On success the result goes straight to the DB — zero Claude tokens spent, extraction in milliseconds. On ParserError it falls through to ClaudeExtractor as before. ProcessResult gains an `extractor` field ("meta_uk_regex" | "claude") so backfill logs show the hit rate. ## Tests - `test_meta_uk_parser.py` — 11 tests covering variant A, variant B (standard + bonus month + bonus-sacrificed month), malformed inputs, and end-to-end totals validation for all 4 golden fixtures. - `test_processor.py` — 2 new tests proving the regex-first short-circuit and the Claude fallback on non-Meta inputs. Fixtures under `tests/fixtures/` are hand-crafted `pdftotext -layout` emulations — real Meta numbers from the plan's sample payslips for variant B, synthesized realistic variant A and bonus-sacrificed samples. 0001_initial.py reformat is yapf cleanup touched during the session's format pass; not a behavior change. ## Test Plan ### Automated ``` $ poetry run pytest ============================= test session starts ============================== collected 53 items tests/test_extractor.py ..... [ 9%] tests/test_meta_uk_parser.py ........... [ 30%] tests/test_paperless.py ...... [ 41%] tests/test_processor.py .............. [ 67%] tests/test_schema.py .... [ 75%] tests/test_tax_year.py ........ [ 90%] tests/test_webhook.py ..... [100%] ============================== 53 passed in 1.66s ============================== $ poetry run ruff check . All checks passed! $ poetry run mypy . Success: no issues found in 24 source files $ poetry run yapf --style pyproject.toml --diff --recursive payslip_ingest tests (no output — all files are yapf-clean) ``` ### Manual Verification Smoke-test the parser against a real Meta payslip PDF on the deploy host: ``` # After 0003 migration applied to prod DB $ poetry run python -c " from payslip_ingest.parsers import parse_meta_uk import subprocess text = subprocess.check_output(['pdftotext', '-layout', '/path/to/real.pdf', '-']).decode() p = parse_meta_uk(text) print(p.model_dump_json(indent=2)) " ``` Expected: JSON with salary/bonus/rsu_vest/pension_sacrifice populated and `validate_totals(p)` returning True. ## Reproduce locally 1. `cd payslip-ingest && poetry install` 2. `poetry run pytest tests/test_meta_uk_parser.py -v` 3. Expected: 11 tests pass, each fixture validates totals within 2p. Closes: code-un1 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 10:53:52 +00:00
salary: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
bonus: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False, server_default=text("0"))
pension_sacrifice: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
taxable_pay: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
ytd_tax_paid: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
ytd_taxable_pay: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
ytd_gross: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:23:05 +00:00
cash_income_tax: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
cash_income_tax_source: Mapped[str | None] = mapped_column(String(16), nullable=True)
parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:23:05 +00:00
ytd_rsu_tax_offset: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
ytd_rsu_excs_refund: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
other_deductions: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True)
net_pay: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
tax_year: Mapped[str] = mapped_column(String, nullable=False)
raw_extraction: Mapped[dict[str, Any]] = mapped_column(JSON_TYPE, nullable=False)
validated: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default=text("true"))
created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
nullable=False,
server_default=text("now()"))
class ExternalMetaDeposit(Base):
"""Meta payroll deposit as recorded by ActualBudget — ground-truth against
`payslip.net_pay`. Synced daily by a CronJob that reads from the
jhonderson/actual-http-api sidecar.
Idempotent on `actualbudget_tx_id` same transaction id from AB means the
same deposit, re-runs are no-ops. Deletions in AB are not propagated.
"""
__tablename__ = "external_meta_deposits"
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
actualbudget_tx_id: Mapped[str] = mapped_column(String, unique=True, nullable=False)
deposit_date: Mapped[date] = mapped_column(Date, nullable=False)
amount: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
payee: Mapped[str | None] = mapped_column(String, nullable=True)
memo: Mapped[str | None] = mapped_column(String, nullable=True)
synced_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
nullable=False,
server_default=text("now()"))
class RsuVestEvent(Base):
"""Schwab RSU vest event — ground truth against payslip.rsu_vest.
One row per vest. `external_id` is stable across IMAP re-runs
(`schwab:{date}:{ticker}:VEST:{shares_vested}`). USD GBP conversion
happens at write time using the daily ECB rate.
"""
__tablename__ = "rsu_vest_events"
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
external_id: Mapped[str] = mapped_column(String, unique=True, nullable=False)
vest_date: Mapped[date] = mapped_column(Date, nullable=False)
ticker: Mapped[str] = mapped_column(String, nullable=False)
shares_vested: Mapped[Decimal] = mapped_column(Numeric(14, 4), nullable=False)
shares_sold_to_cover: Mapped[Decimal | None] = mapped_column(Numeric(14, 4), nullable=True)
fmv_at_vest_usd: Mapped[Decimal] = mapped_column(Numeric(12, 4), nullable=False)
tax_withheld_usd: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
fx_rate_gbp: Mapped[Decimal | None] = mapped_column(Numeric(10, 6), nullable=True)
gross_value_gbp: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
tax_withheld_gbp: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
source: Mapped[str] = mapped_column(String(32), nullable=False)
raw_extraction: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True)
created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
nullable=False,
server_default=text("now()"))
parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:23:05 +00:00
class P60Reference(Base):
"""HMRC-issued annual P60. One row per (tax_year, employer).
Source of truth for annual PAYE/NI lets the dashboard reconcile
`SUM(payslip_ingest.payslip)` against the figures HMRC actually has on
file, catching both missing-month gaps and parser drift.
"""
__tablename__ = "p60_reference"
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
tax_year: Mapped[str] = mapped_column(String, nullable=False, index=True)
employer: Mapped[str] = mapped_column(String, nullable=False)
employer_paye_ref: Mapped[str | None] = mapped_column(String, nullable=True)
gross_pay: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
income_tax: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
national_insurance: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
student_loan: Mapped[Decimal | None] = mapped_column(Numeric(12, 2), nullable=True)
tax_code: Mapped[str | None] = mapped_column(String, nullable=True)
paperless_doc_id: Mapped[int] = mapped_column(Integer, unique=True, nullable=False)
raw_extraction: Mapped[dict[str, Any]] = mapped_column(JSON_TYPE, nullable=False)
created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
nullable=False,
server_default=text("now()"))
def create_engine_from_env() -> AsyncEngine:
url = os.environ["DB_CONNECTION_STRING"]
return create_async_engine(url, pool_pre_ping=True)
def make_session_factory(engine: AsyncEngine) -> async_sessionmaker[Any]:
return async_sessionmaker(engine, expire_on_commit=False)