parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth

Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 15:23:05 +00:00 · 2026-04-19 15:23:05 +00:00 · 26e43b1055
commit 26e43b1055
parent d91f34ddb4
14 changed files with 644 additions and 15 deletions
--- a/alembic/versions/0004_cash_income_tax.py
+++ b/alembic/versions/0004_cash_income_tax.py
@ -0,0 +1,53 @@
+"""Add cash_income_tax + YTD RSU offset/refund columns.
+
+Meta variant-B payslips gross up Taxable Pay for RSU vest; PAYE on the slip
+(`Tax paid`) is the total including the RSU-attributed portion. Storing the
+verbatim figure is correct for HMRC accounting but makes vest-month tax look
+~2x higher on dashboards that stack it against cash pay.
+
+`cash_income_tax` is the derived pro-rata share of PAYE that the cash portion
+of gross (gross - pension_sacrifice) bears, computed as
+`income_tax * (gross_pay - pension_sacrifice) / taxable_pay`. Dashboards can
+stack the derived column and show the remainder as the RSU-attributed slice.
+
+`ytd_rsu_tax_offset` and `ytd_rsu_excs_refund` capture the Year-to-Date
+column of the RSU lines in the Payments block — useful for reconciliation
+against HMRC's annual figures once the P60 / HMRC API pipelines land.
+
+All three columns are nullable; existing rows get NULL until a one-shot
+backfill runs.
+"""
+import sqlalchemy as sa
+
+from alembic import op
+
+revision = "0004"
+down_revision = "0003"
+branch_labels = None
+depends_on = None
+
+SCHEMA = "payslip_ingest"
+
+
+def upgrade() -> None:
+    op.add_column(
+        "payslip",
+        sa.Column("cash_income_tax", sa.Numeric(12, 2), nullable=True),
+        schema=SCHEMA,
+    )
+    op.add_column(
+        "payslip",
+        sa.Column("ytd_rsu_tax_offset", sa.Numeric(12, 2), nullable=True),
+        schema=SCHEMA,
+    )
+    op.add_column(
+        "payslip",
+        sa.Column("ytd_rsu_excs_refund", sa.Numeric(12, 2), nullable=True),
+        schema=SCHEMA,
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("payslip", "ytd_rsu_excs_refund", schema=SCHEMA)
+    op.drop_column("payslip", "ytd_rsu_tax_offset", schema=SCHEMA)
+    op.drop_column("payslip", "cash_income_tax", schema=SCHEMA)
--- a/alembic/versions/0005_p60_reference.py
+++ b/alembic/versions/0005_p60_reference.py
@ -0,0 +1,63 @@
+"""Add p60_reference table for HMRC annual ground-truth reconciliation.
+
+P60 is the authoritative end-of-year certificate HMRC issues; its figures
+match what HMRC has on file. Storing one row per (tax_year, employer) lets
+the dashboard compare `SUM(payslip)` against the P60 totals and surface
+missing-month gaps or parser drift.
+
+Columns mirror what the P60 explicitly prints; everything derived (effective
+rate, deltas) stays in the dashboard SQL. `paperless_doc_id` is unique so
+re-uploading the same PDF is idempotent. `raw_extraction` keeps the full
+parsed dict for debugging parser regressions.
+"""
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from alembic import op
+
+revision = "0005"
+down_revision = "0004"
+branch_labels = None
+depends_on = None
+
+SCHEMA = "payslip_ingest"
+
+
+def upgrade() -> None:
+    op.create_table(
+        "p60_reference",
+        sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
+        sa.Column("tax_year", sa.String(), nullable=False),
+        sa.Column("employer", sa.String(), nullable=False),
+        sa.Column("employer_paye_ref", sa.String(), nullable=True),
+        sa.Column("gross_pay", sa.Numeric(12, 2), nullable=False),
+        sa.Column("income_tax", sa.Numeric(12, 2), nullable=False),
+        sa.Column("national_insurance", sa.Numeric(12, 2), nullable=False),
+        sa.Column("student_loan", sa.Numeric(12, 2), nullable=True),
+        sa.Column("tax_code", sa.String(), nullable=True),
+        sa.Column("paperless_doc_id", sa.Integer(), nullable=False, unique=True),
+        sa.Column(
+            "raw_extraction",
+            postgresql.JSONB().with_variant(sa.JSON(), "sqlite"),
+            nullable=False,
+        ),
+        sa.Column(
+            "created_at",
+            sa.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("now()"),
+        ),
+        sa.UniqueConstraint("tax_year", "employer", name="uq_p60_tax_year_employer"),
+        schema=SCHEMA,
+    )
+    op.create_index(
+        "ix_p60_reference_tax_year",
+        "p60_reference",
+        ["tax_year"],
+        schema=SCHEMA,
+    )
+
+
+def downgrade() -> None:
+    op.drop_index("ix_p60_reference_tax_year", table_name="p60_reference", schema=SCHEMA)
+    op.drop_table("p60_reference", schema=SCHEMA)