payslip-ingest/alembic/versions/0001_initial.py

"""initial schema

Revision ID: 0001
Revises:
Create Date: 2026-04-18 00:00:00.000000

"""
from collections.abc import Sequence

import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

from alembic import op

revision: str = "0001"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None

SCHEMA = "payslip_ingest"


def upgrade() -> None:
    op.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")

    op.create_table(
        "payslip",
        sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
        sa.Column("paperless_doc_id", sa.Integer(), nullable=False, unique=True),
        sa.Column("pay_date", sa.Date(), nullable=False),
        sa.Column("pay_period_start", sa.Date(), nullable=True),
        sa.Column("pay_period_end", sa.Date(), nullable=True),
        sa.Column("employer", sa.Text(), nullable=True),
        sa.Column("currency", sa.CHAR(3), nullable=False, server_default="GBP"),
        sa.Column("gross_pay", sa.Numeric(12, 2), nullable=False),
        sa.Column("income_tax", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")),
        sa.Column("national_insurance",
                  sa.Numeric(12, 2),
                  nullable=False,
                  server_default=sa.text("0")),
        sa.Column("pension_employee",
                  sa.Numeric(12, 2),
                  nullable=False,
                  server_default=sa.text("0")),
        sa.Column("pension_employer",
                  sa.Numeric(12, 2),
                  nullable=False,
                  server_default=sa.text("0")),
        sa.Column("student_loan", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")),
        sa.Column("other_deductions", postgresql.JSONB(), nullable=True),
        sa.Column("net_pay", sa.Numeric(12, 2), nullable=False),
        sa.Column("tax_year", sa.Text(), nullable=False),
        sa.Column("raw_extraction", postgresql.JSONB(), nullable=False),
        sa.Column("validated", sa.Boolean(), nullable=False, server_default=sa.text("true")),
        sa.Column(
            "created_at",
            sa.TIMESTAMP(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        schema=SCHEMA,
    )
    op.create_index("idx_payslip_pay_date", "payslip", ["pay_date"], schema=SCHEMA)
    op.create_index("idx_payslip_tax_year", "payslip", ["tax_year"], schema=SCHEMA)


def downgrade() -> None:
    op.drop_index("idx_payslip_tax_year", table_name="payslip", schema=SCHEMA)
    op.drop_index("idx_payslip_pay_date", table_name="payslip", schema=SCHEMA)
    op.drop_table("payslip", schema=SCHEMA)
    op.execute(f"DROP SCHEMA IF EXISTS {SCHEMA}")
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`"""initial schema`

			`Revision ID: 0001`
			`Revises:`
			`Create Date: 2026-04-18 00:00:00.000000`

			`"""`
			`from collections.abc import Sequence`

			`import sqlalchemy as sa`
			`from sqlalchemy.dialects import postgresql`

			`from alembic import op`

			`revision: str = "0001"`
			`down_revision: str \| None = None`
			`branch_labels: str \| Sequence[str] \| None = None`
			`depends_on: str \| Sequence[str] \| None = None`

			`SCHEMA = "payslip_ingest"`


			`def upgrade() -> None:`
			`op.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")`

			`op.create_table(`
			`"payslip",`
			`sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),`
			`sa.Column("paperless_doc_id", sa.Integer(), nullable=False, unique=True),`
			`sa.Column("pay_date", sa.Date(), nullable=False),`
			`sa.Column("pay_period_start", sa.Date(), nullable=True),`
			`sa.Column("pay_period_end", sa.Date(), nullable=True),`
			`sa.Column("employer", sa.Text(), nullable=True),`
			`sa.Column("currency", sa.CHAR(3), nullable=False, server_default="GBP"),`
			`sa.Column("gross_pay", sa.Numeric(12, 2), nullable=False),`
			`sa.Column("income_tax", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")),`
v2: regex parser for Meta UK template + accurate RSU tax attribution ## Context v1 shipped a Claude Haiku-based extractor that validated only 10/71 backfilled rows. Haiku fumbles the arithmetic on pension salary-sacrifice, conflates RSU vest with regular earnings, and occasionally misreads YTD vs this-period columns — so 86% of rows land with validated=false and the downstream dashboards under-report take-home. Meta UK uses a stable two-variant template (pre/post 2022-01-31 boundary), so a regex parser is both faster (ms vs. 30-90s + $0.01-0.05/call) and more accurate. v2 introduces that parser as the primary path, keeps Claude as the fallback for non-Meta payslips, and surfaces new fields the dashboard needs to attribute PAYE between cash salary and RSU vests correctly. ## This change ### Parser (new) `payslip_ingest/parsers/meta_uk.py` detects the layout variant by header presence: - Variant A (pre-2022): vertical Description/This Period/This Year. `AE Pension EE` is a positive deduction against a pre-sacrifice gross — maps to `pension_employee` for the existing validation formula to hold. - Variant B (post-2022): side-by-side Payments \| Deductions \| Year to Date. `AE Pension EE` is NEGATIVE in Payments (salary sacrifice) — maps to `pension_sacrifice` and is already netted into Total Payment. `rsu_vest = RSU Tax Offset + RSU Excs Refund` (Meta's template inflates Taxable Pay without using a matching offset deduction). Column boundaries come from the header row's anchor positions; each data row slices into 3 cells and the last numeric token per cell is the amount. Anchor misses raise ParserError so the caller falls back to Claude rather than silently returning bad data. ### New fields Schema + DB + Claude prompt gain: - `salary`, `bonus`, `pension_sacrifice` — earnings decomposition for the dashboard's bonus-sacrifice visibility and earnings-breakdown chart - `taxable_pay`, `ytd_tax_paid`, `ytd_taxable_pay`, `ytd_gross` — powers the YTD-effective-rate method of attributing cash tax vs RSU tax, which is the only method that's accurate month-to-month All new columns default to 0 / null so v1 rows continue to round-trip. ### Orchestration processor.py tries `parse_meta_uk(pdftotext(pdf))` first. On success the result goes straight to the DB — zero Claude tokens spent, extraction in milliseconds. On ParserError it falls through to ClaudeExtractor as before. ProcessResult gains an `extractor` field ("meta_uk_regex" \| "claude") so backfill logs show the hit rate. ## Tests - `test_meta_uk_parser.py` — 11 tests covering variant A, variant B (standard + bonus month + bonus-sacrificed month), malformed inputs, and end-to-end totals validation for all 4 golden fixtures. - `test_processor.py` — 2 new tests proving the regex-first short-circuit and the Claude fallback on non-Meta inputs. Fixtures under `tests/fixtures/` are hand-crafted `pdftotext -layout` emulations — real Meta numbers from the plan's sample payslips for variant B, synthesized realistic variant A and bonus-sacrificed samples. 0001_initial.py reformat is yapf cleanup touched during the session's format pass; not a behavior change. ## Test Plan ### Automated ``` $ poetry run pytest ============================= test session starts ============================== collected 53 items tests/test_extractor.py ..... [ 9%] tests/test_meta_uk_parser.py ........... [ 30%] tests/test_paperless.py ...... [ 41%] tests/test_processor.py .............. [ 67%] tests/test_schema.py .... [ 75%] tests/test_tax_year.py ........ [ 90%] tests/test_webhook.py ..... [100%] ============================== 53 passed in 1.66s ============================== $ poetry run ruff check . All checks passed! $ poetry run mypy . Success: no issues found in 24 source files $ poetry run yapf --style pyproject.toml --diff --recursive payslip_ingest tests (no output — all files are yapf-clean) ``` ### Manual Verification Smoke-test the parser against a real Meta payslip PDF on the deploy host: ``` # After 0003 migration applied to prod DB $ poetry run python -c " from payslip_ingest.parsers import parse_meta_uk import subprocess text = subprocess.check_output(['pdftotext', '-layout', '/path/to/real.pdf', '-']).decode() p = parse_meta_uk(text) print(p.model_dump_json(indent=2)) " ``` Expected: JSON with salary/bonus/rsu_vest/pension_sacrifice populated and `validate_totals(p)` returning True. ## Reproduce locally 1. `cd payslip-ingest && poetry install` 2. `poetry run pytest tests/test_meta_uk_parser.py -v` 3. Expected: 11 tests pass, each fixture validates totals within 2p. Closes: code-un1 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-19 10:53:52 +00:00			`sa.Column("national_insurance",`
			`sa.Numeric(12, 2),`
			`nullable=False,`
			`server_default=sa.text("0")),`
			`sa.Column("pension_employee",`
			`sa.Numeric(12, 2),`
			`nullable=False,`
			`server_default=sa.text("0")),`
			`sa.Column("pension_employer",`
			`sa.Numeric(12, 2),`
			`nullable=False,`
			`server_default=sa.text("0")),`
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`sa.Column("student_loan", sa.Numeric(12, 2), nullable=False, server_default=sa.text("0")),`
			`sa.Column("other_deductions", postgresql.JSONB(), nullable=True),`
			`sa.Column("net_pay", sa.Numeric(12, 2), nullable=False),`
			`sa.Column("tax_year", sa.Text(), nullable=False),`
			`sa.Column("raw_extraction", postgresql.JSONB(), nullable=False),`
			`sa.Column("validated", sa.Boolean(), nullable=False, server_default=sa.text("true")),`
			`sa.Column(`
			`"created_at",`
			`sa.TIMESTAMP(timezone=True),`
			`nullable=False,`
			`server_default=sa.text("now()"),`
			`),`
			`schema=SCHEMA,`
			`)`
v2: regex parser for Meta UK template + accurate RSU tax attribution ## Context v1 shipped a Claude Haiku-based extractor that validated only 10/71 backfilled rows. Haiku fumbles the arithmetic on pension salary-sacrifice, conflates RSU vest with regular earnings, and occasionally misreads YTD vs this-period columns — so 86% of rows land with validated=false and the downstream dashboards under-report take-home. Meta UK uses a stable two-variant template (pre/post 2022-01-31 boundary), so a regex parser is both faster (ms vs. 30-90s + $0.01-0.05/call) and more accurate. v2 introduces that parser as the primary path, keeps Claude as the fallback for non-Meta payslips, and surfaces new fields the dashboard needs to attribute PAYE between cash salary and RSU vests correctly. ## This change ### Parser (new) `payslip_ingest/parsers/meta_uk.py` detects the layout variant by header presence: - Variant A (pre-2022): vertical Description/This Period/This Year. `AE Pension EE` is a positive deduction against a pre-sacrifice gross — maps to `pension_employee` for the existing validation formula to hold. - Variant B (post-2022): side-by-side Payments \| Deductions \| Year to Date. `AE Pension EE` is NEGATIVE in Payments (salary sacrifice) — maps to `pension_sacrifice` and is already netted into Total Payment. `rsu_vest = RSU Tax Offset + RSU Excs Refund` (Meta's template inflates Taxable Pay without using a matching offset deduction). Column boundaries come from the header row's anchor positions; each data row slices into 3 cells and the last numeric token per cell is the amount. Anchor misses raise ParserError so the caller falls back to Claude rather than silently returning bad data. ### New fields Schema + DB + Claude prompt gain: - `salary`, `bonus`, `pension_sacrifice` — earnings decomposition for the dashboard's bonus-sacrifice visibility and earnings-breakdown chart - `taxable_pay`, `ytd_tax_paid`, `ytd_taxable_pay`, `ytd_gross` — powers the YTD-effective-rate method of attributing cash tax vs RSU tax, which is the only method that's accurate month-to-month All new columns default to 0 / null so v1 rows continue to round-trip. ### Orchestration processor.py tries `parse_meta_uk(pdftotext(pdf))` first. On success the result goes straight to the DB — zero Claude tokens spent, extraction in milliseconds. On ParserError it falls through to ClaudeExtractor as before. ProcessResult gains an `extractor` field ("meta_uk_regex" \| "claude") so backfill logs show the hit rate. ## Tests - `test_meta_uk_parser.py` — 11 tests covering variant A, variant B (standard + bonus month + bonus-sacrificed month), malformed inputs, and end-to-end totals validation for all 4 golden fixtures. - `test_processor.py` — 2 new tests proving the regex-first short-circuit and the Claude fallback on non-Meta inputs. Fixtures under `tests/fixtures/` are hand-crafted `pdftotext -layout` emulations — real Meta numbers from the plan's sample payslips for variant B, synthesized realistic variant A and bonus-sacrificed samples. 0001_initial.py reformat is yapf cleanup touched during the session's format pass; not a behavior change. ## Test Plan ### Automated ``` $ poetry run pytest ============================= test session starts ============================== collected 53 items tests/test_extractor.py ..... [ 9%] tests/test_meta_uk_parser.py ........... [ 30%] tests/test_paperless.py ...... [ 41%] tests/test_processor.py .............. [ 67%] tests/test_schema.py .... [ 75%] tests/test_tax_year.py ........ [ 90%] tests/test_webhook.py ..... [100%] ============================== 53 passed in 1.66s ============================== $ poetry run ruff check . All checks passed! $ poetry run mypy . Success: no issues found in 24 source files $ poetry run yapf --style pyproject.toml --diff --recursive payslip_ingest tests (no output — all files are yapf-clean) ``` ### Manual Verification Smoke-test the parser against a real Meta payslip PDF on the deploy host: ``` # After 0003 migration applied to prod DB $ poetry run python -c " from payslip_ingest.parsers import parse_meta_uk import subprocess text = subprocess.check_output(['pdftotext', '-layout', '/path/to/real.pdf', '-']).decode() p = parse_meta_uk(text) print(p.model_dump_json(indent=2)) " ``` Expected: JSON with salary/bonus/rsu_vest/pension_sacrifice populated and `validate_totals(p)` returning True. ## Reproduce locally 1. `cd payslip-ingest && poetry install` 2. `poetry run pytest tests/test_meta_uk_parser.py -v` 3. Expected: 11 tests pass, each fixture validates totals within 2p. Closes: code-un1 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-19 10:53:52 +00:00			`op.create_index("idx_payslip_pay_date", "payslip", ["pay_date"], schema=SCHEMA)`
			`op.create_index("idx_payslip_tax_year", "payslip", ["tax_year"], schema=SCHEMA)`
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00

			`def downgrade() -> None:`
			`op.drop_index("idx_payslip_tax_year", table_name="payslip", schema=SCHEMA)`
			`op.drop_index("idx_payslip_pay_date", table_name="payslip", schema=SCHEMA)`
			`op.drop_table("payslip", schema=SCHEMA)`
			`op.execute(f"DROP SCHEMA IF EXISTS {SCHEMA}")`