"""Add p60_reference table for HMRC annual ground-truth reconciliation. P60 is the authoritative end-of-year certificate HMRC issues; its figures match what HMRC has on file. Storing one row per (tax_year, employer) lets the dashboard compare `SUM(payslip)` against the P60 totals and surface missing-month gaps or parser drift. Columns mirror what the P60 explicitly prints; everything derived (effective rate, deltas) stays in the dashboard SQL. `paperless_doc_id` is unique so re-uploading the same PDF is idempotent. `raw_extraction` keeps the full parsed dict for debugging parser regressions. """ import sqlalchemy as sa from sqlalchemy.dialects import postgresql from alembic import op revision = "0005" down_revision = "0004" branch_labels = None depends_on = None SCHEMA = "payslip_ingest" def upgrade() -> None: op.create_table( "p60_reference", sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), sa.Column("tax_year", sa.String(), nullable=False), sa.Column("employer", sa.String(), nullable=False), sa.Column("employer_paye_ref", sa.String(), nullable=True), sa.Column("gross_pay", sa.Numeric(12, 2), nullable=False), sa.Column("income_tax", sa.Numeric(12, 2), nullable=False), sa.Column("national_insurance", sa.Numeric(12, 2), nullable=False), sa.Column("student_loan", sa.Numeric(12, 2), nullable=True), sa.Column("tax_code", sa.String(), nullable=True), sa.Column("paperless_doc_id", sa.Integer(), nullable=False, unique=True), sa.Column( "raw_extraction", postgresql.JSONB().with_variant(sa.JSON(), "sqlite"), nullable=False, ), sa.Column( "created_at", sa.TIMESTAMP(timezone=True), nullable=False, server_default=sa.text("now()"), ), sa.UniqueConstraint("tax_year", "employer", name="uq_p60_tax_year_employer"), schema=SCHEMA, ) op.create_index( "ix_p60_reference_tax_year", "p60_reference", ["tax_year"], schema=SCHEMA, ) def downgrade() -> None: op.drop_index("ix_p60_reference_tax_year", table_name="p60_reference", schema=SCHEMA) op.drop_table("p60_reference", schema=SCHEMA)