payslip-ingest/payslip_ingest/db.py
Viktor Barzin 9105b6b79d extractor: track rsu_vest + rsu_offset separately from cash pay
UK payslips for equity-comp employees report RSU vests as notional pay
for HMRC only. A paired same-magnitude deduction (Shares Retained /
Stock Tax Withholding / RSU Offset) nets it back out of cash. The UK
payslip's income_tax line shows tax on the grossed-up total, but the
actual RSU tax is handled by Schwab (US broker) via share sale. No
cash flows through UK payroll for RSU.

Previously the extractor folded RSU notional into gross_pay and
income_tax, which inflated the dashboard numbers — a payslip with
£25k RSU vest looked like 2x salary with 80% tax rate.

Changes:
- schema: add rsu_vest + rsu_offset fields (default 0).
- db + alembic 0002: add two new NUMERIC(12,2) columns with server
  default 0 (backward-compatible; existing rows get 0).
- validate_totals: include rsu_offset in deductions sum so the
  gross + rsu_vest inflation is properly netted out.
- extraction prompt: explicit rules for identifying RSU lines by the
  common Meta/Sage/Workday labels, and to NOT put them in
  other_deductions.

Dashboards in a follow-up commit: cash_gross = gross_pay - rsu_vest,
effective tax rate based on cash metrics.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 23:37:25 +00:00

71 lines
3.7 KiB
Python

import os
from datetime import date, datetime
from decimal import Decimal
from typing import Any
from sqlalchemy import JSON, TIMESTAMP, Boolean, Date, Integer, Numeric, String, text
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
SCHEMA_NAME = "payslip_ingest"
class Base(DeclarativeBase):
pass
# JSONB on Postgres, plain JSON (as text) on SQLite — tests use SQLite, prod uses Postgres.
JSON_TYPE = JSONB().with_variant(JSON(), "sqlite")
class Payslip(Base):
__tablename__ = "payslip"
__table_args__ = {"schema": SCHEMA_NAME} # noqa: RUF012
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
paperless_doc_id: Mapped[int] = mapped_column(Integer, unique=True, nullable=False)
pay_date: Mapped[date] = mapped_column(Date, nullable=False)
pay_period_start: Mapped[date | None] = mapped_column(Date, nullable=True)
pay_period_end: Mapped[date | None] = mapped_column(Date, nullable=True)
employer: Mapped[str | None] = mapped_column(String, nullable=True)
currency: Mapped[str] = mapped_column(String(3), nullable=False, server_default="GBP")
gross_pay: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
income_tax: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
national_insurance: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
pension_employee: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
pension_employer: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
student_loan: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
rsu_vest: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
rsu_offset: Mapped[Decimal] = mapped_column(Numeric(12, 2),
nullable=False,
server_default=text("0"))
other_deductions: Mapped[dict[str, Any] | None] = mapped_column(JSON_TYPE, nullable=True)
net_pay: Mapped[Decimal] = mapped_column(Numeric(12, 2), nullable=False)
tax_year: Mapped[str] = mapped_column(String, nullable=False)
raw_extraction: Mapped[dict[str, Any]] = mapped_column(JSON_TYPE, nullable=False)
validated: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default=text("true"))
created_at: Mapped[datetime] = mapped_column(TIMESTAMP(timezone=True),
nullable=False,
server_default=text("now()"))
def create_engine_from_env() -> AsyncEngine:
url = os.environ["DB_CONNECTION_STRING"]
return create_async_engine(url, pool_pre_ping=True)
def make_session_factory(engine: AsyncEngine) -> async_sessionmaker[Any]:
return async_sessionmaker(engine, expire_on_commit=False)