payslip-ingest/payslip_ingest/__main__.py

import asyncio
import json
import logging
import os
import subprocess
import sys
from pathlib import Path

import click
import uvicorn

from payslip_ingest.db import create_engine_from_env, make_session_factory
from payslip_ingest.extractor import ClaudeExtractor
from payslip_ingest.paperless import PaperlessClient
from payslip_ingest.processor import process_document
from payslip_ingest.schema import validate_totals

log = logging.getLogger(__name__)


@click.group()
def cli() -> None:
    logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))


@cli.command()
def serve() -> None:
    """Run the webhook HTTP server (K8s entrypoint)."""
    uvicorn.run("payslip_ingest.app:app", host="0.0.0.0", port=8080)


@cli.command()
@click.option("--all", "process_all", is_flag=True, help="Process every payslip-tagged doc.")
@click.option("--limit", type=int, default=None, help="Cap the number of documents processed.")
@click.option("--tag", default="payslip", help="Paperless tag name to enumerate.")
def backfill(process_all: bool, limit: int | None, tag: str) -> None:
    """Enumerate every payslip-tagged Paperless doc and process sequentially."""
    if not process_all:
        raise click.UsageError("pass --all to opt in to the full enumeration")
    asyncio.run(_backfill(tag, limit))


async def _backfill(tag: str, limit: int | None) -> None:
    engine = create_engine_from_env()
    session_factory = make_session_factory(engine)
    paperless = PaperlessClient(
        base_url=os.environ["PAPERLESS_URL"],
        api_token=os.environ["PAPERLESS_API_TOKEN"],
    )
    extractor = ClaudeExtractor(
        base_url=os.environ["CLAUDE_AGENT_URL"],
        bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
    )
    # Resolve the P60 tag if present — needed for the dispatch branch even
    # when backfilling a non-p60 tag (a P60-tagged doc carrying the payslip
    # tag too should still route to the P60 handler).
    p60_tag_id: int | None = None
    try:
        p60_tag_id = await paperless.get_tag_id("p60")
    except Exception as exc:
        click.echo(f"warning: p60 tag resolution failed — dispatch disabled: {exc}", err=True)
    processed = 0
    failed = 0
    try:
        async for doc in paperless.list_tagged_documents(tag):
            if limit is not None and processed >= limit:
                break
            doc_id = int(doc["id"])
            try:
                result = await process_document(doc_id, session_factory, paperless, extractor,
                                                p60_tag_id)
                click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}")
            except Exception as exc:
                # Don't let a single bad doc (wrong tag, non-payslip PDF, Claude
                # hallucinating null fields) abort the whole backfill. Log + continue.
                failed += 1
                click.echo(f"doc_id={doc_id} status=failed error={type(exc).__name__}: {exc}",
                           err=True)
                log.exception("backfill: doc_id=%s failed", doc_id)
            processed += 1
        click.echo(f"backfill complete: processed={processed} failed={failed}")
    finally:
        await paperless.aclose()
        await extractor.aclose()
        await engine.dispose()


@cli.command("extract-one")
@click.argument("path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
def extract_one(path: Path) -> None:
    """Smoke-test extraction on a local PDF — no DB writes."""
    asyncio.run(_extract_one(path))


async def _extract_one(path: Path) -> None:
    pdf_bytes = path.read_bytes()
    extractor = ClaudeExtractor(
        base_url=os.environ["CLAUDE_AGENT_URL"],
        bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
    )
    try:
        extracted = await extractor.extract(pdf_bytes, {"id": None, "source": str(path)})
    finally:
        await extractor.aclose()
    click.echo(extracted.model_dump_json(indent=2))
    ok = validate_totals(extracted)
    click.echo(json.dumps({"totals_validated": ok}))
    if not ok:
        sys.exit(1)


@cli.command()
def migrate() -> None:
    """Run `alembic upgrade head`."""
    result = subprocess.run(["alembic", "upgrade", "head"], check=False)
    sys.exit(result.returncode)


@cli.command("sync-meta-deposits")
def sync_meta_deposits_cmd() -> None:
    """Pull Meta payroll deposits from ActualBudget into external_meta_deposits.

    Reads from the jhonderson/actual-http-api sidecar. Requires env vars:
    ACTUALBUDGET_HTTP_API_URL, ACTUALBUDGET_API_KEY,
    ACTUALBUDGET_ENCRYPTION_PASSWORD, ACTUALBUDGET_BUDGET_SYNC_ID.
    """
    asyncio.run(_sync_meta_deposits())


async def _sync_meta_deposits() -> None:
    from payslip_ingest.sync.actualbudget import ActualBudgetClient, sync_meta_deposits

    engine = create_engine_from_env()
    session_factory = make_session_factory(engine)
    client = ActualBudgetClient(
        base_url=os.environ["ACTUALBUDGET_HTTP_API_URL"],
        api_key=os.environ["ACTUALBUDGET_API_KEY"],
        encryption_password=os.environ["ACTUALBUDGET_ENCRYPTION_PASSWORD"],
        budget_sync_id=os.environ["ACTUALBUDGET_BUDGET_SYNC_ID"],
    )
    try:
        result = await sync_meta_deposits(client, session_factory)
        click.echo(f"sync complete: accounts={result.accounts_scanned} "
                   f"transactions={result.transactions_fetched} "
                   f"meta_matched={result.meta_deposits_matched} "
                   f"inserted={result.inserted} existing={result.skipped_existing}")
    finally:
        await client.aclose()
        await engine.dispose()


@cli.command("backfill-cash-tax")
@click.option("--limit", type=int, default=None, help="Cap the number of rows processed.")
def backfill_cash_tax(limit: int | None) -> None:
    """Back-fill cash_income_tax on rows where it's NULL (vest months only).

    Uses the widened regex parser first; falls back to Claude. Writes the
    provenance source into `cash_income_tax_source`. Idempotent — only
    touches NULL rows.
    """
    asyncio.run(_backfill_cash_tax(limit))


async def _backfill_cash_tax(limit: int | None) -> None:
    from payslip_ingest.backfill_cash_tax import backfill_cash_income_tax

    engine = create_engine_from_env()
    session_factory = make_session_factory(engine)
    paperless = PaperlessClient(
        base_url=os.environ["PAPERLESS_URL"],
        api_token=os.environ["PAPERLESS_API_TOKEN"],
    )
    extractor = ClaudeExtractor(
        base_url=os.environ["CLAUDE_AGENT_URL"],
        bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
    )
    try:
        result = await backfill_cash_income_tax(session_factory, paperless, extractor, limit=limit)
        click.echo(f"back-fill complete: processed={result.processed} "
                   f"regex={result.regex_hits} claude={result.claude_hits} "
                   f"fallback_null={result.fallback_null} errors={result.errors}")
    finally:
        await paperless.aclose()
        await extractor.aclose()
        await engine.dispose()


if __name__ == "__main__":
    cli()
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`import asyncio`
			`import json`
			`import logging`
			`import os`
			`import subprocess`
			`import sys`
			`from pathlib import Path`

			`import click`
			`import uvicorn`

			`from payslip_ingest.db import create_engine_from_env, make_session_factory`
			`from payslip_ingest.extractor import ClaudeExtractor`
			`from payslip_ingest.paperless import PaperlessClient`
			`from payslip_ingest.processor import process_document`
			`from payslip_ingest.schema import validate_totals`

			`log = logging.getLogger(__name__)`


			`@click.group()`
			`def cli() -> None:`
			`logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))`


			`@cli.command()`
			`def serve() -> None:`
			`"""Run the webhook HTTP server (K8s entrypoint)."""`
			`uvicorn.run("payslip_ingest.app:app", host="0.0.0.0", port=8080)`


			`@cli.command()`
			`@click.option("--all", "process_all", is_flag=True, help="Process every payslip-tagged doc.")`
			`@click.option("--limit", type=int, default=None, help="Cap the number of documents processed.")`
			`@click.option("--tag", default="payslip", help="Paperless tag name to enumerate.")`
			`def backfill(process_all: bool, limit: int \| None, tag: str) -> None:`
			`"""Enumerate every payslip-tagged Paperless doc and process sequentially."""`
			`if not process_all:`
			`raise click.UsageError("pass --all to opt in to the full enumeration")`
			`asyncio.run(_backfill(tag, limit))`


			`async def _backfill(tag: str, limit: int \| None) -> None:`
			`engine = create_engine_from_env()`
			`session_factory = make_session_factory(engine)`
			`paperless = PaperlessClient(`
			`base_url=os.environ["PAPERLESS_URL"],`
			`api_token=os.environ["PAPERLESS_API_TOKEN"],`
			`)`
			`extractor = ClaudeExtractor(`
			`base_url=os.environ["CLAUDE_AGENT_URL"],`
			`bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],`
			`)`
parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-19 15:23:05 +00:00			`# Resolve the P60 tag if present — needed for the dispatch branch even`
			`# when backfilling a non-p60 tag (a P60-tagged doc carrying the payslip`
			`# tag too should still route to the P60 handler).`
			`p60_tag_id: int \| None = None`
			`try:`
			`p60_tag_id = await paperless.get_tag_id("p60")`
			`except Exception as exc:`
			`click.echo(f"warning: p60 tag resolution failed — dispatch disabled: {exc}", err=True)`
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`processed = 0`
backfill: continue on per-document errors instead of aborting A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later. 2026-04-18 23:25:36 +00:00			`failed = 0`
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`try:`
			`async for doc in paperless.list_tagged_documents(tag):`
			`if limit is not None and processed >= limit:`
			`break`
			`doc_id = int(doc["id"])`
backfill: continue on per-document errors instead of aborting A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later. 2026-04-18 23:25:36 +00:00			`try:`
parser + P60 ingest: split income_tax cash/RSU, add P60 ground-truth Meta variant-B payslips gross up Taxable Pay for RSU and compute PAYE on the grossed-up figure, so `income_tax` on the slip is the total PAYE (cash + RSU-attributed). Dashboards that stacked the raw figure made vest-month tax look ~2x higher than "cash tax paid". Introduce `cash_income_tax = income_tax * (gross_pay - pension_sacrifice) / taxable_pay` as a derived column alongside the raw figure. Dashboards can now stack cash vs RSU-attributed tax as separate segments. Also capture YTD column values of `RSU Tax Offset` and `RSU Excs Refund` from the Payments grid — needed for reconciliation against HMRC annual figures. P60 ingest: new parser under `parsers/p60.py` anchoring on statutory HMRC line labels (`Tax year to 5 April YYYY`, `Employer PAYE reference`, `In this employment` pay/tax row, NI letter bands). Processor routes documents carrying the `p60` Paperless tag to `_handle_p60` which writes to the new `payslip_ingest.p60_reference` table (one row per tax_year+employer). App lifespan resolves the tag id at startup; missing tag disables dispatch without breaking payslip ingest. Paperless tag creation + webhook config are manual follow-ups. Migrations: - 0004 — cash_income_tax + ytd_rsu_tax_offset + ytd_rsu_excs_refund on payslip, all nullable. - 0005 — p60_reference table with (tax_year, employer) unique + paperless_doc_id unique for idempotent re-uploads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-19 15:23:05 +00:00			`result = await process_document(doc_id, session_factory, paperless, extractor,`
			`p60_tag_id)`
backfill: continue on per-document errors instead of aborting A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later. 2026-04-18 23:25:36 +00:00			`click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}")`
			`except Exception as exc:`
			`# Don't let a single bad doc (wrong tag, non-payslip PDF, Claude`
			`# hallucinating null fields) abort the whole backfill. Log + continue.`
			`failed += 1`
			`click.echo(f"doc_id={doc_id} status=failed error={type(exc).__name__}: {exc}",`
			`err=True)`
			`log.exception("backfill: doc_id=%s failed", doc_id)`
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`processed += 1`
backfill: continue on per-document errors instead of aborting A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later. 2026-04-18 23:25:36 +00:00			`click.echo(f"backfill complete: processed={processed} failed={failed}")`
Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`finally:`
			`await paperless.aclose()`
			`await extractor.aclose()`
			`await engine.dispose()`


			`@cli.command("extract-one")`
			`@click.argument("path", type=click.Path(exists=True, dir_okay=False, path_type=Path))`
			`def extract_one(path: Path) -> None:`
			`"""Smoke-test extraction on a local PDF — no DB writes."""`
			`asyncio.run(_extract_one(path))`


			`async def _extract_one(path: Path) -> None:`
			`pdf_bytes = path.read_bytes()`
			`extractor = ClaudeExtractor(`
			`base_url=os.environ["CLAUDE_AGENT_URL"],`
			`bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],`
			`)`
			`try:`
			`extracted = await extractor.extract(pdf_bytes, {"id": None, "source": str(path)})`
			`finally:`
			`await extractor.aclose()`
			`click.echo(extracted.model_dump_json(indent=2))`
			`ok = validate_totals(extracted)`
			`click.echo(json.dumps({"totals_validated": ok}))`
			`if not ok:`
			`sys.exit(1)`


			`@cli.command()`
			`def migrate() -> None:`
			"""Run `alembic upgrade head`."""
			`result = subprocess.run(["alembic", "upgrade", "head"], check=False)`
			`sys.exit(result.returncode)`


sync: ActualBudget Meta deposit overlay (Phase C) Adds daily sync of Meta payroll deposits from ActualBudget into payslip_ingest.external_meta_deposits, enabling the dashboard to overlay bank deposits against payslip net_pay and surface parser drift on net. - Migration 0007: new table external_meta_deposits, unique on actualbudget_tx_id, indexed on deposit_date. - payslip_ingest.sync.actualbudget: narrow client for the jhonderson/actual-http-api sidecar (list accounts + transactions). Filters on payee regex (META\|FACEBOOK, word-boundary). Idempotent upsert — ON CONFLICT DO NOTHING on actualbudget_tx_id. Surfaces clear error if the transactions endpoint is missing so the operator can switch to a SQLite-mount fallback. - CLI command: `python -m payslip_ingest sync-meta-deposits` driven by 4 env vars (ACTUALBUDGET_HTTP_API_URL, API_KEY, ENCRYPTION_PASSWORD, BUDGET_SYNC_ID). - Tests: 5 — regex positive/negative, full sync insert, idempotency, 404-endpoint failure mode. Part of: code-860 2026-04-19 18:20:50 +00:00			`@cli.command("sync-meta-deposits")`
			`def sync_meta_deposits_cmd() -> None:`
			`"""Pull Meta payroll deposits from ActualBudget into external_meta_deposits.`

			`Reads from the jhonderson/actual-http-api sidecar. Requires env vars:`
			`ACTUALBUDGET_HTTP_API_URL, ACTUALBUDGET_API_KEY,`
			`ACTUALBUDGET_ENCRYPTION_PASSWORD, ACTUALBUDGET_BUDGET_SYNC_ID.`
			`"""`
			`asyncio.run(_sync_meta_deposits())`


			`async def _sync_meta_deposits() -> None:`
			`from payslip_ingest.sync.actualbudget import ActualBudgetClient, sync_meta_deposits`

			`engine = create_engine_from_env()`
			`session_factory = make_session_factory(engine)`
			`client = ActualBudgetClient(`
			`base_url=os.environ["ACTUALBUDGET_HTTP_API_URL"],`
			`api_key=os.environ["ACTUALBUDGET_API_KEY"],`
			`encryption_password=os.environ["ACTUALBUDGET_ENCRYPTION_PASSWORD"],`
			`budget_sync_id=os.environ["ACTUALBUDGET_BUDGET_SYNC_ID"],`
			`)`
			`try:`
			`result = await sync_meta_deposits(client, session_factory)`
			`click.echo(f"sync complete: accounts={result.accounts_scanned} "`
			`f"transactions={result.transactions_fetched} "`
			`f"meta_matched={result.meta_deposits_matched} "`
			`f"inserted={result.inserted} existing={result.skipped_existing}")`
			`finally:`
			`await client.aclose()`
			`await engine.dispose()`


backfill: cash_income_tax back-fill for variant-A NULL rows Phase B of RSU tax spike fix. Vest-month spikes on the dashboard trace to variant-A slips (2019–mid-2022) where `cash_income_tax` is NULL — the dashboard's COALESCE fallback returns full PAYE, masquerading as cash tax. Three changes: 1. Widen variant-A Taxable Pay regex. Original pattern only matched `Taxable Pay : This Period £...`; add case-insensitive variants that tolerate missing/different colons, elided "This", and uppercase labels. Covers older 2019-2020 templates that failed the previous match. 2. New `backfill_cash_income_tax` module — walks every NULL-cash-tax row with rsu_vest > 0, re-downloads the PDF from Paperless, runs the widened regex parser, falls back to Claude for taxable_pay extraction if regex still misses, and derives cash_income_tax pro-rata. Records provenance in new `cash_income_tax_source` column (regex/claude/ fallback_null). Idempotent — only touches NULL rows. 3. Migration 0006 adds the `cash_income_tax_source` audit column. CLI: `python -m payslip_ingest backfill-cash-tax [--limit N]`. Meant to run as a one-shot K8s Job after `alembic upgrade head`. Part of: code-860 2026-04-19 18:15:18 +00:00			`@cli.command("backfill-cash-tax")`
			`@click.option("--limit", type=int, default=None, help="Cap the number of rows processed.")`
			`def backfill_cash_tax(limit: int \| None) -> None:`
			`"""Back-fill cash_income_tax on rows where it's NULL (vest months only).`

			`Uses the widened regex parser first; falls back to Claude. Writes the`
			provenance source into `cash_income_tax_source`. Idempotent — only
			`touches NULL rows.`
			`"""`
			`asyncio.run(_backfill_cash_tax(limit))`


			`async def _backfill_cash_tax(limit: int \| None) -> None:`
			`from payslip_ingest.backfill_cash_tax import backfill_cash_income_tax`

			`engine = create_engine_from_env()`
			`session_factory = make_session_factory(engine)`
			`paperless = PaperlessClient(`
			`base_url=os.environ["PAPERLESS_URL"],`
			`api_token=os.environ["PAPERLESS_API_TOKEN"],`
			`)`
			`extractor = ClaudeExtractor(`
			`base_url=os.environ["CLAUDE_AGENT_URL"],`
			`bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],`
			`)`
			`try:`
			`result = await backfill_cash_income_tax(session_factory, paperless, extractor, limit=limit)`
			`click.echo(f"back-fill complete: processed={result.processed} "`
			`f"regex={result.regex_hits} claude={result.claude_hits} "`
			`f"fallback_null={result.fallback_null} errors={result.errors}")`
			`finally:`
			`await paperless.aclose()`
			`await extractor.aclose()`
			`await engine.dispose()`


Initial commit: event-driven UK payslip ingest service Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI can watch it. Identical content to /home/wizard/code commit e426028. See README.md for overview, env vars, and Paperless workflow config. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 22:10:23 +00:00			`if __name__ == "__main__":`
			`cli()`