payslip-ingest/payslip_ingest/app.py
Viktor Barzin 57484619c1 Initial commit: event-driven UK payslip ingest service
Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI
can watch it. Identical content to /home/wizard/code commit e426028.

See README.md for overview, env vars, and Paperless workflow config.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 22:10:23 +00:00

118 lines
3.9 KiB
Python

import asyncio
import contextlib
import hmac
import logging
import os
from collections.abc import AsyncIterator, Awaitable, Callable
from contextlib import asynccontextmanager
from typing import Any
from fastapi import FastAPI, Header, HTTPException, status
from prometheus_fastapi_instrumentator import Instrumentator
from sqlalchemy.ext.asyncio import async_sessionmaker
from payslip_ingest.db import create_engine_from_env, make_session_factory
from payslip_ingest.extractor import ClaudeExtractor
from payslip_ingest.paperless import PaperlessClient
from payslip_ingest.processor import process_document
from payslip_ingest.schema import WebhookPayload
log = logging.getLogger(__name__)
REQUIRED_ENV = [
"PAPERLESS_URL",
"PAPERLESS_API_TOKEN",
"CLAUDE_AGENT_URL",
"CLAUDE_AGENT_BEARER_TOKEN",
"DB_CONNECTION_STRING",
"WEBHOOK_BEARER_TOKEN",
]
# Type alias for the processor function — makes monkeypatching in tests explicit.
ProcessorFn = Callable[
[int, async_sessionmaker[Any], PaperlessClient, ClaudeExtractor],
Awaitable[Any],
]
def _verify_env() -> None:
missing = [k for k in REQUIRED_ENV if not os.environ.get(k)]
if missing:
raise RuntimeError(f"Missing required env vars: {', '.join(missing)}")
def _verify_bearer(authorization: str | None, expected: str) -> None:
if not expected:
raise HTTPException(status_code=401, detail="Service unauthenticated")
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing bearer token")
token = authorization.removeprefix("Bearer ")
if not hmac.compare_digest(token, expected):
raise HTTPException(status_code=401, detail="Invalid token")
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
_verify_env()
engine = create_engine_from_env()
session_factory = make_session_factory(engine)
paperless = PaperlessClient(
base_url=os.environ["PAPERLESS_URL"],
api_token=os.environ["PAPERLESS_API_TOKEN"],
)
extractor = ClaudeExtractor(
base_url=os.environ["CLAUDE_AGENT_URL"],
bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
)
queue: asyncio.Queue[int] = asyncio.Queue()
processor: ProcessorFn = app.state.__dict__.get("processor_fn", process_document)
async def worker() -> None:
while True:
doc_id = await queue.get()
try:
await processor(doc_id, session_factory, paperless, extractor)
except Exception:
log.exception("processing failed for doc_id=%s", doc_id)
finally:
queue.task_done()
worker_task = asyncio.create_task(worker())
app.state.queue = queue
app.state.session_factory = session_factory
app.state.paperless = paperless
app.state.extractor = extractor
try:
yield
finally:
worker_task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await worker_task
await paperless.aclose()
await extractor.aclose()
await engine.dispose()
app = FastAPI(title="Payslip Ingest", lifespan=lifespan)
Instrumentator().instrument(app).expose(app, endpoint="/metrics")
@app.post("/webhook", status_code=status.HTTP_202_ACCEPTED)
async def webhook(
payload: WebhookPayload,
authorization: str | None = Header(default=None),
) -> dict[str, Any]:
_verify_bearer(authorization, os.environ.get("WEBHOOK_BEARER_TOKEN", ""))
queue: asyncio.Queue[int] = app.state.queue
await queue.put(payload.document_id)
return {"status": "accepted", "document_id": payload.document_id}
@app.get("/healthz")
async def healthz() -> dict[str, Any]:
queue: asyncio.Queue[int] | None = getattr(app.state, "queue", None)
depth = queue.qsize() if queue is not None else 0
return {"status": "ok", "queue_depth": depth}