Initial commit: event-driven UK payslip ingest service

Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI
can watch it. Identical content to /home/wizard/code commit e426028.

See README.md for overview, env vars, and Paperless workflow config.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-04-18 22:10:23 +00:00
commit 57484619c1
27 changed files with 2878 additions and 0 deletions

0
tests/__init__.py Normal file
View file

8
tests/conftest.py Normal file
View file

@ -0,0 +1,8 @@
import os
os.environ.setdefault("PAPERLESS_URL", "http://paperless.test")
os.environ.setdefault("PAPERLESS_API_TOKEN", "test-paperless-token")
os.environ.setdefault("CLAUDE_AGENT_URL", "http://agent.test")
os.environ.setdefault("CLAUDE_AGENT_BEARER_TOKEN", "test-agent-token")
os.environ.setdefault("DB_CONNECTION_STRING", "sqlite+aiosqlite:///:memory:")
os.environ.setdefault("WEBHOOK_BEARER_TOKEN", "test-webhook-token")

138
tests/test_extractor.py Normal file
View file

@ -0,0 +1,138 @@
import json
import httpx
import pytest
import respx
from payslip_ingest import extractor as extractor_module
from payslip_ingest.extractor import ClaudeExtractor, ExtractorError
@pytest.fixture(autouse=True)
def _tighten_retries(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(extractor_module, "POLL_INTERVAL_SECONDS", 0)
monkeypatch.setattr(extractor_module, "MAX_POLL_SECONDS", 1)
monkeypatch.setattr(extractor_module, "BUSY_RETRY_DELAY_SECONDS", 0)
def _sample_extraction() -> dict[str, object]:
return {
"pay_date": "2026-03-28",
"pay_period_start": "2026-03-01",
"pay_period_end": "2026-03-31",
"employer": "Acme Ltd",
"currency": "GBP",
"gross_pay": 5000.0,
"income_tax": 800.0,
"national_insurance": 350.0,
"pension_employee": 250.0,
"pension_employer": 150.0,
"student_loan": 100.0,
"other_deductions": {
"cycle_to_work": 50.0
},
"net_pay": 3450.0,
}
def _agent_output(payload: dict[str, object]) -> list[str]:
"""Simulate claude CLI --output-format json stdout."""
return [
json.dumps({
"type": "system",
"subtype": "init"
}) + "\n",
json.dumps({
"type": "assistant",
"message": {
"content": [{
"type": "text",
"text": json.dumps(payload)
}],
},
}) + "\n",
json.dumps({
"type": "result",
"result": json.dumps(payload)
}) + "\n",
]
@pytest.fixture()
def client() -> ClaudeExtractor:
return ClaudeExtractor(base_url="http://agent.test", bearer_token="tok")
async def test_extract_happy_path(client: ClaudeExtractor) -> None:
payload = _sample_extraction()
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(
return_value=httpx.Response(202, json={
"job_id": "abc123",
"status": "running"
}))
mock.get("/jobs/abc123").mock(return_value=httpx.Response(
200, json={
"status": "completed",
"output": _agent_output(payload)
}))
extracted = await client.extract(b"PDFDATA", {"id": 42})
assert float(extracted.gross_pay) == 5000.0
assert extracted.employer == "Acme Ltd"
async def test_extract_retries_on_409(client: ClaudeExtractor) -> None:
payload = _sample_extraction()
with respx.mock(base_url="http://agent.test") as mock:
route = mock.post("/execute")
route.side_effect = [
httpx.Response(409, json={"detail": "busy"}),
httpx.Response(202, json={"job_id": "abc123"}),
]
mock.get("/jobs/abc123").mock(return_value=httpx.Response(
200, json={
"status": "completed",
"output": _agent_output(payload)
}))
extracted = await client.extract(b"PDFDATA", {"id": 42})
assert extracted.net_pay.is_finite()
assert route.call_count == 2
async def test_extract_polling_timeout_raises(client: ClaudeExtractor) -> None:
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(return_value=httpx.Response(202, json={"job_id": "abc123"}))
mock.get("/jobs/abc123").mock(
return_value=httpx.Response(200, json={
"status": "running",
"output": []
}))
with pytest.raises(TimeoutError):
await client.extract(b"PDFDATA", {"id": 42})
async def test_extract_malformed_json_raises(client: ClaudeExtractor) -> None:
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(return_value=httpx.Response(202, json={"job_id": "abc123"}))
mock.get("/jobs/abc123").mock(return_value=httpx.Response(
200,
json={
"status": "completed",
"output": ["this is not json\n", "still not json\n"],
},
))
with pytest.raises(ExtractorError):
await client.extract(b"PDFDATA", {"id": 42})
async def test_extract_failed_status_raises(client: ClaudeExtractor) -> None:
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(return_value=httpx.Response(202, json={"job_id": "abc123"}))
mock.get("/jobs/abc123").mock(return_value=httpx.Response(200,
json={
"status": "failed",
"output": [],
"exit_code": 1
}))
with pytest.raises(ExtractorError):
await client.extract(b"PDFDATA", {"id": 42})

96
tests/test_paperless.py Normal file
View file

@ -0,0 +1,96 @@
import httpx
import pytest
import respx
from payslip_ingest.paperless import PaperlessClient, PaperlessError
@pytest.fixture()
def client() -> PaperlessClient:
return PaperlessClient(base_url="http://paperless.test", api_token="tok")
async def test_get_tag_id_happy_path(client: PaperlessClient) -> None:
with respx.mock(base_url="http://paperless.test") as mock:
mock.get("/api/tags/", params={
"name__iexact": "payslip"
}).mock(return_value=httpx.Response(200, json={"results": [{
"id": 7,
"name": "payslip"
}]}))
assert await client.get_tag_id("payslip") == 7
async def test_get_tag_id_zero_results_raises(client: PaperlessClient) -> None:
with respx.mock(base_url="http://paperless.test") as mock:
mock.get("/api/tags/").mock(return_value=httpx.Response(200, json={"results": []}))
with pytest.raises(PaperlessError):
await client.get_tag_id("payslip")
async def test_get_tag_id_many_results_raises(client: PaperlessClient) -> None:
with respx.mock(base_url="http://paperless.test") as mock:
mock.get("/api/tags/").mock(return_value=httpx.Response(
200,
json={"results": [{
"id": 1,
"name": "payslip"
}, {
"id": 2,
"name": "Payslip"
}]},
))
with pytest.raises(PaperlessError):
await client.get_tag_id("payslip")
async def test_list_tagged_documents_paginates(client: PaperlessClient) -> None:
with respx.mock() as mock:
mock.get("http://paperless.test/api/tags/").mock(
return_value=httpx.Response(200, json={"results": [{
"id": 7,
"name": "payslip"
}]}))
mock.get("http://paperless.test/api/documents/?tags__id=7").mock(
return_value=httpx.Response(
200,
json={
"results": [{
"id": 1
}, {
"id": 2
}],
"next": "http://paperless.test/api/documents/?tags__id=7&page=2",
},
))
mock.get("http://paperless.test/api/documents/?tags__id=7&page=2").mock(
return_value=httpx.Response(
200,
json={
"results": [{
"id": 3
}],
"next": None
},
))
ids = [doc["id"] async for doc in client.list_tagged_documents("payslip")]
assert ids == [1, 2, 3]
async def test_get_document_returns_metadata(client: PaperlessClient) -> None:
with respx.mock(base_url="http://paperless.test") as mock:
mock.get("/api/documents/42/").mock(
return_value=httpx.Response(200, json={
"id": 42,
"title": "Payslip Mar"
}))
data = await client.get_document(42)
assert data["title"] == "Payslip Mar"
async def test_download_document_returns_bytes(client: PaperlessClient) -> None:
with respx.mock(base_url="http://paperless.test") as mock:
mock.get("/api/documents/42/download/").mock(
return_value=httpx.Response(200, content=b"PDFDATA"))
data = await client.download_document(42)
assert data == b"PDFDATA"

127
tests/test_processor.py Normal file
View file

@ -0,0 +1,127 @@
from datetime import date
from decimal import Decimal
from typing import Any
from unittest.mock import AsyncMock, MagicMock
import pytest
from payslip_ingest.processor import process_document
from payslip_ingest.schema import ExtractedPayslip
def _sample_extraction() -> ExtractedPayslip:
return ExtractedPayslip(
pay_date=date(2026, 3, 28),
pay_period_start=date(2026, 3, 1),
pay_period_end=date(2026, 3, 31),
employer="Acme Ltd",
currency="GBP",
gross_pay=Decimal("5000.00"),
income_tax=Decimal("800.00"),
national_insurance=Decimal("350.00"),
pension_employee=Decimal("250.00"),
pension_employer=Decimal("150.00"),
student_loan=Decimal("100.00"),
other_deductions={"cycle_to_work": Decimal("50.00")},
net_pay=Decimal("3450.00"),
)
class _FakeSession:
"""Minimal AsyncSession stand-in that records flushes and execute calls."""
def __init__(self, existing_ids: list[int]):
self._existing_ids = existing_ids
self.added: list[Any] = []
self.begin_calls = 0
async def __aenter__(self) -> "_FakeSession":
return self
async def __aexit__(self, *exc: object) -> None:
return None
def begin(self) -> "_FakeSession":
self.begin_calls += 1
return self
async def execute(self, stmt: Any) -> Any:
result = MagicMock()
# scalar() returns None when we treat the row as missing.
result.scalar.return_value = self._existing_ids.pop(0) if self._existing_ids else None
return result
def add(self, row: Any) -> None:
row.id = 1
self.added.append(row)
async def flush(self) -> None:
return None
class _SessionFactory:
def __init__(self, sessions: list[_FakeSession]):
self._sessions = list(sessions)
self.used: list[_FakeSession] = []
def __call__(self) -> _FakeSession:
session = self._sessions.pop(0)
self.used.append(session)
return session
@pytest.fixture()
def paperless() -> AsyncMock:
mock = AsyncMock()
mock.get_document.return_value = {"id": 42, "title": "Payslip"}
mock.download_document.return_value = b"PDFDATA"
return mock
@pytest.fixture()
def extractor() -> AsyncMock:
mock = AsyncMock()
mock.extract.return_value = _sample_extraction()
return mock
async def test_process_document_inserts_new(paperless: AsyncMock, extractor: AsyncMock) -> None:
factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "inserted"
assert result.validated is True
paperless.get_document.assert_awaited_once_with(42)
paperless.download_document.assert_awaited_once_with(42)
extractor.extract.assert_awaited_once()
inserted_row = factory.used[1].added[0]
assert inserted_row.paperless_doc_id == 42
assert inserted_row.tax_year == "2025/26"
async def test_process_document_skips_existing(paperless: AsyncMock, extractor: AsyncMock) -> None:
factory = _SessionFactory([_FakeSession(existing_ids=[99])])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "skipped"
paperless.get_document.assert_not_called()
extractor.extract.assert_not_called()
async def test_process_document_flags_validation_failure(paperless: AsyncMock,
extractor: AsyncMock) -> None:
bad = _sample_extraction()
bad_dict = bad.model_dump()
bad_dict["net_pay"] = Decimal("9999.00")
extractor.extract.return_value = ExtractedPayslip.model_validate(bad_dict)
factory = _SessionFactory([_FakeSession(existing_ids=[]), _FakeSession(existing_ids=[])])
result = await process_document(42, factory, paperless, extractor)
assert result.status == "inserted"
assert result.validated is False
assert factory.used[1].added[0].validated is False

52
tests/test_schema.py Normal file
View file

@ -0,0 +1,52 @@
from decimal import Decimal
import pytest
from pydantic import ValidationError
from payslip_ingest.schema import ExtractedPayslip, validate_totals
def _sample_payload() -> dict[str, object]:
return {
"pay_date": "2026-03-28",
"pay_period_start": "2026-03-01",
"pay_period_end": "2026-03-31",
"employer": "Acme Ltd",
"currency": "GBP",
"gross_pay": "5000.00",
"income_tax": "800.00",
"national_insurance": "350.00",
"pension_employee": "250.00",
"pension_employer": "150.00",
"student_loan": "100.00",
"other_deductions": {
"cycle_to_work": "50.00"
},
"net_pay": "3450.00",
}
def test_schema_accepts_realistic_payload() -> None:
model = ExtractedPayslip.model_validate(_sample_payload())
assert model.employer == "Acme Ltd"
assert model.gross_pay == Decimal("5000.00")
assert model.other_deductions == {"cycle_to_work": Decimal("50.00")}
def test_schema_rejects_extra_fields() -> None:
payload = _sample_payload()
payload["bonus_field"] = "not allowed"
with pytest.raises(ValidationError):
ExtractedPayslip.model_validate(payload)
def test_validate_totals_true_for_matched_numbers() -> None:
model = ExtractedPayslip.model_validate(_sample_payload())
assert validate_totals(model) is True
def test_validate_totals_false_for_mismatch() -> None:
payload = _sample_payload()
payload["net_pay"] = "4000.00"
model = ExtractedPayslip.model_validate(payload)
assert validate_totals(model) is False

22
tests/test_tax_year.py Normal file
View file

@ -0,0 +1,22 @@
from datetime import date
import pytest
from payslip_ingest.tax_year import derive_tax_year
@pytest.mark.parametrize(
("pay_date", "expected"),
[
(date(2025, 4, 5), "2024/25"),
(date(2025, 4, 6), "2025/26"),
(date(2026, 4, 5), "2025/26"),
(date(2026, 4, 6), "2026/27"),
(date(2026, 12, 31), "2026/27"),
(date(2027, 1, 1), "2026/27"),
(date(2027, 4, 5), "2026/27"),
(date(2027, 4, 6), "2027/28"),
],
)
def test_derive_tax_year(pay_date: date, expected: str) -> None:
assert derive_tax_year(pay_date) == expected

111
tests/test_webhook.py Normal file
View file

@ -0,0 +1,111 @@
import asyncio
import contextlib
import os
from collections.abc import AsyncIterator, Iterator
from contextlib import asynccontextmanager
import pytest
from fastapi import FastAPI, Header, HTTPException, status
from fastapi.testclient import TestClient
from payslip_ingest.app import _verify_bearer
from payslip_ingest.schema import WebhookPayload
def _build_app() -> tuple[FastAPI, list[int]]:
"""Build a minimal FastAPI app that mirrors the real /webhook behaviour.
Mirroring rather than importing lets us avoid booting SQLAlchemy / httpx
clients that the real `lifespan` constructs on startup.
"""
seen: list[int] = []
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
queue: asyncio.Queue[int] = asyncio.Queue()
app.state.queue = queue
async def worker() -> None:
while True:
doc_id = await queue.get()
seen.append(doc_id)
queue.task_done()
task = asyncio.create_task(worker())
try:
yield
finally:
task.cancel()
with contextlib.suppress(asyncio.CancelledError):
await task
app = FastAPI(lifespan=lifespan)
@app.post("/webhook", status_code=status.HTTP_202_ACCEPTED)
async def webhook(
payload: WebhookPayload,
authorization: str | None = Header(default=None),
) -> dict[str, object]:
_verify_bearer(authorization, os.environ.get("WEBHOOK_BEARER_TOKEN", ""))
queue: asyncio.Queue[int] = app.state.queue
await queue.put(payload.document_id)
return {"status": "accepted", "document_id": payload.document_id}
return app, seen
@pytest.fixture()
def client() -> Iterator[TestClient]:
app, seen = _build_app()
app.state.seen = seen
with TestClient(app) as tc:
yield tc
def test_webhook_rejects_missing_auth(client: TestClient) -> None:
resp = client.post("/webhook", json={"document_id": 42})
assert resp.status_code == 401
def test_webhook_rejects_wrong_bearer(client: TestClient) -> None:
resp = client.post(
"/webhook",
json={"document_id": 42},
headers={"Authorization": "Bearer wrong"},
)
assert resp.status_code == 401
def test_webhook_accepts_valid_request(client: TestClient) -> None:
resp = client.post(
"/webhook",
json={"document_id": 42},
headers={"Authorization": f"Bearer {os.environ['WEBHOOK_BEARER_TOKEN']}"},
)
assert resp.status_code == 202
assert resp.json() == {"status": "accepted", "document_id": 42}
queue: asyncio.Queue[int] = client.app.state.queue # type: ignore[attr-defined]
# Join the queue so the worker actually picks up our enqueued doc.
loop = asyncio.new_event_loop()
try:
loop.run_until_complete(asyncio.wait_for(queue.join(), timeout=2.0))
finally:
loop.close()
seen: list[int] = client.app.state.seen # type: ignore[attr-defined]
assert 42 in seen
def test_webhook_rejects_malformed_body(client: TestClient) -> None:
resp = client.post(
"/webhook",
json={"document_id": "not-an-int"},
headers={"Authorization": f"Bearer {os.environ['WEBHOOK_BEARER_TOKEN']}"},
)
assert resp.status_code == 422
def test_verify_bearer_rejects_unconfigured_service() -> None:
with pytest.raises(HTTPException):
_verify_bearer("Bearer anything", "")