payslip-ingest/tests/test_extractor.py
Viktor Barzin 57484619c1 Initial commit: event-driven UK payslip ingest service
Extracted from /home/wizard/code monorepo into its own repo so Woodpecker CI
can watch it. Identical content to /home/wizard/code commit e426028.

See README.md for overview, env vars, and Paperless workflow config.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 22:10:23 +00:00

138 lines
4.9 KiB
Python

import json
import httpx
import pytest
import respx
from payslip_ingest import extractor as extractor_module
from payslip_ingest.extractor import ClaudeExtractor, ExtractorError
@pytest.fixture(autouse=True)
def _tighten_retries(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(extractor_module, "POLL_INTERVAL_SECONDS", 0)
monkeypatch.setattr(extractor_module, "MAX_POLL_SECONDS", 1)
monkeypatch.setattr(extractor_module, "BUSY_RETRY_DELAY_SECONDS", 0)
def _sample_extraction() -> dict[str, object]:
return {
"pay_date": "2026-03-28",
"pay_period_start": "2026-03-01",
"pay_period_end": "2026-03-31",
"employer": "Acme Ltd",
"currency": "GBP",
"gross_pay": 5000.0,
"income_tax": 800.0,
"national_insurance": 350.0,
"pension_employee": 250.0,
"pension_employer": 150.0,
"student_loan": 100.0,
"other_deductions": {
"cycle_to_work": 50.0
},
"net_pay": 3450.0,
}
def _agent_output(payload: dict[str, object]) -> list[str]:
"""Simulate claude CLI --output-format json stdout."""
return [
json.dumps({
"type": "system",
"subtype": "init"
}) + "\n",
json.dumps({
"type": "assistant",
"message": {
"content": [{
"type": "text",
"text": json.dumps(payload)
}],
},
}) + "\n",
json.dumps({
"type": "result",
"result": json.dumps(payload)
}) + "\n",
]
@pytest.fixture()
def client() -> ClaudeExtractor:
return ClaudeExtractor(base_url="http://agent.test", bearer_token="tok")
async def test_extract_happy_path(client: ClaudeExtractor) -> None:
payload = _sample_extraction()
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(
return_value=httpx.Response(202, json={
"job_id": "abc123",
"status": "running"
}))
mock.get("/jobs/abc123").mock(return_value=httpx.Response(
200, json={
"status": "completed",
"output": _agent_output(payload)
}))
extracted = await client.extract(b"PDFDATA", {"id": 42})
assert float(extracted.gross_pay) == 5000.0
assert extracted.employer == "Acme Ltd"
async def test_extract_retries_on_409(client: ClaudeExtractor) -> None:
payload = _sample_extraction()
with respx.mock(base_url="http://agent.test") as mock:
route = mock.post("/execute")
route.side_effect = [
httpx.Response(409, json={"detail": "busy"}),
httpx.Response(202, json={"job_id": "abc123"}),
]
mock.get("/jobs/abc123").mock(return_value=httpx.Response(
200, json={
"status": "completed",
"output": _agent_output(payload)
}))
extracted = await client.extract(b"PDFDATA", {"id": 42})
assert extracted.net_pay.is_finite()
assert route.call_count == 2
async def test_extract_polling_timeout_raises(client: ClaudeExtractor) -> None:
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(return_value=httpx.Response(202, json={"job_id": "abc123"}))
mock.get("/jobs/abc123").mock(
return_value=httpx.Response(200, json={
"status": "running",
"output": []
}))
with pytest.raises(TimeoutError):
await client.extract(b"PDFDATA", {"id": 42})
async def test_extract_malformed_json_raises(client: ClaudeExtractor) -> None:
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(return_value=httpx.Response(202, json={"job_id": "abc123"}))
mock.get("/jobs/abc123").mock(return_value=httpx.Response(
200,
json={
"status": "completed",
"output": ["this is not json\n", "still not json\n"],
},
))
with pytest.raises(ExtractorError):
await client.extract(b"PDFDATA", {"id": 42})
async def test_extract_failed_status_raises(client: ClaudeExtractor) -> None:
with respx.mock(base_url="http://agent.test") as mock:
mock.post("/execute").mock(return_value=httpx.Response(202, json={"job_id": "abc123"}))
mock.get("/jobs/abc123").mock(return_value=httpx.Response(200,
json={
"status": "failed",
"output": [],
"exit_code": 1
}))
with pytest.raises(ExtractorError):
await client.extract(b"PDFDATA", {"id": 42})