From c696bf32f0c10c280d6b34bff921b3cb6bf5ec84 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 23:25:36 +0000 Subject: [PATCH] backfill: continue on per-document errors instead of aborting A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later. --- payslip_ingest/__main__.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/payslip_ingest/__main__.py b/payslip_ingest/__main__.py index ffce922..f30293d 100644 --- a/payslip_ingest/__main__.py +++ b/payslip_ingest/__main__.py @@ -52,14 +52,24 @@ async def _backfill(tag: str, limit: int | None) -> None: bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"], ) processed = 0 + failed = 0 try: async for doc in paperless.list_tagged_documents(tag): if limit is not None and processed >= limit: break doc_id = int(doc["id"]) - result = await process_document(doc_id, session_factory, paperless, extractor) - click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}") + try: + result = await process_document(doc_id, session_factory, paperless, extractor) + click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}") + except Exception as exc: + # Don't let a single bad doc (wrong tag, non-payslip PDF, Claude + # hallucinating null fields) abort the whole backfill. Log + continue. + failed += 1 + click.echo(f"doc_id={doc_id} status=failed error={type(exc).__name__}: {exc}", + err=True) + log.exception("backfill: doc_id=%s failed", doc_id) processed += 1 + click.echo(f"backfill complete: processed={processed} failed={failed}") finally: await paperless.aclose() await extractor.aclose()