backfill: continue on per-document errors instead of aborting
A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later.
This commit is contained in:
parent
3da24fdf7a
commit
c696bf32f0
1 changed files with 12 additions and 2 deletions
|
|
@ -52,14 +52,24 @@ async def _backfill(tag: str, limit: int | None) -> None:
|
|||
bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
|
||||
)
|
||||
processed = 0
|
||||
failed = 0
|
||||
try:
|
||||
async for doc in paperless.list_tagged_documents(tag):
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
doc_id = int(doc["id"])
|
||||
result = await process_document(doc_id, session_factory, paperless, extractor)
|
||||
click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}")
|
||||
try:
|
||||
result = await process_document(doc_id, session_factory, paperless, extractor)
|
||||
click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}")
|
||||
except Exception as exc:
|
||||
# Don't let a single bad doc (wrong tag, non-payslip PDF, Claude
|
||||
# hallucinating null fields) abort the whole backfill. Log + continue.
|
||||
failed += 1
|
||||
click.echo(f"doc_id={doc_id} status=failed error={type(exc).__name__}: {exc}",
|
||||
err=True)
|
||||
log.exception("backfill: doc_id=%s failed", doc_id)
|
||||
processed += 1
|
||||
click.echo(f"backfill complete: processed={processed} failed={failed}")
|
||||
finally:
|
||||
await paperless.aclose()
|
||||
await extractor.aclose()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue