backfill: continue on per-document errors instead of aborting
A single doc that isn't a real payslip (e.g., an RSU letter wrongly tagged as payslip in Paperless) makes Claude return pay_date=null, which pydantic rejects with ValidationError. Previously this killed the whole backfill at the first bad doc, leaving 60 of 88 docs unprocessed. Catch + log + continue so the backfill processes every doc. Failed docs can be re-tagged or fixed individually later.
This commit is contained in:
parent
3da24fdf7a
commit
c696bf32f0
1 changed files with 12 additions and 2 deletions
|
|
@ -52,14 +52,24 @@ async def _backfill(tag: str, limit: int | None) -> None:
|
||||||
bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
|
bearer_token=os.environ["CLAUDE_AGENT_BEARER_TOKEN"],
|
||||||
)
|
)
|
||||||
processed = 0
|
processed = 0
|
||||||
|
failed = 0
|
||||||
try:
|
try:
|
||||||
async for doc in paperless.list_tagged_documents(tag):
|
async for doc in paperless.list_tagged_documents(tag):
|
||||||
if limit is not None and processed >= limit:
|
if limit is not None and processed >= limit:
|
||||||
break
|
break
|
||||||
doc_id = int(doc["id"])
|
doc_id = int(doc["id"])
|
||||||
result = await process_document(doc_id, session_factory, paperless, extractor)
|
try:
|
||||||
click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}")
|
result = await process_document(doc_id, session_factory, paperless, extractor)
|
||||||
|
click.echo(f"doc_id={doc_id} status={result.status} validated={result.validated}")
|
||||||
|
except Exception as exc:
|
||||||
|
# Don't let a single bad doc (wrong tag, non-payslip PDF, Claude
|
||||||
|
# hallucinating null fields) abort the whole backfill. Log + continue.
|
||||||
|
failed += 1
|
||||||
|
click.echo(f"doc_id={doc_id} status=failed error={type(exc).__name__}: {exc}",
|
||||||
|
err=True)
|
||||||
|
log.exception("backfill: doc_id=%s failed", doc_id)
|
||||||
processed += 1
|
processed += 1
|
||||||
|
click.echo(f"backfill complete: processed={processed} failed={failed}")
|
||||||
finally:
|
finally:
|
||||||
await paperless.aclose()
|
await paperless.aclose()
|
||||||
await extractor.aclose()
|
await extractor.aclose()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue