examples: serialize LLM calls via Semaphore (default 1) to dodge CAS busy-lock

2026-06-02 13:54:15 +00:00 · 2026-06-02 13:54:15 +00:00 · 9d308c36dc
commit 9d308c36dc
parent c1c1e2202b
1 changed files with 33 additions and 7 deletions
--- a/fire_planner/examples/cli.py
+++ b/fire_planner/examples/cli.py
@ -52,20 +52,38 @@ async def ingest_subreddit(
    claude_bearer: str,
    client: httpx.AsyncClient,
    fx_rates: dict[str, Decimal],
    llm_semaphore: asyncio.Semaphore | None = None,
 ) -> tuple[int, int]:
    """Yield (inserted, skipped) counts for one (sub, when) bucket.
    `llm_semaphore` serializes the LLM call across parallel sub-runs;
    claude-agent-service's `/v1/chat/completions` has a single-flight
    busy-lock, so 12 concurrent fan-outs trample each other. Default
    None = no serialization (test path).
    """
    inserted = 0
    skipped = 0
    async for post in fetch_top(reddit, sub, when, limit=limit):
        if not is_candidate(post):
            skipped += 1
            continue
-        extracted = await extract_with_fallback(
+        if llm_semaphore is not None:
-            post,
+            async with llm_semaphore:
-            llama_url=llama_url,
+                extracted = await extract_with_fallback(
-            claude_url=claude_url,
+                    post,
-            claude_bearer=claude_bearer,
+                    llama_url=llama_url,
-            client=client,
+                    claude_url=claude_url,
-        )
+                    claude_bearer=claude_bearer,
                    client=client,
                )
        else:
            extracted = await extract_with_fallback(
                post,
                llama_url=llama_url,
                claude_url=claude_url,
                claude_bearer=claude_bearer,
                client=client,
            )
        if extracted is None:
            log.info("dropping %s — both LLM tiers failed", post.reddit_id)
            skipped += 1
@ -96,6 +114,13 @@ async def _ingest_all(
    claude_url = os.environ["CLAUDE_AGENT_SERVICE_URL"]
    claude_bearer = os.environ["CLAUDE_AGENT_BEARER"]
    # Cap concurrent LLM calls across all sub-runs. claude-agent-service's
    # /v1/chat/completions busy-locks (single-flight), so 12-sub fan-out
    # otherwise loses 11 calls to 503. Default 1 = strict serial. Bump
    # to 2-3 once the busy-lock is dropped for the chat endpoint.
    llm_concurrency = int(os.environ.get("LLM_CONCURRENCY", "1"))
    llm_semaphore = asyncio.Semaphore(llm_concurrency)
    async def _one(sub: str, when: TopWhen) -> tuple[int, int]:
        async with factory() as session, httpx.AsyncClient() as client:
            return await ingest_subreddit(
@ -106,6 +131,7 @@ async def _ingest_all(
                claude_bearer=claude_bearer,
                client=client,
                fx_rates=rates,
                llm_semaphore=llm_semaphore,
            )
    tasks = [_one(s, w) for s in subs for w in when_list]