examples: disable qwen3 thinking output in llama-swap requests

qwen3-8b emits <think>...</think> chain-of-thought before the JSON response by default, which trips the JSON parser and forces ~50% of posts to escalate to claude-agent-service (discovered during first bulk ingest, 2026-06-05). Fix: pass chat_template_kwargs.enable_thinking=false in the request body for the Tier 1 (llama-swap) call. Claude calls are unaffected. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-05 08:40:25 +00:00 · 2026-06-05 08:40:25 +00:00 · 25c948c933
commit 25c948c933
parent 2e38934010
2 changed files with 55 additions and 1 deletions
--- a/fire_planner/examples/llm_extract.py
+++ b/fire_planner/examples/llm_extract.py
@ -62,6 +62,8 @@ async def extract_with_qwen(
        post=post,
        client=client,
        record_model=QWEN_MODEL,
+        # Suppress qwen3's chain-of-thought prefix so the response is bare JSON.
+        extra_body={"chat_template_kwargs": {"enable_thinking": False}},
    )


@ -73,8 +75,9 @@ async def _call_openai_chat(
    client: httpx.AsyncClient,
    record_model: str,
    extra_headers: dict[str, str] | None = None,
+    extra_body: dict[str, Any] | None = None,
 ) -> ExtractedExample | None:
-    body = {
+    body: dict[str, Any] = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": PROMPT_SYSTEM},
@ -83,6 +86,8 @@ async def _call_openai_chat(
        "temperature": 0.0,
        "max_tokens": 512,
    }
+    if extra_body:
+        body.update(extra_body)
    try:
        resp = await client.post(
            url,