examples: disable qwen3 thinking output in llama-swap requests
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
qwen3-8b emits <think>...</think> chain-of-thought before the JSON response by default, which trips the JSON parser and forces ~50% of posts to escalate to claude-agent-service (discovered during first bulk ingest, 2026-06-05). Fix: pass chat_template_kwargs.enable_thinking=false in the request body for the Tier 1 (llama-swap) call. Claude calls are unaffected. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e38934010
commit
25c948c933
2 changed files with 55 additions and 1 deletions
|
|
@ -62,6 +62,8 @@ async def extract_with_qwen(
|
||||||
post=post,
|
post=post,
|
||||||
client=client,
|
client=client,
|
||||||
record_model=QWEN_MODEL,
|
record_model=QWEN_MODEL,
|
||||||
|
# Suppress qwen3's chain-of-thought prefix so the response is bare JSON.
|
||||||
|
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -73,8 +75,9 @@ async def _call_openai_chat(
|
||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
record_model: str,
|
record_model: str,
|
||||||
extra_headers: dict[str, str] | None = None,
|
extra_headers: dict[str, str] | None = None,
|
||||||
|
extra_body: dict[str, Any] | None = None,
|
||||||
) -> ExtractedExample | None:
|
) -> ExtractedExample | None:
|
||||||
body = {
|
body: dict[str, Any] = {
|
||||||
"model": model_name,
|
"model": model_name,
|
||||||
"messages": [
|
"messages": [
|
||||||
{"role": "system", "content": PROMPT_SYSTEM},
|
{"role": "system", "content": PROMPT_SYSTEM},
|
||||||
|
|
@ -83,6 +86,8 @@ async def _call_openai_chat(
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
}
|
}
|
||||||
|
if extra_body:
|
||||||
|
body.update(extra_body)
|
||||||
try:
|
try:
|
||||||
resp = await client.post(
|
resp = await client.post(
|
||||||
url,
|
url,
|
||||||
|
|
|
||||||
|
|
@ -174,6 +174,55 @@ async def test_fallback_keeps_high_confidence_qwen_result() -> None:
|
||||||
assert claude_route.called is False # high-confidence qwen → claude not hit
|
assert claude_route.called is False # high-confidence qwen → claude not hit
|
||||||
|
|
||||||
|
|
||||||
|
@respx.mock
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_qwen_request_disables_thinking() -> None:
|
||||||
|
"""qwen call must include chat_template_kwargs.enable_thinking=False."""
|
||||||
|
captured: list[dict] = []
|
||||||
|
|
||||||
|
def capture(request: httpx.Request) -> httpx.Response:
|
||||||
|
captured.append(json.loads(request.content))
|
||||||
|
payload = {"country": "UK", "confidence": 0.9}
|
||||||
|
return httpx.Response(
|
||||||
|
200,
|
||||||
|
json={"choices": [{"message": {"content": json.dumps(payload)}}]},
|
||||||
|
)
|
||||||
|
|
||||||
|
respx.post(LLAMA_URL).mock(side_effect=capture)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
await extract_with_qwen(_post(), llama_url=LLAMA_URL, client=client)
|
||||||
|
|
||||||
|
assert captured, "No request made"
|
||||||
|
body = captured[0]
|
||||||
|
assert body.get("chat_template_kwargs", {}).get("enable_thinking") is False
|
||||||
|
|
||||||
|
|
||||||
|
@respx.mock
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_claude_request_omits_thinking_kwarg() -> None:
|
||||||
|
"""Claude call must NOT send chat_template_kwargs — it uses a different API."""
|
||||||
|
from fire_planner.examples.llm_extract import extract_with_claude
|
||||||
|
|
||||||
|
captured: list[dict] = []
|
||||||
|
|
||||||
|
def capture(request: httpx.Request) -> httpx.Response:
|
||||||
|
captured.append(json.loads(request.content))
|
||||||
|
payload = {"country": "UK", "confidence": 0.9}
|
||||||
|
return httpx.Response(
|
||||||
|
200,
|
||||||
|
json={"choices": [{"message": {"content": json.dumps(payload)}}]},
|
||||||
|
)
|
||||||
|
|
||||||
|
respx.post(CLAUDE_URL).mock(side_effect=capture)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
await extract_with_claude(_post(), claude_url=CLAUDE_URL, bearer="t", client=client)
|
||||||
|
|
||||||
|
assert captured, "No request made"
|
||||||
|
assert "chat_template_kwargs" not in captured[0]
|
||||||
|
|
||||||
|
|
||||||
def test_to_gbp_converts_usd() -> None:
|
def test_to_gbp_converts_usd() -> None:
|
||||||
rates = {"GBP": Decimal("1"), "USD": Decimal("0.80")}
|
rates = {"GBP": Decimal("1"), "USD": Decimal("0.80")}
|
||||||
assert to_gbp(Decimal("100"), "USD", rates) == Decimal("80.00")
|
assert to_gbp(Decimal("100"), "USD", rates) == Decimal("80.00")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue