examples: disable qwen3 thinking output in llama-swap requests
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
qwen3-8b emits <think>...</think> chain-of-thought before the JSON response by default, which trips the JSON parser and forces ~50% of posts to escalate to claude-agent-service (discovered during first bulk ingest, 2026-06-05). Fix: pass chat_template_kwargs.enable_thinking=false in the request body for the Tier 1 (llama-swap) call. Claude calls are unaffected. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e38934010
commit
25c948c933
2 changed files with 55 additions and 1 deletions
|
|
@ -62,6 +62,8 @@ async def extract_with_qwen(
|
|||
post=post,
|
||||
client=client,
|
||||
record_model=QWEN_MODEL,
|
||||
# Suppress qwen3's chain-of-thought prefix so the response is bare JSON.
|
||||
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -73,8 +75,9 @@ async def _call_openai_chat(
|
|||
client: httpx.AsyncClient,
|
||||
record_model: str,
|
||||
extra_headers: dict[str, str] | None = None,
|
||||
extra_body: dict[str, Any] | None = None,
|
||||
) -> ExtractedExample | None:
|
||||
body = {
|
||||
body: dict[str, Any] = {
|
||||
"model": model_name,
|
||||
"messages": [
|
||||
{"role": "system", "content": PROMPT_SYSTEM},
|
||||
|
|
@ -83,6 +86,8 @@ async def _call_openai_chat(
|
|||
"temperature": 0.0,
|
||||
"max_tokens": 512,
|
||||
}
|
||||
if extra_body:
|
||||
body.update(extra_body)
|
||||
try:
|
||||
resp = await client.post(
|
||||
url,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue