examples: LLM_MODEL env var (default qwen3-8b; swap to qwen3vl-4b in K8s)

2026-06-01 19:07:32 +00:00 · 2026-06-01 19:07:32 +00:00 · 536f432a46
commit 536f432a46
parent 9b32247fea
1 changed files with 6 additions and 1 deletions
--- a/fire_planner/examples/llm_extract.py
+++ b/fire_planner/examples/llm_extract.py
@ -11,6 +11,7 @@ from __future__ import annotations

 import json
 import logging
+import os
 from decimal import Decimal, InvalidOperation
 from typing import Any

@ -21,7 +22,11 @@ from fire_planner.examples.models import ExtractedExample, RawPost

 log = logging.getLogger(__name__)

-QWEN_MODEL = "qwen3-8b"
+# `LLM_MODEL` lets the deployment swap to a smaller model when the GPU is
+# contested. Default stays on qwen3-8b for local dev / tests. The "qwen" name
+# in the constant is historical — the value can be any llama-swap model id
+# (e.g. `qwen3vl-4b` when k8s-node1's VRAM is mostly held by immich-ml).
+QWEN_MODEL = os.environ.get("LLM_MODEL", "qwen3-8b")
 CLAUDE_AGENT_MODEL = "claude-haiku-4-5"
 HTTP_TIMEOUT = httpx.Timeout(60.0)