From 536f432a46345b6d65b3a109c2645055868edfd4 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 1 Jun 2026 19:07:32 +0000 Subject: [PATCH] examples: LLM_MODEL env var (default qwen3-8b; swap to qwen3vl-4b in K8s) --- fire_planner/examples/llm_extract.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fire_planner/examples/llm_extract.py b/fire_planner/examples/llm_extract.py index 2f7d04b..c6e7ca1 100644 --- a/fire_planner/examples/llm_extract.py +++ b/fire_planner/examples/llm_extract.py @@ -11,6 +11,7 @@ from __future__ import annotations import json import logging +import os from decimal import Decimal, InvalidOperation from typing import Any @@ -21,7 +22,11 @@ from fire_planner.examples.models import ExtractedExample, RawPost log = logging.getLogger(__name__) -QWEN_MODEL = "qwen3-8b" +# `LLM_MODEL` lets the deployment swap to a smaller model when the GPU is +# contested. Default stays on qwen3-8b for local dev / tests. The "qwen" name +# in the constant is historical — the value can be any llama-swap model id +# (e.g. `qwen3vl-4b` when k8s-node1's VRAM is mostly held by immich-ml). +QWEN_MODEL = os.environ.get("LLM_MODEL", "qwen3-8b") CLAUDE_AGENT_MODEL = "claude-haiku-4-5" HTTP_TIMEOUT = httpx.Timeout(60.0)