recruiter-responder: deploy stack + llama-cpp qwen3-8b + openclaw plugin mount

Three coupled changes for the new recruiter-responder pipeline:

1. stacks/llama-cpp/: add qwen3-8b text-only model to llama-swap. Uses
   unsloth/Qwen3-8B-GGUF Q4_K_M, 16k context, no mmproj. Refactored the
   download Job script + cmd renderer to handle text_only=true (skip
   mmproj download + --mmproj flag). The 3 existing vision models stay
   on text_only=false; no behaviour change for them.

2. stacks/recruiter-responder/: new stack. Namespace, 2 ExternalSecrets
   (app secrets from secret/recruiter-responder, DB creds from Vault DB
   engine static-creds/pg-recruiter-responder), Deployment (replicas=1,
   Recreate -- IMAP IDLE + APScheduler want single leader), Service
   ClusterIP. Image: forgejo.viktorbarzin.me/viktor/recruiter-responder.

3. stacks/openclaw/: add init container `install-recruiter-plugin` that
   uses the recruiter-responder image to copy the .mjs plugin into
   /home/node/.openclaw/extensions/recruiter-api/ on NFS. Couples plugin
   version to the recruiter-responder image tag. Also injects
   RECRUITER_RESPONDER_URL + RECRUITER_RESPONDER_TOKEN env vars (token
   from openclaw-secrets.recruiter_responder_bearer_token, optional).

Pre-apply checklist for recruiter-responder stack:
  - Vault: seed secret/recruiter-responder with webhook_bearer_token,
    imap_{me,spam}_{user,pass}, smtp_password, claude_agent_token,
    task_webhook_token.
  - Vault: add secret/openclaw.recruiter_responder_bearer_token (same as
    above webhook_bearer_token).
  - dbaas: create DB recruiter_responder + role recruiter_responder,
    and Vault DB-engine role static-creds/pg-recruiter-responder.
  - Build + push image via Woodpecker (recruiter-responder repo CI).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-05-15 22:38:53 +00:00 committed by Viktor Barzin
parent 95b9f7bc89
commit 7e1580ba8c
4 changed files with 409 additions and 12 deletions

View file

@ -17,6 +17,8 @@ locals {
# snapshot_download with allow_patterns). Stable symlinks model.gguf /
# mmproj.gguf are created after download so llama-swap config can be
# filename-agnostic.
# `text_only = true` skips mmproj download + --mmproj flag (text-only LLM).
# Vision models keep `text_only = false` (default).
models = {
qwen3vl-8b = {
hf_repo = "Qwen/Qwen3-VL-8B-Instruct-GGUF"
@ -24,6 +26,7 @@ locals {
mmproj_pattern = "*mmproj*.gguf"
ctx_size = 3072
gpu_layers = 99
text_only = false
}
minicpm-v-4-5 = {
hf_repo = "openbmb/MiniCPM-V-4_5-gguf"
@ -31,6 +34,7 @@ locals {
mmproj_pattern = "*mmproj*.gguf"
ctx_size = 3072
gpu_layers = 99
text_only = false
}
qwen3vl-4b = {
hf_repo = "Qwen/Qwen3-VL-4B-Instruct-GGUF"
@ -38,6 +42,21 @@ locals {
mmproj_pattern = "*mmproj*.gguf"
ctx_size = 3072
gpu_layers = 99
text_only = false
}
# Text-only triage / drafting model for recruiter-responder.
# Q4_K_M, ~4.7GB, 32k native context (capped at 16k here plenty
# for recruiter emails + extraction prompt + JSON output).
# Unsloth's GGUF: well-maintained, includes Q4_K_M. Qwen3 is a
# thinking-capable model; recruiter-responder disables thinking via
# `enable_thinking=false` in the chat-template kwargs.
qwen3-8b = {
hf_repo = "unsloth/Qwen3-8B-GGUF"
gguf_pattern = "*Q4_K_M*.gguf"
mmproj_pattern = ""
ctx_size = 16384
gpu_layers = 99
text_only = true
}
}
@ -55,18 +74,20 @@ locals {
models = {
for mid, cfg in local.models : mid => {
cmd = join(" ", [
cmd = join(" ", concat([
"/app/llama-server",
"--host 0.0.0.0",
"--port $${PORT}",
"-m /models/${mid}/model.gguf",
], cfg.text_only ? [] : [
"--mmproj /models/${mid}/mmproj.gguf",
], [
"-ngl ${cfg.gpu_layers}",
"-c ${cfg.ctx_size}",
"-np 1",
"--jinja",
"-fa on",
])
]))
ttl = 600 # unload after 10 min idle
checkEndpoint = "/health"
}
@ -133,11 +154,15 @@ resource "kubernetes_job_v1" "download_models" {
for mid, cfg in models.items():
local_dir = f"/models/{mid}"
os.makedirs(local_dir, exist_ok=True)
print(f"==> downloading {mid} from {cfg['hf_repo']} -> {local_dir}", flush=True)
text_only = cfg.get("text_only", False)
patterns = [cfg["gguf_pattern"]]
if not text_only and cfg.get("mmproj_pattern"):
patterns.append(cfg["mmproj_pattern"])
print(f"==> downloading {mid} from {cfg['hf_repo']} -> {local_dir} (text_only={text_only})", flush=True)
snapshot_download(
repo_id=cfg["hf_repo"],
local_dir=local_dir,
allow_patterns=[cfg["gguf_pattern"], cfg["mmproj_pattern"]],
allow_patterns=patterns,
token=os.environ.get("HF_TOKEN") or None,
# Single-threaded download multi-worker buffers
# multi-GB chunks per worker and OOMs the Job at 2Gi.
@ -146,17 +171,20 @@ resource "kubernetes_job_v1" "download_models" {
# Resolve actual filenames and create stable symlinks so
# llama-swap config is filename-agnostic.
ggufs = [p for p in glob.glob(f"{local_dir}/*Q4_K_M*.gguf") if "mmproj" not in p.lower()]
mmprojs = glob.glob(f"{local_dir}/*mmproj*.gguf")
if not ggufs:
raise SystemExit(f"no GGUF found in {local_dir}")
if not mmprojs:
raise SystemExit(f"no mmproj found in {local_dir}")
gguf_link = f"{local_dir}/model.gguf"
mmproj_link = f"{local_dir}/mmproj.gguf"
for link, target in ((gguf_link, ggufs[0]), (mmproj_link, mmprojs[0])):
if os.path.islink(link) or os.path.exists(link):
os.unlink(link)
os.symlink(os.path.basename(target), link)
if os.path.islink(gguf_link) or os.path.exists(gguf_link):
os.unlink(gguf_link)
os.symlink(os.path.basename(ggufs[0]), gguf_link)
if not text_only:
mmprojs = glob.glob(f"{local_dir}/*mmproj*.gguf")
if not mmprojs:
raise SystemExit(f"no mmproj found in {local_dir}")
mmproj_link = f"{local_dir}/mmproj.gguf"
if os.path.islink(mmproj_link) or os.path.exists(mmproj_link):
os.unlink(mmproj_link)
os.symlink(os.path.basename(mmprojs[0]), mmproj_link)
print(f"==> done {mid}", flush=True)
for f in sorted(os.listdir(local_dir)):
full = os.path.join(local_dir, f)