Three coupled changes for the new recruiter-responder pipeline:
1. stacks/llama-cpp/: add qwen3-8b text-only model to llama-swap. Uses
unsloth/Qwen3-8B-GGUF Q4_K_M, 16k context, no mmproj. Refactored the
download Job script + cmd renderer to handle text_only=true (skip
mmproj download + --mmproj flag). The 3 existing vision models stay
on text_only=false; no behaviour change for them.
2. stacks/recruiter-responder/: new stack. Namespace, 2 ExternalSecrets
(app secrets from secret/recruiter-responder, DB creds from Vault DB
engine static-creds/pg-recruiter-responder), Deployment (replicas=1,
Recreate -- IMAP IDLE + APScheduler want single leader), Service
ClusterIP. Image: forgejo.viktorbarzin.me/viktor/recruiter-responder.
3. stacks/openclaw/: add init container `install-recruiter-plugin` that
uses the recruiter-responder image to copy the .mjs plugin into
/home/node/.openclaw/extensions/recruiter-api/ on NFS. Couples plugin
version to the recruiter-responder image tag. Also injects
RECRUITER_RESPONDER_URL + RECRUITER_RESPONDER_TOKEN env vars (token
from openclaw-secrets.recruiter_responder_bearer_token, optional).
Pre-apply checklist for recruiter-responder stack:
- Vault: seed secret/recruiter-responder with webhook_bearer_token,
imap_{me,spam}_{user,pass}, smtp_password, claude_agent_token,
task_webhook_token.
- Vault: add secret/openclaw.recruiter_responder_bearer_token (same as
above webhook_bearer_token).
- dbaas: create DB recruiter_responder + role recruiter_responder,
and Vault DB-engine role static-creds/pg-recruiter-responder.
- Build + push image via Woodpecker (recruiter-responder repo CI).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
399 lines
13 KiB
HCL
399 lines
13 KiB
HCL
locals {
|
|
namespace = "llama-cpp"
|
|
labels = { app = "llama-cpp" }
|
|
|
|
# llama-swap fronts per-model llama.cpp instances. The :cuda image
|
|
# ships a recent llama-server inside, which is what gets spawned per
|
|
# model. One Service, one /v1 endpoint, model selected by the
|
|
# OpenAI `model` field. mostlygeek/llama-swap is production-grade
|
|
# (3.9k★, v211, May 2026).
|
|
llamaswap_image = "ghcr.io/mostlygeek/llama-swap:cuda"
|
|
|
|
# Three vision models for the benchmark sweep. All Apache-2.0, all GGUF
|
|
# Q4_K_M (T4 has no FP8/BF16 — INT4 is the right knob). Image long-edge
|
|
# capped at 1024 px to keep prefill <2s on the T4.
|
|
#
|
|
# Filenames are matched by glob in the download Job (huggingface_hub
|
|
# snapshot_download with allow_patterns). Stable symlinks model.gguf /
|
|
# mmproj.gguf are created after download so llama-swap config can be
|
|
# filename-agnostic.
|
|
# `text_only = true` skips mmproj download + --mmproj flag (text-only LLM).
|
|
# Vision models keep `text_only = false` (default).
|
|
models = {
|
|
qwen3vl-8b = {
|
|
hf_repo = "Qwen/Qwen3-VL-8B-Instruct-GGUF"
|
|
gguf_pattern = "*Q4_K_M*.gguf"
|
|
mmproj_pattern = "*mmproj*.gguf"
|
|
ctx_size = 3072
|
|
gpu_layers = 99
|
|
text_only = false
|
|
}
|
|
minicpm-v-4-5 = {
|
|
hf_repo = "openbmb/MiniCPM-V-4_5-gguf"
|
|
gguf_pattern = "*Q4_K_M*.gguf"
|
|
mmproj_pattern = "*mmproj*.gguf"
|
|
ctx_size = 3072
|
|
gpu_layers = 99
|
|
text_only = false
|
|
}
|
|
qwen3vl-4b = {
|
|
hf_repo = "Qwen/Qwen3-VL-4B-Instruct-GGUF"
|
|
gguf_pattern = "*Q4_K_M*.gguf"
|
|
mmproj_pattern = "*mmproj*.gguf"
|
|
ctx_size = 3072
|
|
gpu_layers = 99
|
|
text_only = false
|
|
}
|
|
# Text-only triage / drafting model for recruiter-responder.
|
|
# Q4_K_M, ~4.7GB, 32k native context (capped at 16k here — plenty
|
|
# for recruiter emails + extraction prompt + JSON output).
|
|
# Unsloth's GGUF: well-maintained, includes Q4_K_M. Qwen3 is a
|
|
# thinking-capable model; recruiter-responder disables thinking via
|
|
# `enable_thinking=false` in the chat-template kwargs.
|
|
qwen3-8b = {
|
|
hf_repo = "unsloth/Qwen3-8B-GGUF"
|
|
gguf_pattern = "*Q4_K_M*.gguf"
|
|
mmproj_pattern = ""
|
|
ctx_size = 16384
|
|
gpu_layers = 99
|
|
text_only = true
|
|
}
|
|
}
|
|
|
|
# YAML config rendered into the ConfigMap. llama-swap reads /app/config.yaml.
|
|
# ${PORT} is substituted by llama-swap; ${MODEL_ID} is the model key.
|
|
llama_swap_config = yamlencode({
|
|
healthCheckTimeout = 180 # 60-90s is typical model load on NFS-SSD
|
|
logLevel = "info"
|
|
logToStdout = "both"
|
|
startPort = 5800
|
|
|
|
macros = {
|
|
llama_server_base = "/app/llama-server --host 0.0.0.0 --port $${PORT} --jinja -fa -np 1"
|
|
}
|
|
|
|
models = {
|
|
for mid, cfg in local.models : mid => {
|
|
cmd = join(" ", concat([
|
|
"/app/llama-server",
|
|
"--host 0.0.0.0",
|
|
"--port $${PORT}",
|
|
"-m /models/${mid}/model.gguf",
|
|
], cfg.text_only ? [] : [
|
|
"--mmproj /models/${mid}/mmproj.gguf",
|
|
], [
|
|
"-ngl ${cfg.gpu_layers}",
|
|
"-c ${cfg.ctx_size}",
|
|
"-np 1",
|
|
"--jinja",
|
|
"-fa on",
|
|
]))
|
|
ttl = 600 # unload after 10 min idle
|
|
checkEndpoint = "/health"
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
resource "kubernetes_namespace" "llama_cpp" {
|
|
metadata {
|
|
name = local.namespace
|
|
labels = {
|
|
tier = local.tiers.gpu
|
|
"istio-injection" = "disabled"
|
|
}
|
|
}
|
|
lifecycle {
|
|
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
|
}
|
|
}
|
|
|
|
# Shared model store. NFS-RWX so the download Job can write while
|
|
# the llama-swap Deployment mounts it. Path /srv/nfs-ssd/llamacpp on
|
|
# the Proxmox host (SSD-backed for fast model load — Q4_K_M 8B mmaps in
|
|
# ~2s vs ~10s on HDD NFS). Page-cache is warmed by the download Job so
|
|
# first inference reads from warm cache.
|
|
module "nfs_models" {
|
|
source = "../../modules/kubernetes/nfs_volume"
|
|
name = "llama-cpp-models"
|
|
namespace = kubernetes_namespace.llama_cpp.metadata[0].name
|
|
nfs_server = "192.168.1.127"
|
|
nfs_path = "/srv/nfs-ssd/llamacpp"
|
|
storage = "30Gi"
|
|
}
|
|
|
|
# One-shot download Job. Pulls Q4_K_M GGUF + mmproj for every model in
|
|
# locals.models into /models/<id>/, creates stable model.gguf /
|
|
# mmproj.gguf symlinks, then warms the page cache. Idempotent —
|
|
# huggingface_hub's snapshot_download skips files that already exist
|
|
# with matching size; symlinks are recreated each run.
|
|
resource "kubernetes_job_v1" "download_models" {
|
|
metadata {
|
|
name = "download-models"
|
|
namespace = kubernetes_namespace.llama_cpp.metadata[0].name
|
|
labels = local.labels
|
|
}
|
|
spec {
|
|
backoff_limit = 2
|
|
ttl_seconds_after_finished = 86400
|
|
template {
|
|
metadata { labels = local.labels }
|
|
spec {
|
|
restart_policy = "OnFailure"
|
|
|
|
container {
|
|
name = "download"
|
|
image = "python:3.12-slim"
|
|
command = ["/bin/bash", "-c", <<-EOT
|
|
set -euo pipefail
|
|
pip install --quiet --no-cache-dir 'huggingface_hub>=0.24'
|
|
python - <<'PY'
|
|
import json, os, glob
|
|
from huggingface_hub import snapshot_download
|
|
models = json.loads(os.environ["MODELS_JSON"])
|
|
for mid, cfg in models.items():
|
|
local_dir = f"/models/{mid}"
|
|
os.makedirs(local_dir, exist_ok=True)
|
|
text_only = cfg.get("text_only", False)
|
|
patterns = [cfg["gguf_pattern"]]
|
|
if not text_only and cfg.get("mmproj_pattern"):
|
|
patterns.append(cfg["mmproj_pattern"])
|
|
print(f"==> downloading {mid} from {cfg['hf_repo']} -> {local_dir} (text_only={text_only})", flush=True)
|
|
snapshot_download(
|
|
repo_id=cfg["hf_repo"],
|
|
local_dir=local_dir,
|
|
allow_patterns=patterns,
|
|
token=os.environ.get("HF_TOKEN") or None,
|
|
# Single-threaded download — multi-worker buffers
|
|
# multi-GB chunks per worker and OOMs the Job at 2Gi.
|
|
max_workers=1,
|
|
)
|
|
# Resolve actual filenames and create stable symlinks so
|
|
# llama-swap config is filename-agnostic.
|
|
ggufs = [p for p in glob.glob(f"{local_dir}/*Q4_K_M*.gguf") if "mmproj" not in p.lower()]
|
|
if not ggufs:
|
|
raise SystemExit(f"no GGUF found in {local_dir}")
|
|
gguf_link = f"{local_dir}/model.gguf"
|
|
if os.path.islink(gguf_link) or os.path.exists(gguf_link):
|
|
os.unlink(gguf_link)
|
|
os.symlink(os.path.basename(ggufs[0]), gguf_link)
|
|
if not text_only:
|
|
mmprojs = glob.glob(f"{local_dir}/*mmproj*.gguf")
|
|
if not mmprojs:
|
|
raise SystemExit(f"no mmproj found in {local_dir}")
|
|
mmproj_link = f"{local_dir}/mmproj.gguf"
|
|
if os.path.islink(mmproj_link) or os.path.exists(mmproj_link):
|
|
os.unlink(mmproj_link)
|
|
os.symlink(os.path.basename(mmprojs[0]), mmproj_link)
|
|
print(f"==> done {mid}", flush=True)
|
|
for f in sorted(os.listdir(local_dir)):
|
|
full = os.path.join(local_dir, f)
|
|
if os.path.isfile(full) and not os.path.islink(full):
|
|
print(f" {f} ({os.path.getsize(full):,} bytes)", flush=True)
|
|
print("==> warming page cache", flush=True)
|
|
PY
|
|
# Warm the kernel page cache so first inference reads warm.
|
|
# Wrapped in bash (not the Python heredoc) to keep the cat
|
|
# output out of stdout buffering.
|
|
find /models -type f -name '*.gguf' ! -name 'model.gguf' ! -name 'mmproj.gguf' \
|
|
-exec sh -c 'cat "$1" > /dev/null' _ {} \;
|
|
echo "ALL DONE"
|
|
EOT
|
|
]
|
|
env {
|
|
name = "MODELS_JSON"
|
|
value = jsonencode(local.models)
|
|
}
|
|
env {
|
|
name = "HF_HUB_ENABLE_HF_TRANSFER"
|
|
value = "0"
|
|
}
|
|
# Optional: HF token from Vault (rate-limit avoidance). Sourced
|
|
# from the existing `viktor` Vault path which holds personal
|
|
# creds. Empty string is acceptable (anonymous downloads).
|
|
env {
|
|
name = "HF_TOKEN"
|
|
value_from {
|
|
secret_key_ref {
|
|
name = "hf-token"
|
|
key = "token"
|
|
optional = true
|
|
}
|
|
}
|
|
}
|
|
volume_mount {
|
|
name = "models"
|
|
mount_path = "/models"
|
|
}
|
|
resources {
|
|
requests = { cpu = "100m", memory = "256Mi" }
|
|
# 4Gi covers the worst-case huggingface_hub buffer (single
|
|
# 5GB GGUF chunked over HTTP) plus interpreter overhead.
|
|
# 2Gi was hit by the previous run.
|
|
limits = { memory = "4Gi" }
|
|
}
|
|
}
|
|
|
|
volume {
|
|
name = "models"
|
|
persistent_volume_claim {
|
|
claim_name = module.nfs_models.claim_name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
wait_for_completion = false
|
|
lifecycle {
|
|
ignore_changes = [
|
|
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
|
metadata[0].annotations,
|
|
]
|
|
}
|
|
}
|
|
|
|
resource "kubernetes_config_map" "llama_swap_config" {
|
|
metadata {
|
|
name = "llama-swap-config"
|
|
namespace = kubernetes_namespace.llama_cpp.metadata[0].name
|
|
labels = local.labels
|
|
}
|
|
data = {
|
|
"config.yaml" = local.llama_swap_config
|
|
}
|
|
}
|
|
|
|
# Single Deployment running llama-swap. Spawns per-model llama-server
|
|
# subprocesses on demand and unloads them after `ttl` seconds idle.
|
|
# The whole T4 is allocated to this pod via nvidia.com/gpu=1; immich-ml
|
|
# must be scaled to 0 during benchmark runs.
|
|
resource "kubernetes_deployment" "llama_swap" {
|
|
metadata {
|
|
name = "llama-swap"
|
|
namespace = kubernetes_namespace.llama_cpp.metadata[0].name
|
|
labels = merge(local.labels, { tier = local.tiers.gpu })
|
|
}
|
|
# Don't block apply on rollout — the GPU is shared with immich-ml and
|
|
# the pod stays Pending until the operator scales immich-ml=0 for a
|
|
# benchmark window. Apply is "create the desired state, don't wait
|
|
# for it to be reachable".
|
|
wait_for_rollout = false
|
|
spec {
|
|
replicas = 1
|
|
strategy { type = "Recreate" }
|
|
|
|
selector {
|
|
match_labels = { app = "llama-cpp", component = "llama-swap" }
|
|
}
|
|
|
|
template {
|
|
metadata {
|
|
labels = { app = "llama-cpp", component = "llama-swap" }
|
|
annotations = {
|
|
# Bounce the pod whenever the configmap content changes.
|
|
"checksum/config" = sha256(local.llama_swap_config)
|
|
}
|
|
}
|
|
spec {
|
|
node_selector = { gpu = "true" }
|
|
toleration {
|
|
key = "nvidia.com/gpu"
|
|
operator = "Equal"
|
|
value = "true"
|
|
effect = "NoSchedule"
|
|
}
|
|
|
|
container {
|
|
name = "llama-swap"
|
|
image = local.llamaswap_image
|
|
args = ["-config", "/app/config.yaml", "-listen", ":8080"]
|
|
port {
|
|
container_port = 8080
|
|
name = "http"
|
|
}
|
|
volume_mount {
|
|
name = "models"
|
|
mount_path = "/models"
|
|
}
|
|
volume_mount {
|
|
name = "config"
|
|
mount_path = "/app/config.yaml"
|
|
sub_path = "config.yaml"
|
|
}
|
|
# llama-swap returns 200 on / once running; per-model readiness
|
|
# is gated by the model's own /health endpoint (configured in
|
|
# the YAML as checkEndpoint).
|
|
readiness_probe {
|
|
http_get {
|
|
path = "/"
|
|
port = 8080
|
|
}
|
|
initial_delay_seconds = 5
|
|
period_seconds = 10
|
|
failure_threshold = 6
|
|
}
|
|
liveness_probe {
|
|
http_get {
|
|
path = "/"
|
|
port = 8080
|
|
}
|
|
initial_delay_seconds = 30
|
|
period_seconds = 30
|
|
failure_threshold = 5
|
|
}
|
|
resources {
|
|
requests = {
|
|
cpu = "200m"
|
|
memory = "2Gi"
|
|
}
|
|
limits = {
|
|
memory = "12Gi"
|
|
"nvidia.com/gpu" = "1"
|
|
}
|
|
}
|
|
}
|
|
|
|
volume {
|
|
name = "models"
|
|
persistent_volume_claim {
|
|
claim_name = module.nfs_models.claim_name
|
|
}
|
|
}
|
|
volume {
|
|
name = "config"
|
|
config_map {
|
|
name = kubernetes_config_map.llama_swap_config.metadata[0].name
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
lifecycle {
|
|
ignore_changes = [
|
|
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
|
]
|
|
}
|
|
|
|
depends_on = [kubernetes_job_v1.download_models]
|
|
}
|
|
|
|
resource "kubernetes_service" "llama_swap" {
|
|
metadata {
|
|
name = "llama-swap"
|
|
namespace = kubernetes_namespace.llama_cpp.metadata[0].name
|
|
labels = local.labels
|
|
}
|
|
spec {
|
|
type = "ClusterIP"
|
|
selector = {
|
|
app = "llama-cpp"
|
|
component = "llama-swap"
|
|
}
|
|
port {
|
|
name = "http"
|
|
port = 8080
|
|
target_port = 8080
|
|
}
|
|
}
|
|
}
|