locals { namespace = "llama-cpp" labels = { app = "llama-cpp" } # llama-swap fronts per-model llama.cpp instances. The :cuda image # ships a recent llama-server inside, which is what gets spawned per # model. One Service, one /v1 endpoint, model selected by the # OpenAI `model` field. mostlygeek/llama-swap is production-grade # (3.9k★, v211, May 2026). llamaswap_image = "ghcr.io/mostlygeek/llama-swap:cuda" # Three vision models for the benchmark sweep. All Apache-2.0, all GGUF # Q4_K_M (T4 has no FP8/BF16 — INT4 is the right knob). Image long-edge # capped at 1024 px to keep prefill <2s on the T4. # # Filenames are matched by glob in the download Job (huggingface_hub # snapshot_download with allow_patterns). Stable symlinks model.gguf / # mmproj.gguf are created after download so llama-swap config can be # filename-agnostic. models = { qwen3vl-8b = { hf_repo = "Qwen/Qwen3-VL-8B-Instruct-GGUF" gguf_pattern = "*Q4_K_M*.gguf" mmproj_pattern = "*mmproj*.gguf" ctx_size = 3072 gpu_layers = 99 } minicpm-v-4-5 = { hf_repo = "openbmb/MiniCPM-V-4_5-gguf" gguf_pattern = "*Q4_K_M*.gguf" mmproj_pattern = "*mmproj*.gguf" ctx_size = 3072 gpu_layers = 99 } qwen3vl-4b = { hf_repo = "Qwen/Qwen3-VL-4B-Instruct-GGUF" gguf_pattern = "*Q4_K_M*.gguf" mmproj_pattern = "*mmproj*.gguf" ctx_size = 3072 gpu_layers = 99 } } # YAML config rendered into the ConfigMap. llama-swap reads /app/config.yaml. # ${PORT} is substituted by llama-swap; ${MODEL_ID} is the model key. llama_swap_config = yamlencode({ healthCheckTimeout = 180 # 60-90s is typical model load on NFS-SSD logLevel = "info" logToStdout = "both" startPort = 5800 macros = { llama_server_base = "/app/llama-server --host 0.0.0.0 --port $${PORT} --jinja -fa -np 1" } models = { for mid, cfg in local.models : mid => { cmd = join(" ", [ "/app/llama-server", "--host 0.0.0.0", "--port $${PORT}", "-m /models/${mid}/model.gguf", "--mmproj /models/${mid}/mmproj.gguf", "-ngl ${cfg.gpu_layers}", "-c ${cfg.ctx_size}", "-np 1", "--jinja", "-fa on", ]) ttl = 600 # unload after 10 min idle checkEndpoint = "/health" } } }) } resource "kubernetes_namespace" "llama_cpp" { metadata { name = local.namespace labels = { tier = local.tiers.gpu "istio-injection" = "disabled" } } lifecycle { ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] } } # Shared model store. NFS-RWX so the download Job can write while # the llama-swap Deployment mounts it. Path /srv/nfs-ssd/llamacpp on # the Proxmox host (SSD-backed for fast model load — Q4_K_M 8B mmaps in # ~2s vs ~10s on HDD NFS). Page-cache is warmed by the download Job so # first inference reads from warm cache. module "nfs_models" { source = "../../modules/kubernetes/nfs_volume" name = "llama-cpp-models" namespace = kubernetes_namespace.llama_cpp.metadata[0].name nfs_server = "192.168.1.127" nfs_path = "/srv/nfs-ssd/llamacpp" storage = "30Gi" } # One-shot download Job. Pulls Q4_K_M GGUF + mmproj for every model in # locals.models into /models//, creates stable model.gguf / # mmproj.gguf symlinks, then warms the page cache. Idempotent — # huggingface_hub's snapshot_download skips files that already exist # with matching size; symlinks are recreated each run. resource "kubernetes_job_v1" "download_models" { metadata { name = "download-models" namespace = kubernetes_namespace.llama_cpp.metadata[0].name labels = local.labels } spec { backoff_limit = 2 ttl_seconds_after_finished = 86400 template { metadata { labels = local.labels } spec { restart_policy = "OnFailure" container { name = "download" image = "python:3.12-slim" command = ["/bin/bash", "-c", <<-EOT set -euo pipefail pip install --quiet --no-cache-dir 'huggingface_hub>=0.24' python - <<'PY' import json, os, glob from huggingface_hub import snapshot_download models = json.loads(os.environ["MODELS_JSON"]) for mid, cfg in models.items(): local_dir = f"/models/{mid}" os.makedirs(local_dir, exist_ok=True) print(f"==> downloading {mid} from {cfg['hf_repo']} -> {local_dir}", flush=True) snapshot_download( repo_id=cfg["hf_repo"], local_dir=local_dir, allow_patterns=[cfg["gguf_pattern"], cfg["mmproj_pattern"]], token=os.environ.get("HF_TOKEN") or None, # Single-threaded download — multi-worker buffers # multi-GB chunks per worker and OOMs the Job at 2Gi. max_workers=1, ) # Resolve actual filenames and create stable symlinks so # llama-swap config is filename-agnostic. ggufs = [p for p in glob.glob(f"{local_dir}/*Q4_K_M*.gguf") if "mmproj" not in p.lower()] mmprojs = glob.glob(f"{local_dir}/*mmproj*.gguf") if not ggufs: raise SystemExit(f"no GGUF found in {local_dir}") if not mmprojs: raise SystemExit(f"no mmproj found in {local_dir}") gguf_link = f"{local_dir}/model.gguf" mmproj_link = f"{local_dir}/mmproj.gguf" for link, target in ((gguf_link, ggufs[0]), (mmproj_link, mmprojs[0])): if os.path.islink(link) or os.path.exists(link): os.unlink(link) os.symlink(os.path.basename(target), link) print(f"==> done {mid}", flush=True) for f in sorted(os.listdir(local_dir)): full = os.path.join(local_dir, f) if os.path.isfile(full) and not os.path.islink(full): print(f" {f} ({os.path.getsize(full):,} bytes)", flush=True) print("==> warming page cache", flush=True) PY # Warm the kernel page cache so first inference reads warm. # Wrapped in bash (not the Python heredoc) to keep the cat # output out of stdout buffering. find /models -type f -name '*.gguf' ! -name 'model.gguf' ! -name 'mmproj.gguf' \ -exec sh -c 'cat "$1" > /dev/null' _ {} \; echo "ALL DONE" EOT ] env { name = "MODELS_JSON" value = jsonencode(local.models) } env { name = "HF_HUB_ENABLE_HF_TRANSFER" value = "0" } # Optional: HF token from Vault (rate-limit avoidance). Sourced # from the existing `viktor` Vault path which holds personal # creds. Empty string is acceptable (anonymous downloads). env { name = "HF_TOKEN" value_from { secret_key_ref { name = "hf-token" key = "token" optional = true } } } volume_mount { name = "models" mount_path = "/models" } resources { requests = { cpu = "100m", memory = "256Mi" } # 4Gi covers the worst-case huggingface_hub buffer (single # 5GB GGUF chunked over HTTP) plus interpreter overhead. # 2Gi was hit by the previous run. limits = { memory = "4Gi" } } } volume { name = "models" persistent_volume_claim { claim_name = module.nfs_models.claim_name } } } } } wait_for_completion = false lifecycle { ignore_changes = [ spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 metadata[0].annotations, ] } } resource "kubernetes_config_map" "llama_swap_config" { metadata { name = "llama-swap-config" namespace = kubernetes_namespace.llama_cpp.metadata[0].name labels = local.labels } data = { "config.yaml" = local.llama_swap_config } } # Single Deployment running llama-swap. Spawns per-model llama-server # subprocesses on demand and unloads them after `ttl` seconds idle. # The whole T4 is allocated to this pod via nvidia.com/gpu=1; immich-ml # must be scaled to 0 during benchmark runs. resource "kubernetes_deployment" "llama_swap" { metadata { name = "llama-swap" namespace = kubernetes_namespace.llama_cpp.metadata[0].name labels = merge(local.labels, { tier = local.tiers.gpu }) } # Don't block apply on rollout — the GPU is shared with immich-ml and # the pod stays Pending until the operator scales immich-ml=0 for a # benchmark window. Apply is "create the desired state, don't wait # for it to be reachable". wait_for_rollout = false spec { replicas = 1 strategy { type = "Recreate" } selector { match_labels = { app = "llama-cpp", component = "llama-swap" } } template { metadata { labels = { app = "llama-cpp", component = "llama-swap" } annotations = { # Bounce the pod whenever the configmap content changes. "checksum/config" = sha256(local.llama_swap_config) } } spec { node_selector = { gpu = "true" } toleration { key = "nvidia.com/gpu" operator = "Equal" value = "true" effect = "NoSchedule" } container { name = "llama-swap" image = local.llamaswap_image args = ["-config", "/app/config.yaml", "-listen", ":8080"] port { container_port = 8080 name = "http" } volume_mount { name = "models" mount_path = "/models" } volume_mount { name = "config" mount_path = "/app/config.yaml" sub_path = "config.yaml" } # llama-swap returns 200 on / once running; per-model readiness # is gated by the model's own /health endpoint (configured in # the YAML as checkEndpoint). readiness_probe { http_get { path = "/" port = 8080 } initial_delay_seconds = 5 period_seconds = 10 failure_threshold = 6 } liveness_probe { http_get { path = "/" port = 8080 } initial_delay_seconds = 30 period_seconds = 30 failure_threshold = 5 } resources { requests = { cpu = "200m" memory = "2Gi" } limits = { memory = "12Gi" "nvidia.com/gpu" = "1" } } } volume { name = "models" persistent_volume_claim { claim_name = module.nfs_models.claim_name } } volume { name = "config" config_map { name = kubernetes_config_map.llama_swap_config.metadata[0].name } } } } } lifecycle { ignore_changes = [ spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 ] } depends_on = [kubernetes_job_v1.download_models] } resource "kubernetes_service" "llama_swap" { metadata { name = "llama-swap" namespace = kubernetes_namespace.llama_cpp.metadata[0].name labels = local.labels } spec { type = "ClusterIP" selector = { app = "llama-cpp" component = "llama-swap" } port { name = "http" port = 8080 target_port = 8080 } } }