infra/stacks/llama-cpp/terragrunt.hcl

include "root" {
  path = find_in_parent_folders()
}

dependency "platform" {
  config_path  = "../platform"
  skip_outputs = true
}

dependency "vault" {
  config_path  = "../vault"
  skip_outputs = true
}

# llama-cpp: in-cluster vision-LLM server. One Deployment of
# `mostlygeek/llama-swap:cuda` fronts three models (qwen3vl-8b,
# minicpm-v-4-5, qwen3vl-4b) at a single OpenAI-compat /v1 endpoint
# on Service `llama-swap`. llama-swap loads/unloads per-model
# llama-server subprocesses on demand (idle TTL 10 min). The T4 is
# allocated wholly to this pod; immich-ml must be scaled to 0 during
# benchmark runs. See infra/docs/architecture/llama-cpp.md for the
# full rationale (build ≥ b6907 for Qwen3-VL, T4 FP16/INT4 only,
# llama-swap over Ollama, etc.).
infra/llama-cpp: add stack — llama-swap fronting Qwen3-VL + MiniCPM-V Single Deployment of mostlygeek/llama-swap:cuda hot-swaps three GGUF vision models (qwen3vl-8b, minicpm-v-4-5, qwen3vl-4b) at one OpenAI-compat /v1 endpoint on Service llama-swap.llama-cpp.svc. Idle TTL 10min so models unload between benchmark batches. Storage: NFS-RWX from /srv/nfs-ssd/llamacpp (30Gi). One-shot download Job pulls Q4_K_M GGUF + mmproj per model, creates stable model.gguf / mmproj.gguf symlinks so the llama-swap config is filename-agnostic, then warms the kernel page cache. GPU: nvidia.com/gpu=1 = whole T4 — operator must scale immich-ml to 0 during benchmark windows. wait_for_rollout=false so apply doesn't block on GPU availability. Initial use case: vision-LLM benchmark for instagram-poster candidate scoring; future consumers (HA, agentic tooling) hit the same endpoint via LiteLLM at the gateway. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-10 14:13:40 +00:00			`include "root" {`
			`path = find_in_parent_folders()`
			`}`

			`dependency "platform" {`
			`config_path = "../platform"`
			`skip_outputs = true`
			`}`

			`dependency "vault" {`
			`config_path = "../vault"`
			`skip_outputs = true`
			`}`

			`# llama-cpp: in-cluster vision-LLM server. One Deployment of`
			# `mostlygeek/llama-swap:cuda` fronts three models (qwen3vl-8b,
			`# minicpm-v-4-5, qwen3vl-4b) at a single OpenAI-compat /v1 endpoint`
			# on Service `llama-swap`. llama-swap loads/unloads per-model
			`# llama-server subprocesses on demand (idle TTL 10 min). The T4 is`
			`# allocated wholly to this pod; immich-ml must be scaled to 0 during`
			`# benchmark runs. See infra/docs/architecture/llama-cpp.md for the`
			`# full rationale (build ≥ b6907 for Qwen3-VL, T4 FP16/INT4 only,`
			`# llama-swap over Ollama, etc.).`