Phase 7 of the vision-LLM benchmark plan. Adds: - docs/benchmarks/2026-05-10-vision-llm.md — curated report (TL;DR, per-model analysis, top-N agreement, cost vs cloud APIs, sample captions). Verdict: qwen3vl-4b for the request path (3.55 s p50, 100% parse, decisive top-N distro); qwen3vl-8b for caption polish. - docs/benchmarks/benchmark-2026-05-10-1424.json — raw 300-row dump for diff-checking against future runs. - main.tf: -fa -> -fa on (b9085 llama.cpp removed the no-value form of the flash-attention flag; without the value llama-server exits before serving any request). - llama-cpp.md architecture doc links the report so future operators land on the deployed-and-evaluated model from one entry point. 300/300 calls, 0 parse errors, 33m32s wall on a single T4 with the GPU exclusively allocated. immich-ml was scaled to 0 for the run (node1 RAM constraint, not GPU - bumping node1 RAM is tracked as a follow-up). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
37 lines
677 B
HCL
37 lines
677 B
HCL
# Generated by Terragrunt. Sig: nIlQXj57tbuaRZEa
|
|
terraform {
|
|
required_providers {
|
|
vault = {
|
|
source = "hashicorp/vault"
|
|
version = "~> 4.0"
|
|
}
|
|
cloudflare = {
|
|
source = "cloudflare/cloudflare"
|
|
version = "~> 4"
|
|
}
|
|
authentik = {
|
|
source = "goauthentik/authentik"
|
|
version = "~> 2024.10"
|
|
}
|
|
}
|
|
}
|
|
|
|
variable "kube_config_path" {
|
|
type = string
|
|
default = "~/.kube/config"
|
|
}
|
|
|
|
provider "kubernetes" {
|
|
config_path = var.kube_config_path
|
|
}
|
|
|
|
provider "helm" {
|
|
kubernetes = {
|
|
config_path = var.kube_config_path
|
|
}
|
|
}
|
|
|
|
provider "vault" {
|
|
address = "https://vault.viktorbarzin.me"
|
|
skip_child_token = true
|
|
}
|