From b10233975b8a22ee6270c86fe99ae627029a07a0 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 29 May 2026 06:20:03 +0000 Subject: [PATCH] llama-cpp: restore replicas to 1; fire-planner: fix llama-swap URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit llama-cpp was scaled to 0 during 2026-05-25 IO-storm recovery (TEMP-SCALEDOWN). Cluster is now stable; only frigate competes for the GPU on k8s-node1. Restoring to 1 to unblock fire-planner's Reddit examples ingest, which needs qwen3-8b for structured extraction. fire-planner's llama_cpp_base_url default pointed at a non-existent service:port (llama-cpp:8000) — the real service is `llama-swap` on port 8080. First 2026-05-28 bulk Job exited 0 with 0 rows because of this. Correcting. --- stacks/fire-planner/main.tf | 7 +++++-- stacks/llama-cpp/main.tf | 16 +++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/stacks/fire-planner/main.tf b/stacks/fire-planner/main.tf index 36cf36e1..fa3fe4f7 100644 --- a/stacks/fire-planner/main.tf +++ b/stacks/fire-planner/main.tf @@ -620,8 +620,11 @@ resource "kubernetes_config_map" "grafana_fire_planner_datasource" { variable "llama_cpp_base_url" { type = string - description = "llama-cpp /v1/chat/completions endpoint for primary LLM extraction" - default = "http://llama-cpp.llama-cpp.svc.cluster.local:8000/v1/chat/completions" + description = "llama-swap /v1/chat/completions endpoint for primary LLM extraction" + # Service is named `llama-swap`, NOT `llama-cpp` — the proxy in front of + # the actual llama-cpp pod. Port 8080. (Initial 2026-05-28 value pointed + # at a non-existent service:port and the bulk Job produced 0 rows.) + default = "http://llama-swap.llama-cpp.svc.cluster.local:8080/v1/chat/completions" } variable "claude_agent_service_url" { diff --git a/stacks/llama-cpp/main.tf b/stacks/llama-cpp/main.tf index 6e2a324c..c0719bbf 100644 --- a/stacks/llama-cpp/main.tf +++ b/stacks/llama-cpp/main.tf @@ -99,8 +99,8 @@ resource "kubernetes_namespace" "llama_cpp" { metadata { name = local.namespace labels = { - tier = local.tiers.gpu - "istio-injection" = "disabled" + tier = local.tiers.gpu + "istio-injection" = "disabled" "keel.sh/enrolled" = "true" } } @@ -280,10 +280,12 @@ resource "kubernetes_deployment" "llama_swap" { # for it to be reachable". wait_for_rollout = false spec { - # TEMP-SCALEDOWN-2026-05-25-IO-STORM: scaled to 0 during cluster recovery. - # Restore to 1 when cluster is fully stable. See post-mortem - # docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md. - replicas = 0 + # Restored to 1 on 2026-05-29 (was 0 during 2026-05-25 IO-storm recovery — + # see docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md). The + # immediate trigger was fire-planner's examples ingest needing qwen3-8b for + # bulk Reddit-post extraction; only frigate is currently on the GPU on + # k8s-node1 so contention is minimal. + replicas = 1 strategy { type = "Recreate" } selector { @@ -380,7 +382,7 @@ resource "kubernetes_deployment" "llama_swap" { metadata[0].annotations["keel.sh/policy"], metadata[0].annotations["keel.sh/trigger"], metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 - spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE + spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE # KEEL_LIFECYCLE_V1 — stop the apply→keel fight: every keel digest # update patches `keel.sh/update-time` on the pod template and # `kubernetes.io/change-cause` + bumps the K8s rollout revision on