From b10233975b8a22ee6270c86fe99ae627029a07a0 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Fri, 29 May 2026 06:20:03 +0000
Subject: [PATCH] llama-cpp: restore replicas to 1; fire-planner: fix
 llama-swap URL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

llama-cpp was scaled to 0 during 2026-05-25 IO-storm recovery
(TEMP-SCALEDOWN). Cluster is now stable; only frigate competes for the
GPU on k8s-node1. Restoring to 1 to unblock fire-planner's Reddit
examples ingest, which needs qwen3-8b for structured extraction.

fire-planner's llama_cpp_base_url default pointed at a non-existent
service:port (llama-cpp:8000) — the real service is `llama-swap` on
port 8080. First 2026-05-28 bulk Job exited 0 with 0 rows because of
this. Correcting.
---
 stacks/fire-planner/main.tf |  7 +++++--
 stacks/llama-cpp/main.tf    | 16 +++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/stacks/fire-planner/main.tf b/stacks/fire-planner/main.tf
index 36cf36e1..fa3fe4f7 100644
--- a/stacks/fire-planner/main.tf
+++ b/stacks/fire-planner/main.tf
@@ -620,8 +620,11 @@ resource "kubernetes_config_map" "grafana_fire_planner_datasource" {
 
 variable "llama_cpp_base_url" {
   type        = string
-  description = "llama-cpp /v1/chat/completions endpoint for primary LLM extraction"
-  default     = "http://llama-cpp.llama-cpp.svc.cluster.local:8000/v1/chat/completions"
+  description = "llama-swap /v1/chat/completions endpoint for primary LLM extraction"
+  # Service is named `llama-swap`, NOT `llama-cpp` — the proxy in front of
+  # the actual llama-cpp pod. Port 8080. (Initial 2026-05-28 value pointed
+  # at a non-existent service:port and the bulk Job produced 0 rows.)
+  default = "http://llama-swap.llama-cpp.svc.cluster.local:8080/v1/chat/completions"
 }
 
 variable "claude_agent_service_url" {
diff --git a/stacks/llama-cpp/main.tf b/stacks/llama-cpp/main.tf
index 6e2a324c..c0719bbf 100644
--- a/stacks/llama-cpp/main.tf
+++ b/stacks/llama-cpp/main.tf
@@ -99,8 +99,8 @@ resource "kubernetes_namespace" "llama_cpp" {
   metadata {
     name = local.namespace
     labels = {
-      tier              = local.tiers.gpu
-      "istio-injection" = "disabled"
+      tier               = local.tiers.gpu
+      "istio-injection"  = "disabled"
       "keel.sh/enrolled" = "true"
     }
   }
@@ -280,10 +280,12 @@ resource "kubernetes_deployment" "llama_swap" {
   # for it to be reachable".
   wait_for_rollout = false
   spec {
-    # TEMP-SCALEDOWN-2026-05-25-IO-STORM: scaled to 0 during cluster recovery.
-    # Restore to 1 when cluster is fully stable. See post-mortem
-    # docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md.
-    replicas = 0
+    # Restored to 1 on 2026-05-29 (was 0 during 2026-05-25 IO-storm recovery —
+    # see docs/post-mortems/2026-05-25-immich-anca-elements-io-storm.md). The
+    # immediate trigger was fire-planner's examples ingest needing qwen3-8b for
+    # bulk Reddit-post extraction; only frigate is currently on the GPU on
+    # k8s-node1 so contention is minimal.
+    replicas = 1
     strategy { type = "Recreate" }
 
     selector {
@@ -380,7 +382,7 @@ resource "kubernetes_deployment" "llama_swap" {
       metadata[0].annotations["keel.sh/policy"],
       metadata[0].annotations["keel.sh/trigger"],
       metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
-      spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
+      spec[0].template[0].spec[0].container[0].image,  # KEEL_IGNORE_IMAGE
       # KEEL_LIFECYCLE_V1 — stop the apply→keel fight: every keel digest
       # update patches `keel.sh/update-time` on the pod template and
       # `kubernetes.io/change-cause` + bumps the K8s rollout revision on