infra/stacks/portal-stt/main.tf

# =============================================================================
# portal-stt — Speaches STT (Whisper large-v3-turbo int8) for portal-assistant
# =============================================================================
#
# DRAFT for operator review (portal-assistant issue #2). HITL apply: an agent
# drafts; the operator applies via GitOps (presence-claimed) and verifies the
# rollout. Do NOT `terragrunt apply` this from a worktree.
#
# WHAT: a single WARM-RESIDENT Speaches deployment (OpenAI-compatible
# faster-whisper server) serving `large-v3-turbo` int8, multilingual (Bulgarian
# + English), on the shared Tesla T4 (one time-slice). ClusterIP only — audio
# never leaves the LAN; the portal-assistant Gateway is the only externally
# exposed component (ADR-0001), so no ingress/auth here.
#
# WHY WARM-RESIDENT, NOT THE CHATTERBOX DEMAND-GATE:
#   The TTS (chatterbox) stack scales 0<->1 behind a free-VRAM CronJob gate
#   because it is a best-effort BATCH tenant (tripit narration) that can wait.
#   STT here is INTERACTIVE voice — every Turn would pay a multi-second cold
#   model load (download/mmap + CUDA init) if we scaled to zero. So this stack
#   keeps the model permanently loaded: replicas=1 + Speaches STT_MODEL_TTL=-1
#   (never unload) + PRELOAD_MODELS (load at startup). See portal-assistant
#   CONTEXT.md "Warm window" + ADR-0003.
#
# OOM HISTORY / VRAM MATH — the binding constraint is the shared T4 (16 GiB,
# time-sliced across immich-ml / frigate / llama-swap / android-emulator with
# NO per-tenant VRAM isolation). See
# docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md (immich-ml's
# unbounded onnxruntime arena starved llama-swap's qwen3-8b -> recruiter down).
#
#   Live residents measured 2026-06-17 (gpu_pod_memory_used_bytes):
#     immich-ml        ~2.1 GiB  (capped: MACHINE_LEARNING_MODEL_TTL=600)
#     frigate (8 proc) ~1.9 GiB  (detector + ffmpeg decode)
#     android-emulator ~0.15 GiB
#     llama-swap        0 idle, but loads qwen3-8b on demand = ~4.35 GiB peak
#                       (cudaMalloc 4455 MiB, per the post-mortem)
#   Worst-case concurrent baseline (everything hot): 2.1 + 1.9 + 0.15 + 4.35
#                                                   = ~8.5 GiB.
#   Speaches large-v3-turbo int8 weights ~= 0.8 GiB on disk; resident CTranslate2
#   int8 + CUDA context + decode buffers budget conservatively to ~1.5 GiB
#   (VERIFY at apply against gpu_pod_memory_used_bytes{namespace="portal-stt"}).
#
#   8.5 (residents) + 1.5 (this) = ~10.0 GiB used  =>  ~6 GiB T4 headroom.
#   That headroom is the safety margin against onnxruntime arena drift (the
#   exact failure mode from 2026-06-02). If a future resident grows, this is the
#   FIRST place to re-measure. The conservative int8 (not fp16) choice halves
#   our weight footprint precisely to protect this margin.
#
# GPU PRIORITY: this pod requests nvidia.com/gpu, so the Kyverno
# `inject-gpu-workload-priority` ClusterPolicy auto-stamps the immich-equal
# `gpu-workload` (1,200,000) priority — portal-stt is NOT in that policy's
# exclude list (only `tts` is, to keep chatterbox demotable). That is CORRECT
# here: warm interactive STT is a first-class GPU resident, never the first
# evicted. We also set priority_class_name explicitly so intent is legible at
# the call site and survives a policy fail-open. (Contrast tts/main.tf, which
# pins tier-2-gpu precisely so chatterbox IS evicted first.)
# =============================================================================

variable "tls_secret_name" {
  type      = string
  sensitive = true
}

variable "nfs_server" {
  type        = string
  description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)."
}

variable "speaches_image" {
  type = string
  # ghcr.io/speaches-ai/speaches CUDA build. The live registry currently
  # publishes 0.9.0-rc.3-cuda (+ sha-/cuda-12.x variants) and a moving
  # :latest-cuda; there is no published :0.8.3-cuda for the last stable. Pinned
  # to the rc.3 CUDA tag (immutable-ish, beats :latest for the OOM/Keel-churn
  # history). CUDA 12.4/12.6 image runtime is fine under our 570.195.03 driver
  # (CUDA 12.8, backward-compatible). OPEN ITEM for operator: confirm this tag
  # still resolves at apply, or bump to the newest -cuda tag.
  default     = "ghcr.io/speaches-ai/speaches:0.9.0-rc.3-cuda"
  description = "Speaches CUDA image. Pin a -cuda tag, not :latest-cuda."
}

variable "stt_model_id" {
  type = string
  # HF repo id of the CTranslate2 large-v3-turbo conversion. deepdml's is the
  # canonical community ct2 build of openai large-v3-turbo (multilingual,
  # incl. Bulgarian) and is what ADR-0003's FLEURS-bg bake-off measured at
  # 8.3% WER. Speaches resolves whisper models by HF repo id.
  default     = "deepdml/faster-whisper-large-v3-turbo-ct2"
  description = "HuggingFace repo id of the warm-resident whisper model."
}

locals {
  namespace = "portal-stt"
  labels    = { app = "portal-stt" }

  # Speaches is configured via env vars (pydantic-settings): scalars map from
  # UPPER_SNAKE, nested whisper.* settings from WHISPER__FIELD. The three knobs
  # that make this WARM-RESIDENT and int8:
  #   PRELOAD_MODELS          — JSON list, loaded sequentially at startup so the
  #                             first Turn is never cold (pod won't go Ready until
  #                             the model is in VRAM).
  #   STT_MODEL_TTL=-1        — never unload an idle STT model (0=immediate,
  #                             default 300s). This is the warm-resident lever.
  #   WHISPER__COMPUTE_TYPE   — int8 (conservative VRAM; default "default"=fp16).
  #   WHISPER__INFERENCE_DEVICE — cuda (default "auto").
  # HF cache is redirected onto the NFS-SSD PVC so weights download once and
  # persist across pod restarts (image default cache is /home/ubuntu/.cache/
  # huggingface/hub — ephemeral). Speaches runs as uid 1000 (ubuntu).
}

resource "kubernetes_namespace" "portal_stt" {
  metadata {
    name = local.namespace
    labels = {
      tier               = local.tiers.gpu
      "istio-injection"  = "disabled"
      "keel.sh/enrolled" = "true"
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

module "tls_secret" {
  source          = "../../modules/kubernetes/setup_tls_secret"
  namespace       = kubernetes_namespace.portal_stt.metadata[0].name
  tls_secret_name = var.tls_secret_name
}

# Model + HF cache on NFS-SSD (fast first-load, persists across restarts). Path
# /srv/nfs-ssd/portal-stt on the Proxmox host (192.168.1.127). Mirrors the
# chatterbox nfs_models pattern. RWX so a future seed/inspect pod can touch it.
module "nfs_models" {
  source     = "../../modules/kubernetes/nfs_volume"
  name       = "portal-stt-models"
  namespace  = kubernetes_namespace.portal_stt.metadata[0].name
  nfs_server = var.nfs_server
  nfs_path   = "/srv/nfs-ssd/portal-stt"
  storage    = "10Gi" # large-v3-turbo ct2 (~0.8Gi) + HF cache headroom
}

# One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-stt
# SUBDIR must exist before kubelet can bind-mount it (chatterbox hit exit 32 on
# a missing subdir the first window — see stacks/tts/main.tf). Mount the export
# ROOT (which exists) and mkdir the subtree; kubelet's mount retry then heals
# the main pod. Idempotent; immutable-once-created.
resource "kubernetes_job" "models_dir_init" {
  metadata {
    name      = "portal-stt-models-dir-init"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = local.labels
  }
  spec {
    backoff_limit              = 3
    ttl_seconds_after_finished = 86400
    template {
      metadata { labels = local.labels }
      spec {
        restart_policy = "Never"
        container {
          name    = "mkdir"
          image   = "busybox:1.37"
          command = ["sh", "-c", "mkdir -p /mnt/portal-stt/hub && ls -la /mnt/portal-stt"]
          volume_mount {
            name       = "nfs-ssd-root"
            mount_path = "/mnt"
          }
        }
        volume {
          name = "nfs-ssd-root"
          nfs {
            server = var.nfs_server
            path   = "/srv/nfs-ssd"
          }
        }
      }
    }
  }
  wait_for_completion = true
  timeouts { create = "3m" }
}

# Warm-resident Speaches. replicas=1, NEVER scaled to zero (no off-peak gate,
# unlike tts) — the model stays in VRAM so interactive Turns never pay a cold
# load. wait_for_rollout left default (true): a plain apply SHOULD block until
# the model is loaded and the pod is Ready, surfacing a bad image/model early.
resource "kubernetes_deployment" "portal_stt" {
  metadata {
    name      = "portal-stt"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = merge(local.labels, { tier = local.tiers.gpu })
  }
  spec {
    replicas = 1
    # RWO is not in play (model PVC is RWX NFS), but Recreate avoids two pods
    # briefly double-loading the model into the shared T4 during a rollout.
    strategy { type = "Recreate" }
    selector {
      match_labels = { app = "portal-stt" }
    }
    template {
      metadata {
        labels = { app = "portal-stt" }
      }
      spec {
        node_selector = { "nvidia.com/gpu.present" = "true" }
        toleration {
          key      = "nvidia.com/gpu"
          operator = "Equal"
          value    = "true"
          effect   = "NoSchedule"
        }
        # First-class GPU resident (warm interactive STT) — same priority as
        # immich-ml. Kyverno would stamp this anyway (portal-stt is not in the
        # gpu-priority exclude list); set explicitly for legibility + fail-open
        # safety. NOT tier-2-gpu (that is chatterbox's evict-first demotion).
        priority_class_name = "gpu-workload"

        container {
          name  = "portal-stt"
          image = var.speaches_image

          # --- warm-resident + int8 + cuda config (see locals) ---
          env {
            name  = "PRELOAD_MODELS"
            value = jsonencode([var.stt_model_id])
          }
          env {
            name  = "STT_MODEL_TTL"
            value = "-1" # never unload — the warm-resident lever
          }
          env {
            name  = "WHISPER__INFERENCE_DEVICE"
            value = "cuda"
          }
          env {
            name  = "WHISPER__COMPUTE_TYPE"
            value = "int8" # conservative VRAM (vs fp16 default)
          }
          env {
            name  = "LOG_LEVEL"
            value = "info" # image default is debug
          }
          # Persist the HF model cache on the NFS-SSD PVC (image default cache
          # dir is ephemeral). Speaches/HF honour HF_HUB_CACHE + HF_HOME.
          env {
            name  = "HF_HUB_CACHE"
            value = "/data/hub"
          }
          env {
            name  = "HF_HOME"
            value = "/data"
          }

          port {
            container_port = 8000
            name           = "http"
          }

          volume_mount {
            name       = "models"
            mount_path = "/data"
          }

          # /health is Speaches' liveness/readiness path. Generous startup
          # allowance: the first boot downloads large-v3-turbo to the PVC before
          # the server reports healthy (PRELOAD blocks startup). After the model
          # is cached on NFS-SSD, subsequent boots load in seconds.
          startup_probe {
            http_get {
              path = "/health"
              port = 8000
            }
            period_seconds    = 10
            failure_threshold = 60 # up to ~10 min for the first model download
          }
          readiness_probe {
            http_get {
              path = "/health"
              port = 8000
            }
            period_seconds    = 15
            failure_threshold = 4
          }
          liveness_probe {
            http_get {
              path = "/health"
              port = 8000
            }
            initial_delay_seconds = 30
            period_seconds        = 30
            failure_threshold     = 5
          }

          resources {
            requests = {
              cpu    = "200m"
              memory = "2Gi"
            }
            limits = {
              memory           = "4Gi"
              "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
            }
          }
        }

        volume {
          name = "models"
          persistent_volume_claim {
            claim_name = module.nfs_models.claim_name
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [
      spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
      # image is TF-OWNED (pinned -cuda tag) — Keel can manage the digest on
      # this tag if desired, so ignore keel's annotation churn but NOT the image
      # itself (we want tag pins to apply). Mirrors tts: keel annotations only.
      metadata[0].annotations["keel.sh/policy"],
      metadata[0].annotations["keel.sh/trigger"],
      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
      metadata[0].annotations["keel.sh/match-tag"],
      metadata[0].annotations["kubernetes.io/change-cause"],
      metadata[0].annotations["deployment.kubernetes.io/revision"],
      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
    ]
  }
}

# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN).
# No ingress, no Authentik: the Gateway is the only externally exposed component
# (ADR-0001) and holds the edge auth. OpenAI transcription path is
# http://portal-stt.portal-stt.svc.cluster.local:8000/v1/audio/transcriptions
resource "kubernetes_service" "portal_stt" {
  metadata {
    name      = "portal-stt"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = local.labels
    annotations = {
      # Speaches exposes Prometheus metrics at /metrics — wire annotation-based
      # scrape (Ready-endpoint relabeling already filters non-Ready pods).
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/metrics"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP"
    selector = { app = "portal-stt" }
    port {
      name        = "http"
      port        = 8000
      target_port = 8000
    }
  }
}