infra/stacks/portal-stt/main.tf

# =============================================================================
# portal-stt — Speaches STT (Whisper large-v3-turbo int8) for portal-assistant
# =============================================================================
#
# DRAFT for operator review (portal-assistant issue #2). HITL apply: an agent
# drafts; the operator applies via GitOps (presence-claimed) and verifies the
# rollout. Do NOT `terragrunt apply` this from a worktree.
#
# WHAT: a single WARM-RESIDENT Speaches deployment (OpenAI-compatible
# faster-whisper server) serving `large-v3-turbo` int8, multilingual (Bulgarian
# + English), on the shared Tesla T4 (one time-slice). ClusterIP only — audio
# never leaves the LAN; the portal-assistant Gateway is the only externally
# exposed component (ADR-0001), so no ingress/auth here.
#
# WHY WARM-RESIDENT, NOT THE CHATTERBOX DEMAND-GATE:
#   The TTS (chatterbox) stack scales 0<->1 behind a free-VRAM CronJob gate
#   because it is a best-effort BATCH tenant (tripit narration) that can wait.
#   STT here is INTERACTIVE voice — every Turn would pay a multi-second cold
#   model load (download/mmap + CUDA init) if we scaled to zero. So this stack
#   keeps the model permanently loaded: replicas=1 + Speaches STT_MODEL_TTL=-1
#   (never unload) + PRELOAD_MODELS (load at startup). See portal-assistant
#   CONTEXT.md "Warm window" + ADR-0003.
#
# OOM HISTORY / VRAM MATH — the binding constraint is the shared T4 (16 GiB,
# time-sliced across immich-ml / frigate / llama-swap / android-emulator with
# NO per-tenant VRAM isolation). See
# docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md (immich-ml's
# unbounded onnxruntime arena starved llama-swap's qwen3-8b -> recruiter down).
#
#   Live residents measured 2026-06-17 (gpu_pod_memory_used_bytes):
#     immich-ml        ~2.1 GiB  (capped: MACHINE_LEARNING_MODEL_TTL=600)
#     frigate (8 proc) ~1.9 GiB  (detector + ffmpeg decode)
#     android-emulator ~0.15 GiB
#     llama-swap        0 idle, but loads qwen3-8b on demand = ~4.35 GiB peak
#                       (cudaMalloc 4455 MiB, per the post-mortem)
#   Worst-case concurrent baseline (everything hot): 2.1 + 1.9 + 0.15 + 4.35
#                                                   = ~8.5 GiB.
#   Speaches large-v3-turbo int8 weights ~= 0.8 GiB on disk; resident CTranslate2
#   int8 + CUDA context + decode buffers budget conservatively to ~1.5 GiB
#   (VERIFY at apply against gpu_pod_memory_used_bytes{namespace="portal-stt"}).
#
#   8.5 (residents) + 1.5 (this) = ~10.0 GiB used  =>  ~6 GiB T4 headroom.
#   That headroom is the safety margin against onnxruntime arena drift (the
#   exact failure mode from 2026-06-02). If a future resident grows, this is the
#   FIRST place to re-measure. The conservative int8 (not fp16) choice halves
#   our weight footprint precisely to protect this margin.
#
# GPU PRIORITY: this pod requests nvidia.com/gpu, so the Kyverno
# `inject-gpu-workload-priority` ClusterPolicy auto-stamps the immich-equal
# `gpu-workload` (1,200,000) priority — portal-stt is NOT in that policy's
# exclude list (only `tts` is, to keep chatterbox demotable). That is CORRECT
# here: warm interactive STT is a first-class GPU resident, never the first
# evicted. We also set priority_class_name explicitly so intent is legible at
# the call site and survives a policy fail-open. (Contrast tts/main.tf, which
# pins tier-2-gpu precisely so chatterbox IS evicted first.)
# =============================================================================

variable "nfs_server" {
  type        = string
  description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)."
}

variable "speaches_image" {
  type = string
  # ghcr.io/speaches-ai/speaches CUDA build. The live registry currently
  # publishes 0.9.0-rc.3-cuda (+ sha-/cuda-12.x variants) and a moving
  # :latest-cuda; there is no published :0.8.3-cuda for the last stable. Pinned
  # to the rc.3 CUDA tag (immutable-ish, beats :latest for the OOM/Keel-churn
  # history). CUDA 12.4/12.6 image runtime is fine under our 570.195.03 driver
  # (CUDA 12.8, backward-compatible). OPEN ITEM for operator: confirm this tag
  # still resolves at apply, or bump to the newest -cuda tag.
  default     = "ghcr.io/speaches-ai/speaches:0.9.0-rc.3-cuda"
  description = "Speaches CUDA image. Pin a -cuda tag, not :latest-cuda."
}

variable "stt_model_id" {
  type = string
  # HF repo id of the CTranslate2 large-v3-turbo conversion. deepdml's is the
  # canonical community ct2 build of openai large-v3-turbo (multilingual,
  # incl. Bulgarian) and is what ADR-0003's FLEURS-bg bake-off measured at
  # 8.3% WER. Speaches resolves whisper models by HF repo id.
  default     = "deepdml/faster-whisper-large-v3-turbo-ct2"
  description = "HuggingFace repo id of the warm-resident whisper model."
}

locals {
  namespace = "portal-stt"
  labels    = { app = "portal-stt" }

  # Speaches is configured via env vars (pydantic-settings): scalars map from
  # UPPER_SNAKE, nested whisper.* settings from WHISPER__FIELD. The three knobs
  # that make this WARM-RESIDENT and int8:
  #   PRELOAD_MODELS          — JSON list, loaded sequentially at startup so the
  #                             first Turn is never cold (pod won't go Ready until
  #                             the model is in VRAM).
  #   STT_MODEL_TTL=-1        — never unload an idle STT model (0=immediate,
  #                             default 300s). This is the warm-resident lever.
  #   WHISPER__COMPUTE_TYPE   — int8 (conservative VRAM; default "default"=fp16).
  #   WHISPER__INFERENCE_DEVICE — cuda (default "auto").
  # HF cache is redirected onto the NFS-SSD PVC so weights download once and
  # persist across pod restarts (image default cache is /home/ubuntu/.cache/
  # huggingface/hub — ephemeral). Speaches runs as uid 1000 (ubuntu).
}

resource "kubernetes_namespace" "portal_stt" {
  metadata {
    name = local.namespace
    labels = {
      tier               = local.tiers.gpu
      "istio-injection"  = "disabled"
      "keel.sh/enrolled" = "true"
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

# portal-stt is ClusterIP-only (no ingress) — the Gateway is the sole
# externally-exposed component (ADR-0001), so there is NO TLS secret / no
# setup_tls_secret module here (it would demand secrets/fullchain.pem that this
# stack does not ship).

# Model + HF cache on NFS-SSD (fast first-load, persists across restarts). Path
# /srv/nfs-ssd/portal-stt on the Proxmox host (192.168.1.127). Mirrors the
# chatterbox nfs_models pattern. RWX so a future seed/inspect pod can touch it.
module "nfs_models" {
  source     = "../../modules/kubernetes/nfs_volume"
  name       = "portal-stt-models"
  namespace  = kubernetes_namespace.portal_stt.metadata[0].name
  nfs_server = var.nfs_server
  nfs_path   = "/srv/nfs-ssd/portal-stt"
  storage    = "10Gi" # large-v3-turbo ct2 (~0.8Gi) + HF cache headroom
}

# One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-stt
# SUBDIR must exist before kubelet can bind-mount it (chatterbox hit exit 32 on
# a missing subdir the first window — see stacks/tts/main.tf). Mount the export
# ROOT (which exists) and mkdir the subtree; kubelet's mount retry then heals
# the main pod. Idempotent; immutable-once-created.
resource "kubernetes_job" "models_dir_init" {
  metadata {
    name      = "portal-stt-models-dir-init"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = local.labels
  }
  spec {
    backoff_limit              = 3
    ttl_seconds_after_finished = 86400
    template {
      metadata { labels = local.labels }
      spec {
        restart_policy = "Never"
        container {
          name    = "mkdir"
          image   = "busybox:1.37"
          command = ["sh", "-c", "mkdir -p /mnt/portal-stt/hub && ls -la /mnt/portal-stt"]
          volume_mount {
            name       = "nfs-ssd-root"
            mount_path = "/mnt"
          }
        }
        volume {
          name = "nfs-ssd-root"
          nfs {
            server = var.nfs_server
            path   = "/srv/nfs-ssd"
          }
        }
      }
    }
  }
  wait_for_completion = true
  timeouts { create = "3m" }
}

# Warm-resident Speaches. replicas=1, NEVER scaled to zero (no off-peak gate,
# unlike tts) — the model stays in VRAM so interactive Turns never pay a cold
# load. wait_for_rollout left default (true): a plain apply SHOULD block until
# the model is loaded and the pod is Ready, surfacing a bad image/model early.
resource "kubernetes_deployment" "portal_stt" {
  metadata {
    name      = "portal-stt"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = merge(local.labels, { tier = local.tiers.gpu })
  }
  spec {
    replicas = 1
    # RWO is not in play (model PVC is RWX NFS), but Recreate avoids two pods
    # briefly double-loading the model into the shared T4 during a rollout.
    strategy { type = "Recreate" }
    selector {
      match_labels = { app = "portal-stt" }
    }
    template {
      metadata {
        labels = { app = "portal-stt" }
      }
      spec {
        node_selector = { "nvidia.com/gpu.present" = "true" }
        toleration {
          key      = "nvidia.com/gpu"
          operator = "Equal"
          value    = "true"
          effect   = "NoSchedule"
        }
        # First-class GPU resident (warm interactive STT) — same priority as
        # immich-ml. Kyverno would stamp this anyway (portal-stt is not in the
        # gpu-priority exclude list); set explicitly for legibility + fail-open
        # safety. NOT tier-2-gpu (that is chatterbox's evict-first demotion).
        priority_class_name = "gpu-workload"

        container {
          name  = "portal-stt"
          image = var.speaches_image

          # --- warm-resident + int8 + cuda config (see locals) ---
          env {
            name  = "PRELOAD_MODELS"
            value = jsonencode([var.stt_model_id])
          }
          env {
            name  = "STT_MODEL_TTL"
            value = "-1" # never unload — the warm-resident lever
          }
          env {
            name  = "WHISPER__INFERENCE_DEVICE"
            value = "cuda"
          }
          env {
            name  = "WHISPER__COMPUTE_TYPE"
            value = "int8" # conservative VRAM (vs fp16 default)
          }
          env {
            name  = "LOG_LEVEL"
            value = "info" # image default is debug
          }
          # Persist the HF model cache on the NFS-SSD PVC (image default cache
          # dir is ephemeral). Speaches/HF honour HF_HUB_CACHE + HF_HOME.
          env {
            name  = "HF_HUB_CACHE"
            value = "/data/hub"
          }
          env {
            name  = "HF_HOME"
            value = "/data"
          }

          port {
            container_port = 8000
            name           = "http"
          }

          volume_mount {
            name       = "models"
            mount_path = "/data"
          }

          # /health is Speaches' liveness/readiness path. Generous startup
          # allowance: the first boot downloads large-v3-turbo to the PVC before
          # the server reports healthy (PRELOAD blocks startup). After the model
          # is cached on NFS-SSD, subsequent boots load in seconds.
          startup_probe {
            http_get {
              path = "/health"
              port = 8000
            }
            period_seconds    = 10
            failure_threshold = 60 # up to ~10 min for the first model download
          }
          readiness_probe {
            http_get {
              path = "/health"
              port = 8000
            }
            period_seconds    = 15
            failure_threshold = 4
          }
          liveness_probe {
            http_get {
              path = "/health"
              port = 8000
            }
            initial_delay_seconds = 30
            period_seconds        = 30
            failure_threshold     = 5
          }

          resources {
            requests = {
              cpu    = "200m"
              memory = "2Gi"
            }
            limits = {
              memory           = "4Gi"
              "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
            }
          }
        }

        volume {
          name = "models"
          persistent_volume_claim {
            claim_name = module.nfs_models.claim_name
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [
      spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
      # image is TF-OWNED (pinned -cuda tag) — Keel can manage the digest on
      # this tag if desired, so ignore keel's annotation churn but NOT the image
      # itself (we want tag pins to apply). Mirrors tts: keel annotations only.
      metadata[0].annotations["keel.sh/policy"],
      metadata[0].annotations["keel.sh/trigger"],
      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
      metadata[0].annotations["keel.sh/match-tag"],
      metadata[0].annotations["kubernetes.io/change-cause"],
      metadata[0].annotations["deployment.kubernetes.io/revision"],
      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
    ]
  }
}

# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN).
# No ingress, no Authentik: the Gateway is the only externally exposed component
# (ADR-0001) and holds the edge auth. OpenAI transcription path is
# http://portal-stt.portal-stt.svc.cluster.local:8000/v1/audio/transcriptions
resource "kubernetes_service" "portal_stt" {
  metadata {
    name      = "portal-stt"
    namespace = kubernetes_namespace.portal_stt.metadata[0].name
    labels    = local.labels
    annotations = {
      # Speaches exposes Prometheus metrics at /metrics — wire annotation-based
      # scrape (Ready-endpoint relabeling already filters non-Ready pods).
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/metrics"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP"
    selector = { app = "portal-stt" }
    port {
      name        = "http"
      port        = 8000
      target_port = 8000
    }
  }
}
portal-assistant: land voice stacks + switch TTS to edge-tts (intelligible Bulgarian) The portal-assistant voice-assistant stacks (portal-tts, portal-stt, portal-assistant) were applied to the live cluster from feature branches but never landed on master — the GitOps source of truth. This lands all three and, in portal-tts, fixes Bulgarian speech. Bulgarian was unintelligible: the local Piper voice (bg_BG-dimitar-medium via espeak-ng) mangles Bulgarian consonants — a synth->Whisper round-trip turned "Добър ден" into "Обърден", and a user heard pure gibberish. English was fine. portal-tts now runs openai-edge-tts (Microsoft edge-tts neural voices) for BOTH languages instead of Piper — ADR-0003 always named edge-tts as the online Bulgarian-quality fallback. Validated before landing: edge bg round-trips through Whisper verbatim ("Добър ден! Как сте днес? ..."). The gateway maps detected language bg/en to the edge voice names via new TTS_VOICE_BG / TTS_VOICE_EN env (bg-BG-KalinaNeural / en-US-AvaNeural). No GPU, no NFS model store, no secrets — edge fetches voices from Microsoft per request (egress verified). The assistant already needs the internet for the Claude brain, so an online TTS adds no new failure mode. The brain stays Sonnet with no extended thinking (already the default — a live turn answers directly in ~3.4s), per the latency-over-smartness ask. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-17 20:25:29 +00:00			`# =============================================================================`
			`# portal-stt — Speaches STT (Whisper large-v3-turbo int8) for portal-assistant`
			`# =============================================================================`
			`#`
			`# DRAFT for operator review (portal-assistant issue #2). HITL apply: an agent`
			`# drafts; the operator applies via GitOps (presence-claimed) and verifies the`
			# rollout. Do NOT `terragrunt apply` this from a worktree.
			`#`
			`# WHAT: a single WARM-RESIDENT Speaches deployment (OpenAI-compatible`
			# faster-whisper server) serving `large-v3-turbo` int8, multilingual (Bulgarian
			`# + English), on the shared Tesla T4 (one time-slice). ClusterIP only — audio`
			`# never leaves the LAN; the portal-assistant Gateway is the only externally`
			`# exposed component (ADR-0001), so no ingress/auth here.`
			`#`
			`# WHY WARM-RESIDENT, NOT THE CHATTERBOX DEMAND-GATE:`
			`# The TTS (chatterbox) stack scales 0<->1 behind a free-VRAM CronJob gate`
			`# because it is a best-effort BATCH tenant (tripit narration) that can wait.`
			`# STT here is INTERACTIVE voice — every Turn would pay a multi-second cold`
			`# model load (download/mmap + CUDA init) if we scaled to zero. So this stack`
			`# keeps the model permanently loaded: replicas=1 + Speaches STT_MODEL_TTL=-1`
			`# (never unload) + PRELOAD_MODELS (load at startup). See portal-assistant`
			`# CONTEXT.md "Warm window" + ADR-0003.`
			`#`
			`# OOM HISTORY / VRAM MATH — the binding constraint is the shared T4 (16 GiB,`
			`# time-sliced across immich-ml / frigate / llama-swap / android-emulator with`
			`# NO per-tenant VRAM isolation). See`
			`# docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md (immich-ml's`
			`# unbounded onnxruntime arena starved llama-swap's qwen3-8b -> recruiter down).`
			`#`
			`# Live residents measured 2026-06-17 (gpu_pod_memory_used_bytes):`
			`# immich-ml ~2.1 GiB (capped: MACHINE_LEARNING_MODEL_TTL=600)`
			`# frigate (8 proc) ~1.9 GiB (detector + ffmpeg decode)`
			`# android-emulator ~0.15 GiB`
			`# llama-swap 0 idle, but loads qwen3-8b on demand = ~4.35 GiB peak`
			`# (cudaMalloc 4455 MiB, per the post-mortem)`
			`# Worst-case concurrent baseline (everything hot): 2.1 + 1.9 + 0.15 + 4.35`
			`# = ~8.5 GiB.`
			`# Speaches large-v3-turbo int8 weights ~= 0.8 GiB on disk; resident CTranslate2`
			`# int8 + CUDA context + decode buffers budget conservatively to ~1.5 GiB`
			`# (VERIFY at apply against gpu_pod_memory_used_bytes{namespace="portal-stt"}).`
			`#`
			`# 8.5 (residents) + 1.5 (this) = ~10.0 GiB used => ~6 GiB T4 headroom.`
			`# That headroom is the safety margin against onnxruntime arena drift (the`
			`# exact failure mode from 2026-06-02). If a future resident grows, this is the`
			`# FIRST place to re-measure. The conservative int8 (not fp16) choice halves`
			`# our weight footprint precisely to protect this margin.`
			`#`
			`# GPU PRIORITY: this pod requests nvidia.com/gpu, so the Kyverno`
			# `inject-gpu-workload-priority` ClusterPolicy auto-stamps the immich-equal
			# `gpu-workload` (1,200,000) priority — portal-stt is NOT in that policy's
			# exclude list (only `tts` is, to keep chatterbox demotable). That is CORRECT
			`# here: warm interactive STT is a first-class GPU resident, never the first`
			`# evicted. We also set priority_class_name explicitly so intent is legible at`
			`# the call site and survives a policy fail-open. (Contrast tts/main.tf, which`
			`# pins tier-2-gpu precisely so chatterbox IS evicted first.)`
			`# =============================================================================`

			`variable "nfs_server" {`
			`type = string`
			`description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)."`
			`}`

			`variable "speaches_image" {`
			`type = string`
			`# ghcr.io/speaches-ai/speaches CUDA build. The live registry currently`
			`# publishes 0.9.0-rc.3-cuda (+ sha-/cuda-12.x variants) and a moving`
			`# :latest-cuda; there is no published :0.8.3-cuda for the last stable. Pinned`
			`# to the rc.3 CUDA tag (immutable-ish, beats :latest for the OOM/Keel-churn`
			`# history). CUDA 12.4/12.6 image runtime is fine under our 570.195.03 driver`
			`# (CUDA 12.8, backward-compatible). OPEN ITEM for operator: confirm this tag`
			`# still resolves at apply, or bump to the newest -cuda tag.`
			`default = "ghcr.io/speaches-ai/speaches:0.9.0-rc.3-cuda"`
			`description = "Speaches CUDA image. Pin a -cuda tag, not :latest-cuda."`
			`}`

			`variable "stt_model_id" {`
			`type = string`
			`# HF repo id of the CTranslate2 large-v3-turbo conversion. deepdml's is the`
			`# canonical community ct2 build of openai large-v3-turbo (multilingual,`
			`# incl. Bulgarian) and is what ADR-0003's FLEURS-bg bake-off measured at`
			`# 8.3% WER. Speaches resolves whisper models by HF repo id.`
			`default = "deepdml/faster-whisper-large-v3-turbo-ct2"`
			`description = "HuggingFace repo id of the warm-resident whisper model."`
			`}`

			`locals {`
			`namespace = "portal-stt"`
			`labels = { app = "portal-stt" }`

			`# Speaches is configured via env vars (pydantic-settings): scalars map from`
			`# UPPER_SNAKE, nested whisper.* settings from WHISPER__FIELD. The three knobs`
			`# that make this WARM-RESIDENT and int8:`
			`# PRELOAD_MODELS — JSON list, loaded sequentially at startup so the`
			`# first Turn is never cold (pod won't go Ready until`
			`# the model is in VRAM).`
			`# STT_MODEL_TTL=-1 — never unload an idle STT model (0=immediate,`
			`# default 300s). This is the warm-resident lever.`
			`# WHISPER__COMPUTE_TYPE — int8 (conservative VRAM; default "default"=fp16).`
			`# WHISPER__INFERENCE_DEVICE — cuda (default "auto").`
			`# HF cache is redirected onto the NFS-SSD PVC so weights download once and`
			`# persist across pod restarts (image default cache is /home/ubuntu/.cache/`
			`# huggingface/hub — ephemeral). Speaches runs as uid 1000 (ubuntu).`
			`}`

			`resource "kubernetes_namespace" "portal_stt" {`
			`metadata {`
			`name = local.namespace`
			`labels = {`
			`tier = local.tiers.gpu`
			`"istio-injection" = "disabled"`
			`"keel.sh/enrolled" = "true"`
			`}`
			`}`
			`lifecycle {`
			`# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace`
			`ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]`
			`}`
			`}`

portal-stt: drop setup_tls_secret module (ClusterIP-only, no fullchain.pem) The landed portal-stt source still declared the setup_tls_secret module + tls_secret_name variable, which file()-reads secrets/fullchain.pem — a file this stack does not ship. portal-stt is ClusterIP-only (no ingress; the Gateway is the sole externally-exposed component, ADR-0001), so it needs no TLS secret. The live deployment never had it (removed during the original apply); this aligns the source with reality so CI applies cleanly. Fixes the pipeline-229 portal-stt apply failure. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-17 20:29:31 +00:00			`# portal-stt is ClusterIP-only (no ingress) — the Gateway is the sole`
			`# externally-exposed component (ADR-0001), so there is NO TLS secret / no`
			`# setup_tls_secret module here (it would demand secrets/fullchain.pem that this`
			`# stack does not ship).`
portal-assistant: land voice stacks + switch TTS to edge-tts (intelligible Bulgarian) The portal-assistant voice-assistant stacks (portal-tts, portal-stt, portal-assistant) were applied to the live cluster from feature branches but never landed on master — the GitOps source of truth. This lands all three and, in portal-tts, fixes Bulgarian speech. Bulgarian was unintelligible: the local Piper voice (bg_BG-dimitar-medium via espeak-ng) mangles Bulgarian consonants — a synth->Whisper round-trip turned "Добър ден" into "Обърден", and a user heard pure gibberish. English was fine. portal-tts now runs openai-edge-tts (Microsoft edge-tts neural voices) for BOTH languages instead of Piper — ADR-0003 always named edge-tts as the online Bulgarian-quality fallback. Validated before landing: edge bg round-trips through Whisper verbatim ("Добър ден! Как сте днес? ..."). The gateway maps detected language bg/en to the edge voice names via new TTS_VOICE_BG / TTS_VOICE_EN env (bg-BG-KalinaNeural / en-US-AvaNeural). No GPU, no NFS model store, no secrets — edge fetches voices from Microsoft per request (egress verified). The assistant already needs the internet for the Claude brain, so an online TTS adds no new failure mode. The brain stays Sonnet with no extended thinking (already the default — a live turn answers directly in ~3.4s), per the latency-over-smartness ask. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-17 20:25:29 +00:00
			`# Model + HF cache on NFS-SSD (fast first-load, persists across restarts). Path`
			`# /srv/nfs-ssd/portal-stt on the Proxmox host (192.168.1.127). Mirrors the`
			`# chatterbox nfs_models pattern. RWX so a future seed/inspect pod can touch it.`
			`module "nfs_models" {`
			`source = "../../modules/kubernetes/nfs_volume"`
			`name = "portal-stt-models"`
			`namespace = kubernetes_namespace.portal_stt.metadata[0].name`
			`nfs_server = var.nfs_server`
			`nfs_path = "/srv/nfs-ssd/portal-stt"`
			`storage = "10Gi" # large-v3-turbo ct2 (~0.8Gi) + HF cache headroom`
			`}`

			`# One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-stt`
			`# SUBDIR must exist before kubelet can bind-mount it (chatterbox hit exit 32 on`
			`# a missing subdir the first window — see stacks/tts/main.tf). Mount the export`
			`# ROOT (which exists) and mkdir the subtree; kubelet's mount retry then heals`
			`# the main pod. Idempotent; immutable-once-created.`
			`resource "kubernetes_job" "models_dir_init" {`
			`metadata {`
			`name = "portal-stt-models-dir-init"`
			`namespace = kubernetes_namespace.portal_stt.metadata[0].name`
			`labels = local.labels`
			`}`
			`spec {`
			`backoff_limit = 3`
			`ttl_seconds_after_finished = 86400`
			`template {`
			`metadata { labels = local.labels }`
			`spec {`
			`restart_policy = "Never"`
			`container {`
			`name = "mkdir"`
			`image = "busybox:1.37"`
			`command = ["sh", "-c", "mkdir -p /mnt/portal-stt/hub && ls -la /mnt/portal-stt"]`
			`volume_mount {`
			`name = "nfs-ssd-root"`
			`mount_path = "/mnt"`
			`}`
			`}`
			`volume {`
			`name = "nfs-ssd-root"`
			`nfs {`
			`server = var.nfs_server`
			`path = "/srv/nfs-ssd"`
			`}`
			`}`
			`}`
			`}`
			`}`
			`wait_for_completion = true`
			`timeouts { create = "3m" }`
			`}`

			`# Warm-resident Speaches. replicas=1, NEVER scaled to zero (no off-peak gate,`
			`# unlike tts) — the model stays in VRAM so interactive Turns never pay a cold`
			`# load. wait_for_rollout left default (true): a plain apply SHOULD block until`
			`# the model is loaded and the pod is Ready, surfacing a bad image/model early.`
			`resource "kubernetes_deployment" "portal_stt" {`
			`metadata {`
			`name = "portal-stt"`
			`namespace = kubernetes_namespace.portal_stt.metadata[0].name`
			`labels = merge(local.labels, { tier = local.tiers.gpu })`
			`}`
			`spec {`
			`replicas = 1`
			`# RWO is not in play (model PVC is RWX NFS), but Recreate avoids two pods`
			`# briefly double-loading the model into the shared T4 during a rollout.`
			`strategy { type = "Recreate" }`
			`selector {`
			`match_labels = { app = "portal-stt" }`
			`}`
			`template {`
			`metadata {`
			`labels = { app = "portal-stt" }`
			`}`
			`spec {`
			`node_selector = { "nvidia.com/gpu.present" = "true" }`
			`toleration {`
			`key = "nvidia.com/gpu"`
			`operator = "Equal"`
			`value = "true"`
			`effect = "NoSchedule"`
			`}`
			`# First-class GPU resident (warm interactive STT) — same priority as`
			`# immich-ml. Kyverno would stamp this anyway (portal-stt is not in the`
			`# gpu-priority exclude list); set explicitly for legibility + fail-open`
			`# safety. NOT tier-2-gpu (that is chatterbox's evict-first demotion).`
			`priority_class_name = "gpu-workload"`

			`container {`
			`name = "portal-stt"`
			`image = var.speaches_image`

			`# --- warm-resident + int8 + cuda config (see locals) ---`
			`env {`
			`name = "PRELOAD_MODELS"`
			`value = jsonencode([var.stt_model_id])`
			`}`
			`env {`
			`name = "STT_MODEL_TTL"`
			`value = "-1" # never unload — the warm-resident lever`
			`}`
			`env {`
			`name = "WHISPER__INFERENCE_DEVICE"`
			`value = "cuda"`
			`}`
			`env {`
			`name = "WHISPER__COMPUTE_TYPE"`
			`value = "int8" # conservative VRAM (vs fp16 default)`
			`}`
			`env {`
			`name = "LOG_LEVEL"`
			`value = "info" # image default is debug`
			`}`
			`# Persist the HF model cache on the NFS-SSD PVC (image default cache`
			`# dir is ephemeral). Speaches/HF honour HF_HUB_CACHE + HF_HOME.`
			`env {`
			`name = "HF_HUB_CACHE"`
			`value = "/data/hub"`
			`}`
			`env {`
			`name = "HF_HOME"`
			`value = "/data"`
			`}`

			`port {`
			`container_port = 8000`
			`name = "http"`
			`}`

			`volume_mount {`
			`name = "models"`
			`mount_path = "/data"`
			`}`

			`# /health is Speaches' liveness/readiness path. Generous startup`
			`# allowance: the first boot downloads large-v3-turbo to the PVC before`
			`# the server reports healthy (PRELOAD blocks startup). After the model`
			`# is cached on NFS-SSD, subsequent boots load in seconds.`
			`startup_probe {`
			`http_get {`
			`path = "/health"`
			`port = 8000`
			`}`
			`period_seconds = 10`
			`failure_threshold = 60 # up to ~10 min for the first model download`
			`}`
			`readiness_probe {`
			`http_get {`
			`path = "/health"`
			`port = 8000`
			`}`
			`period_seconds = 15`
			`failure_threshold = 4`
			`}`
			`liveness_probe {`
			`http_get {`
			`path = "/health"`
			`port = 8000`
			`}`
			`initial_delay_seconds = 30`
			`period_seconds = 30`
			`failure_threshold = 5`
			`}`

			`resources {`
			`requests = {`
			`cpu = "200m"`
			`memory = "2Gi"`
			`}`
			`limits = {`
			`memory = "4Gi"`
			`"nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card`
			`}`
			`}`
			`}`

			`volume {`
			`name = "models"`
			`persistent_volume_claim {`
			`claim_name = module.nfs_models.claim_name`
			`}`
			`}`
			`}`
			`}`
			`}`
			`lifecycle {`
			`ignore_changes = [`
			`spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1`
			`# image is TF-OWNED (pinned -cuda tag) — Keel can manage the digest on`
			`# this tag if desired, so ignore keel's annotation churn but NOT the image`
			`# itself (we want tag pins to apply). Mirrors tts: keel annotations only.`
			`metadata[0].annotations["keel.sh/policy"],`
			`metadata[0].annotations["keel.sh/trigger"],`
			`metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2`
			`metadata[0].annotations["keel.sh/match-tag"],`
			`metadata[0].annotations["kubernetes.io/change-cause"],`
			`metadata[0].annotations["deployment.kubernetes.io/revision"],`
			`spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1`
			`]`
			`}`
			`}`

			`# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN).`
			`# No ingress, no Authentik: the Gateway is the only externally exposed component`
			`# (ADR-0001) and holds the edge auth. OpenAI transcription path is`
			`# http://portal-stt.portal-stt.svc.cluster.local:8000/v1/audio/transcriptions`
			`resource "kubernetes_service" "portal_stt" {`
			`metadata {`
			`name = "portal-stt"`
			`namespace = kubernetes_namespace.portal_stt.metadata[0].name`
			`labels = local.labels`
			`annotations = {`
			`# Speaches exposes Prometheus metrics at /metrics — wire annotation-based`
			`# scrape (Ready-endpoint relabeling already filters non-Ready pods).`
			`"prometheus.io/scrape" = "true"`
			`"prometheus.io/path" = "/metrics"`
			`"prometheus.io/port" = "8000"`
			`}`
			`}`
			`spec {`
			`type = "ClusterIP"`
			`selector = { app = "portal-stt" }`
			`port {`
			`name = "http"`
			`port = 8000`
			`target_port = 8000`
			`}`
			`}`
			`}`