infra/stacks/portal-tts/main.tf

# =============================================================================
# portal-tts — edge-tts (CPU, always-on) for the portal-assistant Gateway
# =============================================================================
#
# WHAT: a single ALWAYS-ON openai-edge-tts deployment (travisvn/openai-edge-tts),
# an OpenAI-compatible /v1/audio/speech proxy over Microsoft edge-tts neural
# voices, serving Bulgarian (bg-BG-KalinaNeural) AND English (en-US-AvaNeural),
# the voice chosen PER REQUEST by the Gateway, behind a ClusterIP Service
# `portal-tts.portal-tts.svc:8000`. CPU-only — no GPU, no NFS model store.
#
# WHY edge-tts (REPLACED Piper / openedai-speech on 2026-06-17): the local Piper
# Bulgarian voice (bg_BG-dimitar-medium, espeak-ng phonemes) was garbled and
# unintelligible — espeak mangles Bulgarian consonants (a synth->Whisper
# round-trip turned "Добър ден" into "Обърден"; a user heard pure gibberish).
# ADR-0003 always named Microsoft edge-tts as the online Bulgarian-quality
# fallback; the operator chose it for BOTH languages (validated 2026-06-17: edge
# bg round-trips through Whisper verbatim — "Добър ден! Как сте днес? ..."). The
# assistant already depends on the internet for the Claude brain, so an online
# TTS adds no new failure mode. English moved to edge too (one engine, higher
# quality) — the previous local Piper English worked but is no longer needed.
#
# NO GPU, NO NFS, NO SECRETS: edge-tts fetches voices from Microsoft on demand
# (nothing to persist), so the NFS model PVC + download init-container + voice
# ConfigMap of the old Piper design are all gone. The container needs EGRESS to
# speech.platform.bing.com (verified reachable from this namespace). The Service
# is ClusterIP-only and the Gateway is the sole externally-exposed component
# (ADR-0001) holding the edge auth, so REQUIRE_API_KEY=False here (the Gateway's
# TTSClient sends no Authorization to TTS).
#
# API SHAPE (unchanged Gateway contract): OpenAI /v1/audio/speech
#   POST /v1/audio/speech
#   { "model":"tts-1", "input":"<text>", "voice":"<edge voice name>",
#     "response_format":"wav" }  -> 200, body = raw PCM16 wav bytes
# The Gateway maps detected lang bg/en -> TTS_VOICE_BG / TTS_VOICE_EN (the edge
# voice names, set on the gateway Deployment), and openai-edge-tts accepts edge
# voice names directly. The `-ffmpeg` image variant is REQUIRED for wav output
# (the base image only emits mp3; ffmpeg transcodes to PCM16 wav).
# =============================================================================

variable "edge_tts_image" {
  type = string
  # openai-edge-tts, the OpenAI-compatible edge-tts proxy. The `-ffmpeg` variant
  # bundles ffmpeg so response_format=wav (PCM16) works. Floating tag (no semver
  # discipline upstream) — the namespace is Keel-enrolled so digest bumps roll in
  # automatically; TF owns only the tag string.
  default     = "travisvn/openai-edge-tts:latest-ffmpeg"
  description = "openai-edge-tts image (ffmpeg variant — needed for wav output)."
}

variable "bg_voice" {
  type        = string
  default     = "bg-BG-KalinaNeural"
  description = "Microsoft edge-tts neural Bulgarian voice (the Gateway's TTS_VOICE_BG must match)."
}

variable "en_voice" {
  type        = string
  default     = "en-US-AvaNeural"
  description = "Microsoft edge-tts neural English voice (the Gateway's TTS_VOICE_EN must match)."
}

locals {
  namespace = "portal-tts"
  labels    = { app = "portal-tts" }
}

resource "kubernetes_namespace" "portal_tts" {
  metadata {
    name = local.namespace
    labels = {
      tier               = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant
      "istio-injection"  = "disabled"
      "keel.sh/enrolled" = "true"
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

# Always-on openai-edge-tts. replicas=1, never scaled to zero (no GPU to free,
# negligible idle cost — it's a thin proxy to Microsoft edge-tts). CPU-only: NO
# node_selector / toleration / nvidia.com/gpu. No init container and no volumes:
# voices are fetched from Microsoft per request, so the pod is stateless.
resource "kubernetes_deployment" "portal_tts" {
  metadata {
    name      = "portal-tts"
    namespace = kubernetes_namespace.portal_tts.metadata[0].name
    labels    = merge(local.labels, { tier = local.tiers.aux })
  }
  spec {
    replicas = 1
    strategy { type = "Recreate" }
    selector {
      match_labels = { app = "portal-tts" }
    }
    template {
      metadata {
        labels = { app = "portal-tts" }
      }
      spec {
        container {
          name  = "portal-tts"
          image = var.edge_tts_image

          # openai-edge-tts listens on :5050 by default; the Service maps 8000 ->
          # 5050 so the Gateway's TTS_URL (:8000) is unchanged.
          port {
            container_port = 5050
            name           = "http"
          }
          # No API key: ClusterIP-only, the Gateway holds edge auth and sends no
          # Authorization header to TTS. DEFAULT_VOICE is a fallback only — every
          # request carries an explicit voice + response_format.
          env {
            name  = "REQUIRE_API_KEY"
            value = "False"
          }
          env {
            name  = "DEFAULT_VOICE"
            value = var.en_voice
          }

          # TCP probes — uvicorn binds :5050 only once the app is ready. No model
          # download, so startup is fast; egress to Microsoft happens per request.
          startup_probe {
            tcp_socket { port = 5050 }
            period_seconds    = 5
            failure_threshold = 24 # ~2 min
          }
          readiness_probe {
            tcp_socket { port = 5050 }
            period_seconds    = 15
            failure_threshold = 4
          }
          liveness_probe {
            tcp_socket { port = 5050 }
            initial_delay_seconds = 20
            period_seconds        = 30
            failure_threshold     = 5
          }

          resources {
            # Thin HTTP proxy to Microsoft edge-tts + ffmpeg transcode. Light on
            # CPU (no CPU limit — cluster CFS-throttling policy). VERIFY with krr
            # after real traffic and tighten.
            requests = {
              cpu    = "50m"
              memory = "256Mi"
            }
            limits = {
              memory = "512Mi"
            }
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [
      spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
      # Keel is enrolled (floating tag) — ignore its annotation churn but let the
      # tag string keep applying from TF.
      metadata[0].annotations["keel.sh/policy"],
      metadata[0].annotations["keel.sh/trigger"],
      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
      metadata[0].annotations["keel.sh/match-tag"],
      metadata[0].annotations["kubernetes.io/change-cause"],
      metadata[0].annotations["deployment.kubernetes.io/revision"],
      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
    ]
  }
}

# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN
# until the Gateway speaks it to the Portal). No ingress, no Authentik: the
# Gateway is the only externally exposed component (ADR-0001). OpenAI speech path:
# http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech
resource "kubernetes_service" "portal_tts" {
  metadata {
    name      = "portal-tts"
    namespace = kubernetes_namespace.portal_tts.metadata[0].name
    labels    = local.labels
    annotations = {
      # openai-edge-tts has no /metrics; annotation-based scrape kept on a live
      # path so the Service stays in the scrape set (Ready-endpoint relabeling
      # filters non-Ready pods). /v1/models is the OpenAI model list.
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/v1/models"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP"
    selector = { app = "portal-tts" }
    port {
      name        = "http"
      port        = 8000
      target_port = 5050
    }
  }
}