infra/stacks/portal-tts/main.tf

# =============================================================================
# portal-tts — edge-tts (CPU, always-on) for the portal-assistant Gateway
# =============================================================================
#
# WHAT: a single ALWAYS-ON openai-edge-tts deployment (travisvn/openai-edge-tts),
# an OpenAI-compatible /v1/audio/speech proxy over Microsoft edge-tts neural
# voices, serving Bulgarian (bg-BG-KalinaNeural) AND English (en-US-AvaNeural),
# the voice chosen PER REQUEST by the Gateway, behind a ClusterIP Service
# `portal-tts.portal-tts.svc:8000`. CPU-only — no GPU, no NFS model store.
#
# WHY edge-tts (REPLACED Piper / openedai-speech on 2026-06-17): the local Piper
# Bulgarian voice (bg_BG-dimitar-medium, espeak-ng phonemes) was garbled and
# unintelligible — espeak mangles Bulgarian consonants (a synth->Whisper
# round-trip turned "Добър ден" into "Обърден"; a user heard pure gibberish).
# ADR-0003 always named Microsoft edge-tts as the online Bulgarian-quality
# fallback; the operator chose it for BOTH languages (validated 2026-06-17: edge
# bg round-trips through Whisper verbatim — "Добър ден! Как сте днес? ..."). The
# assistant already depends on the internet for the Claude brain, so an online
# TTS adds no new failure mode. English moved to edge too (one engine, higher
# quality) — the previous local Piper English worked but is no longer needed.
#
# NO GPU, NO NFS, NO SECRETS: edge-tts fetches voices from Microsoft on demand
# (nothing to persist), so the NFS model PVC + download init-container + voice
# ConfigMap of the old Piper design are all gone. The container needs EGRESS to
# speech.platform.bing.com (verified reachable from this namespace). The Service
# is ClusterIP-only and the Gateway is the sole externally-exposed component
# (ADR-0001) holding the edge auth, so REQUIRE_API_KEY=False here (the Gateway's
# TTSClient sends no Authorization to TTS).
#
# API SHAPE (unchanged Gateway contract): OpenAI /v1/audio/speech
#   POST /v1/audio/speech
#   { "model":"tts-1", "input":"<text>", "voice":"<edge voice name>",
#     "response_format":"wav" }  -> 200, body = raw PCM16 wav bytes
# The Gateway maps detected lang bg/en -> TTS_VOICE_BG / TTS_VOICE_EN (the edge
# voice names, set on the gateway Deployment), and openai-edge-tts accepts edge
# voice names directly. The `-ffmpeg` image variant is REQUIRED for wav output
# (the base image only emits mp3; ffmpeg transcodes to PCM16 wav).
# =============================================================================

variable "edge_tts_image" {
  type = string
  # openai-edge-tts, the OpenAI-compatible edge-tts proxy. The `-ffmpeg` variant
  # bundles ffmpeg so response_format=wav (PCM16) works. Floating tag (no semver
  # discipline upstream) — the namespace is Keel-enrolled so digest bumps roll in
  # automatically; TF owns only the tag string.
  default     = "travisvn/openai-edge-tts:latest-ffmpeg"
  description = "openai-edge-tts image (ffmpeg variant — needed for wav output)."
}

variable "bg_voice" {
  type        = string
  default     = "bg-BG-KalinaNeural"
  description = "Microsoft edge-tts neural Bulgarian voice (the Gateway's TTS_VOICE_BG must match)."
}

variable "en_voice" {
  type        = string
  default     = "en-US-AvaNeural"
  description = "Microsoft edge-tts neural English voice (the Gateway's TTS_VOICE_EN must match)."
}

locals {
  namespace = "portal-tts"
  labels    = { app = "portal-tts" }
}

resource "kubernetes_namespace" "portal_tts" {
  metadata {
    name = local.namespace
    labels = {
      tier               = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant
      "istio-injection"  = "disabled"
      "keel.sh/enrolled" = "true"
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

# Always-on openai-edge-tts. replicas=1, never scaled to zero (no GPU to free,
# negligible idle cost — it's a thin proxy to Microsoft edge-tts). CPU-only: NO
# node_selector / toleration / nvidia.com/gpu. No init container and no volumes:
# voices are fetched from Microsoft per request, so the pod is stateless.
resource "kubernetes_deployment" "portal_tts" {
  metadata {
    name      = "portal-tts"
    namespace = kubernetes_namespace.portal_tts.metadata[0].name
    labels    = merge(local.labels, { tier = local.tiers.aux })
  }
  spec {
    replicas = 1
    strategy { type = "Recreate" }
    selector {
      match_labels = { app = "portal-tts" }
    }
    template {
      metadata {
        labels = { app = "portal-tts" }
      }
      spec {
        container {
          name  = "portal-tts"
          image = var.edge_tts_image

          # openai-edge-tts listens on :5050 by default; the Service maps 8000 ->
          # 5050 so the Gateway's TTS_URL (:8000) is unchanged.
          port {
            container_port = 5050
            name           = "http"
          }
          # No API key: ClusterIP-only, the Gateway holds edge auth and sends no
          # Authorization header to TTS. DEFAULT_VOICE is a fallback only — every
          # request carries an explicit voice + response_format.
          env {
            name  = "REQUIRE_API_KEY"
            value = "False"
          }
          env {
            name  = "DEFAULT_VOICE"
            value = var.en_voice
          }

          # TCP probes — uvicorn binds :5050 only once the app is ready. No model
          # download, so startup is fast; egress to Microsoft happens per request.
          startup_probe {
            tcp_socket { port = 5050 }
            period_seconds    = 5
            failure_threshold = 24 # ~2 min
          }
          readiness_probe {
            tcp_socket { port = 5050 }
            period_seconds    = 15
            failure_threshold = 4
          }
          liveness_probe {
            tcp_socket { port = 5050 }
            initial_delay_seconds = 20
            period_seconds        = 30
            failure_threshold     = 5
          }

          resources {
            # Thin HTTP proxy to Microsoft edge-tts + ffmpeg transcode. Light on
            # CPU (no CPU limit — cluster CFS-throttling policy). VERIFY with krr
            # after real traffic and tighten.
            requests = {
              cpu    = "50m"
              memory = "256Mi"
            }
            limits = {
              memory = "512Mi"
            }
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [
      spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
      # Keel is enrolled (floating tag) — ignore its annotation churn but let the
      # tag string keep applying from TF.
      metadata[0].annotations["keel.sh/policy"],
      metadata[0].annotations["keel.sh/trigger"],
      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
      metadata[0].annotations["keel.sh/match-tag"],
      metadata[0].annotations["kubernetes.io/change-cause"],
      metadata[0].annotations["deployment.kubernetes.io/revision"],
      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1
    ]
  }
}

# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN
# until the Gateway speaks it to the Portal). No ingress, no Authentik: the
# Gateway is the only externally exposed component (ADR-0001). OpenAI speech path:
# http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech
resource "kubernetes_service" "portal_tts" {
  metadata {
    name      = "portal-tts"
    namespace = kubernetes_namespace.portal_tts.metadata[0].name
    labels    = local.labels
    annotations = {
      # openai-edge-tts has no /metrics; annotation-based scrape kept on a live
      # path so the Service stays in the scrape set (Ready-endpoint relabeling
      # filters non-Ready pods). /v1/models is the OpenAI model list.
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/v1/models"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP"
    selector = { app = "portal-tts" }
    port {
      name        = "http"
      port        = 8000
      target_port = 5050
    }
  }
}
portal-assistant: land voice stacks + switch TTS to edge-tts (intelligible Bulgarian) The portal-assistant voice-assistant stacks (portal-tts, portal-stt, portal-assistant) were applied to the live cluster from feature branches but never landed on master — the GitOps source of truth. This lands all three and, in portal-tts, fixes Bulgarian speech. Bulgarian was unintelligible: the local Piper voice (bg_BG-dimitar-medium via espeak-ng) mangles Bulgarian consonants — a synth->Whisper round-trip turned "Добър ден" into "Обърден", and a user heard pure gibberish. English was fine. portal-tts now runs openai-edge-tts (Microsoft edge-tts neural voices) for BOTH languages instead of Piper — ADR-0003 always named edge-tts as the online Bulgarian-quality fallback. Validated before landing: edge bg round-trips through Whisper verbatim ("Добър ден! Как сте днес? ..."). The gateway maps detected language bg/en to the edge voice names via new TTS_VOICE_BG / TTS_VOICE_EN env (bg-BG-KalinaNeural / en-US-AvaNeural). No GPU, no NFS model store, no secrets — edge fetches voices from Microsoft per request (egress verified). The assistant already needs the internet for the Claude brain, so an online TTS adds no new failure mode. The brain stays Sonnet with no extended thinking (already the default — a live turn answers directly in ~3.4s), per the latency-over-smartness ask. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> 2026-06-17 20:25:29 +00:00			`# =============================================================================`
			`# portal-tts — edge-tts (CPU, always-on) for the portal-assistant Gateway`
			`# =============================================================================`
			`#`
			`# WHAT: a single ALWAYS-ON openai-edge-tts deployment (travisvn/openai-edge-tts),`
			`# an OpenAI-compatible /v1/audio/speech proxy over Microsoft edge-tts neural`
			`# voices, serving Bulgarian (bg-BG-KalinaNeural) AND English (en-US-AvaNeural),`
			`# the voice chosen PER REQUEST by the Gateway, behind a ClusterIP Service`
			# `portal-tts.portal-tts.svc:8000`. CPU-only — no GPU, no NFS model store.
			`#`
			`# WHY edge-tts (REPLACED Piper / openedai-speech on 2026-06-17): the local Piper`
			`# Bulgarian voice (bg_BG-dimitar-medium, espeak-ng phonemes) was garbled and`
			`# unintelligible — espeak mangles Bulgarian consonants (a synth->Whisper`
			`# round-trip turned "Добър ден" into "Обърден"; a user heard pure gibberish).`
			`# ADR-0003 always named Microsoft edge-tts as the online Bulgarian-quality`
			`# fallback; the operator chose it for BOTH languages (validated 2026-06-17: edge`
			`# bg round-trips through Whisper verbatim — "Добър ден! Как сте днес? ..."). The`
			`# assistant already depends on the internet for the Claude brain, so an online`
			`# TTS adds no new failure mode. English moved to edge too (one engine, higher`
			`# quality) — the previous local Piper English worked but is no longer needed.`
			`#`
			`# NO GPU, NO NFS, NO SECRETS: edge-tts fetches voices from Microsoft on demand`
			`# (nothing to persist), so the NFS model PVC + download init-container + voice`
			`# ConfigMap of the old Piper design are all gone. The container needs EGRESS to`
			`# speech.platform.bing.com (verified reachable from this namespace). The Service`
			`# is ClusterIP-only and the Gateway is the sole externally-exposed component`
			`# (ADR-0001) holding the edge auth, so REQUIRE_API_KEY=False here (the Gateway's`
			`# TTSClient sends no Authorization to TTS).`
			`#`
			`# API SHAPE (unchanged Gateway contract): OpenAI /v1/audio/speech`
			`# POST /v1/audio/speech`
			`# { "model":"tts-1", "input":"<text>", "voice":"<edge voice name>",`
			`# "response_format":"wav" } -> 200, body = raw PCM16 wav bytes`
			`# The Gateway maps detected lang bg/en -> TTS_VOICE_BG / TTS_VOICE_EN (the edge`
			`# voice names, set on the gateway Deployment), and openai-edge-tts accepts edge`
			# voice names directly. The `-ffmpeg` image variant is REQUIRED for wav output
			`# (the base image only emits mp3; ffmpeg transcodes to PCM16 wav).`
			`# =============================================================================`

			`variable "edge_tts_image" {`
			`type = string`
			# openai-edge-tts, the OpenAI-compatible edge-tts proxy. The `-ffmpeg` variant
			`# bundles ffmpeg so response_format=wav (PCM16) works. Floating tag (no semver`
			`# discipline upstream) — the namespace is Keel-enrolled so digest bumps roll in`
			`# automatically; TF owns only the tag string.`
			`default = "travisvn/openai-edge-tts:latest-ffmpeg"`
			`description = "openai-edge-tts image (ffmpeg variant — needed for wav output)."`
			`}`

			`variable "bg_voice" {`
			`type = string`
			`default = "bg-BG-KalinaNeural"`
			`description = "Microsoft edge-tts neural Bulgarian voice (the Gateway's TTS_VOICE_BG must match)."`
			`}`

			`variable "en_voice" {`
			`type = string`
			`default = "en-US-AvaNeural"`
			`description = "Microsoft edge-tts neural English voice (the Gateway's TTS_VOICE_EN must match)."`
			`}`

			`locals {`
			`namespace = "portal-tts"`
			`labels = { app = "portal-tts" }`
			`}`

			`resource "kubernetes_namespace" "portal_tts" {`
			`metadata {`
			`name = local.namespace`
			`labels = {`
			`tier = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant`
			`"istio-injection" = "disabled"`
			`"keel.sh/enrolled" = "true"`
			`}`
			`}`
			`lifecycle {`
			`# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace`
			`ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]`
			`}`
			`}`

			`# Always-on openai-edge-tts. replicas=1, never scaled to zero (no GPU to free,`
			`# negligible idle cost — it's a thin proxy to Microsoft edge-tts). CPU-only: NO`
			`# node_selector / toleration / nvidia.com/gpu. No init container and no volumes:`
			`# voices are fetched from Microsoft per request, so the pod is stateless.`
			`resource "kubernetes_deployment" "portal_tts" {`
			`metadata {`
			`name = "portal-tts"`
			`namespace = kubernetes_namespace.portal_tts.metadata[0].name`
			`labels = merge(local.labels, { tier = local.tiers.aux })`
			`}`
			`spec {`
			`replicas = 1`
			`strategy { type = "Recreate" }`
			`selector {`
			`match_labels = { app = "portal-tts" }`
			`}`
			`template {`
			`metadata {`
			`labels = { app = "portal-tts" }`
			`}`
			`spec {`
			`container {`
			`name = "portal-tts"`
			`image = var.edge_tts_image`

			`# openai-edge-tts listens on :5050 by default; the Service maps 8000 ->`
			`# 5050 so the Gateway's TTS_URL (:8000) is unchanged.`
			`port {`
			`container_port = 5050`
			`name = "http"`
			`}`
			`# No API key: ClusterIP-only, the Gateway holds edge auth and sends no`
			`# Authorization header to TTS. DEFAULT_VOICE is a fallback only — every`
			`# request carries an explicit voice + response_format.`
			`env {`
			`name = "REQUIRE_API_KEY"`
			`value = "False"`
			`}`
			`env {`
			`name = "DEFAULT_VOICE"`
			`value = var.en_voice`
			`}`

			`# TCP probes — uvicorn binds :5050 only once the app is ready. No model`
			`# download, so startup is fast; egress to Microsoft happens per request.`
			`startup_probe {`
			`tcp_socket { port = 5050 }`
			`period_seconds = 5`
			`failure_threshold = 24 # ~2 min`
			`}`
			`readiness_probe {`
			`tcp_socket { port = 5050 }`
			`period_seconds = 15`
			`failure_threshold = 4`
			`}`
			`liveness_probe {`
			`tcp_socket { port = 5050 }`
			`initial_delay_seconds = 20`
			`period_seconds = 30`
			`failure_threshold = 5`
			`}`

			`resources {`
			`# Thin HTTP proxy to Microsoft edge-tts + ffmpeg transcode. Light on`
			`# CPU (no CPU limit — cluster CFS-throttling policy). VERIFY with krr`
			`# after real traffic and tighten.`
			`requests = {`
			`cpu = "50m"`
			`memory = "256Mi"`
			`}`
			`limits = {`
			`memory = "512Mi"`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`lifecycle {`
			`ignore_changes = [`
			`spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1`
			`# Keel is enrolled (floating tag) — ignore its annotation churn but let the`
			`# tag string keep applying from TF.`
			`metadata[0].annotations["keel.sh/policy"],`
			`metadata[0].annotations["keel.sh/trigger"],`
			`metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2`
			`metadata[0].annotations["keel.sh/match-tag"],`
			`metadata[0].annotations["kubernetes.io/change-cause"],`
			`metadata[0].annotations["deployment.kubernetes.io/revision"],`
			`spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1`
			`]`
			`}`
			`}`

			`# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN`
			`# until the Gateway speaks it to the Portal). No ingress, no Authentik: the`
			`# Gateway is the only externally exposed component (ADR-0001). OpenAI speech path:`
			`# http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech`
			`resource "kubernetes_service" "portal_tts" {`
			`metadata {`
			`name = "portal-tts"`
			`namespace = kubernetes_namespace.portal_tts.metadata[0].name`
			`labels = local.labels`
			`annotations = {`
			`# openai-edge-tts has no /metrics; annotation-based scrape kept on a live`
			`# path so the Service stays in the scrape set (Ready-endpoint relabeling`
			`# filters non-Ready pods). /v1/models is the OpenAI model list.`
			`"prometheus.io/scrape" = "true"`
			`"prometheus.io/path" = "/v1/models"`
			`"prometheus.io/port" = "8000"`
			`}`
			`}`
			`spec {`
			`type = "ClusterIP"`
			`selector = { app = "portal-tts" }`
			`port {`
			`name = "http"`
			`port = 8000`
			`target_port = 5050`
			`}`
			`}`
			`}`