From e7b9a7475658d5d601eff36fe449cc8c385add08 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Wed, 17 Jun 2026 20:25:29 +0000 Subject: [PATCH] portal-assistant: land voice stacks + switch TTS to edge-tts (intelligible Bulgarian) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The portal-assistant voice-assistant stacks (portal-tts, portal-stt, portal-assistant) were applied to the live cluster from feature branches but never landed on master — the GitOps source of truth. This lands all three and, in portal-tts, fixes Bulgarian speech. Bulgarian was unintelligible: the local Piper voice (bg_BG-dimitar-medium via espeak-ng) mangles Bulgarian consonants — a synth->Whisper round-trip turned "Добър ден" into "Обърден", and a user heard pure gibberish. English was fine. portal-tts now runs openai-edge-tts (Microsoft edge-tts neural voices) for BOTH languages instead of Piper — ADR-0003 always named edge-tts as the online Bulgarian-quality fallback. Validated before landing: edge bg round-trips through Whisper verbatim ("Добър ден! Как сте днес? ..."). The gateway maps detected language bg/en to the edge voice names via new TTS_VOICE_BG / TTS_VOICE_EN env (bg-BG-KalinaNeural / en-US-AvaNeural). No GPU, no NFS model store, no secrets — edge fetches voices from Microsoft per request (egress verified). The assistant already needs the internet for the Claude brain, so an online TTS adds no new failure mode. The brain stays Sonnet with no extended thinking (already the default — a live turn answers directly in ~3.4s), per the latency-over-smartness ask. Co-Authored-By: Claude Opus 4.8 --- stacks/portal-assistant/main.tf | 230 ++++++++++++++++ stacks/portal-assistant/terragrunt.hcl | 13 + stacks/portal-stt/main.tf | 360 +++++++++++++++++++++++++ stacks/portal-stt/terragrunt.hcl | 33 +++ stacks/portal-tts/main.tf | 203 ++++++++++++++ stacks/portal-tts/terragrunt.hcl | 33 +++ 6 files changed, 872 insertions(+) create mode 100644 stacks/portal-assistant/main.tf create mode 100644 stacks/portal-assistant/terragrunt.hcl create mode 100644 stacks/portal-stt/main.tf create mode 100644 stacks/portal-stt/terragrunt.hcl create mode 100644 stacks/portal-tts/main.tf create mode 100644 stacks/portal-tts/terragrunt.hcl diff --git a/stacks/portal-assistant/main.tf b/stacks/portal-assistant/main.tf new file mode 100644 index 00000000..10020d47 --- /dev/null +++ b/stacks/portal-assistant/main.tf @@ -0,0 +1,230 @@ +# ============================================================================= +# portal-assistant gateway — voice orchestrator (STT -> Brain -> TTS) +# ============================================================================= +# The single service the Client app talks to: POST /v1/talk takes a WAV + a +# client id, runs Speaches STT -> the claude-agent-service conversational Brain +# -> Piper TTS, and returns the spoken reply. v1: ClusterIP only (E2E tested +# in-cluster). In-memory sessions (no SESSION_DB_DSN). See portal-assistant +# ADR-0001/0002/0003. Public Cloudflare ingress + device-token edge is the next +# increment. +# ============================================================================= + +data "vault_kv_secret_v2" "viktor" { + mount = "secret" + name = "viktor" +} + +data "vault_kv_secret_v2" "cas" { + mount = "secret" + name = "claude-agent-service" +} + +data "vault_kv_secret_v2" "pa" { + mount = "secret" + name = "portal-assistant" +} + +locals { + namespace = "portal-assistant" + labels = { app = "portal-assistant-gateway" } + image = "ghcr.io/viktorbarzin/portal-assistant-gateway:latest" +} + +resource "kubernetes_namespace" "portal_assistant" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.edge + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Pull secret — the gateway image is a PRIVATE ghcr package. Uses the read-only +# ghcr_pull_token (secret/viktor), the same cred the cluster-wide allowlist uses. +resource "kubernetes_secret" "ghcr" { + metadata { + name = "ghcr-pull" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + } + type = "kubernetes.io/dockerconfigjson" + data = { + ".dockerconfigjson" = jsonencode({ + auths = { + "ghcr.io" = { + username = "viktorbarzin" + password = data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"] + auth = base64encode("viktorbarzin:${data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"]}") + } + } + }) + } +} + +# Tokens the gateway needs: BRAIN_TOKEN = claude-agent-service's bearer (to call +# the conversational endpoint); DEVICE_TOKEN = the per-Client secret the Portal +# app carries to authenticate to /v1/talk. +resource "kubernetes_secret" "gateway" { + metadata { + name = "portal-assistant-gateway-secrets" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + } + data = { + BRAIN_TOKEN = data.vault_kv_secret_v2.cas.data["api_bearer_token"] + DEVICE_TOKEN = data.vault_kv_secret_v2.pa.data["device_token"] + } +} + +resource "kubernetes_deployment" "gateway" { + metadata { + name = "portal-assistant-gateway" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.edge }) + } + spec { + replicas = 1 + selector { + match_labels = { app = "portal-assistant-gateway" } + } + template { + metadata { + labels = { app = "portal-assistant-gateway" } + } + spec { + image_pull_secrets { + name = kubernetes_secret.ghcr.metadata[0].name + } + container { + name = "gateway" + image = local.image + image_pull_policy = "Always" + port { + container_port = 8000 + name = "http" + } + # STT -> Speaches; TTS -> Piper; Brain -> claude-agent-service. + env { + name = "STT_URL" + value = "http://portal-stt.portal-stt.svc.cluster.local:8000" + } + env { + name = "STT_MODEL" + value = "deepdml/faster-whisper-large-v3-turbo-ct2" + } + env { + name = "TTS_URL" + value = "http://portal-tts.portal-tts.svc.cluster.local:8000" + } + # portal-tts now serves Microsoft edge-tts neural voices (Piper's + # Bulgarian was garbled; 2026-06-17). The gateway maps detected lang + # bg/en -> these edge voice names, which openai-edge-tts accepts directly. + env { + name = "TTS_VOICE_BG" + value = "bg-BG-KalinaNeural" + } + env { + name = "TTS_VOICE_EN" + value = "en-US-AvaNeural" + } + env { + name = "BRAIN_URL" + value = "http://claude-agent-service.claude-agent.svc.cluster.local:8080" + } + env { + name = "BRAIN_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.gateway.metadata[0].name + key = "BRAIN_TOKEN" + } + } + } + env { + name = "DEVICE_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.gateway.metadata[0].name + key = "DEVICE_TOKEN" + } + } + } + readiness_probe { + http_get { + path = "/health" + port = 8000 + } + period_seconds = 10 + } + liveness_probe { + http_get { + path = "/health" + port = 8000 + } + initial_delay_seconds = 15 + period_seconds = 30 + } + resources { + requests = { + cpu = "50m" + memory = "256Mi" + } + limits = { + memory = "512Mi" + } + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — the only externally-exposed component (ADR-0001) gets its public +# Cloudflare ingress in the next increment; here it's reachable in-cluster for +# the E2E smoke. /metrics scraped by Prometheus. +resource "kubernetes_service" "gateway" { + metadata { + name = "portal-assistant-gateway" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + labels = local.labels + annotations = { + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/metrics" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-assistant-gateway" } + port { + name = "http" + port = 8000 + target_port = 8000 + } + } +} + +# Public Cloudflare ingress — the Portal app reaches the gateway at +# https://portal-assistant.viktorbarzin.me/v1/talk. tls-secret is Kyverno-synced +# into the namespace. The gateway holds its own edge auth (the DEVICE_TOKEN +# bearer), so no Authentik in front. +module "ingress" { + source = "../../modules/kubernetes/ingress_factory" + name = "portal-assistant" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + service_name = kubernetes_service.gateway.metadata[0].name + port = 8000 + tls_secret_name = "tls-secret" + # auth = "app": the gateway enforces its own DEVICE_TOKEN bearer on /v1/talk; Authentik would break the native Portal client (it has no browser login). + auth = "app" + dns_type = "proxied" + max_body_size = "25m" # audio (WAV) uploads +} diff --git a/stacks/portal-assistant/terragrunt.hcl b/stacks/portal-assistant/terragrunt.hcl new file mode 100644 index 00000000..222cd339 --- /dev/null +++ b/stacks/portal-assistant/terragrunt.hcl @@ -0,0 +1,13 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-assistant gateway — the voice-assistant orchestrator (STT -> Brain -> +# TTS). v1 is ClusterIP-only (E2E proven in-cluster); the public Cloudflare +# ingress for the Portal app is added next. In-memory sessions for now (no +# SESSION_DB_DSN); CNPG Postgres is a later add. portal-assistant issue #10. diff --git a/stacks/portal-stt/main.tf b/stacks/portal-stt/main.tf new file mode 100644 index 00000000..7c0e00bb --- /dev/null +++ b/stacks/portal-stt/main.tf @@ -0,0 +1,360 @@ +# ============================================================================= +# portal-stt — Speaches STT (Whisper large-v3-turbo int8) for portal-assistant +# ============================================================================= +# +# DRAFT for operator review (portal-assistant issue #2). HITL apply: an agent +# drafts; the operator applies via GitOps (presence-claimed) and verifies the +# rollout. Do NOT `terragrunt apply` this from a worktree. +# +# WHAT: a single WARM-RESIDENT Speaches deployment (OpenAI-compatible +# faster-whisper server) serving `large-v3-turbo` int8, multilingual (Bulgarian +# + English), on the shared Tesla T4 (one time-slice). ClusterIP only — audio +# never leaves the LAN; the portal-assistant Gateway is the only externally +# exposed component (ADR-0001), so no ingress/auth here. +# +# WHY WARM-RESIDENT, NOT THE CHATTERBOX DEMAND-GATE: +# The TTS (chatterbox) stack scales 0<->1 behind a free-VRAM CronJob gate +# because it is a best-effort BATCH tenant (tripit narration) that can wait. +# STT here is INTERACTIVE voice — every Turn would pay a multi-second cold +# model load (download/mmap + CUDA init) if we scaled to zero. So this stack +# keeps the model permanently loaded: replicas=1 + Speaches STT_MODEL_TTL=-1 +# (never unload) + PRELOAD_MODELS (load at startup). See portal-assistant +# CONTEXT.md "Warm window" + ADR-0003. +# +# OOM HISTORY / VRAM MATH — the binding constraint is the shared T4 (16 GiB, +# time-sliced across immich-ml / frigate / llama-swap / android-emulator with +# NO per-tenant VRAM isolation). See +# docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md (immich-ml's +# unbounded onnxruntime arena starved llama-swap's qwen3-8b -> recruiter down). +# +# Live residents measured 2026-06-17 (gpu_pod_memory_used_bytes): +# immich-ml ~2.1 GiB (capped: MACHINE_LEARNING_MODEL_TTL=600) +# frigate (8 proc) ~1.9 GiB (detector + ffmpeg decode) +# android-emulator ~0.15 GiB +# llama-swap 0 idle, but loads qwen3-8b on demand = ~4.35 GiB peak +# (cudaMalloc 4455 MiB, per the post-mortem) +# Worst-case concurrent baseline (everything hot): 2.1 + 1.9 + 0.15 + 4.35 +# = ~8.5 GiB. +# Speaches large-v3-turbo int8 weights ~= 0.8 GiB on disk; resident CTranslate2 +# int8 + CUDA context + decode buffers budget conservatively to ~1.5 GiB +# (VERIFY at apply against gpu_pod_memory_used_bytes{namespace="portal-stt"}). +# +# 8.5 (residents) + 1.5 (this) = ~10.0 GiB used => ~6 GiB T4 headroom. +# That headroom is the safety margin against onnxruntime arena drift (the +# exact failure mode from 2026-06-02). If a future resident grows, this is the +# FIRST place to re-measure. The conservative int8 (not fp16) choice halves +# our weight footprint precisely to protect this margin. +# +# GPU PRIORITY: this pod requests nvidia.com/gpu, so the Kyverno +# `inject-gpu-workload-priority` ClusterPolicy auto-stamps the immich-equal +# `gpu-workload` (1,200,000) priority — portal-stt is NOT in that policy's +# exclude list (only `tts` is, to keep chatterbox demotable). That is CORRECT +# here: warm interactive STT is a first-class GPU resident, never the first +# evicted. We also set priority_class_name explicitly so intent is legible at +# the call site and survives a policy fail-open. (Contrast tts/main.tf, which +# pins tier-2-gpu precisely so chatterbox IS evicted first.) +# ============================================================================= + +variable "tls_secret_name" { + type = string + sensitive = true +} + +variable "nfs_server" { + type = string + description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)." +} + +variable "speaches_image" { + type = string + # ghcr.io/speaches-ai/speaches CUDA build. The live registry currently + # publishes 0.9.0-rc.3-cuda (+ sha-/cuda-12.x variants) and a moving + # :latest-cuda; there is no published :0.8.3-cuda for the last stable. Pinned + # to the rc.3 CUDA tag (immutable-ish, beats :latest for the OOM/Keel-churn + # history). CUDA 12.4/12.6 image runtime is fine under our 570.195.03 driver + # (CUDA 12.8, backward-compatible). OPEN ITEM for operator: confirm this tag + # still resolves at apply, or bump to the newest -cuda tag. + default = "ghcr.io/speaches-ai/speaches:0.9.0-rc.3-cuda" + description = "Speaches CUDA image. Pin a -cuda tag, not :latest-cuda." +} + +variable "stt_model_id" { + type = string + # HF repo id of the CTranslate2 large-v3-turbo conversion. deepdml's is the + # canonical community ct2 build of openai large-v3-turbo (multilingual, + # incl. Bulgarian) and is what ADR-0003's FLEURS-bg bake-off measured at + # 8.3% WER. Speaches resolves whisper models by HF repo id. + default = "deepdml/faster-whisper-large-v3-turbo-ct2" + description = "HuggingFace repo id of the warm-resident whisper model." +} + +locals { + namespace = "portal-stt" + labels = { app = "portal-stt" } + + # Speaches is configured via env vars (pydantic-settings): scalars map from + # UPPER_SNAKE, nested whisper.* settings from WHISPER__FIELD. The three knobs + # that make this WARM-RESIDENT and int8: + # PRELOAD_MODELS — JSON list, loaded sequentially at startup so the + # first Turn is never cold (pod won't go Ready until + # the model is in VRAM). + # STT_MODEL_TTL=-1 — never unload an idle STT model (0=immediate, + # default 300s). This is the warm-resident lever. + # WHISPER__COMPUTE_TYPE — int8 (conservative VRAM; default "default"=fp16). + # WHISPER__INFERENCE_DEVICE — cuda (default "auto"). + # HF cache is redirected onto the NFS-SSD PVC so weights download once and + # persist across pod restarts (image default cache is /home/ubuntu/.cache/ + # huggingface/hub — ephemeral). Speaches runs as uid 1000 (ubuntu). +} + +resource "kubernetes_namespace" "portal_stt" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.gpu + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +module "tls_secret" { + source = "../../modules/kubernetes/setup_tls_secret" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + tls_secret_name = var.tls_secret_name +} + +# Model + HF cache on NFS-SSD (fast first-load, persists across restarts). Path +# /srv/nfs-ssd/portal-stt on the Proxmox host (192.168.1.127). Mirrors the +# chatterbox nfs_models pattern. RWX so a future seed/inspect pod can touch it. +module "nfs_models" { + source = "../../modules/kubernetes/nfs_volume" + name = "portal-stt-models" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + nfs_server = var.nfs_server + nfs_path = "/srv/nfs-ssd/portal-stt" + storage = "10Gi" # large-v3-turbo ct2 (~0.8Gi) + HF cache headroom +} + +# One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-stt +# SUBDIR must exist before kubelet can bind-mount it (chatterbox hit exit 32 on +# a missing subdir the first window — see stacks/tts/main.tf). Mount the export +# ROOT (which exists) and mkdir the subtree; kubelet's mount retry then heals +# the main pod. Idempotent; immutable-once-created. +resource "kubernetes_job" "models_dir_init" { + metadata { + name = "portal-stt-models-dir-init" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + labels = local.labels + } + spec { + backoff_limit = 3 + ttl_seconds_after_finished = 86400 + template { + metadata { labels = local.labels } + spec { + restart_policy = "Never" + container { + name = "mkdir" + image = "busybox:1.37" + command = ["sh", "-c", "mkdir -p /mnt/portal-stt/hub && ls -la /mnt/portal-stt"] + volume_mount { + name = "nfs-ssd-root" + mount_path = "/mnt" + } + } + volume { + name = "nfs-ssd-root" + nfs { + server = var.nfs_server + path = "/srv/nfs-ssd" + } + } + } + } + } + wait_for_completion = true + timeouts { create = "3m" } +} + +# Warm-resident Speaches. replicas=1, NEVER scaled to zero (no off-peak gate, +# unlike tts) — the model stays in VRAM so interactive Turns never pay a cold +# load. wait_for_rollout left default (true): a plain apply SHOULD block until +# the model is loaded and the pod is Ready, surfacing a bad image/model early. +resource "kubernetes_deployment" "portal_stt" { + metadata { + name = "portal-stt" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.gpu }) + } + spec { + replicas = 1 + # RWO is not in play (model PVC is RWX NFS), but Recreate avoids two pods + # briefly double-loading the model into the shared T4 during a rollout. + strategy { type = "Recreate" } + selector { + match_labels = { app = "portal-stt" } + } + template { + metadata { + labels = { app = "portal-stt" } + } + spec { + node_selector = { "nvidia.com/gpu.present" = "true" } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } + # First-class GPU resident (warm interactive STT) — same priority as + # immich-ml. Kyverno would stamp this anyway (portal-stt is not in the + # gpu-priority exclude list); set explicitly for legibility + fail-open + # safety. NOT tier-2-gpu (that is chatterbox's evict-first demotion). + priority_class_name = "gpu-workload" + + container { + name = "portal-stt" + image = var.speaches_image + + # --- warm-resident + int8 + cuda config (see locals) --- + env { + name = "PRELOAD_MODELS" + value = jsonencode([var.stt_model_id]) + } + env { + name = "STT_MODEL_TTL" + value = "-1" # never unload — the warm-resident lever + } + env { + name = "WHISPER__INFERENCE_DEVICE" + value = "cuda" + } + env { + name = "WHISPER__COMPUTE_TYPE" + value = "int8" # conservative VRAM (vs fp16 default) + } + env { + name = "LOG_LEVEL" + value = "info" # image default is debug + } + # Persist the HF model cache on the NFS-SSD PVC (image default cache + # dir is ephemeral). Speaches/HF honour HF_HUB_CACHE + HF_HOME. + env { + name = "HF_HUB_CACHE" + value = "/data/hub" + } + env { + name = "HF_HOME" + value = "/data" + } + + port { + container_port = 8000 + name = "http" + } + + volume_mount { + name = "models" + mount_path = "/data" + } + + # /health is Speaches' liveness/readiness path. Generous startup + # allowance: the first boot downloads large-v3-turbo to the PVC before + # the server reports healthy (PRELOAD blocks startup). After the model + # is cached on NFS-SSD, subsequent boots load in seconds. + startup_probe { + http_get { + path = "/health" + port = 8000 + } + period_seconds = 10 + failure_threshold = 60 # up to ~10 min for the first model download + } + readiness_probe { + http_get { + path = "/health" + port = 8000 + } + period_seconds = 15 + failure_threshold = 4 + } + liveness_probe { + http_get { + path = "/health" + port = 8000 + } + initial_delay_seconds = 30 + period_seconds = 30 + failure_threshold = 5 + } + + resources { + requests = { + cpu = "200m" + memory = "2Gi" + } + limits = { + memory = "4Gi" + "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card + } + } + } + + volume { + name = "models" + persistent_volume_claim { + claim_name = module.nfs_models.claim_name + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + # image is TF-OWNED (pinned -cuda tag) — Keel can manage the digest on + # this tag if desired, so ignore keel's annotation churn but NOT the image + # itself (we want tag pins to apply). Mirrors tts: keel annotations only. + metadata[0].annotations["keel.sh/policy"], + metadata[0].annotations["keel.sh/trigger"], + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/match-tag"], + metadata[0].annotations["kubernetes.io/change-cause"], + metadata[0].annotations["deployment.kubernetes.io/revision"], + spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN). +# No ingress, no Authentik: the Gateway is the only externally exposed component +# (ADR-0001) and holds the edge auth. OpenAI transcription path is +# http://portal-stt.portal-stt.svc.cluster.local:8000/v1/audio/transcriptions +resource "kubernetes_service" "portal_stt" { + metadata { + name = "portal-stt" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + labels = local.labels + annotations = { + # Speaches exposes Prometheus metrics at /metrics — wire annotation-based + # scrape (Ready-endpoint relabeling already filters non-Ready pods). + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/metrics" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-stt" } + port { + name = "http" + port = 8000 + target_port = 8000 + } + } +} diff --git a/stacks/portal-stt/terragrunt.hcl b/stacks/portal-stt/terragrunt.hcl new file mode 100644 index 00000000..b35880be --- /dev/null +++ b/stacks/portal-stt/terragrunt.hcl @@ -0,0 +1,33 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-stt: in-cluster speech-to-text for the portal-assistant Gateway +# (portal-assistant issue #2, ADR-0003). One Deployment of Speaches +# (ghcr.io/speaches-ai/speaches, OpenAI-compatible faster-whisper) serving +# `large-v3-turbo` int8, multilingual (Bulgarian + English), behind a single +# ClusterIP Service `portal-stt.portal-stt.svc:8000`. Transcription path: +# /v1/audio/transcriptions. Requests ONE time-slice of the shared T4 +# (nvidia.com/gpu=1) — a slice, not the card. +# +# WARM-RESIDENT (NOT the tts/chatterbox demand-gate): replicas=1, never scaled +# to zero. The model is preloaded at startup (PRELOAD_MODELS) and never unloaded +# (STT_MODEL_TTL=-1) so interactive voice Turns never pay a cold model load. +# Chatterbox can scale 0<->1 because it is best-effort batch narration; STT is +# latency-critical and must stay warm. See portal-assistant CONTEXT.md +# "Warm window". +# +# VRAM safety on the shared T4 (16 GiB, no per-tenant isolation): int8 weights +# budget ~1.5 GiB; worst-case alongside immich-ml (~2.1) + frigate (~1.9) + +# llama-swap qwen3-8b (~4.35) leaves ~6 GiB headroom. This pod is NOT excluded +# from the kyverno gpu-priority policy, so it correctly gets the immich-equal +# `gpu-workload` priority (first-class resident, never evicted first) — the +# inverse of tts. Full VRAM math + the OOM post-mortem reference are in main.tf. +# +# HITL: agent drafts; operator presence-claims the T4 and applies via GitOps, +# then verifies the rollout + a bg/en transcription smoke test. diff --git a/stacks/portal-tts/main.tf b/stacks/portal-tts/main.tf new file mode 100644 index 00000000..18ef8fb3 --- /dev/null +++ b/stacks/portal-tts/main.tf @@ -0,0 +1,203 @@ +# ============================================================================= +# portal-tts — edge-tts (CPU, always-on) for the portal-assistant Gateway +# ============================================================================= +# +# WHAT: a single ALWAYS-ON openai-edge-tts deployment (travisvn/openai-edge-tts), +# an OpenAI-compatible /v1/audio/speech proxy over Microsoft edge-tts neural +# voices, serving Bulgarian (bg-BG-KalinaNeural) AND English (en-US-AvaNeural), +# the voice chosen PER REQUEST by the Gateway, behind a ClusterIP Service +# `portal-tts.portal-tts.svc:8000`. CPU-only — no GPU, no NFS model store. +# +# WHY edge-tts (REPLACED Piper / openedai-speech on 2026-06-17): the local Piper +# Bulgarian voice (bg_BG-dimitar-medium, espeak-ng phonemes) was garbled and +# unintelligible — espeak mangles Bulgarian consonants (a synth->Whisper +# round-trip turned "Добър ден" into "Обърден"; a user heard pure gibberish). +# ADR-0003 always named Microsoft edge-tts as the online Bulgarian-quality +# fallback; the operator chose it for BOTH languages (validated 2026-06-17: edge +# bg round-trips through Whisper verbatim — "Добър ден! Как сте днес? ..."). The +# assistant already depends on the internet for the Claude brain, so an online +# TTS adds no new failure mode. English moved to edge too (one engine, higher +# quality) — the previous local Piper English worked but is no longer needed. +# +# NO GPU, NO NFS, NO SECRETS: edge-tts fetches voices from Microsoft on demand +# (nothing to persist), so the NFS model PVC + download init-container + voice +# ConfigMap of the old Piper design are all gone. The container needs EGRESS to +# speech.platform.bing.com (verified reachable from this namespace). The Service +# is ClusterIP-only and the Gateway is the sole externally-exposed component +# (ADR-0001) holding the edge auth, so REQUIRE_API_KEY=False here (the Gateway's +# TTSClient sends no Authorization to TTS). +# +# API SHAPE (unchanged Gateway contract): OpenAI /v1/audio/speech +# POST /v1/audio/speech +# { "model":"tts-1", "input":"", "voice":"", +# "response_format":"wav" } -> 200, body = raw PCM16 wav bytes +# The Gateway maps detected lang bg/en -> TTS_VOICE_BG / TTS_VOICE_EN (the edge +# voice names, set on the gateway Deployment), and openai-edge-tts accepts edge +# voice names directly. The `-ffmpeg` image variant is REQUIRED for wav output +# (the base image only emits mp3; ffmpeg transcodes to PCM16 wav). +# ============================================================================= + +variable "edge_tts_image" { + type = string + # openai-edge-tts, the OpenAI-compatible edge-tts proxy. The `-ffmpeg` variant + # bundles ffmpeg so response_format=wav (PCM16) works. Floating tag (no semver + # discipline upstream) — the namespace is Keel-enrolled so digest bumps roll in + # automatically; TF owns only the tag string. + default = "travisvn/openai-edge-tts:latest-ffmpeg" + description = "openai-edge-tts image (ffmpeg variant — needed for wav output)." +} + +variable "bg_voice" { + type = string + default = "bg-BG-KalinaNeural" + description = "Microsoft edge-tts neural Bulgarian voice (the Gateway's TTS_VOICE_BG must match)." +} + +variable "en_voice" { + type = string + default = "en-US-AvaNeural" + description = "Microsoft edge-tts neural English voice (the Gateway's TTS_VOICE_EN must match)." +} + +locals { + namespace = "portal-tts" + labels = { app = "portal-tts" } +} + +resource "kubernetes_namespace" "portal_tts" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Always-on openai-edge-tts. replicas=1, never scaled to zero (no GPU to free, +# negligible idle cost — it's a thin proxy to Microsoft edge-tts). CPU-only: NO +# node_selector / toleration / nvidia.com/gpu. No init container and no volumes: +# voices are fetched from Microsoft per request, so the pod is stateless. +resource "kubernetes_deployment" "portal_tts" { + metadata { + name = "portal-tts" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.aux }) + } + spec { + replicas = 1 + strategy { type = "Recreate" } + selector { + match_labels = { app = "portal-tts" } + } + template { + metadata { + labels = { app = "portal-tts" } + } + spec { + container { + name = "portal-tts" + image = var.edge_tts_image + + # openai-edge-tts listens on :5050 by default; the Service maps 8000 -> + # 5050 so the Gateway's TTS_URL (:8000) is unchanged. + port { + container_port = 5050 + name = "http" + } + # No API key: ClusterIP-only, the Gateway holds edge auth and sends no + # Authorization header to TTS. DEFAULT_VOICE is a fallback only — every + # request carries an explicit voice + response_format. + env { + name = "REQUIRE_API_KEY" + value = "False" + } + env { + name = "DEFAULT_VOICE" + value = var.en_voice + } + + # TCP probes — uvicorn binds :5050 only once the app is ready. No model + # download, so startup is fast; egress to Microsoft happens per request. + startup_probe { + tcp_socket { port = 5050 } + period_seconds = 5 + failure_threshold = 24 # ~2 min + } + readiness_probe { + tcp_socket { port = 5050 } + period_seconds = 15 + failure_threshold = 4 + } + liveness_probe { + tcp_socket { port = 5050 } + initial_delay_seconds = 20 + period_seconds = 30 + failure_threshold = 5 + } + + resources { + # Thin HTTP proxy to Microsoft edge-tts + ffmpeg transcode. Light on + # CPU (no CPU limit — cluster CFS-throttling policy). VERIFY with krr + # after real traffic and tighten. + requests = { + cpu = "50m" + memory = "256Mi" + } + limits = { + memory = "512Mi" + } + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + # Keel is enrolled (floating tag) — ignore its annotation churn but let the + # tag string keep applying from TF. + metadata[0].annotations["keel.sh/policy"], + metadata[0].annotations["keel.sh/trigger"], + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/match-tag"], + metadata[0].annotations["kubernetes.io/change-cause"], + metadata[0].annotations["deployment.kubernetes.io/revision"], + spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN +# until the Gateway speaks it to the Portal). No ingress, no Authentik: the +# Gateway is the only externally exposed component (ADR-0001). OpenAI speech path: +# http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech +resource "kubernetes_service" "portal_tts" { + metadata { + name = "portal-tts" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = local.labels + annotations = { + # openai-edge-tts has no /metrics; annotation-based scrape kept on a live + # path so the Service stays in the scrape set (Ready-endpoint relabeling + # filters non-Ready pods). /v1/models is the OpenAI model list. + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/v1/models" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-tts" } + port { + name = "http" + port = 8000 + target_port = 5050 + } + } +} diff --git a/stacks/portal-tts/terragrunt.hcl b/stacks/portal-tts/terragrunt.hcl new file mode 100644 index 00000000..8e25ac63 --- /dev/null +++ b/stacks/portal-tts/terragrunt.hcl @@ -0,0 +1,33 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-tts: in-cluster text-to-speech for the portal-assistant Gateway +# (portal-assistant issue #3, ADR-0003). One ALWAYS-ON Deployment of Piper +# (ghcr.io/matatonic/openedai-speech-min, OpenAI-compatible /v1/audio/speech) +# serving Bulgarian `bg_BG-dimitar-medium` + English `en_US-lessac-medium`, voice +# chosen PER REQUEST, behind a single ClusterIP Service +# `portal-tts.portal-tts.svc:8000`. Speech path: /v1/audio/speech. +# +# CPU-ONLY: Piper is a fast CPU neural TTS — NO GPU node selector / toleration / +# nvidia.com/gpu request. This deliberately keeps TTS off the OOM-prone shared +# T4 (the two GPU siblings tts/chatterbox + portal-stt already contend for it); +# Bulgarian isn't available on chatterbox anyway (ADR-0003). replicas=1, never +# scaled to zero — no off-peak gate needed when there's no GPU to free. +# +# Voices live on an NFS-SSD PVC, downloaded from rhasspy/piper-voices by an init +# container on first boot (both .onnx + .onnx.json), then persist. A ConfigMap +# supplies voice_to_speaker.yaml mapping request voice "bg"/"en" -> .onnx model. +# +# PLUGGABLE: ADR-0003 keeps TTS a swappable backend with edge-tts as an online +# Bulgarian fallback — that switch is Gateway-side; nothing here changes for it. +# +# nfs_server comes from config.tfvars (192.168.1.127) via the root inputs. +# +# HITL: agent drafts; operator applies via GitOps, then verifies the rollout + +# a bg/en /v1/audio/speech smoke test (curl returns audio bytes).