diff --git a/stacks/portal-assistant/main.tf b/stacks/portal-assistant/main.tf new file mode 100644 index 00000000..10020d47 --- /dev/null +++ b/stacks/portal-assistant/main.tf @@ -0,0 +1,230 @@ +# ============================================================================= +# portal-assistant gateway — voice orchestrator (STT -> Brain -> TTS) +# ============================================================================= +# The single service the Client app talks to: POST /v1/talk takes a WAV + a +# client id, runs Speaches STT -> the claude-agent-service conversational Brain +# -> Piper TTS, and returns the spoken reply. v1: ClusterIP only (E2E tested +# in-cluster). In-memory sessions (no SESSION_DB_DSN). See portal-assistant +# ADR-0001/0002/0003. Public Cloudflare ingress + device-token edge is the next +# increment. +# ============================================================================= + +data "vault_kv_secret_v2" "viktor" { + mount = "secret" + name = "viktor" +} + +data "vault_kv_secret_v2" "cas" { + mount = "secret" + name = "claude-agent-service" +} + +data "vault_kv_secret_v2" "pa" { + mount = "secret" + name = "portal-assistant" +} + +locals { + namespace = "portal-assistant" + labels = { app = "portal-assistant-gateway" } + image = "ghcr.io/viktorbarzin/portal-assistant-gateway:latest" +} + +resource "kubernetes_namespace" "portal_assistant" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.edge + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Pull secret — the gateway image is a PRIVATE ghcr package. Uses the read-only +# ghcr_pull_token (secret/viktor), the same cred the cluster-wide allowlist uses. +resource "kubernetes_secret" "ghcr" { + metadata { + name = "ghcr-pull" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + } + type = "kubernetes.io/dockerconfigjson" + data = { + ".dockerconfigjson" = jsonencode({ + auths = { + "ghcr.io" = { + username = "viktorbarzin" + password = data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"] + auth = base64encode("viktorbarzin:${data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"]}") + } + } + }) + } +} + +# Tokens the gateway needs: BRAIN_TOKEN = claude-agent-service's bearer (to call +# the conversational endpoint); DEVICE_TOKEN = the per-Client secret the Portal +# app carries to authenticate to /v1/talk. +resource "kubernetes_secret" "gateway" { + metadata { + name = "portal-assistant-gateway-secrets" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + } + data = { + BRAIN_TOKEN = data.vault_kv_secret_v2.cas.data["api_bearer_token"] + DEVICE_TOKEN = data.vault_kv_secret_v2.pa.data["device_token"] + } +} + +resource "kubernetes_deployment" "gateway" { + metadata { + name = "portal-assistant-gateway" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.edge }) + } + spec { + replicas = 1 + selector { + match_labels = { app = "portal-assistant-gateway" } + } + template { + metadata { + labels = { app = "portal-assistant-gateway" } + } + spec { + image_pull_secrets { + name = kubernetes_secret.ghcr.metadata[0].name + } + container { + name = "gateway" + image = local.image + image_pull_policy = "Always" + port { + container_port = 8000 + name = "http" + } + # STT -> Speaches; TTS -> Piper; Brain -> claude-agent-service. + env { + name = "STT_URL" + value = "http://portal-stt.portal-stt.svc.cluster.local:8000" + } + env { + name = "STT_MODEL" + value = "deepdml/faster-whisper-large-v3-turbo-ct2" + } + env { + name = "TTS_URL" + value = "http://portal-tts.portal-tts.svc.cluster.local:8000" + } + # portal-tts now serves Microsoft edge-tts neural voices (Piper's + # Bulgarian was garbled; 2026-06-17). The gateway maps detected lang + # bg/en -> these edge voice names, which openai-edge-tts accepts directly. + env { + name = "TTS_VOICE_BG" + value = "bg-BG-KalinaNeural" + } + env { + name = "TTS_VOICE_EN" + value = "en-US-AvaNeural" + } + env { + name = "BRAIN_URL" + value = "http://claude-agent-service.claude-agent.svc.cluster.local:8080" + } + env { + name = "BRAIN_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.gateway.metadata[0].name + key = "BRAIN_TOKEN" + } + } + } + env { + name = "DEVICE_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.gateway.metadata[0].name + key = "DEVICE_TOKEN" + } + } + } + readiness_probe { + http_get { + path = "/health" + port = 8000 + } + period_seconds = 10 + } + liveness_probe { + http_get { + path = "/health" + port = 8000 + } + initial_delay_seconds = 15 + period_seconds = 30 + } + resources { + requests = { + cpu = "50m" + memory = "256Mi" + } + limits = { + memory = "512Mi" + } + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — the only externally-exposed component (ADR-0001) gets its public +# Cloudflare ingress in the next increment; here it's reachable in-cluster for +# the E2E smoke. /metrics scraped by Prometheus. +resource "kubernetes_service" "gateway" { + metadata { + name = "portal-assistant-gateway" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + labels = local.labels + annotations = { + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/metrics" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-assistant-gateway" } + port { + name = "http" + port = 8000 + target_port = 8000 + } + } +} + +# Public Cloudflare ingress — the Portal app reaches the gateway at +# https://portal-assistant.viktorbarzin.me/v1/talk. tls-secret is Kyverno-synced +# into the namespace. The gateway holds its own edge auth (the DEVICE_TOKEN +# bearer), so no Authentik in front. +module "ingress" { + source = "../../modules/kubernetes/ingress_factory" + name = "portal-assistant" + namespace = kubernetes_namespace.portal_assistant.metadata[0].name + service_name = kubernetes_service.gateway.metadata[0].name + port = 8000 + tls_secret_name = "tls-secret" + # auth = "app": the gateway enforces its own DEVICE_TOKEN bearer on /v1/talk; Authentik would break the native Portal client (it has no browser login). + auth = "app" + dns_type = "proxied" + max_body_size = "25m" # audio (WAV) uploads +} diff --git a/stacks/portal-assistant/terragrunt.hcl b/stacks/portal-assistant/terragrunt.hcl new file mode 100644 index 00000000..222cd339 --- /dev/null +++ b/stacks/portal-assistant/terragrunt.hcl @@ -0,0 +1,13 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-assistant gateway — the voice-assistant orchestrator (STT -> Brain -> +# TTS). v1 is ClusterIP-only (E2E proven in-cluster); the public Cloudflare +# ingress for the Portal app is added next. In-memory sessions for now (no +# SESSION_DB_DSN); CNPG Postgres is a later add. portal-assistant issue #10. diff --git a/stacks/portal-stt/main.tf b/stacks/portal-stt/main.tf new file mode 100644 index 00000000..7c0e00bb --- /dev/null +++ b/stacks/portal-stt/main.tf @@ -0,0 +1,360 @@ +# ============================================================================= +# portal-stt — Speaches STT (Whisper large-v3-turbo int8) for portal-assistant +# ============================================================================= +# +# DRAFT for operator review (portal-assistant issue #2). HITL apply: an agent +# drafts; the operator applies via GitOps (presence-claimed) and verifies the +# rollout. Do NOT `terragrunt apply` this from a worktree. +# +# WHAT: a single WARM-RESIDENT Speaches deployment (OpenAI-compatible +# faster-whisper server) serving `large-v3-turbo` int8, multilingual (Bulgarian +# + English), on the shared Tesla T4 (one time-slice). ClusterIP only — audio +# never leaves the LAN; the portal-assistant Gateway is the only externally +# exposed component (ADR-0001), so no ingress/auth here. +# +# WHY WARM-RESIDENT, NOT THE CHATTERBOX DEMAND-GATE: +# The TTS (chatterbox) stack scales 0<->1 behind a free-VRAM CronJob gate +# because it is a best-effort BATCH tenant (tripit narration) that can wait. +# STT here is INTERACTIVE voice — every Turn would pay a multi-second cold +# model load (download/mmap + CUDA init) if we scaled to zero. So this stack +# keeps the model permanently loaded: replicas=1 + Speaches STT_MODEL_TTL=-1 +# (never unload) + PRELOAD_MODELS (load at startup). See portal-assistant +# CONTEXT.md "Warm window" + ADR-0003. +# +# OOM HISTORY / VRAM MATH — the binding constraint is the shared T4 (16 GiB, +# time-sliced across immich-ml / frigate / llama-swap / android-emulator with +# NO per-tenant VRAM isolation). See +# docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md (immich-ml's +# unbounded onnxruntime arena starved llama-swap's qwen3-8b -> recruiter down). +# +# Live residents measured 2026-06-17 (gpu_pod_memory_used_bytes): +# immich-ml ~2.1 GiB (capped: MACHINE_LEARNING_MODEL_TTL=600) +# frigate (8 proc) ~1.9 GiB (detector + ffmpeg decode) +# android-emulator ~0.15 GiB +# llama-swap 0 idle, but loads qwen3-8b on demand = ~4.35 GiB peak +# (cudaMalloc 4455 MiB, per the post-mortem) +# Worst-case concurrent baseline (everything hot): 2.1 + 1.9 + 0.15 + 4.35 +# = ~8.5 GiB. +# Speaches large-v3-turbo int8 weights ~= 0.8 GiB on disk; resident CTranslate2 +# int8 + CUDA context + decode buffers budget conservatively to ~1.5 GiB +# (VERIFY at apply against gpu_pod_memory_used_bytes{namespace="portal-stt"}). +# +# 8.5 (residents) + 1.5 (this) = ~10.0 GiB used => ~6 GiB T4 headroom. +# That headroom is the safety margin against onnxruntime arena drift (the +# exact failure mode from 2026-06-02). If a future resident grows, this is the +# FIRST place to re-measure. The conservative int8 (not fp16) choice halves +# our weight footprint precisely to protect this margin. +# +# GPU PRIORITY: this pod requests nvidia.com/gpu, so the Kyverno +# `inject-gpu-workload-priority` ClusterPolicy auto-stamps the immich-equal +# `gpu-workload` (1,200,000) priority — portal-stt is NOT in that policy's +# exclude list (only `tts` is, to keep chatterbox demotable). That is CORRECT +# here: warm interactive STT is a first-class GPU resident, never the first +# evicted. We also set priority_class_name explicitly so intent is legible at +# the call site and survives a policy fail-open. (Contrast tts/main.tf, which +# pins tier-2-gpu precisely so chatterbox IS evicted first.) +# ============================================================================= + +variable "tls_secret_name" { + type = string + sensitive = true +} + +variable "nfs_server" { + type = string + description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)." +} + +variable "speaches_image" { + type = string + # ghcr.io/speaches-ai/speaches CUDA build. The live registry currently + # publishes 0.9.0-rc.3-cuda (+ sha-/cuda-12.x variants) and a moving + # :latest-cuda; there is no published :0.8.3-cuda for the last stable. Pinned + # to the rc.3 CUDA tag (immutable-ish, beats :latest for the OOM/Keel-churn + # history). CUDA 12.4/12.6 image runtime is fine under our 570.195.03 driver + # (CUDA 12.8, backward-compatible). OPEN ITEM for operator: confirm this tag + # still resolves at apply, or bump to the newest -cuda tag. + default = "ghcr.io/speaches-ai/speaches:0.9.0-rc.3-cuda" + description = "Speaches CUDA image. Pin a -cuda tag, not :latest-cuda." +} + +variable "stt_model_id" { + type = string + # HF repo id of the CTranslate2 large-v3-turbo conversion. deepdml's is the + # canonical community ct2 build of openai large-v3-turbo (multilingual, + # incl. Bulgarian) and is what ADR-0003's FLEURS-bg bake-off measured at + # 8.3% WER. Speaches resolves whisper models by HF repo id. + default = "deepdml/faster-whisper-large-v3-turbo-ct2" + description = "HuggingFace repo id of the warm-resident whisper model." +} + +locals { + namespace = "portal-stt" + labels = { app = "portal-stt" } + + # Speaches is configured via env vars (pydantic-settings): scalars map from + # UPPER_SNAKE, nested whisper.* settings from WHISPER__FIELD. The three knobs + # that make this WARM-RESIDENT and int8: + # PRELOAD_MODELS — JSON list, loaded sequentially at startup so the + # first Turn is never cold (pod won't go Ready until + # the model is in VRAM). + # STT_MODEL_TTL=-1 — never unload an idle STT model (0=immediate, + # default 300s). This is the warm-resident lever. + # WHISPER__COMPUTE_TYPE — int8 (conservative VRAM; default "default"=fp16). + # WHISPER__INFERENCE_DEVICE — cuda (default "auto"). + # HF cache is redirected onto the NFS-SSD PVC so weights download once and + # persist across pod restarts (image default cache is /home/ubuntu/.cache/ + # huggingface/hub — ephemeral). Speaches runs as uid 1000 (ubuntu). +} + +resource "kubernetes_namespace" "portal_stt" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.gpu + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +module "tls_secret" { + source = "../../modules/kubernetes/setup_tls_secret" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + tls_secret_name = var.tls_secret_name +} + +# Model + HF cache on NFS-SSD (fast first-load, persists across restarts). Path +# /srv/nfs-ssd/portal-stt on the Proxmox host (192.168.1.127). Mirrors the +# chatterbox nfs_models pattern. RWX so a future seed/inspect pod can touch it. +module "nfs_models" { + source = "../../modules/kubernetes/nfs_volume" + name = "portal-stt-models" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + nfs_server = var.nfs_server + nfs_path = "/srv/nfs-ssd/portal-stt" + storage = "10Gi" # large-v3-turbo ct2 (~0.8Gi) + HF cache headroom +} + +# One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-stt +# SUBDIR must exist before kubelet can bind-mount it (chatterbox hit exit 32 on +# a missing subdir the first window — see stacks/tts/main.tf). Mount the export +# ROOT (which exists) and mkdir the subtree; kubelet's mount retry then heals +# the main pod. Idempotent; immutable-once-created. +resource "kubernetes_job" "models_dir_init" { + metadata { + name = "portal-stt-models-dir-init" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + labels = local.labels + } + spec { + backoff_limit = 3 + ttl_seconds_after_finished = 86400 + template { + metadata { labels = local.labels } + spec { + restart_policy = "Never" + container { + name = "mkdir" + image = "busybox:1.37" + command = ["sh", "-c", "mkdir -p /mnt/portal-stt/hub && ls -la /mnt/portal-stt"] + volume_mount { + name = "nfs-ssd-root" + mount_path = "/mnt" + } + } + volume { + name = "nfs-ssd-root" + nfs { + server = var.nfs_server + path = "/srv/nfs-ssd" + } + } + } + } + } + wait_for_completion = true + timeouts { create = "3m" } +} + +# Warm-resident Speaches. replicas=1, NEVER scaled to zero (no off-peak gate, +# unlike tts) — the model stays in VRAM so interactive Turns never pay a cold +# load. wait_for_rollout left default (true): a plain apply SHOULD block until +# the model is loaded and the pod is Ready, surfacing a bad image/model early. +resource "kubernetes_deployment" "portal_stt" { + metadata { + name = "portal-stt" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.gpu }) + } + spec { + replicas = 1 + # RWO is not in play (model PVC is RWX NFS), but Recreate avoids two pods + # briefly double-loading the model into the shared T4 during a rollout. + strategy { type = "Recreate" } + selector { + match_labels = { app = "portal-stt" } + } + template { + metadata { + labels = { app = "portal-stt" } + } + spec { + node_selector = { "nvidia.com/gpu.present" = "true" } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } + # First-class GPU resident (warm interactive STT) — same priority as + # immich-ml. Kyverno would stamp this anyway (portal-stt is not in the + # gpu-priority exclude list); set explicitly for legibility + fail-open + # safety. NOT tier-2-gpu (that is chatterbox's evict-first demotion). + priority_class_name = "gpu-workload" + + container { + name = "portal-stt" + image = var.speaches_image + + # --- warm-resident + int8 + cuda config (see locals) --- + env { + name = "PRELOAD_MODELS" + value = jsonencode([var.stt_model_id]) + } + env { + name = "STT_MODEL_TTL" + value = "-1" # never unload — the warm-resident lever + } + env { + name = "WHISPER__INFERENCE_DEVICE" + value = "cuda" + } + env { + name = "WHISPER__COMPUTE_TYPE" + value = "int8" # conservative VRAM (vs fp16 default) + } + env { + name = "LOG_LEVEL" + value = "info" # image default is debug + } + # Persist the HF model cache on the NFS-SSD PVC (image default cache + # dir is ephemeral). Speaches/HF honour HF_HUB_CACHE + HF_HOME. + env { + name = "HF_HUB_CACHE" + value = "/data/hub" + } + env { + name = "HF_HOME" + value = "/data" + } + + port { + container_port = 8000 + name = "http" + } + + volume_mount { + name = "models" + mount_path = "/data" + } + + # /health is Speaches' liveness/readiness path. Generous startup + # allowance: the first boot downloads large-v3-turbo to the PVC before + # the server reports healthy (PRELOAD blocks startup). After the model + # is cached on NFS-SSD, subsequent boots load in seconds. + startup_probe { + http_get { + path = "/health" + port = 8000 + } + period_seconds = 10 + failure_threshold = 60 # up to ~10 min for the first model download + } + readiness_probe { + http_get { + path = "/health" + port = 8000 + } + period_seconds = 15 + failure_threshold = 4 + } + liveness_probe { + http_get { + path = "/health" + port = 8000 + } + initial_delay_seconds = 30 + period_seconds = 30 + failure_threshold = 5 + } + + resources { + requests = { + cpu = "200m" + memory = "2Gi" + } + limits = { + memory = "4Gi" + "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card + } + } + } + + volume { + name = "models" + persistent_volume_claim { + claim_name = module.nfs_models.claim_name + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + # image is TF-OWNED (pinned -cuda tag) — Keel can manage the digest on + # this tag if desired, so ignore keel's annotation churn but NOT the image + # itself (we want tag pins to apply). Mirrors tts: keel annotations only. + metadata[0].annotations["keel.sh/policy"], + metadata[0].annotations["keel.sh/trigger"], + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/match-tag"], + metadata[0].annotations["kubernetes.io/change-cause"], + metadata[0].annotations["deployment.kubernetes.io/revision"], + spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN). +# No ingress, no Authentik: the Gateway is the only externally exposed component +# (ADR-0001) and holds the edge auth. OpenAI transcription path is +# http://portal-stt.portal-stt.svc.cluster.local:8000/v1/audio/transcriptions +resource "kubernetes_service" "portal_stt" { + metadata { + name = "portal-stt" + namespace = kubernetes_namespace.portal_stt.metadata[0].name + labels = local.labels + annotations = { + # Speaches exposes Prometheus metrics at /metrics — wire annotation-based + # scrape (Ready-endpoint relabeling already filters non-Ready pods). + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/metrics" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-stt" } + port { + name = "http" + port = 8000 + target_port = 8000 + } + } +} diff --git a/stacks/portal-stt/terragrunt.hcl b/stacks/portal-stt/terragrunt.hcl new file mode 100644 index 00000000..b35880be --- /dev/null +++ b/stacks/portal-stt/terragrunt.hcl @@ -0,0 +1,33 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-stt: in-cluster speech-to-text for the portal-assistant Gateway +# (portal-assistant issue #2, ADR-0003). One Deployment of Speaches +# (ghcr.io/speaches-ai/speaches, OpenAI-compatible faster-whisper) serving +# `large-v3-turbo` int8, multilingual (Bulgarian + English), behind a single +# ClusterIP Service `portal-stt.portal-stt.svc:8000`. Transcription path: +# /v1/audio/transcriptions. Requests ONE time-slice of the shared T4 +# (nvidia.com/gpu=1) — a slice, not the card. +# +# WARM-RESIDENT (NOT the tts/chatterbox demand-gate): replicas=1, never scaled +# to zero. The model is preloaded at startup (PRELOAD_MODELS) and never unloaded +# (STT_MODEL_TTL=-1) so interactive voice Turns never pay a cold model load. +# Chatterbox can scale 0<->1 because it is best-effort batch narration; STT is +# latency-critical and must stay warm. See portal-assistant CONTEXT.md +# "Warm window". +# +# VRAM safety on the shared T4 (16 GiB, no per-tenant isolation): int8 weights +# budget ~1.5 GiB; worst-case alongside immich-ml (~2.1) + frigate (~1.9) + +# llama-swap qwen3-8b (~4.35) leaves ~6 GiB headroom. This pod is NOT excluded +# from the kyverno gpu-priority policy, so it correctly gets the immich-equal +# `gpu-workload` priority (first-class resident, never evicted first) — the +# inverse of tts. Full VRAM math + the OOM post-mortem reference are in main.tf. +# +# HITL: agent drafts; operator presence-claims the T4 and applies via GitOps, +# then verifies the rollout + a bg/en transcription smoke test. diff --git a/stacks/portal-tts/main.tf b/stacks/portal-tts/main.tf new file mode 100644 index 00000000..18ef8fb3 --- /dev/null +++ b/stacks/portal-tts/main.tf @@ -0,0 +1,203 @@ +# ============================================================================= +# portal-tts — edge-tts (CPU, always-on) for the portal-assistant Gateway +# ============================================================================= +# +# WHAT: a single ALWAYS-ON openai-edge-tts deployment (travisvn/openai-edge-tts), +# an OpenAI-compatible /v1/audio/speech proxy over Microsoft edge-tts neural +# voices, serving Bulgarian (bg-BG-KalinaNeural) AND English (en-US-AvaNeural), +# the voice chosen PER REQUEST by the Gateway, behind a ClusterIP Service +# `portal-tts.portal-tts.svc:8000`. CPU-only — no GPU, no NFS model store. +# +# WHY edge-tts (REPLACED Piper / openedai-speech on 2026-06-17): the local Piper +# Bulgarian voice (bg_BG-dimitar-medium, espeak-ng phonemes) was garbled and +# unintelligible — espeak mangles Bulgarian consonants (a synth->Whisper +# round-trip turned "Добър ден" into "Обърден"; a user heard pure gibberish). +# ADR-0003 always named Microsoft edge-tts as the online Bulgarian-quality +# fallback; the operator chose it for BOTH languages (validated 2026-06-17: edge +# bg round-trips through Whisper verbatim — "Добър ден! Как сте днес? ..."). The +# assistant already depends on the internet for the Claude brain, so an online +# TTS adds no new failure mode. English moved to edge too (one engine, higher +# quality) — the previous local Piper English worked but is no longer needed. +# +# NO GPU, NO NFS, NO SECRETS: edge-tts fetches voices from Microsoft on demand +# (nothing to persist), so the NFS model PVC + download init-container + voice +# ConfigMap of the old Piper design are all gone. The container needs EGRESS to +# speech.platform.bing.com (verified reachable from this namespace). The Service +# is ClusterIP-only and the Gateway is the sole externally-exposed component +# (ADR-0001) holding the edge auth, so REQUIRE_API_KEY=False here (the Gateway's +# TTSClient sends no Authorization to TTS). +# +# API SHAPE (unchanged Gateway contract): OpenAI /v1/audio/speech +# POST /v1/audio/speech +# { "model":"tts-1", "input":"", "voice":"", +# "response_format":"wav" } -> 200, body = raw PCM16 wav bytes +# The Gateway maps detected lang bg/en -> TTS_VOICE_BG / TTS_VOICE_EN (the edge +# voice names, set on the gateway Deployment), and openai-edge-tts accepts edge +# voice names directly. The `-ffmpeg` image variant is REQUIRED for wav output +# (the base image only emits mp3; ffmpeg transcodes to PCM16 wav). +# ============================================================================= + +variable "edge_tts_image" { + type = string + # openai-edge-tts, the OpenAI-compatible edge-tts proxy. The `-ffmpeg` variant + # bundles ffmpeg so response_format=wav (PCM16) works. Floating tag (no semver + # discipline upstream) — the namespace is Keel-enrolled so digest bumps roll in + # automatically; TF owns only the tag string. + default = "travisvn/openai-edge-tts:latest-ffmpeg" + description = "openai-edge-tts image (ffmpeg variant — needed for wav output)." +} + +variable "bg_voice" { + type = string + default = "bg-BG-KalinaNeural" + description = "Microsoft edge-tts neural Bulgarian voice (the Gateway's TTS_VOICE_BG must match)." +} + +variable "en_voice" { + type = string + default = "en-US-AvaNeural" + description = "Microsoft edge-tts neural English voice (the Gateway's TTS_VOICE_EN must match)." +} + +locals { + namespace = "portal-tts" + labels = { app = "portal-tts" } +} + +resource "kubernetes_namespace" "portal_tts" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Always-on openai-edge-tts. replicas=1, never scaled to zero (no GPU to free, +# negligible idle cost — it's a thin proxy to Microsoft edge-tts). CPU-only: NO +# node_selector / toleration / nvidia.com/gpu. No init container and no volumes: +# voices are fetched from Microsoft per request, so the pod is stateless. +resource "kubernetes_deployment" "portal_tts" { + metadata { + name = "portal-tts" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.aux }) + } + spec { + replicas = 1 + strategy { type = "Recreate" } + selector { + match_labels = { app = "portal-tts" } + } + template { + metadata { + labels = { app = "portal-tts" } + } + spec { + container { + name = "portal-tts" + image = var.edge_tts_image + + # openai-edge-tts listens on :5050 by default; the Service maps 8000 -> + # 5050 so the Gateway's TTS_URL (:8000) is unchanged. + port { + container_port = 5050 + name = "http" + } + # No API key: ClusterIP-only, the Gateway holds edge auth and sends no + # Authorization header to TTS. DEFAULT_VOICE is a fallback only — every + # request carries an explicit voice + response_format. + env { + name = "REQUIRE_API_KEY" + value = "False" + } + env { + name = "DEFAULT_VOICE" + value = var.en_voice + } + + # TCP probes — uvicorn binds :5050 only once the app is ready. No model + # download, so startup is fast; egress to Microsoft happens per request. + startup_probe { + tcp_socket { port = 5050 } + period_seconds = 5 + failure_threshold = 24 # ~2 min + } + readiness_probe { + tcp_socket { port = 5050 } + period_seconds = 15 + failure_threshold = 4 + } + liveness_probe { + tcp_socket { port = 5050 } + initial_delay_seconds = 20 + period_seconds = 30 + failure_threshold = 5 + } + + resources { + # Thin HTTP proxy to Microsoft edge-tts + ffmpeg transcode. Light on + # CPU (no CPU limit — cluster CFS-throttling policy). VERIFY with krr + # after real traffic and tighten. + requests = { + cpu = "50m" + memory = "256Mi" + } + limits = { + memory = "512Mi" + } + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + # Keel is enrolled (floating tag) — ignore its annotation churn but let the + # tag string keep applying from TF. + metadata[0].annotations["keel.sh/policy"], + metadata[0].annotations["keel.sh/trigger"], + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/match-tag"], + metadata[0].annotations["kubernetes.io/change-cause"], + metadata[0].annotations["deployment.kubernetes.io/revision"], + spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN +# until the Gateway speaks it to the Portal). No ingress, no Authentik: the +# Gateway is the only externally exposed component (ADR-0001). OpenAI speech path: +# http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech +resource "kubernetes_service" "portal_tts" { + metadata { + name = "portal-tts" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = local.labels + annotations = { + # openai-edge-tts has no /metrics; annotation-based scrape kept on a live + # path so the Service stays in the scrape set (Ready-endpoint relabeling + # filters non-Ready pods). /v1/models is the OpenAI model list. + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/v1/models" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-tts" } + port { + name = "http" + port = 8000 + target_port = 5050 + } + } +} diff --git a/stacks/portal-tts/terragrunt.hcl b/stacks/portal-tts/terragrunt.hcl new file mode 100644 index 00000000..8e25ac63 --- /dev/null +++ b/stacks/portal-tts/terragrunt.hcl @@ -0,0 +1,33 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-tts: in-cluster text-to-speech for the portal-assistant Gateway +# (portal-assistant issue #3, ADR-0003). One ALWAYS-ON Deployment of Piper +# (ghcr.io/matatonic/openedai-speech-min, OpenAI-compatible /v1/audio/speech) +# serving Bulgarian `bg_BG-dimitar-medium` + English `en_US-lessac-medium`, voice +# chosen PER REQUEST, behind a single ClusterIP Service +# `portal-tts.portal-tts.svc:8000`. Speech path: /v1/audio/speech. +# +# CPU-ONLY: Piper is a fast CPU neural TTS — NO GPU node selector / toleration / +# nvidia.com/gpu request. This deliberately keeps TTS off the OOM-prone shared +# T4 (the two GPU siblings tts/chatterbox + portal-stt already contend for it); +# Bulgarian isn't available on chatterbox anyway (ADR-0003). replicas=1, never +# scaled to zero — no off-peak gate needed when there's no GPU to free. +# +# Voices live on an NFS-SSD PVC, downloaded from rhasspy/piper-voices by an init +# container on first boot (both .onnx + .onnx.json), then persist. A ConfigMap +# supplies voice_to_speaker.yaml mapping request voice "bg"/"en" -> .onnx model. +# +# PLUGGABLE: ADR-0003 keeps TTS a swappable backend with edge-tts as an online +# Bulgarian fallback — that switch is Gateway-side; nothing here changes for it. +# +# nfs_server comes from config.tfvars (192.168.1.127) via the root inputs. +# +# HITL: agent drafts; operator applies via GitOps, then verifies the rollout + +# a bg/en /v1/audio/speech smoke test (curl returns audio bytes).