# ============================================================================= # portal-tts — Piper TTS (CPU, always-on) for the portal-assistant Gateway # ============================================================================= # # DRAFT for operator review (portal-assistant issue #3). HITL apply: an agent # drafts; the operator applies via GitOps and verifies the rollout. Do NOT # `terragrunt apply` this from a worktree. # # WHAT: a single ALWAYS-ON Piper deployment serving Bulgarian # (`bg_BG-dimitar-medium`) AND English (`en_US-lessac-medium`), with the voice # chosen PER REQUEST, behind a ClusterIP Service `portal-tts.portal-tts.svc:8000`. # CPU-ONLY — no GPU node selector / toleration / nvidia.com/gpu request (Piper # is a fast CPU neural TTS; ADR-0003). Audio never leaves the LAN; the # portal-assistant Gateway is the only externally exposed component (ADR-0001), # so there is no ingress / Authentik here. # # WHY CPU + ALWAYS-ON (contrast the two GPU siblings): # * tts/ (chatterbox) scales 0<->1 behind a free-VRAM CronJob gate — it is a # best-effort BATCH tenant on the shared T4. # * portal-stt/ (Speaches) is warm-resident on ONE T4 slice — interactive STT # that must not pay a cold model load. # Piper needs neither: it runs in real time on CPU (no GPU contention at all), # so the simplest correct design is replicas=1, always up. Keeping it off the # T4 also REMOVES one tenant from the OOM-prone shared card # (docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md) — Bulgarian # isn't on chatterbox anyway (its 23 langs exclude bg; ADR-0003). # # API SHAPE (the Gateway team's contract): openedai-speech is OpenAI-compatible. # POST /v1/audio/speech # Content-Type: application/json # { "model": "tts-1", "input": "", "voice": "", # "response_format": "wav", "speed": 1.0 } # -> 200, body = raw audio bytes (wav/mp3/opus/flac/aac/pcm per response_format) # This matches the Gateway's tts.synthesize(text, lang) -> bytes interface # (portal-assistant gateway/app/pipeline.py): map lang "bg"->voice "bg", # anything else -> "en". Same OpenAI shape chatterbox already uses, so the # Gateway can treat Piper and the edge-tts fallback identically. # # PLUGGABLE FALLBACK (noted, NOT built here — a Gateway-side concern): ADR-0003 # keeps TTS a swappable backend with Microsoft edge-tts as an online # Bulgarian-quality fallback. The Gateway selects Piper (this Service, on-LAN # default) vs edge-tts (cloud) per its own config; nothing in THIS stack needs # to change to add edge-tts. If a second in-cluster engine is ever wanted, # add a sibling Deployment+Service and let the Gateway choose. # # IMAGE CHOICE (OPEN ITEM — operator please confirm before apply): # Primary: ghcr.io/matatonic/openedai-speech-min — the CPU-only, piper-only # (~1 GiB) variant of openedai-speech. OpenAI /v1/audio/speech, multi-voice via # a voice_to_speaker.yaml map, returns raw audio bytes. Pre-built for # linux/amd64+arm64, pullable from ghcr (tags: latest, 0.18.2, 0.18.1, ...). # CAVEAT: the upstream repo was ARCHIVED 2026-01-04 (read-only, no further # updates / security patches). It is feature-complete and stable for this use, # but pinned (not Keel-tracked) precisely because it is frozen upstream. # Alternative if a maintained image is preferred: arkdevuk/Webpiper (FastAPI, # actively developed) — but it returns JSON {url} requiring a SECOND fetch to # retrieve the wav, a worse fit for the Gateway's bytes contract; it would # need a small Gateway adapter. The rhasspy/wyoming-piper HTTP server is NOT # suitable: it loads ONE voice per process (no per-request voice switch). # ============================================================================= variable "nfs_server" { type = string description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)." } variable "piper_image" { type = string # CPU-only piper-only openedai-speech. Pinned to 0.18.2 (the newest published # tag; repo archived 2026-01-04 so this is effectively the final release) for # reproducibility — NOT :latest, which would also drift to the same frozen # build but loses the explicit version record. linux/amd64 confirmed. default = "ghcr.io/matatonic/openedai-speech-min:0.18.2" description = "openedai-speech CPU/piper-only image. See IMAGE CHOICE note in main.tf." } variable "bg_voice" { type = string # The single Bulgarian Piper voice (rhasspy/piper-voices). ADR-0003 names this # exact model; it was reviewed against edge-tts in the M0.2 bake-off. default = "bg_BG-dimitar-medium" description = "Bulgarian Piper voice model stem (rhasspy/piper-voices)." } variable "en_voice" { type = string # English Piper voice. lessac-medium is the canonical balanced en_US voice # (the upstream openedai-speech default-quality pick). Swappable to any # rhasspy/piper-voices en stem (e.g. en_US-amy-medium, en_GB-alba-medium). default = "en_US-lessac-medium" description = "English Piper voice model stem (rhasspy/piper-voices)." } locals { namespace = "portal-tts" labels = { app = "portal-tts" } # rhasspy/piper-voices HuggingFace layout: # resolve/main/////--.onnx # Each voice needs BOTH the .onnx and the matching .onnx.json (Piper reads the # sample rate / phoneme map from the .json next to the model). All 4 URLs were # HEAD-verified 200 on 2026-06-17. Derived from the voice stems so changing a # voice variable updates both the download URL and the config map together. hf_base = "https://huggingface.co/rhasspy/piper-voices/resolve/main" voice_models = { # voice stem => HF directory path "///" (var.bg_voice) = "bg/bg_BG/dimitar/medium" (var.en_voice) = "en/en_US/lessac/medium" } # voice_to_speaker.yaml: the openedai-speech config that maps a REQUEST voice # name ("bg" / "en") to a Piper .onnx on the PVC. The Gateway sends # voice="bg" or "en"; the server resolves it here. (We expose short logical # names, not the long model stems, so the Gateway's lang->voice map is trivial # and stable even if the underlying model stem changes.) The default model # name is "tts-1" (openedai-speech's piper model id). voice_to_speaker = yamlencode({ "tts-1" = { bg = { model = "voices/${var.bg_voice}.onnx" speaker = null # single-speaker model -> default } en = { model = "voices/${var.en_voice}.onnx" speaker = null } } }) # Init-container provisioning script. Downloads each voice's .onnx + .onnx.json # into the PVC's voices/ dir IF MISSING (idempotent — re-runs skip # already-present files), then copies the config map's voice_to_speaker.yaml # into the PVC's config/ dir. Pure POSIX + wget (busybox has wget). download_script = <<-EOT set -eu mkdir -p /data/voices /data/config fetch() { # $1 = url, $2 = dest if [ -s "$2" ]; then echo "have $2"; return 0; fi echo "get $1 -> $2" wget -q -O "$2.tmp" "$1" mv "$2.tmp" "$2" } %{for stem, dir in local.voice_models~} fetch "${local.hf_base}/${dir}/${stem}.onnx" "/data/voices/${stem}.onnx" fetch "${local.hf_base}/${dir}/${stem}.onnx.json" "/data/voices/${stem}.onnx.json" %{endfor~} cp /config-src/voice_to_speaker.yaml /data/config/voice_to_speaker.yaml echo "voices:"; ls -la /data/voices echo "config:"; cat /data/config/voice_to_speaker.yaml EOT } resource "kubernetes_namespace" "portal_tts" { metadata { name = local.namespace labels = { tier = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant "istio-injection" = "disabled" "keel.sh/enrolled" = "true" } } lifecycle { # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] } } # Voice models + config on NFS-SSD: fast first-load, persists across restarts, # and RWX so a future seed/inspect pod can touch it. Path /srv/nfs-ssd/portal-tts # on the Proxmox host. Small — two medium Piper voices are ~60-120 MiB each. # Mirrors portal-stt's nfs_models pattern. module "nfs_models" { source = "../../modules/kubernetes/nfs_volume" name = "portal-tts-models" namespace = kubernetes_namespace.portal_tts.metadata[0].name nfs_server = var.nfs_server nfs_path = "/srv/nfs-ssd/portal-tts" storage = "2Gi" # 2 medium Piper voices + headroom for more } # One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-tts # SUBDIR must exist before kubelet can bind-mount it (chatterbox/portal-stt both # hit exit 32 on a missing subdir). Mount the export ROOT (which exists) and # mkdir the subtree; kubelet's mount retry then heals the main pod. Idempotent; # immutable-once-created. resource "kubernetes_job" "models_dir_init" { metadata { name = "portal-tts-models-dir-init" namespace = kubernetes_namespace.portal_tts.metadata[0].name labels = local.labels } spec { backoff_limit = 3 ttl_seconds_after_finished = 86400 template { metadata { labels = local.labels } spec { restart_policy = "Never" container { name = "mkdir" image = "busybox:1.37" command = ["sh", "-c", "mkdir -p /mnt/portal-tts/voices /mnt/portal-tts/config && ls -la /mnt/portal-tts"] volume_mount { name = "nfs-ssd-root" mount_path = "/mnt" } } volume { name = "nfs-ssd-root" nfs { server = var.nfs_server path = "/srv/nfs-ssd" } } } } } wait_for_completion = true timeouts { create = "3m" } } # The voice_to_speaker.yaml map, mounted into the init container which copies it # onto the PVC (openedai-speech reads config from a writable dir). Checksum drives # a rollout when the voice map changes. resource "kubernetes_config_map" "voices" { metadata { name = "portal-tts-voices" namespace = kubernetes_namespace.portal_tts.metadata[0].name labels = local.labels } data = { "voice_to_speaker.yaml" = local.voice_to_speaker } } # Always-on Piper. replicas=1, never scaled to zero (no off-peak gate). CPU-only: # NO node_selector / toleration / nvidia.com/gpu — it runs on any worker. The # init container downloads the voices to the PVC and seeds the config before the # server starts; openedai-speech then serves both voices, selectable per request. resource "kubernetes_deployment" "portal_tts" { metadata { name = "portal-tts" namespace = kubernetes_namespace.portal_tts.metadata[0].name labels = merge(local.labels, { tier = local.tiers.aux }) } # wait_for_rollout left default (true): a plain apply SHOULD block until the # voices download + the server reports healthy, surfacing a bad image/voice # early. First boot pulls ~2 voices from HuggingFace onto the PVC. spec { replicas = 1 # NFS PVC is RWX so RollingUpdate would be safe, but Recreate keeps it simple # and avoids two pods racing the same voices/ dir on first download. strategy { type = "Recreate" } selector { match_labels = { app = "portal-tts" } } template { metadata { labels = { app = "portal-tts" } annotations = { "checksum/voices" = sha256(local.voice_to_speaker) } } spec { # Download voices + seed config onto the PVC before the server starts. init_container { name = "fetch-voices" image = "busybox:1.37" command = ["sh", "-c", local.download_script] volume_mount { name = "models" mount_path = "/data" } volume_mount { name = "config-src" mount_path = "/config-src" } resources { requests = { cpu = "20m", memory = "32Mi" } limits = { memory = "64Mi" } } } container { name = "portal-tts" image = var.piper_image # openedai-speech serves /v1/audio/speech on :8000. It reads # config/voice_to_speaker.yaml and voices/*.onnx relative to its # workdir (/app); we mount the PVC at /app/voices and /app/config so # the init-seeded files are found. port { container_port = 8000 name = "http" } env { name = "OPENEDAI_LOG_LEVEL" value = "INFO" # image default is INFO; explicit for legibility } volume_mount { name = "models" mount_path = "/app/voices" sub_path = "voices" } volume_mount { name = "models" mount_path = "/app/config" sub_path = "config" } # openedai-speech has no /health on its OpenAI surface. Use a TCP probe # — the uvicorn socket binds only after the app (and the piper voices # index) is ready. The init container already downloaded the voices, so # process start is fast. startup_probe { tcp_socket { port = 8000 } period_seconds = 5 failure_threshold = 24 # ~2 min } readiness_probe { tcp_socket { port = 8000 } period_seconds = 15 failure_threshold = 4 } liveness_probe { tcp_socket { port = 8000 } initial_delay_seconds = 20 period_seconds = 30 failure_threshold = 5 } resources { # Piper is light on CPU. No CPU limit (cluster policy: CFS throttling # avoided). Memory sized for the python runtime + 2 loaded onnx voices # (each medium model ~60-120 MiB + onnxruntime arena). VERIFY with krr # after a few days of real traffic and tighten. requests = { cpu = "100m" memory = "512Mi" } limits = { memory = "1Gi" } } } volume { name = "models" persistent_volume_claim { claim_name = module.nfs_models.claim_name } } volume { name = "config-src" config_map { name = kubernetes_config_map.voices.metadata[0].name } } } } } lifecycle { ignore_changes = [ spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 # image is TF-OWNED (pinned tag on a FROZEN upstream) — do NOT let Keel # churn it. Ignore keel's annotation noise but keep the image pin applying. metadata[0].annotations["keel.sh/policy"], metadata[0].annotations["keel.sh/trigger"], metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 metadata[0].annotations["keel.sh/match-tag"], metadata[0].annotations["kubernetes.io/change-cause"], metadata[0].annotations["deployment.kubernetes.io/revision"], spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 ] } } # ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN). # No ingress, no Authentik: the Gateway is the only externally exposed component # (ADR-0001) and holds the edge auth. OpenAI speech path: # http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech resource "kubernetes_service" "portal_tts" { metadata { name = "portal-tts" namespace = kubernetes_namespace.portal_tts.metadata[0].name labels = local.labels annotations = { # openedai-speech has no /metrics endpoint; annotation-based scrape kept on # a liveness path so the Service stays in the scrape set (Ready-endpoint # relabeling filters non-Ready pods). Probes the OpenAI models list. "prometheus.io/scrape" = "true" "prometheus.io/path" = "/v1/models" "prometheus.io/port" = "8000" } } spec { type = "ClusterIP" selector = { app = "portal-tts" } port { name = "http" port = 8000 target_port = 8000 } } }