diff --git a/stacks/portal-tts/main.tf b/stacks/portal-tts/main.tf new file mode 100644 index 00000000..1b423ff5 --- /dev/null +++ b/stacks/portal-tts/main.tf @@ -0,0 +1,405 @@ +# ============================================================================= +# portal-tts — Piper TTS (CPU, always-on) for the portal-assistant Gateway +# ============================================================================= +# +# DRAFT for operator review (portal-assistant issue #3). HITL apply: an agent +# drafts; the operator applies via GitOps and verifies the rollout. Do NOT +# `terragrunt apply` this from a worktree. +# +# WHAT: a single ALWAYS-ON Piper deployment serving Bulgarian +# (`bg_BG-dimitar-medium`) AND English (`en_US-lessac-medium`), with the voice +# chosen PER REQUEST, behind a ClusterIP Service `portal-tts.portal-tts.svc:8000`. +# CPU-ONLY — no GPU node selector / toleration / nvidia.com/gpu request (Piper +# is a fast CPU neural TTS; ADR-0003). Audio never leaves the LAN; the +# portal-assistant Gateway is the only externally exposed component (ADR-0001), +# so there is no ingress / Authentik here. +# +# WHY CPU + ALWAYS-ON (contrast the two GPU siblings): +# * tts/ (chatterbox) scales 0<->1 behind a free-VRAM CronJob gate — it is a +# best-effort BATCH tenant on the shared T4. +# * portal-stt/ (Speaches) is warm-resident on ONE T4 slice — interactive STT +# that must not pay a cold model load. +# Piper needs neither: it runs in real time on CPU (no GPU contention at all), +# so the simplest correct design is replicas=1, always up. Keeping it off the +# T4 also REMOVES one tenant from the OOM-prone shared card +# (docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md) — Bulgarian +# isn't on chatterbox anyway (its 23 langs exclude bg; ADR-0003). +# +# API SHAPE (the Gateway team's contract): openedai-speech is OpenAI-compatible. +# POST /v1/audio/speech +# Content-Type: application/json +# { "model": "tts-1", "input": "", "voice": "", +# "response_format": "wav", "speed": 1.0 } +# -> 200, body = raw audio bytes (wav/mp3/opus/flac/aac/pcm per response_format) +# This matches the Gateway's tts.synthesize(text, lang) -> bytes interface +# (portal-assistant gateway/app/pipeline.py): map lang "bg"->voice "bg", +# anything else -> "en". Same OpenAI shape chatterbox already uses, so the +# Gateway can treat Piper and the edge-tts fallback identically. +# +# PLUGGABLE FALLBACK (noted, NOT built here — a Gateway-side concern): ADR-0003 +# keeps TTS a swappable backend with Microsoft edge-tts as an online +# Bulgarian-quality fallback. The Gateway selects Piper (this Service, on-LAN +# default) vs edge-tts (cloud) per its own config; nothing in THIS stack needs +# to change to add edge-tts. If a second in-cluster engine is ever wanted, +# add a sibling Deployment+Service and let the Gateway choose. +# +# IMAGE CHOICE (OPEN ITEM — operator please confirm before apply): +# Primary: ghcr.io/matatonic/openedai-speech-min — the CPU-only, piper-only +# (~1 GiB) variant of openedai-speech. OpenAI /v1/audio/speech, multi-voice via +# a voice_to_speaker.yaml map, returns raw audio bytes. Pre-built for +# linux/amd64+arm64, pullable from ghcr (tags: latest, 0.18.2, 0.18.1, ...). +# CAVEAT: the upstream repo was ARCHIVED 2026-01-04 (read-only, no further +# updates / security patches). It is feature-complete and stable for this use, +# but pinned (not Keel-tracked) precisely because it is frozen upstream. +# Alternative if a maintained image is preferred: arkdevuk/Webpiper (FastAPI, +# actively developed) — but it returns JSON {url} requiring a SECOND fetch to +# retrieve the wav, a worse fit for the Gateway's bytes contract; it would +# need a small Gateway adapter. The rhasspy/wyoming-piper HTTP server is NOT +# suitable: it loads ONE voice per process (no per-request voice switch). +# ============================================================================= + +variable "nfs_server" { + type = string + description = "NFS server (Proxmox host). From config.tfvars (192.168.1.127)." +} + +variable "piper_image" { + type = string + # CPU-only piper-only openedai-speech. Pinned to 0.18.2 (the newest published + # tag; repo archived 2026-01-04 so this is effectively the final release) for + # reproducibility — NOT :latest, which would also drift to the same frozen + # build but loses the explicit version record. linux/amd64 confirmed. + default = "ghcr.io/matatonic/openedai-speech-min:0.18.2" + description = "openedai-speech CPU/piper-only image. See IMAGE CHOICE note in main.tf." +} + +variable "bg_voice" { + type = string + # The single Bulgarian Piper voice (rhasspy/piper-voices). ADR-0003 names this + # exact model; it was reviewed against edge-tts in the M0.2 bake-off. + default = "bg_BG-dimitar-medium" + description = "Bulgarian Piper voice model stem (rhasspy/piper-voices)." +} + +variable "en_voice" { + type = string + # English Piper voice. lessac-medium is the canonical balanced en_US voice + # (the upstream openedai-speech default-quality pick). Swappable to any + # rhasspy/piper-voices en stem (e.g. en_US-amy-medium, en_GB-alba-medium). + default = "en_US-lessac-medium" + description = "English Piper voice model stem (rhasspy/piper-voices)." +} + +locals { + namespace = "portal-tts" + labels = { app = "portal-tts" } + + # rhasspy/piper-voices HuggingFace layout: + # resolve/main/////--.onnx + # Each voice needs BOTH the .onnx and the matching .onnx.json (Piper reads the + # sample rate / phoneme map from the .json next to the model). All 4 URLs were + # HEAD-verified 200 on 2026-06-17. Derived from the voice stems so changing a + # voice variable updates both the download URL and the config map together. + hf_base = "https://huggingface.co/rhasspy/piper-voices/resolve/main" + voice_models = { + # voice stem => HF directory path "///" + (var.bg_voice) = "bg/bg_BG/dimitar/medium" + (var.en_voice) = "en/en_US/lessac/medium" + } + + # voice_to_speaker.yaml: the openedai-speech config that maps a REQUEST voice + # name ("bg" / "en") to a Piper .onnx on the PVC. The Gateway sends + # voice="bg" or "en"; the server resolves it here. (We expose short logical + # names, not the long model stems, so the Gateway's lang->voice map is trivial + # and stable even if the underlying model stem changes.) The default model + # name is "tts-1" (openedai-speech's piper model id). + voice_to_speaker = yamlencode({ + "tts-1" = { + bg = { + model = "voices/${var.bg_voice}.onnx" + speaker = null # single-speaker model -> default + } + en = { + model = "voices/${var.en_voice}.onnx" + speaker = null + } + } + }) + + # Init-container provisioning script. Downloads each voice's .onnx + .onnx.json + # into the PVC's voices/ dir IF MISSING (idempotent — re-runs skip + # already-present files), then copies the config map's voice_to_speaker.yaml + # into the PVC's config/ dir. Pure POSIX + wget (busybox has wget). + download_script = <<-EOT + set -eu + mkdir -p /data/voices /data/config + fetch() { # $1 = url, $2 = dest + if [ -s "$2" ]; then echo "have $2"; return 0; fi + echo "get $1 -> $2" + wget -q -O "$2.tmp" "$1" + mv "$2.tmp" "$2" + } + %{for stem, dir in local.voice_models~} + fetch "${local.hf_base}/${dir}/${stem}.onnx" "/data/voices/${stem}.onnx" + fetch "${local.hf_base}/${dir}/${stem}.onnx.json" "/data/voices/${stem}.onnx.json" + %{endfor~} + cp /config-src/voice_to_speaker.yaml /data/config/voice_to_speaker.yaml + echo "voices:"; ls -la /data/voices + echo "config:"; cat /data/config/voice_to_speaker.yaml + EOT +} + +resource "kubernetes_namespace" "portal_tts" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.aux # CPU-only best-effort helper, not a GPU tenant + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Voice models + config on NFS-SSD: fast first-load, persists across restarts, +# and RWX so a future seed/inspect pod can touch it. Path /srv/nfs-ssd/portal-tts +# on the Proxmox host. Small — two medium Piper voices are ~60-120 MiB each. +# Mirrors portal-stt's nfs_models pattern. +module "nfs_models" { + source = "../../modules/kubernetes/nfs_volume" + name = "portal-tts-models" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + nfs_server = var.nfs_server + nfs_path = "/srv/nfs-ssd/portal-tts" + storage = "2Gi" # 2 medium Piper voices + headroom for more +} + +# One-shot bootstrap: /srv/nfs-ssd is exported whole-tree but the portal-tts +# SUBDIR must exist before kubelet can bind-mount it (chatterbox/portal-stt both +# hit exit 32 on a missing subdir). Mount the export ROOT (which exists) and +# mkdir the subtree; kubelet's mount retry then heals the main pod. Idempotent; +# immutable-once-created. +resource "kubernetes_job" "models_dir_init" { + metadata { + name = "portal-tts-models-dir-init" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = local.labels + } + spec { + backoff_limit = 3 + ttl_seconds_after_finished = 86400 + template { + metadata { labels = local.labels } + spec { + restart_policy = "Never" + container { + name = "mkdir" + image = "busybox:1.37" + command = ["sh", "-c", "mkdir -p /mnt/portal-tts/voices /mnt/portal-tts/config && ls -la /mnt/portal-tts"] + volume_mount { + name = "nfs-ssd-root" + mount_path = "/mnt" + } + } + volume { + name = "nfs-ssd-root" + nfs { + server = var.nfs_server + path = "/srv/nfs-ssd" + } + } + } + } + } + wait_for_completion = true + timeouts { create = "3m" } +} + +# The voice_to_speaker.yaml map, mounted into the init container which copies it +# onto the PVC (openedai-speech reads config from a writable dir). Checksum drives +# a rollout when the voice map changes. +resource "kubernetes_config_map" "voices" { + metadata { + name = "portal-tts-voices" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = local.labels + } + data = { + "voice_to_speaker.yaml" = local.voice_to_speaker + } +} + +# Always-on Piper. replicas=1, never scaled to zero (no off-peak gate). CPU-only: +# NO node_selector / toleration / nvidia.com/gpu — it runs on any worker. The +# init container downloads the voices to the PVC and seeds the config before the +# server starts; openedai-speech then serves both voices, selectable per request. +resource "kubernetes_deployment" "portal_tts" { + metadata { + name = "portal-tts" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.aux }) + } + # wait_for_rollout left default (true): a plain apply SHOULD block until the + # voices download + the server reports healthy, surfacing a bad image/voice + # early. First boot pulls ~2 voices from HuggingFace onto the PVC. + spec { + replicas = 1 + # NFS PVC is RWX so RollingUpdate would be safe, but Recreate keeps it simple + # and avoids two pods racing the same voices/ dir on first download. + strategy { type = "Recreate" } + selector { + match_labels = { app = "portal-tts" } + } + template { + metadata { + labels = { app = "portal-tts" } + annotations = { + "checksum/voices" = sha256(local.voice_to_speaker) + } + } + spec { + # Download voices + seed config onto the PVC before the server starts. + init_container { + name = "fetch-voices" + image = "busybox:1.37" + command = ["sh", "-c", local.download_script] + volume_mount { + name = "models" + mount_path = "/data" + } + volume_mount { + name = "config-src" + mount_path = "/config-src" + } + resources { + requests = { cpu = "20m", memory = "32Mi" } + limits = { memory = "64Mi" } + } + } + + container { + name = "portal-tts" + image = var.piper_image + + # openedai-speech serves /v1/audio/speech on :8000. It reads + # config/voice_to_speaker.yaml and voices/*.onnx relative to its + # workdir (/app); we mount the PVC at /app/voices and /app/config so + # the init-seeded files are found. + port { + container_port = 8000 + name = "http" + } + env { + name = "OPENEDAI_LOG_LEVEL" + value = "INFO" # image default is INFO; explicit for legibility + } + + volume_mount { + name = "models" + mount_path = "/app/voices" + sub_path = "voices" + } + volume_mount { + name = "models" + mount_path = "/app/config" + sub_path = "config" + } + + # openedai-speech has no /health on its OpenAI surface. Use a TCP probe + # — the uvicorn socket binds only after the app (and the piper voices + # index) is ready. The init container already downloaded the voices, so + # process start is fast. + startup_probe { + tcp_socket { port = 8000 } + period_seconds = 5 + failure_threshold = 24 # ~2 min + } + readiness_probe { + tcp_socket { port = 8000 } + period_seconds = 15 + failure_threshold = 4 + } + liveness_probe { + tcp_socket { port = 8000 } + initial_delay_seconds = 20 + period_seconds = 30 + failure_threshold = 5 + } + + resources { + # Piper is light on CPU. No CPU limit (cluster policy: CFS throttling + # avoided). Memory sized for the python runtime + 2 loaded onnx voices + # (each medium model ~60-120 MiB + onnxruntime arena). VERIFY with krr + # after a few days of real traffic and tighten. + requests = { + cpu = "100m" + memory = "512Mi" + } + limits = { + memory = "1Gi" + } + } + } + + volume { + name = "models" + persistent_volume_claim { + claim_name = module.nfs_models.claim_name + } + } + volume { + name = "config-src" + config_map { + name = kubernetes_config_map.voices.metadata[0].name + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + # image is TF-OWNED (pinned tag on a FROZEN upstream) — do NOT let Keel + # churn it. Ignore keel's annotation noise but keep the image pin applying. + metadata[0].annotations["keel.sh/policy"], + metadata[0].annotations["keel.sh/trigger"], + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["keel.sh/match-tag"], + metadata[0].annotations["kubernetes.io/change-cause"], + metadata[0].annotations["deployment.kubernetes.io/revision"], + spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], # KEEL_LIFECYCLE_V1 + ] + } +} + +# ClusterIP — in-cluster only (the Gateway calls this; audio stays on the LAN). +# No ingress, no Authentik: the Gateway is the only externally exposed component +# (ADR-0001) and holds the edge auth. OpenAI speech path: +# http://portal-tts.portal-tts.svc.cluster.local:8000/v1/audio/speech +resource "kubernetes_service" "portal_tts" { + metadata { + name = "portal-tts" + namespace = kubernetes_namespace.portal_tts.metadata[0].name + labels = local.labels + annotations = { + # openedai-speech has no /metrics endpoint; annotation-based scrape kept on + # a liveness path so the Service stays in the scrape set (Ready-endpoint + # relabeling filters non-Ready pods). Probes the OpenAI models list. + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/v1/models" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" + selector = { app = "portal-tts" } + port { + name = "http" + port = 8000 + target_port = 8000 + } + } +} diff --git a/stacks/portal-tts/terragrunt.hcl b/stacks/portal-tts/terragrunt.hcl new file mode 100644 index 00000000..8e25ac63 --- /dev/null +++ b/stacks/portal-tts/terragrunt.hcl @@ -0,0 +1,33 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-tts: in-cluster text-to-speech for the portal-assistant Gateway +# (portal-assistant issue #3, ADR-0003). One ALWAYS-ON Deployment of Piper +# (ghcr.io/matatonic/openedai-speech-min, OpenAI-compatible /v1/audio/speech) +# serving Bulgarian `bg_BG-dimitar-medium` + English `en_US-lessac-medium`, voice +# chosen PER REQUEST, behind a single ClusterIP Service +# `portal-tts.portal-tts.svc:8000`. Speech path: /v1/audio/speech. +# +# CPU-ONLY: Piper is a fast CPU neural TTS — NO GPU node selector / toleration / +# nvidia.com/gpu request. This deliberately keeps TTS off the OOM-prone shared +# T4 (the two GPU siblings tts/chatterbox + portal-stt already contend for it); +# Bulgarian isn't available on chatterbox anyway (ADR-0003). replicas=1, never +# scaled to zero — no off-peak gate needed when there's no GPU to free. +# +# Voices live on an NFS-SSD PVC, downloaded from rhasspy/piper-voices by an init +# container on first boot (both .onnx + .onnx.json), then persist. A ConfigMap +# supplies voice_to_speaker.yaml mapping request voice "bg"/"en" -> .onnx model. +# +# PLUGGABLE: ADR-0003 keeps TTS a swappable backend with edge-tts as an online +# Bulgarian fallback — that switch is Gateway-side; nothing here changes for it. +# +# nfs_server comes from config.tfvars (192.168.1.127) via the root inputs. +# +# HITL: agent drafts; operator applies via GitOps, then verifies the rollout + +# a bg/en /v1/audio/speech smoke test (curl returns audio bytes).