From 53117b193af41114a4715f6d293a0cd21654822b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 20 Jun 2026 08:21:43 +0000 Subject: [PATCH] portal-realtime: deploy the v2 full-duplex voice agent (Pipecat) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New stack for the realtime voice agent — v2 of the portal-assistant brain path. One persistent WebSocket per conversation: continuous mic audio -> Silero VAD turn-taking -> Whisper STT (portal-stt) -> streaming Claude brain (claude-agent-service) -> edge-tts (portal-tts) -> audio out, with barge-in. Reuses all three upstream cluster services; nothing new is spun up. Public Cloudflare ingress (proxied, WebSocket) at portal-realtime.viktorbarzin.me with the app's own DEVICE_TOKEN as the edge gate (auth="app" — Authentik would break the native Portal client). No buffering middleware: it would break the streaming WebSocket. Image ghcr.io/viktorbarzin/portal-assistant-realtime (private ghcr, pulled with ghcr_pull_token). Sibling to the v1 portal-assistant gateway, which stays live. Co-Authored-By: Claude Opus 4.8 --- stacks/portal-realtime/main.tf | 253 ++++++++++++++++++++++++++ stacks/portal-realtime/terragrunt.hcl | 17 ++ 2 files changed, 270 insertions(+) create mode 100644 stacks/portal-realtime/main.tf create mode 100644 stacks/portal-realtime/terragrunt.hcl diff --git a/stacks/portal-realtime/main.tf b/stacks/portal-realtime/main.tf new file mode 100644 index 00000000..53ba1a3a --- /dev/null +++ b/stacks/portal-realtime/main.tf @@ -0,0 +1,253 @@ +# ============================================================================= +# portal-realtime — full-duplex voice agent (Pipecat) over a WebSocket +# ============================================================================= +# v2 of the portal-assistant brain path. Instead of the v1 tap-to-talk +# request/response gateway, this is a persistent conversation: the Portal opens +# ONE WebSocket (/ws) and streams raw PCM16 mic audio continuously; the Pipecat +# pipeline does Silero VAD turn-taking -> Whisper STT (portal-stt) -> streaming +# Claude brain (claude-agent-service /v1/chat/completions) -> edge-tts +# (portal-tts) -> audio out, with barge-in. All three upstreams are REUSED +# cluster services (nothing new spun up); the brain streams token-by-token over +# the free CLI/subscription (no API key). Bilingual bg/en: the TTS voice follows +# the reply's script. +# +# EXPOSURE: a single public Cloudflare ingress (proxied, WebSocket) at +# wss://portal-realtime.viktorbarzin.me/ws. The agent enforces its own edge auth +# (the DEVICE_TOKEN the Portal carries as ?token=), so auth="app" — Authentik +# would break the native Portal client (no browser login). NO buffering +# middleware (max_body_size unset): Traefik's Buffering middleware would break +# the streaming WebSocket. +# +# IMAGE: ghcr.io/viktorbarzin/portal-assistant-realtime (PRIVATE ghcr package, +# pulled with the read-only ghcr_pull_token). Built from the portal-assistant +# repo's realtime/ dir. :latest + Keel auto-roll (namespace is keel-enrolled). +# ============================================================================= + +data "vault_kv_secret_v2" "viktor" { + mount = "secret" + name = "viktor" +} + +data "vault_kv_secret_v2" "cas" { + mount = "secret" + name = "claude-agent-service" +} + +# Reuse the portal-assistant device token — same physical Portal device, same +# edge secret; no need for a separate credential. +data "vault_kv_secret_v2" "pa" { + mount = "secret" + name = "portal-assistant" +} + +locals { + namespace = "portal-realtime" + labels = { app = "portal-realtime" } + image = "ghcr.io/viktorbarzin/portal-assistant-realtime:latest" +} + +resource "kubernetes_namespace" "portal_realtime" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.edge + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label. + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Pull secret — the realtime image is a PRIVATE ghcr package. Uses the read-only +# ghcr_pull_token (secret/viktor), same cred the cluster-wide allowlist uses. +resource "kubernetes_secret" "ghcr" { + metadata { + name = "ghcr-pull" + namespace = kubernetes_namespace.portal_realtime.metadata[0].name + } + type = "kubernetes.io/dockerconfigjson" + data = { + ".dockerconfigjson" = jsonencode({ + auths = { + "ghcr.io" = { + username = "viktorbarzin" + password = data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"] + auth = base64encode("viktorbarzin:${data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"]}") + } + } + }) + } +} + +# Tokens: BRAIN_TOKEN = claude-agent-service's bearer (to call the streaming +# conversational endpoint); DEVICE_TOKEN = the per-Portal secret the app carries +# as ?token= on the WebSocket, which the agent verifies before accepting. +resource "kubernetes_secret" "realtime" { + metadata { + name = "portal-realtime-secrets" + namespace = kubernetes_namespace.portal_realtime.metadata[0].name + } + data = { + BRAIN_TOKEN = data.vault_kv_secret_v2.cas.data["api_bearer_token"] + DEVICE_TOKEN = data.vault_kv_secret_v2.pa.data["device_token"] + } +} + +resource "kubernetes_deployment" "realtime" { + metadata { + name = "portal-realtime" + namespace = kubernetes_namespace.portal_realtime.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.edge }) + } + spec { + replicas = 1 + selector { + match_labels = { app = "portal-realtime" } + } + template { + metadata { + labels = { app = "portal-realtime" } + } + spec { + image_pull_secrets { + name = kubernetes_secret.ghcr.metadata[0].name + } + container { + name = "realtime" + image = local.image + image_pull_policy = "Always" + port { + container_port = 8000 + name = "http" + } + + # STT/Brain/TTS base URLs carry the /v1 suffix: the agent's OpenAI-SDK + # clients append /audio/transcriptions, /chat/completions, /audio/speech. + env { + name = "STT_URL" + value = "http://portal-stt.portal-stt.svc.cluster.local:8000/v1" + } + env { + name = "STT_MODEL" + value = "deepdml/faster-whisper-large-v3-turbo-ct2" + } + env { + name = "BRAIN_URL" + value = "http://claude-agent-service.claude-agent.svc.cluster.local:8080/v1" + } + env { + name = "BRAIN_MODEL" + value = "sonnet" # latency over smartness for live conversation + } + env { + name = "TTS_URL" + value = "http://portal-tts.portal-tts.svc.cluster.local:8000/v1" + } + # edge-tts neural voices; the agent switches per reply script (bg/en). + env { + name = "TTS_VOICE_BG" + value = "bg-BG-KalinaNeural" + } + env { + name = "TTS_VOICE_EN" + value = "en-US-AvaNeural" + } + env { + name = "BRAIN_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.realtime.metadata[0].name + key = "BRAIN_TOKEN" + } + } + } + env { + name = "DEVICE_TOKEN" + value_from { + secret_key_ref { + name = kubernetes_secret.realtime.metadata[0].name + key = "DEVICE_TOKEN" + } + } + } + + readiness_probe { + http_get { + path = "/health" + port = 8000 + } + initial_delay_seconds = 10 + period_seconds = 10 + } + liveness_probe { + http_get { + path = "/health" + port = 8000 + } + initial_delay_seconds = 20 + period_seconds = 30 + } + + resources { + # Pipecat + onnxruntime (Silero VAD) per live connection. No CPU + # limit (cluster CFS-throttling policy) — request only. Burstable + # memory (tier-edge). VERIFY with krr after real traffic. + requests = { + cpu = "200m" + memory = "512Mi" + } + limits = { + memory = "1Gi" + } + } + } + } + } + } + lifecycle { + ignore_changes = [ + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel rolls :latest; don't let TF revert the digest + ] + } +} + +# ClusterIP — fronted by the ingress below. No /metrics endpoint on the agent; +# kept out of the annotation scrape set (Pipecat metrics are internal only). +resource "kubernetes_service" "realtime" { + metadata { + name = "portal-realtime" + namespace = kubernetes_namespace.portal_realtime.metadata[0].name + labels = local.labels + } + spec { + type = "ClusterIP" + selector = { app = "portal-realtime" } + port { + name = "http" + port = 8000 + target_port = 8000 + } + } +} + +# Public Cloudflare ingress — wss://portal-realtime.viktorbarzin.me/ws. Traefik +# upgrades WebSocket on the standard HTTP router (no special annotation needed); +# the entrypoint writeTimeout=0 keeps long-lived streams open. tls-secret is +# Kyverno-synced into the namespace. NO max_body_size: a Buffering middleware +# would break the streaming WebSocket. +module "ingress" { + source = "../../modules/kubernetes/ingress_factory" + name = "portal-realtime" + namespace = kubernetes_namespace.portal_realtime.metadata[0].name + service_name = kubernetes_service.realtime.metadata[0].name + port = 8000 + tls_secret_name = "tls-secret" + # auth = "app": the agent enforces its own DEVICE_TOKEN edge gate on /ws; + # Authentik would break the native Portal client (it has no browser login). + auth = "app" + dns_type = "proxied" +} diff --git a/stacks/portal-realtime/terragrunt.hcl b/stacks/portal-realtime/terragrunt.hcl new file mode 100644 index 00000000..da842ec8 --- /dev/null +++ b/stacks/portal-realtime/terragrunt.hcl @@ -0,0 +1,17 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +# portal-realtime — the v2 full-duplex voice agent (Pipecat). One persistent +# WebSocket per conversation: continuous mic audio -> Silero VAD turn-taking -> +# Whisper STT (portal-stt) -> streaming Claude brain (claude-agent-service) -> +# edge-tts (portal-tts) -> audio out, with barge-in. Reuses all three upstream +# cluster services; nothing new is spun up. Public Cloudflare ingress (proxied, +# WebSocket) with the app's own DEVICE_TOKEN as the edge gate. Sibling to +# portal-assistant (the v1 tap-to-talk gateway, still live). portal-assistant +# realtime Phase 3.