portal-realtime: deploy the v2 full-duplex voice agent (Pipecat)
All checks were successful
ci/woodpecker/push/default Pipeline was successful
All checks were successful
ci/woodpecker/push/default Pipeline was successful
New stack for the realtime voice agent — v2 of the portal-assistant brain path. One persistent WebSocket per conversation: continuous mic audio -> Silero VAD turn-taking -> Whisper STT (portal-stt) -> streaming Claude brain (claude-agent-service) -> edge-tts (portal-tts) -> audio out, with barge-in. Reuses all three upstream cluster services; nothing new is spun up. Public Cloudflare ingress (proxied, WebSocket) at portal-realtime.viktorbarzin.me with the app's own DEVICE_TOKEN as the edge gate (auth="app" — Authentik would break the native Portal client). No buffering middleware: it would break the streaming WebSocket. Image ghcr.io/viktorbarzin/portal-assistant-realtime (private ghcr, pulled with ghcr_pull_token). Sibling to the v1 portal-assistant gateway, which stays live. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
44cac6f4e2
commit
53117b193a
2 changed files with 270 additions and 0 deletions
253
stacks/portal-realtime/main.tf
Normal file
253
stacks/portal-realtime/main.tf
Normal file
|
|
@ -0,0 +1,253 @@
|
|||
# =============================================================================
|
||||
# portal-realtime — full-duplex voice agent (Pipecat) over a WebSocket
|
||||
# =============================================================================
|
||||
# v2 of the portal-assistant brain path. Instead of the v1 tap-to-talk
|
||||
# request/response gateway, this is a persistent conversation: the Portal opens
|
||||
# ONE WebSocket (/ws) and streams raw PCM16 mic audio continuously; the Pipecat
|
||||
# pipeline does Silero VAD turn-taking -> Whisper STT (portal-stt) -> streaming
|
||||
# Claude brain (claude-agent-service /v1/chat/completions) -> edge-tts
|
||||
# (portal-tts) -> audio out, with barge-in. All three upstreams are REUSED
|
||||
# cluster services (nothing new spun up); the brain streams token-by-token over
|
||||
# the free CLI/subscription (no API key). Bilingual bg/en: the TTS voice follows
|
||||
# the reply's script.
|
||||
#
|
||||
# EXPOSURE: a single public Cloudflare ingress (proxied, WebSocket) at
|
||||
# wss://portal-realtime.viktorbarzin.me/ws. The agent enforces its own edge auth
|
||||
# (the DEVICE_TOKEN the Portal carries as ?token=), so auth="app" — Authentik
|
||||
# would break the native Portal client (no browser login). NO buffering
|
||||
# middleware (max_body_size unset): Traefik's Buffering middleware would break
|
||||
# the streaming WebSocket.
|
||||
#
|
||||
# IMAGE: ghcr.io/viktorbarzin/portal-assistant-realtime (PRIVATE ghcr package,
|
||||
# pulled with the read-only ghcr_pull_token). Built from the portal-assistant
|
||||
# repo's realtime/ dir. :latest + Keel auto-roll (namespace is keel-enrolled).
|
||||
# =============================================================================
|
||||
|
||||
data "vault_kv_secret_v2" "viktor" {
|
||||
mount = "secret"
|
||||
name = "viktor"
|
||||
}
|
||||
|
||||
data "vault_kv_secret_v2" "cas" {
|
||||
mount = "secret"
|
||||
name = "claude-agent-service"
|
||||
}
|
||||
|
||||
# Reuse the portal-assistant device token — same physical Portal device, same
|
||||
# edge secret; no need for a separate credential.
|
||||
data "vault_kv_secret_v2" "pa" {
|
||||
mount = "secret"
|
||||
name = "portal-assistant"
|
||||
}
|
||||
|
||||
locals {
|
||||
namespace = "portal-realtime"
|
||||
labels = { app = "portal-realtime" }
|
||||
image = "ghcr.io/viktorbarzin/portal-assistant-realtime:latest"
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace" "portal_realtime" {
|
||||
metadata {
|
||||
name = local.namespace
|
||||
labels = {
|
||||
tier = local.tiers.edge
|
||||
"istio-injection" = "disabled"
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label.
|
||||
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
||||
}
|
||||
}
|
||||
|
||||
# Pull secret — the realtime image is a PRIVATE ghcr package. Uses the read-only
|
||||
# ghcr_pull_token (secret/viktor), same cred the cluster-wide allowlist uses.
|
||||
resource "kubernetes_secret" "ghcr" {
|
||||
metadata {
|
||||
name = "ghcr-pull"
|
||||
namespace = kubernetes_namespace.portal_realtime.metadata[0].name
|
||||
}
|
||||
type = "kubernetes.io/dockerconfigjson"
|
||||
data = {
|
||||
".dockerconfigjson" = jsonencode({
|
||||
auths = {
|
||||
"ghcr.io" = {
|
||||
username = "viktorbarzin"
|
||||
password = data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"]
|
||||
auth = base64encode("viktorbarzin:${data.vault_kv_secret_v2.viktor.data["ghcr_pull_token"]}")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
# Tokens: BRAIN_TOKEN = claude-agent-service's bearer (to call the streaming
|
||||
# conversational endpoint); DEVICE_TOKEN = the per-Portal secret the app carries
|
||||
# as ?token= on the WebSocket, which the agent verifies before accepting.
|
||||
resource "kubernetes_secret" "realtime" {
|
||||
metadata {
|
||||
name = "portal-realtime-secrets"
|
||||
namespace = kubernetes_namespace.portal_realtime.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
BRAIN_TOKEN = data.vault_kv_secret_v2.cas.data["api_bearer_token"]
|
||||
DEVICE_TOKEN = data.vault_kv_secret_v2.pa.data["device_token"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_deployment" "realtime" {
|
||||
metadata {
|
||||
name = "portal-realtime"
|
||||
namespace = kubernetes_namespace.portal_realtime.metadata[0].name
|
||||
labels = merge(local.labels, { tier = local.tiers.edge })
|
||||
}
|
||||
spec {
|
||||
replicas = 1
|
||||
selector {
|
||||
match_labels = { app = "portal-realtime" }
|
||||
}
|
||||
template {
|
||||
metadata {
|
||||
labels = { app = "portal-realtime" }
|
||||
}
|
||||
spec {
|
||||
image_pull_secrets {
|
||||
name = kubernetes_secret.ghcr.metadata[0].name
|
||||
}
|
||||
container {
|
||||
name = "realtime"
|
||||
image = local.image
|
||||
image_pull_policy = "Always"
|
||||
port {
|
||||
container_port = 8000
|
||||
name = "http"
|
||||
}
|
||||
|
||||
# STT/Brain/TTS base URLs carry the /v1 suffix: the agent's OpenAI-SDK
|
||||
# clients append /audio/transcriptions, /chat/completions, /audio/speech.
|
||||
env {
|
||||
name = "STT_URL"
|
||||
value = "http://portal-stt.portal-stt.svc.cluster.local:8000/v1"
|
||||
}
|
||||
env {
|
||||
name = "STT_MODEL"
|
||||
value = "deepdml/faster-whisper-large-v3-turbo-ct2"
|
||||
}
|
||||
env {
|
||||
name = "BRAIN_URL"
|
||||
value = "http://claude-agent-service.claude-agent.svc.cluster.local:8080/v1"
|
||||
}
|
||||
env {
|
||||
name = "BRAIN_MODEL"
|
||||
value = "sonnet" # latency over smartness for live conversation
|
||||
}
|
||||
env {
|
||||
name = "TTS_URL"
|
||||
value = "http://portal-tts.portal-tts.svc.cluster.local:8000/v1"
|
||||
}
|
||||
# edge-tts neural voices; the agent switches per reply script (bg/en).
|
||||
env {
|
||||
name = "TTS_VOICE_BG"
|
||||
value = "bg-BG-KalinaNeural"
|
||||
}
|
||||
env {
|
||||
name = "TTS_VOICE_EN"
|
||||
value = "en-US-AvaNeural"
|
||||
}
|
||||
env {
|
||||
name = "BRAIN_TOKEN"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.realtime.metadata[0].name
|
||||
key = "BRAIN_TOKEN"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "DEVICE_TOKEN"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.realtime.metadata[0].name
|
||||
key = "DEVICE_TOKEN"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
readiness_probe {
|
||||
http_get {
|
||||
path = "/health"
|
||||
port = 8000
|
||||
}
|
||||
initial_delay_seconds = 10
|
||||
period_seconds = 10
|
||||
}
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/health"
|
||||
port = 8000
|
||||
}
|
||||
initial_delay_seconds = 20
|
||||
period_seconds = 30
|
||||
}
|
||||
|
||||
resources {
|
||||
# Pipecat + onnxruntime (Silero VAD) per live connection. No CPU
|
||||
# limit (cluster CFS-throttling policy) — request only. Burstable
|
||||
# memory (tier-edge). VERIFY with krr after real traffic.
|
||||
requests = {
|
||||
cpu = "200m"
|
||||
memory = "512Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "1Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
ignore_changes = [
|
||||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel rolls :latest; don't let TF revert the digest
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# ClusterIP — fronted by the ingress below. No /metrics endpoint on the agent;
|
||||
# kept out of the annotation scrape set (Pipecat metrics are internal only).
|
||||
resource "kubernetes_service" "realtime" {
|
||||
metadata {
|
||||
name = "portal-realtime"
|
||||
namespace = kubernetes_namespace.portal_realtime.metadata[0].name
|
||||
labels = local.labels
|
||||
}
|
||||
spec {
|
||||
type = "ClusterIP"
|
||||
selector = { app = "portal-realtime" }
|
||||
port {
|
||||
name = "http"
|
||||
port = 8000
|
||||
target_port = 8000
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Public Cloudflare ingress — wss://portal-realtime.viktorbarzin.me/ws. Traefik
|
||||
# upgrades WebSocket on the standard HTTP router (no special annotation needed);
|
||||
# the entrypoint writeTimeout=0 keeps long-lived streams open. tls-secret is
|
||||
# Kyverno-synced into the namespace. NO max_body_size: a Buffering middleware
|
||||
# would break the streaming WebSocket.
|
||||
module "ingress" {
|
||||
source = "../../modules/kubernetes/ingress_factory"
|
||||
name = "portal-realtime"
|
||||
namespace = kubernetes_namespace.portal_realtime.metadata[0].name
|
||||
service_name = kubernetes_service.realtime.metadata[0].name
|
||||
port = 8000
|
||||
tls_secret_name = "tls-secret"
|
||||
# auth = "app": the agent enforces its own DEVICE_TOKEN edge gate on /ws;
|
||||
# Authentik would break the native Portal client (it has no browser login).
|
||||
auth = "app"
|
||||
dns_type = "proxied"
|
||||
}
|
||||
17
stacks/portal-realtime/terragrunt.hcl
Normal file
17
stacks/portal-realtime/terragrunt.hcl
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
include "root" {
|
||||
path = find_in_parent_folders()
|
||||
}
|
||||
|
||||
dependency "platform" {
|
||||
config_path = "../platform"
|
||||
skip_outputs = true
|
||||
}
|
||||
|
||||
# portal-realtime — the v2 full-duplex voice agent (Pipecat). One persistent
|
||||
# WebSocket per conversation: continuous mic audio -> Silero VAD turn-taking ->
|
||||
# Whisper STT (portal-stt) -> streaming Claude brain (claude-agent-service) ->
|
||||
# edge-tts (portal-tts) -> audio out, with barge-in. Reuses all three upstream
|
||||
# cluster services; nothing new is spun up. Public Cloudflare ingress (proxied,
|
||||
# WebSocket) with the app's own DEVICE_TOKEN as the edge gate. Sibling to
|
||||
# portal-assistant (the v1 tap-to-talk gateway, still live). portal-assistant
|
||||
# realtime Phase 3.
|
||||
Loading…
Add table
Add a link
Reference in a new issue