infra/stacks/tts/main.tf
Viktor Barzin 48013a4a92 feat(tts): Chatterbox TTS stack + off-peak T4 gate, wire tripit narration [ci skip]
New `infra/stacks/tts/` deploys devnen/Chatterbox-TTS-Server (OpenAI-compatible
/v1/audio/speech) as ClusterIP `chatterbox-tts.tts.svc:8000` (server listens on
8004; Service remaps), requesting ONE T4 time-slice. Mirrors stacks/llama-cpp/.

Option A off-peak control (no VRAM isolation on the time-sliced T4 — see
post-mortem 2026-06-02): Deployment sits at replicas=0; three Europe/London
CronJobs own the replica count — `chatterbox-window-up` scales to 1 at 02:00
ONLY IF a free-VRAM preflight passes (sum gpu_pod_memory_used_bytes from
gpu-pod-exporter; free = 16GiB - used >= floor), `chatterbox-vram-guard` yields
the card mid-window if a resident wakes, `chatterbox-window-down` scales to 0 at
06:00. tripit's bake is best-effort + cached-forever (ADR-0002/0004) so a
skipped/aborted window backfills next time. SA+Role+RoleBinding grant the
CronJobs deployments/scale (nextcloud-watchdog pattern).

Polite-tenant hardening: kyverno `inject-gpu-workload-priority` now excludes the
`tts` namespace (new `gpu_priority_excluded_namespaces` local) so Chatterbox
keeps tier-2-gpu priority (600k) and is always evicted first under GPU pressure
— never immich-ml/frigate/llama-swap. The LimitRange-fallback policy still uses
the base exclude list (tts untouched there).

tripit: add TTS_MODE=openai_compatible, TTS_BASE_URL, TTS_MODEL=chatterbox to
local.app_env (no token — ClusterIP only). No tripit code change.

Image build is documented in stacks/tts/README.md (devnen cu128 target ->
forgejo.viktorbarzin.me/viktor/chatterbox-tts) — build is impractical inline
(large CUDA image + needs the upstream repo). NOT APPLIED — review branch only.
Free-VRAM floor (var.vram_free_floor_bytes, default 6GiB) must be set from the
measured chatterbox-multilingual T4 peak during the first bake.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 21:21:39 +00:00

474 lines
18 KiB
HCL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

variable "image_tag" {
type = string
default = "latest"
description = "chatterbox-tts image tag. Use the 8-char git SHA in CI; :latest for local trials."
}
# ─────────────────────────────────────────────────────────────────────────────
# Option-A off-peak control (see docs/plans/2026-06-08-chatterbox-tts-infra.md §3).
# The Deployment sits at replicas=0; a CronJob scales it to 1 at the window start
# ONLY IF a free-VRAM preflight passes, and another scales it back to 0 at window
# end. A guard CronJob yields the card mid-window if free VRAM drops below the
# floor (a resident woke up). tripit's bake is best-effort + idempotent, so a
# skipped/aborted window simply backfills on the next one (ADR-0002/0004).
# ─────────────────────────────────────────────────────────────────────────────
variable "vram_free_floor_bytes" {
type = number
# OPEN ITEM — must be measured (§5 smoke test / §3.X). This is the minimum free
# VRAM the preflight requires before it will scale Chatterbox up, and the floor
# the guard yields below. Default = 6 GiB ≈ (a conservative guess for
# chatterbox-multilingual FP16 peak ~4 GiB + ~2 GiB headroom for the
# read→cudaMalloc race). RAISE/LOWER once the real T4 peak is captured from
# gpu_pod_memory_used_bytes{namespace="tts"} during a real synth.
default = 6442450944
description = "Minimum free GPU VRAM (bytes) required before scaling Chatterbox up; guard yields below it."
}
variable "gpu_total_bytes" {
type = number
default = 17179869184 # Tesla T4 = 16 GiB
description = "Total VRAM on the shared GPU. Free = this minus sum(gpu_pod_memory_used_bytes)."
}
variable "offpeak_window_up_schedule" {
type = string
default = "0 2 * * *" # 02:00 Europe/London (see timezone on the CronJob)
description = "Cron schedule that fires the free-VRAM preflight + scale-up at window start."
}
variable "offpeak_window_down_schedule" {
type = string
default = "0 6 * * *" # 06:00 Europe/London
description = "Cron schedule that scales Chatterbox back to 0 at window end."
}
variable "offpeak_guard_schedule" {
type = string
default = "*/5 2-5 * * *" # every 5 min inside the 02:0006:00 window
description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops."
}
locals {
namespace = "tts"
labels = { app = "chatterbox-tts" }
image = "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${var.image_tag}"
# config.yaml rendered into a ConfigMap, mounted at /app/config.yaml (the
# server's WORKDIR is /app). Voices, reference audio and the HF model cache
# all live on the NFS-SSD PVC (mounted at /data) so weights persist across
# restarts and load fast. server.port stays at the devnen default 8004; the
# Service remaps 8000->8004 so tripit's default TTS_BASE_URL works unchanged.
#
# model.repo_id = chatterbox-multilingual (ADR-0004; 23 languages for
# worldwide place-names). If the measured T4 VRAM peak is too high to coexist
# even off-peak, fall back to "chatterbox" (English, lighter) — a one-line
# change here (§3.X / §6 decision 3).
chatterbox_config = yamlencode({
server = {
host = "0.0.0.0"
port = 8004
}
model = {
repo_id = "chatterbox-multilingual"
}
tts_engine = {
device = "cuda"
predefined_voices_path = "/data/voices"
reference_audio_path = "/data/reference_audio"
}
})
# Shared script for the off-peak CronJobs. Reads the in-cluster
# gpu_pod_memory_used_bytes gauge (the per-namespace gauge the 2026-06-02
# post-mortem built — host-PID attribution, no new exporter needed), sums it,
# and computes free = GPU_TOTAL - used. Pure POSIX + awk; curl is baked into
# the curl image. ACTION is "up" | "down" | "guard".
# up — scale to 1 ONLY IF free >= FLOOR (positive admission).
# guard — scale to 0 IF free < FLOOR (a resident woke mid-window; yield).
# down — scale to 0 unconditionally (window end).
# Heredoc escaping: only `$${...}` (literal `${...}`) is escaped — Terraform
# would otherwise try to interpolate it. Bare `$(...)`, `$((...))` and awk's
# `$NF` are literal `$` and pass through unescaped.
vram_gate_script = <<-EOT
set -eu
: "$${ACTION:?}" "$${FLOOR:?}" "$${GPU_TOTAL:?}"
METRICS_URL="http://gpu-pod-exporter.nvidia.svc.cluster.local:80/metrics"
# Sum gpu_pod_memory_used_bytes across all pods. Missing metric / empty
# scrape => used=0 (card idle). -f so a non-200 scrape is a hard error we
# treat conservatively (skip scale-up).
if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
echo "WARN: could not scrape $${METRICS_URL}"
if [ "$${ACTION}" = "up" ]; then
echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0
fi
# For down/guard a failed scrape must NOT block yielding the card.
BODY=""
fi
USED="$(printf '%s\n' "$${BODY}" \
| awk '/^gpu_pod_memory_used_bytes\{/ { s += $NF } END { printf "%d", s }')"
USED="$${USED:-0}"
FREE="$(( GPU_TOTAL - USED ))"
echo "GPU VRAM: used=$${USED} free=$${FREE} floor=$${FLOOR} (total=$${GPU_TOTAL})"
case "$${ACTION}" in
up)
if [ "$${FREE}" -ge "$${FLOOR}" ]; then
echo "preflight PASS: free >= floor -> scaling chatterbox-tts to 1"
kubectl -n tts scale deploy/chatterbox-tts --replicas=1
else
echo "preflight SKIP: free < floor -> leaving chatterbox-tts at 0 (retry next window)"
fi
;;
guard)
if [ "$${FREE}" -lt "$${FLOOR}" ]; then
echo "guard TRIP: free < floor -> yielding the card, scaling chatterbox-tts to 0"
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
else
echo "guard OK: free >= floor -> chatterbox-tts may keep running"
fi
;;
down)
echo "window end -> scaling chatterbox-tts to 0"
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
;;
esac
EOT
# Common spec for the three off-peak CronJobs. Each runs one bitnami/kubectl
# pod (in-cluster SA, no kubeconfig) executing the shared gate script with a
# different ACTION. timezone pins the window to Europe/London regardless of
# node TZ.
offpeak_cronjobs = {
chatterbox-window-up = {
schedule = var.offpeak_window_up_schedule
action = "up"
}
chatterbox-window-down = {
schedule = var.offpeak_window_down_schedule
action = "down"
}
chatterbox-vram-guard = {
schedule = var.offpeak_guard_schedule
action = "guard"
}
}
}
resource "kubernetes_namespace" "tts" {
metadata {
name = local.namespace
labels = {
tier = local.tiers.gpu
"istio-injection" = "disabled"
"keel.sh/enrolled" = "true"
}
}
lifecycle {
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
# Model weights + voices on NFS-SSD (fast load), RWX so a seed Job / kubectl cp
# can write the predefined voices + narrator reference WAV while the Deployment
# mounts it. Path /srv/nfs-ssd/chatterbox on the Proxmox host. Mirrors
# llama-cpp's nfs_models. First start downloads the model into /data/hf_cache
# (HF_HOME below), so weights persist across pod restarts.
module "nfs_models" {
source = "../../modules/kubernetes/nfs_volume"
name = "chatterbox-models"
namespace = kubernetes_namespace.tts.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs-ssd/chatterbox"
storage = "20Gi" # multilingual weights + HF cache + voices headroom
}
resource "kubernetes_config_map" "chatterbox_config" {
metadata {
name = "chatterbox-config"
namespace = kubernetes_namespace.tts.metadata[0].name
labels = local.labels
}
data = {
"config.yaml" = local.chatterbox_config
}
}
# Single Deployment running the devnen Chatterbox-TTS-Server (OpenAI-compatible
# /v1/audio/speech). Sits at replicas=0 — the off-peak CronJobs below scale it
# to 1 only when the free-VRAM preflight passes (Option A), and back to 0 at
# window end. wait_for_rollout=false so apply never blocks on a pod that is
# intentionally scaled to 0.
resource "kubernetes_deployment" "chatterbox" {
metadata {
name = "chatterbox-tts"
namespace = kubernetes_namespace.tts.metadata[0].name
labels = merge(local.labels, { tier = local.tiers.gpu })
}
wait_for_rollout = false
spec {
# Off-peak control owns the replica count at runtime (CronJobs scale 0<->1).
# Declare 0 here so a plain `tg apply` outside the window doesn't wake the
# card. ignore_changes on replicas (below) stops apply from fighting the
# CronJob's scale.
replicas = 0
strategy { type = "Recreate" }
selector {
match_labels = { app = "chatterbox-tts" }
}
template {
metadata {
labels = { app = "chatterbox-tts" }
annotations = {
"checksum/config" = sha256(local.chatterbox_config)
}
}
spec {
node_selector = { "nvidia.com/gpu.present" = "true" }
toleration {
key = "nvidia.com/gpu"
operator = "Equal"
value = "true"
effect = "NoSchedule"
}
# C-hardening (§3.RECOMMENDATION.3): Chatterbox is a polite, best-effort
# batch tenant — give it the regular tier-2-gpu priority (600000) so it
# is ALWAYS the pod evicted under GPU-node pressure, never immich-ml /
# frigate / llama-swap. This relies on the `tts` namespace being EXCLUDED
# from the Kyverno `inject-gpu-workload-priority` policy (which would
# otherwise stamp the immich-equal gpu-workload=1,200,000 priority on any
# nvidia.com/gpu pod). That exclusion is the two-line edit to the kyverno
# stack flagged in the PR. Without it, this priority_class_name is
# overwritten on pod CREATE and Chatterbox would compete as an equal.
priority_class_name = "tier-2-gpu"
image_pull_secrets { name = "registry-credentials" }
container {
name = "chatterbox-tts"
image = local.image
port {
container_port = 8004
name = "http"
}
# T4 is Turing — NO bf16 (ADR-0004). Pin off; run FP16/FP32.
env {
name = "TTS_BF16"
value = "off"
}
# Park the HuggingFace cache on the NFS-SSD PVC so model weights
# download once and persist across pod restarts (the pod is recreated
# every window). The devnen compose mounts HF cache at /app/hf_cache;
# point HF_HOME at the PVC instead.
env {
name = "HF_HOME"
value = "/data/hf_cache"
}
env {
name = "HF_HUB_CACHE"
value = "/data/hf_cache"
}
volume_mount {
name = "config"
mount_path = "/app/config.yaml"
sub_path = "config.yaml"
}
volume_mount {
name = "models"
mount_path = "/data"
}
# /v1/audio/voices is cheap and only 200s once the model is loaded —
# so it gates real readiness. First start downloads the model, which
# is slow; the generous failure_threshold absorbs that.
readiness_probe {
http_get {
path = "/v1/audio/voices"
port = 8004
}
initial_delay_seconds = 20
period_seconds = 15
failure_threshold = 12
}
liveness_probe {
http_get {
path = "/v1/audio/voices"
port = 8004
}
initial_delay_seconds = 120
period_seconds = 30
failure_threshold = 5
}
resources {
requests = {
cpu = "200m"
memory = "2Gi"
}
limits = {
memory = "8Gi"
"nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
}
}
}
volume {
name = "config"
config_map {
name = kubernetes_config_map.chatterbox_config.metadata[0].name
}
}
volume {
name = "models"
persistent_volume_claim {
claim_name = module.nfs_models.claim_name
}
}
}
}
}
lifecycle {
ignore_changes = [
# Off-peak CronJobs own the replica count — don't let apply reset it.
spec[0].replicas,
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/match-tag"],
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
metadata[0].annotations["kubernetes.io/change-cause"],
metadata[0].annotations["deployment.kubernetes.io/revision"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"],
]
}
}
resource "kubernetes_service" "chatterbox" {
metadata {
name = "chatterbox-tts"
namespace = kubernetes_namespace.tts.metadata[0].name
labels = local.labels
annotations = {
# Prometheus annotation-based scrape (mirrors tripit). The devnen server
# has no /metrics; this monitors liveness via the blackbox path and keeps
# the Service in the scrape set if a /metrics endpoint is added later.
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/v1/audio/voices"
"prometheus.io/port" = "8000"
}
}
spec {
type = "ClusterIP" # in-cluster only — never ingressed (no token needed)
selector = { app = "chatterbox-tts" }
port {
name = "http"
port = 8000 # tripit's default TTS_BASE_URL port
target_port = 8004 # the devnen server's actual listen port
}
}
}
# ─────────────────────────────────────────────────────────────────────────────
# Option-A off-peak control: SA + Role (scale the Deployment) + RoleBinding +
# three CronJobs (window-up preflight, mid-window guard, window-down). Mirrors
# the nextcloud-watchdog in-cluster-kubectl pattern (SA → Role → bitnami/kubectl
# CronJob, no kubeconfig).
# ─────────────────────────────────────────────────────────────────────────────
resource "kubernetes_service_account" "offpeak" {
metadata {
name = "chatterbox-offpeak"
namespace = kubernetes_namespace.tts.metadata[0].name
}
}
resource "kubernetes_role" "offpeak" {
metadata {
name = "chatterbox-offpeak"
namespace = kubernetes_namespace.tts.metadata[0].name
}
# get + patch on the deployment scale subresource is all the gate needs.
rule {
api_groups = ["apps"]
resources = ["deployments", "deployments/scale"]
verbs = ["get", "patch"]
}
}
resource "kubernetes_role_binding" "offpeak" {
metadata {
name = "chatterbox-offpeak"
namespace = kubernetes_namespace.tts.metadata[0].name
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = kubernetes_role.offpeak.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.offpeak.metadata[0].name
namespace = kubernetes_namespace.tts.metadata[0].name
}
}
resource "kubernetes_cron_job_v1" "offpeak" {
for_each = local.offpeak_cronjobs
metadata {
name = each.key
namespace = kubernetes_namespace.tts.metadata[0].name
labels = local.labels
}
spec {
schedule = each.value.schedule
timezone = "Europe/London"
concurrency_policy = "Forbid"
starting_deadline_seconds = 120
successful_jobs_history_limit = 1
failed_jobs_history_limit = 3
job_template {
metadata { labels = local.labels }
spec {
backoff_limit = 1
active_deadline_seconds = 120
ttl_seconds_after_finished = 300
template {
metadata { labels = local.labels }
spec {
service_account_name = kubernetes_service_account.offpeak.metadata[0].name
restart_policy = "Never"
container {
name = "vram-gate"
image = "bitnami/kubectl:latest"
command = ["/bin/bash", "-c", local.vram_gate_script]
env {
name = "ACTION"
value = each.value.action
}
env {
name = "FLOOR"
value = tostring(var.vram_free_floor_bytes)
}
env {
name = "GPU_TOTAL"
value = tostring(var.gpu_total_bytes)
}
resources {
requests = { cpu = "20m", memory = "64Mi" }
limits = { memory = "128Mi" }
}
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno mutates dns_config with ndots=2 on CronJobs.
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}