New `infra/stacks/tts/` deploys devnen/Chatterbox-TTS-Server (OpenAI-compatible /v1/audio/speech) as ClusterIP `chatterbox-tts.tts.svc:8000` (server listens on 8004; Service remaps), requesting ONE T4 time-slice. Mirrors stacks/llama-cpp/. Option A off-peak control (no VRAM isolation on the time-sliced T4 — see post-mortem 2026-06-02): Deployment sits at replicas=0; three Europe/London CronJobs own the replica count — `chatterbox-window-up` scales to 1 at 02:00 ONLY IF a free-VRAM preflight passes (sum gpu_pod_memory_used_bytes from gpu-pod-exporter; free = 16GiB - used >= floor), `chatterbox-vram-guard` yields the card mid-window if a resident wakes, `chatterbox-window-down` scales to 0 at 06:00. tripit's bake is best-effort + cached-forever (ADR-0002/0004) so a skipped/aborted window backfills next time. SA+Role+RoleBinding grant the CronJobs deployments/scale (nextcloud-watchdog pattern). Polite-tenant hardening: kyverno `inject-gpu-workload-priority` now excludes the `tts` namespace (new `gpu_priority_excluded_namespaces` local) so Chatterbox keeps tier-2-gpu priority (600k) and is always evicted first under GPU pressure — never immich-ml/frigate/llama-swap. The LimitRange-fallback policy still uses the base exclude list (tts untouched there). tripit: add TTS_MODE=openai_compatible, TTS_BASE_URL, TTS_MODEL=chatterbox to local.app_env (no token — ClusterIP only). No tripit code change. Image build is documented in stacks/tts/README.md (devnen cu128 target -> forgejo.viktorbarzin.me/viktor/chatterbox-tts) — build is impractical inline (large CUDA image + needs the upstream repo). NOT APPLIED — review branch only. Free-VRAM floor (var.vram_free_floor_bytes, default 6GiB) must be set from the measured chatterbox-multilingual T4 peak during the first bake. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
474 lines
18 KiB
HCL
474 lines
18 KiB
HCL
variable "image_tag" {
|
||
type = string
|
||
default = "latest"
|
||
description = "chatterbox-tts image tag. Use the 8-char git SHA in CI; :latest for local trials."
|
||
}
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Option-A off-peak control (see docs/plans/2026-06-08-chatterbox-tts-infra.md §3).
|
||
# The Deployment sits at replicas=0; a CronJob scales it to 1 at the window start
|
||
# ONLY IF a free-VRAM preflight passes, and another scales it back to 0 at window
|
||
# end. A guard CronJob yields the card mid-window if free VRAM drops below the
|
||
# floor (a resident woke up). tripit's bake is best-effort + idempotent, so a
|
||
# skipped/aborted window simply backfills on the next one (ADR-0002/0004).
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
variable "vram_free_floor_bytes" {
|
||
type = number
|
||
# OPEN ITEM — must be measured (§5 smoke test / §3.X). This is the minimum free
|
||
# VRAM the preflight requires before it will scale Chatterbox up, and the floor
|
||
# the guard yields below. Default = 6 GiB ≈ (a conservative guess for
|
||
# chatterbox-multilingual FP16 peak ~4 GiB + ~2 GiB headroom for the
|
||
# read→cudaMalloc race). RAISE/LOWER once the real T4 peak is captured from
|
||
# gpu_pod_memory_used_bytes{namespace="tts"} during a real synth.
|
||
default = 6442450944
|
||
description = "Minimum free GPU VRAM (bytes) required before scaling Chatterbox up; guard yields below it."
|
||
}
|
||
|
||
variable "gpu_total_bytes" {
|
||
type = number
|
||
default = 17179869184 # Tesla T4 = 16 GiB
|
||
description = "Total VRAM on the shared GPU. Free = this minus sum(gpu_pod_memory_used_bytes)."
|
||
}
|
||
|
||
variable "offpeak_window_up_schedule" {
|
||
type = string
|
||
default = "0 2 * * *" # 02:00 Europe/London (see timezone on the CronJob)
|
||
description = "Cron schedule that fires the free-VRAM preflight + scale-up at window start."
|
||
}
|
||
|
||
variable "offpeak_window_down_schedule" {
|
||
type = string
|
||
default = "0 6 * * *" # 06:00 Europe/London
|
||
description = "Cron schedule that scales Chatterbox back to 0 at window end."
|
||
}
|
||
|
||
variable "offpeak_guard_schedule" {
|
||
type = string
|
||
default = "*/5 2-5 * * *" # every 5 min inside the 02:00–06:00 window
|
||
description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops."
|
||
}
|
||
|
||
locals {
|
||
namespace = "tts"
|
||
labels = { app = "chatterbox-tts" }
|
||
image = "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${var.image_tag}"
|
||
|
||
# config.yaml rendered into a ConfigMap, mounted at /app/config.yaml (the
|
||
# server's WORKDIR is /app). Voices, reference audio and the HF model cache
|
||
# all live on the NFS-SSD PVC (mounted at /data) so weights persist across
|
||
# restarts and load fast. server.port stays at the devnen default 8004; the
|
||
# Service remaps 8000->8004 so tripit's default TTS_BASE_URL works unchanged.
|
||
#
|
||
# model.repo_id = chatterbox-multilingual (ADR-0004; 23 languages for
|
||
# worldwide place-names). If the measured T4 VRAM peak is too high to coexist
|
||
# even off-peak, fall back to "chatterbox" (English, lighter) — a one-line
|
||
# change here (§3.X / §6 decision 3).
|
||
chatterbox_config = yamlencode({
|
||
server = {
|
||
host = "0.0.0.0"
|
||
port = 8004
|
||
}
|
||
model = {
|
||
repo_id = "chatterbox-multilingual"
|
||
}
|
||
tts_engine = {
|
||
device = "cuda"
|
||
predefined_voices_path = "/data/voices"
|
||
reference_audio_path = "/data/reference_audio"
|
||
}
|
||
})
|
||
|
||
# Shared script for the off-peak CronJobs. Reads the in-cluster
|
||
# gpu_pod_memory_used_bytes gauge (the per-namespace gauge the 2026-06-02
|
||
# post-mortem built — host-PID attribution, no new exporter needed), sums it,
|
||
# and computes free = GPU_TOTAL - used. Pure POSIX + awk; curl is baked into
|
||
# the curl image. ACTION is "up" | "down" | "guard".
|
||
# up — scale to 1 ONLY IF free >= FLOOR (positive admission).
|
||
# guard — scale to 0 IF free < FLOOR (a resident woke mid-window; yield).
|
||
# down — scale to 0 unconditionally (window end).
|
||
# Heredoc escaping: only `$${...}` (literal `${...}`) is escaped — Terraform
|
||
# would otherwise try to interpolate it. Bare `$(...)`, `$((...))` and awk's
|
||
# `$NF` are literal `$` and pass through unescaped.
|
||
vram_gate_script = <<-EOT
|
||
set -eu
|
||
: "$${ACTION:?}" "$${FLOOR:?}" "$${GPU_TOTAL:?}"
|
||
METRICS_URL="http://gpu-pod-exporter.nvidia.svc.cluster.local:80/metrics"
|
||
|
||
# Sum gpu_pod_memory_used_bytes across all pods. Missing metric / empty
|
||
# scrape => used=0 (card idle). -f so a non-200 scrape is a hard error we
|
||
# treat conservatively (skip scale-up).
|
||
if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
|
||
echo "WARN: could not scrape $${METRICS_URL}"
|
||
if [ "$${ACTION}" = "up" ]; then
|
||
echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0
|
||
fi
|
||
# For down/guard a failed scrape must NOT block yielding the card.
|
||
BODY=""
|
||
fi
|
||
USED="$(printf '%s\n' "$${BODY}" \
|
||
| awk '/^gpu_pod_memory_used_bytes\{/ { s += $NF } END { printf "%d", s }')"
|
||
USED="$${USED:-0}"
|
||
FREE="$(( GPU_TOTAL - USED ))"
|
||
echo "GPU VRAM: used=$${USED} free=$${FREE} floor=$${FLOOR} (total=$${GPU_TOTAL})"
|
||
|
||
case "$${ACTION}" in
|
||
up)
|
||
if [ "$${FREE}" -ge "$${FLOOR}" ]; then
|
||
echo "preflight PASS: free >= floor -> scaling chatterbox-tts to 1"
|
||
kubectl -n tts scale deploy/chatterbox-tts --replicas=1
|
||
else
|
||
echo "preflight SKIP: free < floor -> leaving chatterbox-tts at 0 (retry next window)"
|
||
fi
|
||
;;
|
||
guard)
|
||
if [ "$${FREE}" -lt "$${FLOOR}" ]; then
|
||
echo "guard TRIP: free < floor -> yielding the card, scaling chatterbox-tts to 0"
|
||
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
|
||
else
|
||
echo "guard OK: free >= floor -> chatterbox-tts may keep running"
|
||
fi
|
||
;;
|
||
down)
|
||
echo "window end -> scaling chatterbox-tts to 0"
|
||
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
|
||
;;
|
||
esac
|
||
EOT
|
||
|
||
# Common spec for the three off-peak CronJobs. Each runs one bitnami/kubectl
|
||
# pod (in-cluster SA, no kubeconfig) executing the shared gate script with a
|
||
# different ACTION. timezone pins the window to Europe/London regardless of
|
||
# node TZ.
|
||
offpeak_cronjobs = {
|
||
chatterbox-window-up = {
|
||
schedule = var.offpeak_window_up_schedule
|
||
action = "up"
|
||
}
|
||
chatterbox-window-down = {
|
||
schedule = var.offpeak_window_down_schedule
|
||
action = "down"
|
||
}
|
||
chatterbox-vram-guard = {
|
||
schedule = var.offpeak_guard_schedule
|
||
action = "guard"
|
||
}
|
||
}
|
||
}
|
||
|
||
resource "kubernetes_namespace" "tts" {
|
||
metadata {
|
||
name = local.namespace
|
||
labels = {
|
||
tier = local.tiers.gpu
|
||
"istio-injection" = "disabled"
|
||
"keel.sh/enrolled" = "true"
|
||
}
|
||
}
|
||
lifecycle {
|
||
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
|
||
}
|
||
}
|
||
|
||
# Model weights + voices on NFS-SSD (fast load), RWX so a seed Job / kubectl cp
|
||
# can write the predefined voices + narrator reference WAV while the Deployment
|
||
# mounts it. Path /srv/nfs-ssd/chatterbox on the Proxmox host. Mirrors
|
||
# llama-cpp's nfs_models. First start downloads the model into /data/hf_cache
|
||
# (HF_HOME below), so weights persist across pod restarts.
|
||
module "nfs_models" {
|
||
source = "../../modules/kubernetes/nfs_volume"
|
||
name = "chatterbox-models"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
nfs_server = "192.168.1.127"
|
||
nfs_path = "/srv/nfs-ssd/chatterbox"
|
||
storage = "20Gi" # multilingual weights + HF cache + voices headroom
|
||
}
|
||
|
||
resource "kubernetes_config_map" "chatterbox_config" {
|
||
metadata {
|
||
name = "chatterbox-config"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
labels = local.labels
|
||
}
|
||
data = {
|
||
"config.yaml" = local.chatterbox_config
|
||
}
|
||
}
|
||
|
||
# Single Deployment running the devnen Chatterbox-TTS-Server (OpenAI-compatible
|
||
# /v1/audio/speech). Sits at replicas=0 — the off-peak CronJobs below scale it
|
||
# to 1 only when the free-VRAM preflight passes (Option A), and back to 0 at
|
||
# window end. wait_for_rollout=false so apply never blocks on a pod that is
|
||
# intentionally scaled to 0.
|
||
resource "kubernetes_deployment" "chatterbox" {
|
||
metadata {
|
||
name = "chatterbox-tts"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
labels = merge(local.labels, { tier = local.tiers.gpu })
|
||
}
|
||
wait_for_rollout = false
|
||
spec {
|
||
# Off-peak control owns the replica count at runtime (CronJobs scale 0<->1).
|
||
# Declare 0 here so a plain `tg apply` outside the window doesn't wake the
|
||
# card. ignore_changes on replicas (below) stops apply from fighting the
|
||
# CronJob's scale.
|
||
replicas = 0
|
||
strategy { type = "Recreate" }
|
||
selector {
|
||
match_labels = { app = "chatterbox-tts" }
|
||
}
|
||
template {
|
||
metadata {
|
||
labels = { app = "chatterbox-tts" }
|
||
annotations = {
|
||
"checksum/config" = sha256(local.chatterbox_config)
|
||
}
|
||
}
|
||
spec {
|
||
node_selector = { "nvidia.com/gpu.present" = "true" }
|
||
toleration {
|
||
key = "nvidia.com/gpu"
|
||
operator = "Equal"
|
||
value = "true"
|
||
effect = "NoSchedule"
|
||
}
|
||
# C-hardening (§3.RECOMMENDATION.3): Chatterbox is a polite, best-effort
|
||
# batch tenant — give it the regular tier-2-gpu priority (600000) so it
|
||
# is ALWAYS the pod evicted under GPU-node pressure, never immich-ml /
|
||
# frigate / llama-swap. This relies on the `tts` namespace being EXCLUDED
|
||
# from the Kyverno `inject-gpu-workload-priority` policy (which would
|
||
# otherwise stamp the immich-equal gpu-workload=1,200,000 priority on any
|
||
# nvidia.com/gpu pod). That exclusion is the two-line edit to the kyverno
|
||
# stack flagged in the PR. Without it, this priority_class_name is
|
||
# overwritten on pod CREATE and Chatterbox would compete as an equal.
|
||
priority_class_name = "tier-2-gpu"
|
||
|
||
image_pull_secrets { name = "registry-credentials" }
|
||
|
||
container {
|
||
name = "chatterbox-tts"
|
||
image = local.image
|
||
port {
|
||
container_port = 8004
|
||
name = "http"
|
||
}
|
||
|
||
# T4 is Turing — NO bf16 (ADR-0004). Pin off; run FP16/FP32.
|
||
env {
|
||
name = "TTS_BF16"
|
||
value = "off"
|
||
}
|
||
# Park the HuggingFace cache on the NFS-SSD PVC so model weights
|
||
# download once and persist across pod restarts (the pod is recreated
|
||
# every window). The devnen compose mounts HF cache at /app/hf_cache;
|
||
# point HF_HOME at the PVC instead.
|
||
env {
|
||
name = "HF_HOME"
|
||
value = "/data/hf_cache"
|
||
}
|
||
env {
|
||
name = "HF_HUB_CACHE"
|
||
value = "/data/hf_cache"
|
||
}
|
||
|
||
volume_mount {
|
||
name = "config"
|
||
mount_path = "/app/config.yaml"
|
||
sub_path = "config.yaml"
|
||
}
|
||
volume_mount {
|
||
name = "models"
|
||
mount_path = "/data"
|
||
}
|
||
|
||
# /v1/audio/voices is cheap and only 200s once the model is loaded —
|
||
# so it gates real readiness. First start downloads the model, which
|
||
# is slow; the generous failure_threshold absorbs that.
|
||
readiness_probe {
|
||
http_get {
|
||
path = "/v1/audio/voices"
|
||
port = 8004
|
||
}
|
||
initial_delay_seconds = 20
|
||
period_seconds = 15
|
||
failure_threshold = 12
|
||
}
|
||
liveness_probe {
|
||
http_get {
|
||
path = "/v1/audio/voices"
|
||
port = 8004
|
||
}
|
||
initial_delay_seconds = 120
|
||
period_seconds = 30
|
||
failure_threshold = 5
|
||
}
|
||
resources {
|
||
requests = {
|
||
cpu = "200m"
|
||
memory = "2Gi"
|
||
}
|
||
limits = {
|
||
memory = "8Gi"
|
||
"nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
|
||
}
|
||
}
|
||
}
|
||
|
||
volume {
|
||
name = "config"
|
||
config_map {
|
||
name = kubernetes_config_map.chatterbox_config.metadata[0].name
|
||
}
|
||
}
|
||
volume {
|
||
name = "models"
|
||
persistent_volume_claim {
|
||
claim_name = module.nfs_models.claim_name
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
lifecycle {
|
||
ignore_changes = [
|
||
# Off-peak CronJobs own the replica count — don't let apply reset it.
|
||
spec[0].replicas,
|
||
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
|
||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
|
||
metadata[0].annotations["keel.sh/match-tag"],
|
||
metadata[0].annotations["keel.sh/policy"],
|
||
metadata[0].annotations["keel.sh/trigger"],
|
||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||
metadata[0].annotations["kubernetes.io/change-cause"],
|
||
metadata[0].annotations["deployment.kubernetes.io/revision"],
|
||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"],
|
||
]
|
||
}
|
||
}
|
||
|
||
resource "kubernetes_service" "chatterbox" {
|
||
metadata {
|
||
name = "chatterbox-tts"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
labels = local.labels
|
||
annotations = {
|
||
# Prometheus annotation-based scrape (mirrors tripit). The devnen server
|
||
# has no /metrics; this monitors liveness via the blackbox path and keeps
|
||
# the Service in the scrape set if a /metrics endpoint is added later.
|
||
"prometheus.io/scrape" = "true"
|
||
"prometheus.io/path" = "/v1/audio/voices"
|
||
"prometheus.io/port" = "8000"
|
||
}
|
||
}
|
||
spec {
|
||
type = "ClusterIP" # in-cluster only — never ingressed (no token needed)
|
||
selector = { app = "chatterbox-tts" }
|
||
port {
|
||
name = "http"
|
||
port = 8000 # tripit's default TTS_BASE_URL port
|
||
target_port = 8004 # the devnen server's actual listen port
|
||
}
|
||
}
|
||
}
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Option-A off-peak control: SA + Role (scale the Deployment) + RoleBinding +
|
||
# three CronJobs (window-up preflight, mid-window guard, window-down). Mirrors
|
||
# the nextcloud-watchdog in-cluster-kubectl pattern (SA → Role → bitnami/kubectl
|
||
# CronJob, no kubeconfig).
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
resource "kubernetes_service_account" "offpeak" {
|
||
metadata {
|
||
name = "chatterbox-offpeak"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
}
|
||
}
|
||
|
||
resource "kubernetes_role" "offpeak" {
|
||
metadata {
|
||
name = "chatterbox-offpeak"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
}
|
||
# get + patch on the deployment scale subresource is all the gate needs.
|
||
rule {
|
||
api_groups = ["apps"]
|
||
resources = ["deployments", "deployments/scale"]
|
||
verbs = ["get", "patch"]
|
||
}
|
||
}
|
||
|
||
resource "kubernetes_role_binding" "offpeak" {
|
||
metadata {
|
||
name = "chatterbox-offpeak"
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
}
|
||
role_ref {
|
||
api_group = "rbac.authorization.k8s.io"
|
||
kind = "Role"
|
||
name = kubernetes_role.offpeak.metadata[0].name
|
||
}
|
||
subject {
|
||
kind = "ServiceAccount"
|
||
name = kubernetes_service_account.offpeak.metadata[0].name
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
}
|
||
}
|
||
|
||
resource "kubernetes_cron_job_v1" "offpeak" {
|
||
for_each = local.offpeak_cronjobs
|
||
|
||
metadata {
|
||
name = each.key
|
||
namespace = kubernetes_namespace.tts.metadata[0].name
|
||
labels = local.labels
|
||
}
|
||
spec {
|
||
schedule = each.value.schedule
|
||
timezone = "Europe/London"
|
||
concurrency_policy = "Forbid"
|
||
starting_deadline_seconds = 120
|
||
successful_jobs_history_limit = 1
|
||
failed_jobs_history_limit = 3
|
||
job_template {
|
||
metadata { labels = local.labels }
|
||
spec {
|
||
backoff_limit = 1
|
||
active_deadline_seconds = 120
|
||
ttl_seconds_after_finished = 300
|
||
template {
|
||
metadata { labels = local.labels }
|
||
spec {
|
||
service_account_name = kubernetes_service_account.offpeak.metadata[0].name
|
||
restart_policy = "Never"
|
||
container {
|
||
name = "vram-gate"
|
||
image = "bitnami/kubectl:latest"
|
||
command = ["/bin/bash", "-c", local.vram_gate_script]
|
||
env {
|
||
name = "ACTION"
|
||
value = each.value.action
|
||
}
|
||
env {
|
||
name = "FLOOR"
|
||
value = tostring(var.vram_free_floor_bytes)
|
||
}
|
||
env {
|
||
name = "GPU_TOTAL"
|
||
value = tostring(var.gpu_total_bytes)
|
||
}
|
||
resources {
|
||
requests = { cpu = "20m", memory = "64Mi" }
|
||
limits = { memory = "128Mi" }
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
lifecycle {
|
||
# KYVERNO_LIFECYCLE_V1: Kyverno mutates dns_config with ndots=2 on CronJobs.
|
||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||
}
|
||
}
|