infra/stacks/tts/main.tf
Viktor Barzin 798b025580
All checks were successful
ci/woodpecker/push/default Pipeline was successful
ci/woodpecker/push/build-cli Pipeline was successful
tts+kyverno: non-merge apply trigger (merge-commit diff hid stacks/tts from the stack detector)
The Woodpecker default pipeline selects stacks via git diff HEAD~1 HEAD;
on a merge commit that is the first-parent diff, which contained only the
concurrently-landed files — stacks/tts never got applied (namespace still
absent) and the kyverno re-trigger push got no pipeline at all. Single
non-merge commit touching both stacks so the detector sees them; the
sorted loop applies kyverno before tts, the order tripit#26 requires.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 19:08:23 +00:00

484 lines
18 KiB
HCL
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

variable "image_tag" {
type = string
default = "latest"
description = "chatterbox-tts image tag. Use the 8-char git SHA in CI; :latest for local trials."
}
# ─────────────────────────────────────────────────────────────────────────────
# Option-A off-peak control (see docs/plans/2026-06-08-chatterbox-tts-infra.md §3).
# The Deployment sits at replicas=0; a CronJob scales it to 1 at the window start
# ONLY IF a free-VRAM preflight passes, and another scales it back to 0 at window
# end. A guard CronJob yields the card mid-window if free VRAM drops below the
# floor (a resident woke up). tripit's bake is best-effort + idempotent, so a
# skipped/aborted window simply backfills on the next one (ADR-0002/0004).
# ─────────────────────────────────────────────────────────────────────────────
variable "vram_free_floor_bytes" {
type = number
# OPEN ITEM — must be measured (§5 smoke test / §3.X). This is the minimum free
# VRAM the preflight requires before it will scale Chatterbox up, and the floor
# the guard yields below. Default = 6 GiB ≈ (a conservative guess for
# chatterbox-multilingual FP16 peak ~4 GiB + ~2 GiB headroom for the
# read→cudaMalloc race). RAISE/LOWER once the real T4 peak is captured from
# gpu_pod_memory_used_bytes{namespace="tts"} during a real synth.
default = 6442450944
description = "Minimum free GPU VRAM (bytes) required before scaling Chatterbox up; guard yields below it."
}
variable "gpu_total_bytes" {
type = number
default = 17179869184 # Tesla T4 = 16 GiB
description = "Total VRAM on the shared GPU. Free = this minus sum(gpu_pod_memory_used_bytes)."
}
variable "offpeak_window_up_schedule" {
type = string
default = "0 2 * * *" # 02:00 Europe/London (see timezone on the CronJob)
description = "Cron schedule that fires the free-VRAM preflight + scale-up at window start."
}
variable "offpeak_window_down_schedule" {
type = string
default = "0 6 * * *" # 06:00 Europe/London
description = "Cron schedule that scales Chatterbox back to 0 at window end."
}
variable "offpeak_guard_schedule" {
type = string
default = "*/5 2-5 * * *" # every 5 min inside the 02:0006:00 window
description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops."
}
locals {
namespace = "tts"
labels = { app = "chatterbox-tts" }
image = "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${var.image_tag}"
# config.yaml rendered into a ConfigMap, mounted at /app/config.yaml (the
# server's WORKDIR is /app). Voices, reference audio and the HF model cache
# all live on the NFS-SSD PVC (mounted at /data) so weights persist across
# restarts and load fast. server.port stays at the devnen default 8004; the
# Service remaps 8000->8004 so tripit's default TTS_BASE_URL works unchanged.
#
# model.repo_id = chatterbox-multilingual (ADR-0004; 23 languages for
# worldwide place-names). If the measured T4 VRAM peak is too high to coexist
# even off-peak, fall back to "chatterbox" (English, lighter) — a one-line
# change here (§3.X / §6 decision 3).
chatterbox_config = yamlencode({
server = {
host = "0.0.0.0"
port = 8004
}
model = {
repo_id = "chatterbox-multilingual"
}
tts_engine = {
device = "cuda"
# Predefined voices come from the IMAGE's bundled set (28 reference WAVs
# under the devnen server's /app/voices) rather than the NFS PVC: nobody
# can seed /data/voices without NFS-host shell access, and an empty
# predefined dir means /v1/audio/voices serves nothing (it gates the
# readiness probe). tripit's Voice catalog (tripit#30) names a subset of
# these stems. /data keeps reference_audio (future cloning) + HF cache.
predefined_voices_path = "/app/voices"
reference_audio_path = "/data/reference_audio"
}
})
# Shared script for the off-peak CronJobs. Reads the in-cluster
# gpu_pod_memory_used_bytes gauge (the per-namespace gauge the 2026-06-02
# post-mortem built — host-PID attribution, no new exporter needed), sums it,
# and computes free = GPU_TOTAL - used. Pure POSIX + awk; curl is baked into
# the curl image. ACTION is "up" | "down" | "guard".
# up — scale to 1 ONLY IF free >= FLOOR (positive admission).
# guard — scale to 0 IF free < FLOOR (a resident woke mid-window; yield).
# down — scale to 0 unconditionally (window end).
# Heredoc escaping: only `$${...}` (literal `${...}`) is escaped — Terraform
# would otherwise try to interpolate it. Bare `$(...)`, `$((...))` and awk's
# `$NF` are literal `$` and pass through unescaped.
vram_gate_script = <<-EOT
set -eu
: "$${ACTION:?}" "$${FLOOR:?}" "$${GPU_TOTAL:?}"
METRICS_URL="http://gpu-pod-exporter.nvidia.svc.cluster.local:80/metrics"
# Sum gpu_pod_memory_used_bytes across all pods. Missing metric / empty
# scrape => used=0 (card idle). -f so a non-200 scrape is a hard error we
# treat conservatively (skip scale-up).
if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
echo "WARN: could not scrape $${METRICS_URL}"
if [ "$${ACTION}" = "up" ]; then
echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0
fi
# For down/guard a failed scrape must NOT block yielding the card.
BODY=""
fi
USED="$(printf '%s\n' "$${BODY}" \
| awk '/^gpu_pod_memory_used_bytes\{/ { s += $NF } END { printf "%d", s }')"
USED="$${USED:-0}"
FREE="$(( GPU_TOTAL - USED ))"
echo "GPU VRAM: used=$${USED} free=$${FREE} floor=$${FLOOR} (total=$${GPU_TOTAL})"
case "$${ACTION}" in
up)
if [ "$${FREE}" -ge "$${FLOOR}" ]; then
echo "preflight PASS: free >= floor -> scaling chatterbox-tts to 1"
kubectl -n tts scale deploy/chatterbox-tts --replicas=1
else
echo "preflight SKIP: free < floor -> leaving chatterbox-tts at 0 (retry next window)"
fi
;;
guard)
if [ "$${FREE}" -lt "$${FLOOR}" ]; then
echo "guard TRIP: free < floor -> yielding the card, scaling chatterbox-tts to 0"
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
else
echo "guard OK: free >= floor -> chatterbox-tts may keep running"
fi
;;
down)
echo "window end -> scaling chatterbox-tts to 0"
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
;;
esac
EOT
# Common spec for the three off-peak CronJobs. Each runs one bitnami/kubectl
# pod (in-cluster SA, no kubeconfig) executing the shared gate script with a
# different ACTION. timezone pins the window to Europe/London regardless of
# node TZ.
offpeak_cronjobs = {
chatterbox-window-up = {
schedule = var.offpeak_window_up_schedule
action = "up"
}
chatterbox-window-down = {
schedule = var.offpeak_window_down_schedule
action = "down"
}
chatterbox-vram-guard = {
schedule = var.offpeak_guard_schedule
action = "guard"
}
}
}
resource "kubernetes_namespace" "tts" {
metadata {
name = local.namespace
labels = {
tier = local.tiers.gpu
"istio-injection" = "disabled"
"keel.sh/enrolled" = "true"
}
}
lifecycle {
ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
}
}
# Model weights + voices on NFS-SSD (fast load), RWX so a seed Job / kubectl cp
# can write the predefined voices + narrator reference WAV while the Deployment
# mounts it. Path /srv/nfs-ssd/chatterbox on the Proxmox host. Mirrors
# llama-cpp's nfs_models. First start downloads the model into /data/hf_cache
# (HF_HOME below), so weights persist across pod restarts.
module "nfs_models" {
source = "../../modules/kubernetes/nfs_volume"
name = "chatterbox-models"
namespace = kubernetes_namespace.tts.metadata[0].name
nfs_server = "192.168.1.127"
nfs_path = "/srv/nfs-ssd/chatterbox"
storage = "20Gi" # multilingual weights + HF cache + voices headroom
}
resource "kubernetes_config_map" "chatterbox_config" {
metadata {
name = "chatterbox-config"
namespace = kubernetes_namespace.tts.metadata[0].name
labels = local.labels
}
data = {
"config.yaml" = local.chatterbox_config
}
}
# Single Deployment running the devnen Chatterbox-TTS-Server (OpenAI-compatible
# /v1/audio/speech). Sits at replicas=0 — the off-peak CronJobs below scale it
# to 1 only when the free-VRAM preflight passes (Option A), and back to 0 at
# window end. wait_for_rollout=false so apply never blocks on a pod that is
# intentionally scaled to 0.
resource "kubernetes_deployment" "chatterbox" {
metadata {
name = "chatterbox-tts"
namespace = kubernetes_namespace.tts.metadata[0].name
labels = merge(local.labels, { tier = local.tiers.gpu })
}
wait_for_rollout = false
spec {
# Off-peak control owns the replica count at runtime (CronJobs scale 0<->1).
# Declare 0 here so a plain `tg apply` outside the window doesn't wake the
# card. ignore_changes on replicas (below) stops apply from fighting the
# CronJob's scale.
replicas = 0
strategy { type = "Recreate" }
selector {
match_labels = { app = "chatterbox-tts" }
}
template {
metadata {
labels = { app = "chatterbox-tts" }
annotations = {
"checksum/config" = sha256(local.chatterbox_config)
}
}
spec {
node_selector = { "nvidia.com/gpu.present" = "true" }
toleration {
key = "nvidia.com/gpu"
operator = "Equal"
value = "true"
effect = "NoSchedule"
}
# C-hardening (§3.RECOMMENDATION.3): Chatterbox is a polite, best-effort
# batch tenant — give it the regular tier-2-gpu priority (600000) so it
# is ALWAYS the pod evicted under GPU-node pressure, never immich-ml /
# frigate / llama-swap. This relies on the `tts` namespace being EXCLUDED
# from the Kyverno `inject-gpu-workload-priority` policy (which would
# otherwise stamp the immich-equal gpu-workload=1,200,000 priority on any
# nvidia.com/gpu pod). That exclusion is the two-line edit to the kyverno
# stack flagged in the PR. Without it, this priority_class_name is
# overwritten on pod CREATE and Chatterbox would compete as an equal.
priority_class_name = "tier-2-gpu"
image_pull_secrets { name = "registry-credentials" }
container {
name = "chatterbox-tts"
image = local.image
port {
container_port = 8004
name = "http"
}
# T4 is Turing — NO bf16 (ADR-0004). Pin off; run FP16/FP32.
env {
name = "TTS_BF16"
value = "off"
}
# Park the HuggingFace cache on the NFS-SSD PVC so model weights
# download once and persist across pod restarts (the pod is recreated
# every window). The devnen compose mounts HF cache at /app/hf_cache;
# point HF_HOME at the PVC instead.
env {
name = "HF_HOME"
value = "/data/hf_cache"
}
env {
name = "HF_HUB_CACHE"
value = "/data/hf_cache"
}
volume_mount {
name = "config"
mount_path = "/app/config.yaml"
sub_path = "config.yaml"
}
volume_mount {
name = "models"
mount_path = "/data"
}
# /v1/audio/voices is cheap and only 200s once the model is loaded —
# so it gates real readiness. First start downloads the model, which
# is slow; the generous failure_threshold absorbs that.
readiness_probe {
http_get {
path = "/v1/audio/voices"
port = 8004
}
initial_delay_seconds = 20
period_seconds = 15
failure_threshold = 12
}
liveness_probe {
http_get {
path = "/v1/audio/voices"
port = 8004
}
initial_delay_seconds = 120
period_seconds = 30
failure_threshold = 5
}
resources {
requests = {
cpu = "200m"
memory = "2Gi"
}
limits = {
memory = "8Gi"
"nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
}
}
}
volume {
name = "config"
config_map {
name = kubernetes_config_map.chatterbox_config.metadata[0].name
}
}
volume {
name = "models"
persistent_volume_claim {
claim_name = module.nfs_models.claim_name
}
}
}
}
}
lifecycle {
ignore_changes = [
# Off-peak CronJobs own the replica count — don't let apply reset it.
spec[0].replicas,
spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
metadata[0].annotations["keel.sh/match-tag"],
metadata[0].annotations["keel.sh/policy"],
metadata[0].annotations["keel.sh/trigger"],
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
metadata[0].annotations["kubernetes.io/change-cause"],
metadata[0].annotations["deployment.kubernetes.io/revision"],
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"],
]
}
}
resource "kubernetes_service" "chatterbox" {
metadata {
name = "chatterbox-tts"
namespace = kubernetes_namespace.tts.metadata[0].name
labels = local.labels
annotations = {
# Prometheus annotation-based scrape (mirrors tripit). The devnen server
# has no /metrics; this monitors liveness via the blackbox path and keeps
# the Service in the scrape set if a /metrics endpoint is added later.
"prometheus.io/scrape" = "true"
"prometheus.io/path" = "/v1/audio/voices"
"prometheus.io/port" = "8000"
}
}
spec {
type = "ClusterIP" # in-cluster only — never ingressed (no token needed)
selector = { app = "chatterbox-tts" }
port {
name = "http"
port = 8000 # tripit's default TTS_BASE_URL port
target_port = 8004 # the devnen server's actual listen port
}
}
}
# ─────────────────────────────────────────────────────────────────────────────
# Option-A off-peak control: SA + Role (scale the Deployment) + RoleBinding +
# three CronJobs (window-up preflight, mid-window guard, window-down). Mirrors
# the nextcloud-watchdog in-cluster-kubectl pattern (SA → Role → bitnami/kubectl
# CronJob, no kubeconfig).
# ─────────────────────────────────────────────────────────────────────────────
resource "kubernetes_service_account" "offpeak" {
metadata {
name = "chatterbox-offpeak"
namespace = kubernetes_namespace.tts.metadata[0].name
}
}
resource "kubernetes_role" "offpeak" {
metadata {
name = "chatterbox-offpeak"
namespace = kubernetes_namespace.tts.metadata[0].name
}
# get + patch on the deployment scale subresource is all the gate needs.
rule {
api_groups = ["apps"]
resources = ["deployments", "deployments/scale"]
verbs = ["get", "patch"]
}
}
resource "kubernetes_role_binding" "offpeak" {
metadata {
name = "chatterbox-offpeak"
namespace = kubernetes_namespace.tts.metadata[0].name
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "Role"
name = kubernetes_role.offpeak.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.offpeak.metadata[0].name
namespace = kubernetes_namespace.tts.metadata[0].name
}
}
resource "kubernetes_cron_job_v1" "offpeak" {
for_each = local.offpeak_cronjobs
metadata {
name = each.key
namespace = kubernetes_namespace.tts.metadata[0].name
labels = local.labels
}
spec {
schedule = each.value.schedule
timezone = "Europe/London"
concurrency_policy = "Forbid"
starting_deadline_seconds = 120
successful_jobs_history_limit = 1
failed_jobs_history_limit = 3
job_template {
metadata { labels = local.labels }
spec {
backoff_limit = 1
active_deadline_seconds = 120
ttl_seconds_after_finished = 300
template {
metadata { labels = local.labels }
spec {
service_account_name = kubernetes_service_account.offpeak.metadata[0].name
restart_policy = "Never"
container {
name = "vram-gate"
image = "bitnami/kubectl:latest"
command = ["/bin/bash", "-c", local.vram_gate_script]
env {
name = "ACTION"
value = each.value.action
}
env {
name = "FLOOR"
value = tostring(var.vram_free_floor_bytes)
}
env {
name = "GPU_TOTAL"
value = tostring(var.gpu_total_bytes)
}
resources {
requests = { cpu = "20m", memory = "64Mi" }
limits = { memory = "128Mi" }
}
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1: Kyverno mutates dns_config with ndots=2 on CronJobs.
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
}
# Apply trigger 2026-06-11 (tripit#26): the previous push was a merge commit, so
# the changed-stack detector (git diff HEAD~1 HEAD = first-parent diff) missed
# stacks/tts entirely. Non-merge commit so the diff names this stack.