diff --git a/stacks/kyverno/modules/kyverno/resource-governance.tf b/stacks/kyverno/modules/kyverno/resource-governance.tf index 855128f1..bcbaece0 100644 --- a/stacks/kyverno/modules/kyverno/resource-governance.tf +++ b/stacks/kyverno/modules/kyverno/resource-governance.tf @@ -15,6 +15,15 @@ locals { governance_tiers = ["0-core", "1-cluster", "2-gpu", "3-edge", "4-aux"] excluded_namespaces = ["kube-system", "metallb-system", "kyverno", "calico-system", "calico-apiserver"] + + # GPU-priority injection exclude list. Adds `tts` to the base set so the + # `inject-gpu-workload-priority` policy does NOT stamp the immich-equal + # gpu-workload (1,200,000) priority on Chatterbox-TTS pods. Chatterbox is a + # best-effort off-peak batch tenant on the shared T4: it must keep its + # tier-2-gpu (600,000) priority so it is ALWAYS the pod evicted under GPU-node + # pressure, never immich-ml/frigate/llama-swap. See the tts stack + # (stacks/tts/) + docs/plans/2026-06-08-chatterbox-tts-infra.md §3. + gpu_priority_excluded_namespaces = concat(local.excluded_namespaces, ["tts"]) } # ----------------------------------------------------------------------------- @@ -905,7 +914,10 @@ resource "kubectl_manifest" "mutate_gpu_priority" { any = [ { resources = { - namespaces = local.excluded_namespaces + # tts added so Chatterbox-TTS keeps tier-2-gpu priority (it's a + # best-effort off-peak batch tenant — must be evicted first, + # not promoted to immich-equal gpu-workload). See locals above. + namespaces = local.gpu_priority_excluded_namespaces } } ] diff --git a/stacks/tts/README.md b/stacks/tts/README.md new file mode 100644 index 00000000..f5a972dd --- /dev/null +++ b/stacks/tts/README.md @@ -0,0 +1,149 @@ +# tts — Chatterbox TTS (tripit narration) + +In-cluster text-to-speech for tripit's "Tour guide". Runs the +[devnen/Chatterbox-TTS-Server](https://github.com/devnen/Chatterbox-TTS-Server) +(Resemble AI Chatterbox under an OpenAI-compatible HTTP server) as a single +Deployment + ClusterIP Service `chatterbox-tts.tts.svc.cluster.local:8000`, +requesting **one time-slice** of the shared Tesla T4 (`nvidia.com/gpu: 1`). + +Full design + rationale (Option-A off-peak control, OOM analysis, ADR links): +`docs/plans/2026-06-08-chatterbox-tts-infra.md` (in the tripit-tour-guide repo) +and `infra/docs/post-mortems/2026-06-02-immich-ml-ttl-gpu-oom-recruiter.md`. + +> This stack mirrors `infra/stacks/llama-cpp/`. The scaffolding files +> (`backend.tf`, `providers.tf`, `cloudflare_provider.tf`, `tiers.tf`, +> `.terraform.lock.hcl`) are **generated by Terragrunt** on `init` and are +> git-ignored — only `main.tf`, `terragrunt.hcl` and this README are tracked. + +--- + +## What this stack creates + +- `kubernetes_namespace.tts` — tier `2-gpu`, keel-enrolled, istio off. +- `module.nfs_models` — RWX NFS-SSD PVC at `/srv/nfs-ssd/chatterbox`, mounted at + `/data` (predefined voices, narrator reference WAVs, **and** the HuggingFace + model cache via `HF_HOME=/data/hf_cache`, so weights download once and persist + across the per-window pod recreation). +- `kubernetes_config_map.chatterbox_config` — `config.yaml`: `server.port=8004`, + `model.repo_id=chatterbox-multilingual`, `tts_engine.device=cuda`, voices / + reference paths under `/data`. +- `kubernetes_deployment.chatterbox` — **starts at `replicas=0`**; the off-peak + CronJobs own the replica count at runtime. `TTS_BF16=off` (T4 = Turing, no + bf16). `priority_class_name=tier-2-gpu` (the polite-tenant demotion). +- `kubernetes_service.chatterbox` — ClusterIP, **`port 8000 → targetPort 8004`** + so tripit's default `TTS_BASE_URL` works unchanged. Prometheus scrape + annotations. +- **Off-peak control** (SA + Role + RoleBinding + 3 CronJobs): see below. + +## Off-peak control (Option A — window + free-VRAM gate) + +The T4 is time-sliced with **zero VRAM isolation** (post-mortem 2026-06-02), so +`nvidia.com/gpu: 1` buys a scheduling turn, NOT memory. Chatterbox must only +allocate VRAM when the card is actually free. Implemented as three CronJobs +(all `Europe/London`), each a `bitnami/kubectl` pod using the namespace SA: + +| CronJob | Schedule (default) | Action | +|---|---|---| +| `chatterbox-window-up` | `0 2 * * *` | **Preflight**: scrape `gpu_pod_memory_used_bytes` from `gpu-pod-exporter.nvidia.svc:80/metrics`, compute `free = 16 GiB − Σused`; scale to **1 only if** `free ≥ vram_free_floor_bytes`. | +| `chatterbox-vram-guard` | `*/5 2-5 * * *` | **Guard**: every 5 min in-window, scale to **0** if `free < floor` (a resident woke; yield the card mid-bake). | +| `chatterbox-window-down` | `0 6 * * *` | **Window end**: scale to **0** unconditionally. | + +`tripit`'s bake is best-effort + cached-forever (ADR-0002/0004) — a skipped or +aborted window simply backfills on the next one. No latency SLA. + +### The free-VRAM floor — YOU MUST MEASURE THIS + +`var.vram_free_floor_bytes` defaults to **6 GiB** (a conservative guess: +~4 GiB assumed multilingual FP16 peak + ~2 GiB headroom for the +read→`cudaMalloc` race). **The real T4 peak of `chatterbox-multilingual` is not +published upstream.** Capture it during the first bake: + +```bash +# while a real synth is running on the freed T4: +kubectl -n monitoring exec deploy/prometheus -- \ + promtool query instant http://localhost:9090 \ + 'sum(gpu_pod_memory_used_bytes{namespace="tts"})' +# or read the gauge straight from the exporter: +kubectl -n nvidia exec ds/gpu-pod-exporter -- \ + sh -c 'curl -s localhost:9401/metrics | grep "namespace=\"tts\""' +``` + +Then set the floor to `measured_peak + ~2 GiB` (pass `-var` or add to the stack +tfvars). If the peak is too high to coexist even off-peak, switch +`model.repo_id` in `main.tf` to `chatterbox` (English, lighter) or +`chatterbox-turbo`, or escalate to Option B (scale `immich-machine-learning` to +0 for the window). + +--- + +## Build + push the image (do this BEFORE the first apply) + +`devnen/Chatterbox-TTS-Server` ships **no published image** — build from the +repo's **cu128** target (matches the cluster's pinned 570.195.03 / CUDA 12.8 +driver) and push to the private Forgejo registry. The devvm docker is pre-authed +to `forgejo.viktorbarzin.me`. Run on the devvm (large CUDA image — needs disk + +bandwidth): + +```bash +# 1. Clone the upstream server repo (outside the monorepo). +git clone https://github.com/devnen/Chatterbox-TTS-Server /tmp/chatterbox-tts-server +cd /tmp/chatterbox-tts-server + +# 2. Build the cu128 variant (Dockerfile.cu128 — PyTorch 2.9.0+cu128, the target +# the repo's docker-compose-cu128.yml uses) for linux/amd64. +SHA="$(git rev-parse --short=8 HEAD)" +docker build \ + --platform linux/amd64 \ + --build-arg RUNTIME=nvidia \ + -f Dockerfile.cu128 \ + -t forgejo.viktorbarzin.me/viktor/chatterbox-tts:latest \ + -t "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${SHA}" \ + . + +# 3. Push both tags. (If docker isn't authed: log in with the viktor push PAT +# from Vault — `vault kv get -field=forgejo_push_token secret/ci/global` — +# `docker login forgejo.viktorbarzin.me -u viktor`.) +docker push forgejo.viktorbarzin.me/viktor/chatterbox-tts:latest +docker push "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${SHA}" +``` + +> If `Dockerfile.cu128` is not a clean `docker build` target (e.g. it relies on +> build args defined only in `docker-compose-cu128.yml`), lift those args onto +> the `docker build` line or `docker compose -f docker-compose-cu128.yml build` +> then `docker tag` the resulting `chatterbox-tts-server:cu128` image to the +> Forgejo ref above before pushing. + +--- + +## Apply (admin-gated — run in order) + +```bash +vault login -method=oidc +~/code/scripts/presence claim node:k8s-node1 --purpose "chatterbox-tts first apply (GPU)" +~/code/scripts/presence claim stack:tts --purpose "chatterbox-tts stack apply" + +# 1. The polite-tenant hardening (exclude tts from gpu-workload priority). +~/code/scripts/tg plan --stack kyverno +~/code/scripts/tg apply --stack kyverno + +# 2. This stack. +~/code/scripts/tg plan --stack tts +~/code/scripts/tg apply --stack tts # apply does NOT wake the GPU (replicas=0) + +# 3. Flip tripit narration on. +~/code/scripts/tg plan --stack tripit +~/code/scripts/tg apply --stack tripit +``` + +See `docs/plans/2026-06-08-chatterbox-tts-infra.md` §5 for the full go-live +checklist (seed voices on NFS-SSD, smoke-test a synth, watch the neighbours). + +## Rollback (instant, no data loss) + +- **Narration off:** set `TTS_MODE=none` (or drop the three `TTS_*` lines) in + `stacks/tripit/main.tf` → `tg apply --stack tripit`. The bake makes no audio; + playback falls back to browser TTS. Cached `story_audio` rows are harmless. +- **Chatterbox off the GPU:** `kubectl -n tts scale deploy/chatterbox-tts + --replicas=0` (transient) and/or `tg destroy --stack tts`. Best-effort synth + means tripit bakes keep running audio-less — no error. +- Neither touches the resident GPU tenants (Option A never modifies them). diff --git a/stacks/tts/main.tf b/stacks/tts/main.tf new file mode 100644 index 00000000..2afdfad8 --- /dev/null +++ b/stacks/tts/main.tf @@ -0,0 +1,474 @@ +variable "image_tag" { + type = string + default = "latest" + description = "chatterbox-tts image tag. Use the 8-char git SHA in CI; :latest for local trials." +} + +# ───────────────────────────────────────────────────────────────────────────── +# Option-A off-peak control (see docs/plans/2026-06-08-chatterbox-tts-infra.md §3). +# The Deployment sits at replicas=0; a CronJob scales it to 1 at the window start +# ONLY IF a free-VRAM preflight passes, and another scales it back to 0 at window +# end. A guard CronJob yields the card mid-window if free VRAM drops below the +# floor (a resident woke up). tripit's bake is best-effort + idempotent, so a +# skipped/aborted window simply backfills on the next one (ADR-0002/0004). +# ───────────────────────────────────────────────────────────────────────────── + +variable "vram_free_floor_bytes" { + type = number + # OPEN ITEM — must be measured (§5 smoke test / §3.X). This is the minimum free + # VRAM the preflight requires before it will scale Chatterbox up, and the floor + # the guard yields below. Default = 6 GiB ≈ (a conservative guess for + # chatterbox-multilingual FP16 peak ~4 GiB + ~2 GiB headroom for the + # read→cudaMalloc race). RAISE/LOWER once the real T4 peak is captured from + # gpu_pod_memory_used_bytes{namespace="tts"} during a real synth. + default = 6442450944 + description = "Minimum free GPU VRAM (bytes) required before scaling Chatterbox up; guard yields below it." +} + +variable "gpu_total_bytes" { + type = number + default = 17179869184 # Tesla T4 = 16 GiB + description = "Total VRAM on the shared GPU. Free = this minus sum(gpu_pod_memory_used_bytes)." +} + +variable "offpeak_window_up_schedule" { + type = string + default = "0 2 * * *" # 02:00 Europe/London (see timezone on the CronJob) + description = "Cron schedule that fires the free-VRAM preflight + scale-up at window start." +} + +variable "offpeak_window_down_schedule" { + type = string + default = "0 6 * * *" # 06:00 Europe/London + description = "Cron schedule that scales Chatterbox back to 0 at window end." +} + +variable "offpeak_guard_schedule" { + type = string + default = "*/5 2-5 * * *" # every 5 min inside the 02:00–06:00 window + description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops." +} + +locals { + namespace = "tts" + labels = { app = "chatterbox-tts" } + image = "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${var.image_tag}" + + # config.yaml rendered into a ConfigMap, mounted at /app/config.yaml (the + # server's WORKDIR is /app). Voices, reference audio and the HF model cache + # all live on the NFS-SSD PVC (mounted at /data) so weights persist across + # restarts and load fast. server.port stays at the devnen default 8004; the + # Service remaps 8000->8004 so tripit's default TTS_BASE_URL works unchanged. + # + # model.repo_id = chatterbox-multilingual (ADR-0004; 23 languages for + # worldwide place-names). If the measured T4 VRAM peak is too high to coexist + # even off-peak, fall back to "chatterbox" (English, lighter) — a one-line + # change here (§3.X / §6 decision 3). + chatterbox_config = yamlencode({ + server = { + host = "0.0.0.0" + port = 8004 + } + model = { + repo_id = "chatterbox-multilingual" + } + tts_engine = { + device = "cuda" + predefined_voices_path = "/data/voices" + reference_audio_path = "/data/reference_audio" + } + }) + + # Shared script for the off-peak CronJobs. Reads the in-cluster + # gpu_pod_memory_used_bytes gauge (the per-namespace gauge the 2026-06-02 + # post-mortem built — host-PID attribution, no new exporter needed), sums it, + # and computes free = GPU_TOTAL - used. Pure POSIX + awk; curl is baked into + # the curl image. ACTION is "up" | "down" | "guard". + # up — scale to 1 ONLY IF free >= FLOOR (positive admission). + # guard — scale to 0 IF free < FLOOR (a resident woke mid-window; yield). + # down — scale to 0 unconditionally (window end). + # Heredoc escaping: only `$${...}` (literal `${...}`) is escaped — Terraform + # would otherwise try to interpolate it. Bare `$(...)`, `$((...))` and awk's + # `$NF` are literal `$` and pass through unescaped. + vram_gate_script = <<-EOT + set -eu + : "$${ACTION:?}" "$${FLOOR:?}" "$${GPU_TOTAL:?}" + METRICS_URL="http://gpu-pod-exporter.nvidia.svc.cluster.local:80/metrics" + + # Sum gpu_pod_memory_used_bytes across all pods. Missing metric / empty + # scrape => used=0 (card idle). -f so a non-200 scrape is a hard error we + # treat conservatively (skip scale-up). + if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then + echo "WARN: could not scrape $${METRICS_URL}" + if [ "$${ACTION}" = "up" ]; then + echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0 + fi + # For down/guard a failed scrape must NOT block yielding the card. + BODY="" + fi + USED="$(printf '%s\n' "$${BODY}" \ + | awk '/^gpu_pod_memory_used_bytes\{/ { s += $NF } END { printf "%d", s }')" + USED="$${USED:-0}" + FREE="$(( GPU_TOTAL - USED ))" + echo "GPU VRAM: used=$${USED} free=$${FREE} floor=$${FLOOR} (total=$${GPU_TOTAL})" + + case "$${ACTION}" in + up) + if [ "$${FREE}" -ge "$${FLOOR}" ]; then + echo "preflight PASS: free >= floor -> scaling chatterbox-tts to 1" + kubectl -n tts scale deploy/chatterbox-tts --replicas=1 + else + echo "preflight SKIP: free < floor -> leaving chatterbox-tts at 0 (retry next window)" + fi + ;; + guard) + if [ "$${FREE}" -lt "$${FLOOR}" ]; then + echo "guard TRIP: free < floor -> yielding the card, scaling chatterbox-tts to 0" + kubectl -n tts scale deploy/chatterbox-tts --replicas=0 + else + echo "guard OK: free >= floor -> chatterbox-tts may keep running" + fi + ;; + down) + echo "window end -> scaling chatterbox-tts to 0" + kubectl -n tts scale deploy/chatterbox-tts --replicas=0 + ;; + esac + EOT + + # Common spec for the three off-peak CronJobs. Each runs one bitnami/kubectl + # pod (in-cluster SA, no kubeconfig) executing the shared gate script with a + # different ACTION. timezone pins the window to Europe/London regardless of + # node TZ. + offpeak_cronjobs = { + chatterbox-window-up = { + schedule = var.offpeak_window_up_schedule + action = "up" + } + chatterbox-window-down = { + schedule = var.offpeak_window_down_schedule + action = "down" + } + chatterbox-vram-guard = { + schedule = var.offpeak_guard_schedule + action = "guard" + } + } +} + +resource "kubernetes_namespace" "tts" { + metadata { + name = local.namespace + labels = { + tier = local.tiers.gpu + "istio-injection" = "disabled" + "keel.sh/enrolled" = "true" + } + } + lifecycle { + ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]] + } +} + +# Model weights + voices on NFS-SSD (fast load), RWX so a seed Job / kubectl cp +# can write the predefined voices + narrator reference WAV while the Deployment +# mounts it. Path /srv/nfs-ssd/chatterbox on the Proxmox host. Mirrors +# llama-cpp's nfs_models. First start downloads the model into /data/hf_cache +# (HF_HOME below), so weights persist across pod restarts. +module "nfs_models" { + source = "../../modules/kubernetes/nfs_volume" + name = "chatterbox-models" + namespace = kubernetes_namespace.tts.metadata[0].name + nfs_server = "192.168.1.127" + nfs_path = "/srv/nfs-ssd/chatterbox" + storage = "20Gi" # multilingual weights + HF cache + voices headroom +} + +resource "kubernetes_config_map" "chatterbox_config" { + metadata { + name = "chatterbox-config" + namespace = kubernetes_namespace.tts.metadata[0].name + labels = local.labels + } + data = { + "config.yaml" = local.chatterbox_config + } +} + +# Single Deployment running the devnen Chatterbox-TTS-Server (OpenAI-compatible +# /v1/audio/speech). Sits at replicas=0 — the off-peak CronJobs below scale it +# to 1 only when the free-VRAM preflight passes (Option A), and back to 0 at +# window end. wait_for_rollout=false so apply never blocks on a pod that is +# intentionally scaled to 0. +resource "kubernetes_deployment" "chatterbox" { + metadata { + name = "chatterbox-tts" + namespace = kubernetes_namespace.tts.metadata[0].name + labels = merge(local.labels, { tier = local.tiers.gpu }) + } + wait_for_rollout = false + spec { + # Off-peak control owns the replica count at runtime (CronJobs scale 0<->1). + # Declare 0 here so a plain `tg apply` outside the window doesn't wake the + # card. ignore_changes on replicas (below) stops apply from fighting the + # CronJob's scale. + replicas = 0 + strategy { type = "Recreate" } + selector { + match_labels = { app = "chatterbox-tts" } + } + template { + metadata { + labels = { app = "chatterbox-tts" } + annotations = { + "checksum/config" = sha256(local.chatterbox_config) + } + } + spec { + node_selector = { "nvidia.com/gpu.present" = "true" } + toleration { + key = "nvidia.com/gpu" + operator = "Equal" + value = "true" + effect = "NoSchedule" + } + # C-hardening (§3.RECOMMENDATION.3): Chatterbox is a polite, best-effort + # batch tenant — give it the regular tier-2-gpu priority (600000) so it + # is ALWAYS the pod evicted under GPU-node pressure, never immich-ml / + # frigate / llama-swap. This relies on the `tts` namespace being EXCLUDED + # from the Kyverno `inject-gpu-workload-priority` policy (which would + # otherwise stamp the immich-equal gpu-workload=1,200,000 priority on any + # nvidia.com/gpu pod). That exclusion is the two-line edit to the kyverno + # stack flagged in the PR. Without it, this priority_class_name is + # overwritten on pod CREATE and Chatterbox would compete as an equal. + priority_class_name = "tier-2-gpu" + + image_pull_secrets { name = "registry-credentials" } + + container { + name = "chatterbox-tts" + image = local.image + port { + container_port = 8004 + name = "http" + } + + # T4 is Turing — NO bf16 (ADR-0004). Pin off; run FP16/FP32. + env { + name = "TTS_BF16" + value = "off" + } + # Park the HuggingFace cache on the NFS-SSD PVC so model weights + # download once and persist across pod restarts (the pod is recreated + # every window). The devnen compose mounts HF cache at /app/hf_cache; + # point HF_HOME at the PVC instead. + env { + name = "HF_HOME" + value = "/data/hf_cache" + } + env { + name = "HF_HUB_CACHE" + value = "/data/hf_cache" + } + + volume_mount { + name = "config" + mount_path = "/app/config.yaml" + sub_path = "config.yaml" + } + volume_mount { + name = "models" + mount_path = "/data" + } + + # /v1/audio/voices is cheap and only 200s once the model is loaded — + # so it gates real readiness. First start downloads the model, which + # is slow; the generous failure_threshold absorbs that. + readiness_probe { + http_get { + path = "/v1/audio/voices" + port = 8004 + } + initial_delay_seconds = 20 + period_seconds = 15 + failure_threshold = 12 + } + liveness_probe { + http_get { + path = "/v1/audio/voices" + port = 8004 + } + initial_delay_seconds = 120 + period_seconds = 30 + failure_threshold = 5 + } + resources { + requests = { + cpu = "200m" + memory = "2Gi" + } + limits = { + memory = "8Gi" + "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card + } + } + } + + volume { + name = "config" + config_map { + name = kubernetes_config_map.chatterbox_config.metadata[0].name + } + } + volume { + name = "models" + persistent_volume_claim { + claim_name = module.nfs_models.claim_name + } + } + } + } + } + lifecycle { + ignore_changes = [ + # Off-peak CronJobs own the replica count — don't let apply reset it. + spec[0].replicas, + spec[0].template[0].spec[0].dns_config, # KYVERNO_LIFECYCLE_V1 + spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE + metadata[0].annotations["keel.sh/match-tag"], + metadata[0].annotations["keel.sh/policy"], + metadata[0].annotations["keel.sh/trigger"], + metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2 + metadata[0].annotations["kubernetes.io/change-cause"], + metadata[0].annotations["deployment.kubernetes.io/revision"], + spec[0].template[0].metadata[0].annotations["keel.sh/update-time"], + ] + } +} + +resource "kubernetes_service" "chatterbox" { + metadata { + name = "chatterbox-tts" + namespace = kubernetes_namespace.tts.metadata[0].name + labels = local.labels + annotations = { + # Prometheus annotation-based scrape (mirrors tripit). The devnen server + # has no /metrics; this monitors liveness via the blackbox path and keeps + # the Service in the scrape set if a /metrics endpoint is added later. + "prometheus.io/scrape" = "true" + "prometheus.io/path" = "/v1/audio/voices" + "prometheus.io/port" = "8000" + } + } + spec { + type = "ClusterIP" # in-cluster only — never ingressed (no token needed) + selector = { app = "chatterbox-tts" } + port { + name = "http" + port = 8000 # tripit's default TTS_BASE_URL port + target_port = 8004 # the devnen server's actual listen port + } + } +} + +# ───────────────────────────────────────────────────────────────────────────── +# Option-A off-peak control: SA + Role (scale the Deployment) + RoleBinding + +# three CronJobs (window-up preflight, mid-window guard, window-down). Mirrors +# the nextcloud-watchdog in-cluster-kubectl pattern (SA → Role → bitnami/kubectl +# CronJob, no kubeconfig). +# ───────────────────────────────────────────────────────────────────────────── + +resource "kubernetes_service_account" "offpeak" { + metadata { + name = "chatterbox-offpeak" + namespace = kubernetes_namespace.tts.metadata[0].name + } +} + +resource "kubernetes_role" "offpeak" { + metadata { + name = "chatterbox-offpeak" + namespace = kubernetes_namespace.tts.metadata[0].name + } + # get + patch on the deployment scale subresource is all the gate needs. + rule { + api_groups = ["apps"] + resources = ["deployments", "deployments/scale"] + verbs = ["get", "patch"] + } +} + +resource "kubernetes_role_binding" "offpeak" { + metadata { + name = "chatterbox-offpeak" + namespace = kubernetes_namespace.tts.metadata[0].name + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.offpeak.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = kubernetes_service_account.offpeak.metadata[0].name + namespace = kubernetes_namespace.tts.metadata[0].name + } +} + +resource "kubernetes_cron_job_v1" "offpeak" { + for_each = local.offpeak_cronjobs + + metadata { + name = each.key + namespace = kubernetes_namespace.tts.metadata[0].name + labels = local.labels + } + spec { + schedule = each.value.schedule + timezone = "Europe/London" + concurrency_policy = "Forbid" + starting_deadline_seconds = 120 + successful_jobs_history_limit = 1 + failed_jobs_history_limit = 3 + job_template { + metadata { labels = local.labels } + spec { + backoff_limit = 1 + active_deadline_seconds = 120 + ttl_seconds_after_finished = 300 + template { + metadata { labels = local.labels } + spec { + service_account_name = kubernetes_service_account.offpeak.metadata[0].name + restart_policy = "Never" + container { + name = "vram-gate" + image = "bitnami/kubectl:latest" + command = ["/bin/bash", "-c", local.vram_gate_script] + env { + name = "ACTION" + value = each.value.action + } + env { + name = "FLOOR" + value = tostring(var.vram_free_floor_bytes) + } + env { + name = "GPU_TOTAL" + value = tostring(var.gpu_total_bytes) + } + resources { + requests = { cpu = "20m", memory = "64Mi" } + limits = { memory = "128Mi" } + } + } + } + } + } + } + } + lifecycle { + # KYVERNO_LIFECYCLE_V1: Kyverno mutates dns_config with ndots=2 on CronJobs. + ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config] + } +} diff --git a/stacks/tts/terragrunt.hcl b/stacks/tts/terragrunt.hcl new file mode 100644 index 00000000..6bc02966 --- /dev/null +++ b/stacks/tts/terragrunt.hcl @@ -0,0 +1,36 @@ +include "root" { + path = find_in_parent_folders() +} + +dependency "platform" { + config_path = "../platform" + skip_outputs = true +} + +dependency "vault" { + config_path = "../vault" + skip_outputs = true +} + +# tts: in-cluster text-to-speech for tripit's "Tour guide" narration. +# One Deployment of `forgejo.viktorbarzin.me/viktor/chatterbox-tts` (devnen +# Chatterbox-TTS-Server, OpenAI-compatible /v1/audio/speech) at a single +# ClusterIP Service `chatterbox-tts.tts.svc:8000` (server listens on 8004; +# the Service remaps). Requests ONE time-slice of the shared T4 +# (nvidia.com/gpu=1) — a slice, not the card. +# +# OOM-avoidance (Option A, docs/plans/2026-06-08-chatterbox-tts-infra.md §3): +# the Deployment sits at replicas=0; an off-peak CronJob scales it to 1 at the +# 02:00–06:00 Europe/London window ONLY IF a free-VRAM preflight passes +# (gpu_pod_memory_used_bytes from gpu-pod-exporter), a guard CronJob yields the +# card mid-window if a resident wakes, and a window-down CronJob scales back to +# 0. tripit's bake is best-effort + cached-forever (ADR-0002/0004), so a +# skipped/aborted window simply backfills next time — no latency SLA. +# +# Polite-tenant hardening: the `tts` namespace must be EXCLUDED from the kyverno +# `inject-gpu-workload-priority` policy (a separate two-line edit to the kyverno +# stack) so Chatterbox keeps tier-2-gpu priority (600000) and is always the pod +# evicted under pressure — never immich-ml/frigate/llama-swap. +# +# Image is built from the devnen repo + pushed to Forgejo — see this stack's +# README.md for the exact docker build + push commands.