infra/stacks/tts/main.tf

variable "image_tag" {
  type        = string
  default     = "latest"
  description = "chatterbox-tts image tag. Use the 8-char git SHA in CI; :latest for local trials."
}

# ─────────────────────────────────────────────────────────────────────────────
# Option-A off-peak control (see docs/plans/2026-06-08-chatterbox-tts-infra.md §3).
# The Deployment sits at replicas=0; a CronJob scales it to 1 at the window start
# ONLY IF a free-VRAM preflight passes, and another scales it back to 0 at window
# end. A guard CronJob yields the card mid-window if free VRAM drops below the
# floor (a resident woke up). tripit's bake is best-effort + idempotent, so a
# skipped/aborted window simply backfills on the next one (ADR-0002/0004).
# ─────────────────────────────────────────────────────────────────────────────

variable "vram_free_floor_bytes" {
  type = number
  # OPEN ITEM — must be measured (§5 smoke test / §3.X). This is the minimum free
  # VRAM the preflight requires before it will scale Chatterbox up, and the floor
  # the guard yields below. Default = 6 GiB ≈ (a conservative guess for
  # chatterbox-multilingual FP16 peak ~4 GiB + ~2 GiB headroom for the
  # read→cudaMalloc race). RAISE/LOWER once the real T4 peak is captured from
  # gpu_pod_memory_used_bytes{namespace="tts"} during a real synth.
  default     = 6442450944
  description = "Minimum free GPU VRAM (bytes) required before scaling Chatterbox up; guard yields below it."
}

variable "gpu_total_bytes" {
  type        = number
  default     = 17179869184 # Tesla T4 = 16 GiB
  description = "Total VRAM on the shared GPU. Free = this minus sum(gpu_pod_memory_used_bytes)."
}

variable "offpeak_window_up_schedule" {
  type        = string
  default     = "0 2 * * *" # 02:00 Europe/London (see timezone on the CronJob)
  description = "Cron schedule that fires the free-VRAM preflight + scale-up at window start."
}

variable "offpeak_window_down_schedule" {
  type        = string
  default     = "0 6 * * *" # 06:00 Europe/London
  description = "Cron schedule that scales Chatterbox back to 0 at window end."
}

variable "offpeak_guard_schedule" {
  type        = string
  default     = "*/5 2-5 * * *" # every 5 min inside the 02:00–06:00 window
  description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops."
}

locals {
  namespace = "tts"
  labels    = { app = "chatterbox-tts" }
  image     = "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${var.image_tag}"

  # config.yaml rendered into a ConfigMap, mounted at /app/config.yaml (the
  # server's WORKDIR is /app). Voices, reference audio and the HF model cache
  # all live on the NFS-SSD PVC (mounted at /data) so weights persist across
  # restarts and load fast. server.port stays at the devnen default 8004; the
  # Service remaps 8000->8004 so tripit's default TTS_BASE_URL works unchanged.
  #
  # model.repo_id = chatterbox-multilingual (ADR-0004; 23 languages for
  # worldwide place-names). If the measured T4 VRAM peak is too high to coexist
  # even off-peak, fall back to "chatterbox" (English, lighter) — a one-line
  # change here (§3.X / §6 decision 3).
  chatterbox_config = yamlencode({
    server = {
      host = "0.0.0.0"
      port = 8004
    }
    model = {
      repo_id = "chatterbox-multilingual"
    }
    tts_engine = {
      device                 = "cuda"
      predefined_voices_path = "/data/voices"
      reference_audio_path   = "/data/reference_audio"
    }
  })

  # Shared script for the off-peak CronJobs. Reads the in-cluster
  # gpu_pod_memory_used_bytes gauge (the per-namespace gauge the 2026-06-02
  # post-mortem built — host-PID attribution, no new exporter needed), sums it,
  # and computes free = GPU_TOTAL - used. Pure POSIX + awk; curl is baked into
  # the curl image. ACTION is "up" | "down" | "guard".
  #   up    — scale to 1 ONLY IF free >= FLOOR (positive admission).
  #   guard — scale to 0 IF free < FLOOR (a resident woke mid-window; yield).
  #   down  — scale to 0 unconditionally (window end).
  # Heredoc escaping: only `$${...}` (literal `${...}`) is escaped — Terraform
  # would otherwise try to interpolate it. Bare `$(...)`, `$((...))` and awk's
  # `$NF` are literal `$` and pass through unescaped.
  vram_gate_script = <<-EOT
    set -eu
    : "$${ACTION:?}" "$${FLOOR:?}" "$${GPU_TOTAL:?}"
    METRICS_URL="http://gpu-pod-exporter.nvidia.svc.cluster.local:80/metrics"

    # Sum gpu_pod_memory_used_bytes across all pods. Missing metric / empty
    # scrape => used=0 (card idle). -f so a non-200 scrape is a hard error we
    # treat conservatively (skip scale-up).
    if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
      echo "WARN: could not scrape $${METRICS_URL}"
      if [ "$${ACTION}" = "up" ]; then
        echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0
      fi
      # For down/guard a failed scrape must NOT block yielding the card.
      BODY=""
    fi
    USED="$(printf '%s\n' "$${BODY}" \
      | awk '/^gpu_pod_memory_used_bytes\{/ { s += $NF } END { printf "%d", s }')"
    USED="$${USED:-0}"
    FREE="$(( GPU_TOTAL - USED ))"
    echo "GPU VRAM: used=$${USED} free=$${FREE} floor=$${FLOOR} (total=$${GPU_TOTAL})"

    case "$${ACTION}" in
      up)
        if [ "$${FREE}" -ge "$${FLOOR}" ]; then
          echo "preflight PASS: free >= floor -> scaling chatterbox-tts to 1"
          kubectl -n tts scale deploy/chatterbox-tts --replicas=1
        else
          echo "preflight SKIP: free < floor -> leaving chatterbox-tts at 0 (retry next window)"
        fi
        ;;
      guard)
        if [ "$${FREE}" -lt "$${FLOOR}" ]; then
          echo "guard TRIP: free < floor -> yielding the card, scaling chatterbox-tts to 0"
          kubectl -n tts scale deploy/chatterbox-tts --replicas=0
        else
          echo "guard OK: free >= floor -> chatterbox-tts may keep running"
        fi
        ;;
      down)
        echo "window end -> scaling chatterbox-tts to 0"
        kubectl -n tts scale deploy/chatterbox-tts --replicas=0
        ;;
    esac
  EOT

  # Common spec for the three off-peak CronJobs. Each runs one bitnami/kubectl
  # pod (in-cluster SA, no kubeconfig) executing the shared gate script with a
  # different ACTION. timezone pins the window to Europe/London regardless of
  # node TZ.
  offpeak_cronjobs = {
    chatterbox-window-up = {
      schedule = var.offpeak_window_up_schedule
      action   = "up"
    }
    chatterbox-window-down = {
      schedule = var.offpeak_window_down_schedule
      action   = "down"
    }
    chatterbox-vram-guard = {
      schedule = var.offpeak_guard_schedule
      action   = "guard"
    }
  }
}

resource "kubernetes_namespace" "tts" {
  metadata {
    name = local.namespace
    labels = {
      tier               = local.tiers.gpu
      "istio-injection"  = "disabled"
      "keel.sh/enrolled" = "true"
    }
  }
  lifecycle {
    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
  }
}

# Model weights + voices on NFS-SSD (fast load), RWX so a seed Job / kubectl cp
# can write the predefined voices + narrator reference WAV while the Deployment
# mounts it. Path /srv/nfs-ssd/chatterbox on the Proxmox host. Mirrors
# llama-cpp's nfs_models. First start downloads the model into /data/hf_cache
# (HF_HOME below), so weights persist across pod restarts.
module "nfs_models" {
  source     = "../../modules/kubernetes/nfs_volume"
  name       = "chatterbox-models"
  namespace  = kubernetes_namespace.tts.metadata[0].name
  nfs_server = "192.168.1.127"
  nfs_path   = "/srv/nfs-ssd/chatterbox"
  storage    = "20Gi" # multilingual weights + HF cache + voices headroom
}

resource "kubernetes_config_map" "chatterbox_config" {
  metadata {
    name      = "chatterbox-config"
    namespace = kubernetes_namespace.tts.metadata[0].name
    labels    = local.labels
  }
  data = {
    "config.yaml" = local.chatterbox_config
  }
}

# Single Deployment running the devnen Chatterbox-TTS-Server (OpenAI-compatible
# /v1/audio/speech). Sits at replicas=0 — the off-peak CronJobs below scale it
# to 1 only when the free-VRAM preflight passes (Option A), and back to 0 at
# window end. wait_for_rollout=false so apply never blocks on a pod that is
# intentionally scaled to 0.
resource "kubernetes_deployment" "chatterbox" {
  metadata {
    name      = "chatterbox-tts"
    namespace = kubernetes_namespace.tts.metadata[0].name
    labels    = merge(local.labels, { tier = local.tiers.gpu })
  }
  wait_for_rollout = false
  spec {
    # Off-peak control owns the replica count at runtime (CronJobs scale 0<->1).
    # Declare 0 here so a plain `tg apply` outside the window doesn't wake the
    # card. ignore_changes on replicas (below) stops apply from fighting the
    # CronJob's scale.
    replicas = 0
    strategy { type = "Recreate" }
    selector {
      match_labels = { app = "chatterbox-tts" }
    }
    template {
      metadata {
        labels = { app = "chatterbox-tts" }
        annotations = {
          "checksum/config" = sha256(local.chatterbox_config)
        }
      }
      spec {
        node_selector = { "nvidia.com/gpu.present" = "true" }
        toleration {
          key      = "nvidia.com/gpu"
          operator = "Equal"
          value    = "true"
          effect   = "NoSchedule"
        }
        # C-hardening (§3.RECOMMENDATION.3): Chatterbox is a polite, best-effort
        # batch tenant — give it the regular tier-2-gpu priority (600000) so it
        # is ALWAYS the pod evicted under GPU-node pressure, never immich-ml /
        # frigate / llama-swap. This relies on the `tts` namespace being EXCLUDED
        # from the Kyverno `inject-gpu-workload-priority` policy (which would
        # otherwise stamp the immich-equal gpu-workload=1,200,000 priority on any
        # nvidia.com/gpu pod). That exclusion is the two-line edit to the kyverno
        # stack flagged in the PR. Without it, this priority_class_name is
        # overwritten on pod CREATE and Chatterbox would compete as an equal.
        priority_class_name = "tier-2-gpu"

        image_pull_secrets { name = "registry-credentials" }

        container {
          name  = "chatterbox-tts"
          image = local.image
          port {
            container_port = 8004
            name           = "http"
          }

          # T4 is Turing — NO bf16 (ADR-0004). Pin off; run FP16/FP32.
          env {
            name  = "TTS_BF16"
            value = "off"
          }
          # Park the HuggingFace cache on the NFS-SSD PVC so model weights
          # download once and persist across pod restarts (the pod is recreated
          # every window). The devnen compose mounts HF cache at /app/hf_cache;
          # point HF_HOME at the PVC instead.
          env {
            name  = "HF_HOME"
            value = "/data/hf_cache"
          }
          env {
            name  = "HF_HUB_CACHE"
            value = "/data/hf_cache"
          }

          volume_mount {
            name       = "config"
            mount_path = "/app/config.yaml"
            sub_path   = "config.yaml"
          }
          volume_mount {
            name       = "models"
            mount_path = "/data"
          }

          # /v1/audio/voices is cheap and only 200s once the model is loaded —
          # so it gates real readiness. First start downloads the model, which
          # is slow; the generous failure_threshold absorbs that.
          readiness_probe {
            http_get {
              path = "/v1/audio/voices"
              port = 8004
            }
            initial_delay_seconds = 20
            period_seconds        = 15
            failure_threshold     = 12
          }
          liveness_probe {
            http_get {
              path = "/v1/audio/voices"
              port = 8004
            }
            initial_delay_seconds = 120
            period_seconds        = 30
            failure_threshold     = 5
          }
          resources {
            requests = {
              cpu    = "200m"
              memory = "2Gi"
            }
            limits = {
              memory           = "8Gi"
              "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
            }
          }
        }

        volume {
          name = "config"
          config_map {
            name = kubernetes_config_map.chatterbox_config.metadata[0].name
          }
        }
        volume {
          name = "models"
          persistent_volume_claim {
            claim_name = module.nfs_models.claim_name
          }
        }
      }
    }
  }
  lifecycle {
    ignore_changes = [
      # Off-peak CronJobs own the replica count — don't let apply reset it.
      spec[0].replicas,
      spec[0].template[0].spec[0].dns_config,         # KYVERNO_LIFECYCLE_V1
      spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
      metadata[0].annotations["keel.sh/match-tag"],
      metadata[0].annotations["keel.sh/policy"],
      metadata[0].annotations["keel.sh/trigger"],
      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
      metadata[0].annotations["kubernetes.io/change-cause"],
      metadata[0].annotations["deployment.kubernetes.io/revision"],
      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"],
    ]
  }
}

resource "kubernetes_service" "chatterbox" {
  metadata {
    name      = "chatterbox-tts"
    namespace = kubernetes_namespace.tts.metadata[0].name
    labels    = local.labels
    annotations = {
      # Prometheus annotation-based scrape (mirrors tripit). The devnen server
      # has no /metrics; this monitors liveness via the blackbox path and keeps
      # the Service in the scrape set if a /metrics endpoint is added later.
      "prometheus.io/scrape" = "true"
      "prometheus.io/path"   = "/v1/audio/voices"
      "prometheus.io/port"   = "8000"
    }
  }
  spec {
    type     = "ClusterIP" # in-cluster only — never ingressed (no token needed)
    selector = { app = "chatterbox-tts" }
    port {
      name        = "http"
      port        = 8000 # tripit's default TTS_BASE_URL port
      target_port = 8004 # the devnen server's actual listen port
    }
  }
}

# ─────────────────────────────────────────────────────────────────────────────
# Option-A off-peak control: SA + Role (scale the Deployment) + RoleBinding +
# three CronJobs (window-up preflight, mid-window guard, window-down). Mirrors
# the nextcloud-watchdog in-cluster-kubectl pattern (SA → Role → bitnami/kubectl
# CronJob, no kubeconfig).
# ─────────────────────────────────────────────────────────────────────────────

resource "kubernetes_service_account" "offpeak" {
  metadata {
    name      = "chatterbox-offpeak"
    namespace = kubernetes_namespace.tts.metadata[0].name
  }
}

resource "kubernetes_role" "offpeak" {
  metadata {
    name      = "chatterbox-offpeak"
    namespace = kubernetes_namespace.tts.metadata[0].name
  }
  # get + patch on the deployment scale subresource is all the gate needs.
  rule {
    api_groups = ["apps"]
    resources  = ["deployments", "deployments/scale"]
    verbs      = ["get", "patch"]
  }
}

resource "kubernetes_role_binding" "offpeak" {
  metadata {
    name      = "chatterbox-offpeak"
    namespace = kubernetes_namespace.tts.metadata[0].name
  }
  role_ref {
    api_group = "rbac.authorization.k8s.io"
    kind      = "Role"
    name      = kubernetes_role.offpeak.metadata[0].name
  }
  subject {
    kind      = "ServiceAccount"
    name      = kubernetes_service_account.offpeak.metadata[0].name
    namespace = kubernetes_namespace.tts.metadata[0].name
  }
}

resource "kubernetes_cron_job_v1" "offpeak" {
  for_each = local.offpeak_cronjobs

  metadata {
    name      = each.key
    namespace = kubernetes_namespace.tts.metadata[0].name
    labels    = local.labels
  }
  spec {
    schedule                      = each.value.schedule
    timezone                      = "Europe/London"
    concurrency_policy            = "Forbid"
    starting_deadline_seconds     = 120
    successful_jobs_history_limit = 1
    failed_jobs_history_limit     = 3
    job_template {
      metadata { labels = local.labels }
      spec {
        backoff_limit              = 1
        active_deadline_seconds    = 120
        ttl_seconds_after_finished = 300
        template {
          metadata { labels = local.labels }
          spec {
            service_account_name = kubernetes_service_account.offpeak.metadata[0].name
            restart_policy       = "Never"
            container {
              name    = "vram-gate"
              image   = "bitnami/kubectl:latest"
              command = ["/bin/bash", "-c", local.vram_gate_script]
              env {
                name  = "ACTION"
                value = each.value.action
              }
              env {
                name  = "FLOOR"
                value = tostring(var.vram_free_floor_bytes)
              }
              env {
                name  = "GPU_TOTAL"
                value = tostring(var.gpu_total_bytes)
              }
              resources {
                requests = { cpu = "20m", memory = "64Mi" }
                limits   = { memory = "128Mi" }
              }
            }
          }
        }
      }
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: Kyverno mutates dns_config with ndots=2 on CronJobs.
    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
  }
}
-												feat(tts): Chatterbox TTS stack + off-peak T4 gate, wire tripit narration [ci skip]

New `infra/stacks/tts/` deploys devnen/Chatterbox-TTS-Server (OpenAI-compatible
/v1/audio/speech) as ClusterIP `chatterbox-tts.tts.svc:8000` (server listens on
8004; Service remaps), requesting ONE T4 time-slice. Mirrors stacks/llama-cpp/.

Option A off-peak control (no VRAM isolation on the time-sliced T4 — see
post-mortem 2026-06-02): Deployment sits at replicas=0; three Europe/London
CronJobs own the replica count — `chatterbox-window-up` scales to 1 at 02:00
ONLY IF a free-VRAM preflight passes (sum gpu_pod_memory_used_bytes from
gpu-pod-exporter; free = 16GiB - used >= floor), `chatterbox-vram-guard` yields
the card mid-window if a resident wakes, `chatterbox-window-down` scales to 0 at
06:00. tripit's bake is best-effort + cached-forever (ADR-0002/0004) so a
skipped/aborted window backfills next time. SA+Role+RoleBinding grant the
CronJobs deployments/scale (nextcloud-watchdog pattern).

Polite-tenant hardening: kyverno `inject-gpu-workload-priority` now excludes the
`tts` namespace (new `gpu_priority_excluded_namespaces` local) so Chatterbox
keeps tier-2-gpu priority (600k) and is always evicted first under GPU pressure
— never immich-ml/frigate/llama-swap. The LimitRange-fallback policy still uses
the base exclude list (tts untouched there).

tripit: add TTS_MODE=openai_compatible, TTS_BASE_URL, TTS_MODEL=chatterbox to
local.app_env (no token — ClusterIP only). No tripit code change.

Image build is documented in stacks/tts/README.md (devnen cu128 target ->
forgejo.viktorbarzin.me/viktor/chatterbox-tts) — build is impractical inline
(large CUDA image + needs the upstream repo). NOT APPLIED — review branch only.
Free-VRAM floor (var.vram_free_floor_bytes, default 6GiB) must be set from the
measured chatterbox-multilingual T4 peak during the first bake.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

											
										
										
											2026-06-09 07:30:19 +00:00
+								variable "image_tag" {
 								  type        = string
 								  default     = "latest"
 								  description = "chatterbox-tts image tag. Use the 8-char git SHA in CI; :latest for local trials."
 								}
 								# ─────────────────────────────────────────────────────────────────────────────
 								# Option-A off-peak control (see docs/plans/2026-06-08-chatterbox-tts-infra.md §3).
 								# The Deployment sits at replicas=0; a CronJob scales it to 1 at the window start
 								# ONLY IF a free-VRAM preflight passes, and another scales it back to 0 at window
 								# end. A guard CronJob yields the card mid-window if free VRAM drops below the
 								# floor (a resident woke up). tripit's bake is best-effort + idempotent, so a
 								# skipped/aborted window simply backfills on the next one (ADR-0002/0004).
 								# ─────────────────────────────────────────────────────────────────────────────
 								variable "vram_free_floor_bytes" {
 								  type = number
 								  # OPEN ITEM — must be measured (§5 smoke test / §3.X). This is the minimum free
 								  # VRAM the preflight requires before it will scale Chatterbox up, and the floor
 								  # the guard yields below. Default = 6 GiB ≈ (a conservative guess for
 								  # chatterbox-multilingual FP16 peak ~4 GiB + ~2 GiB headroom for the
 								  # read→cudaMalloc race). RAISE/LOWER once the real T4 peak is captured from
 								  # gpu_pod_memory_used_bytes{namespace="tts"} during a real synth.
 								  default     = 6442450944
 								  description = "Minimum free GPU VRAM (bytes) required before scaling Chatterbox up; guard yields below it."
 								}
 								variable "gpu_total_bytes" {
 								  type        = number
 								  default     = 17179869184 # Tesla T4 = 16 GiB
 								  description = "Total VRAM on the shared GPU. Free = this minus sum(gpu_pod_memory_used_bytes)."
 								}
 								variable "offpeak_window_up_schedule" {
 								  type        = string
 								  default     = "0 2 * * *" # 02:00 Europe/London (see timezone on the CronJob)
 								  description = "Cron schedule that fires the free-VRAM preflight + scale-up at window start."
 								}
 								variable "offpeak_window_down_schedule" {
 								  type        = string
 								  default     = "0 6 * * *" # 06:00 Europe/London
 								  description = "Cron schedule that scales Chatterbox back to 0 at window end."
 								}
 								variable "offpeak_guard_schedule" {
 								  type        = string
 								  default     = "*/5 2-5 * * *" # every 5 min inside the 02:00–06:00 window
 								  description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops."
 								}
 								locals {
 								  namespace = "tts"
 								  labels    = { app = "chatterbox-tts" }
 								  image     = "forgejo.viktorbarzin.me/viktor/chatterbox-tts:${var.image_tag}"
 								  # config.yaml rendered into a ConfigMap, mounted at /app/config.yaml (the
 								  # server's WORKDIR is /app). Voices, reference audio and the HF model cache
 								  # all live on the NFS-SSD PVC (mounted at /data) so weights persist across
 								  # restarts and load fast. server.port stays at the devnen default 8004; the
 								  # Service remaps 8000->8004 so tripit's default TTS_BASE_URL works unchanged.
 								  #
 								  # model.repo_id = chatterbox-multilingual (ADR-0004; 23 languages for
 								  # worldwide place-names). If the measured T4 VRAM peak is too high to coexist
 								  # even off-peak, fall back to "chatterbox" (English, lighter) — a one-line
 								  # change here (§3.X / §6 decision 3).
 								  chatterbox_config = yamlencode({
 								    server = {
 								      host = "0.0.0.0"
 								      port = 8004
 								    }
 								    model = {
 								      repo_id = "chatterbox-multilingual"
 								    }
 								    tts_engine = {
 								      device                 = "cuda"
 								      predefined_voices_path = "/data/voices"
 								      reference_audio_path   = "/data/reference_audio"
 								    }
 								  })
 								  # Shared script for the off-peak CronJobs. Reads the in-cluster
 								  # gpu_pod_memory_used_bytes gauge (the per-namespace gauge the 2026-06-02
 								  # post-mortem built — host-PID attribution, no new exporter needed), sums it,
 								  # and computes free = GPU_TOTAL - used. Pure POSIX + awk; curl is baked into
 								  # the curl image. ACTION is "up" | "down" | "guard".
 								  #   up    — scale to 1 ONLY IF free >= FLOOR (positive admission).
 								  #   guard — scale to 0 IF free < FLOOR (a resident woke mid-window; yield).
 								  #   down  — scale to 0 unconditionally (window end).
 								  # Heredoc escaping: only `$${...}` (literal `${...}`) is escaped — Terraform
 								  # would otherwise try to interpolate it. Bare `$(...)`, `$((...))` and awk's
 								  # `$NF` are literal `$` and pass through unescaped.
 								  vram_gate_script = <<-EOT
 								    set -eu
 								    : "$${ACTION:?}" "$${FLOOR:?}" "$${GPU_TOTAL:?}"
 								    METRICS_URL="http://gpu-pod-exporter.nvidia.svc.cluster.local:80/metrics"
 								    # Sum gpu_pod_memory_used_bytes across all pods. Missing metric / empty
 								    # scrape => used=0 (card idle). -f so a non-200 scrape is a hard error we
 								    # treat conservatively (skip scale-up).
 								    if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
 								      echo "WARN: could not scrape $${METRICS_URL}"
 								      if [ "$${ACTION}" = "up" ]; then
 								        echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0
 								      fi
 								      # For down/guard a failed scrape must NOT block yielding the card.
 								      BODY=""
 								    fi
 								    USED="$(printf '%s\n' "$${BODY}" \
 								      | awk '/^gpu_pod_memory_used_bytes\{/ { s += $NF } END { printf "%d", s }')"
 								    USED="$${USED:-0}"
 								    FREE="$(( GPU_TOTAL - USED ))"
 								    echo "GPU VRAM: used=$${USED} free=$${FREE} floor=$${FLOOR} (total=$${GPU_TOTAL})"
 								    case "$${ACTION}" in
 								      up)
 								        if [ "$${FREE}" -ge "$${FLOOR}" ]; then
 								          echo "preflight PASS: free >= floor -> scaling chatterbox-tts to 1"
 								          kubectl -n tts scale deploy/chatterbox-tts --replicas=1
 								        else
 								          echo "preflight SKIP: free < floor -> leaving chatterbox-tts at 0 (retry next window)"
 								        fi
 								        ;;
 								      guard)
 								        if [ "$${FREE}" -lt "$${FLOOR}" ]; then
 								          echo "guard TRIP: free < floor -> yielding the card, scaling chatterbox-tts to 0"
 								          kubectl -n tts scale deploy/chatterbox-tts --replicas=0
 								        else
 								          echo "guard OK: free >= floor -> chatterbox-tts may keep running"
 								        fi
 								        ;;
 								      down)
 								        echo "window end -> scaling chatterbox-tts to 0"
 								        kubectl -n tts scale deploy/chatterbox-tts --replicas=0
 								        ;;
 								    esac
 								  EOT
 								  # Common spec for the three off-peak CronJobs. Each runs one bitnami/kubectl
 								  # pod (in-cluster SA, no kubeconfig) executing the shared gate script with a
 								  # different ACTION. timezone pins the window to Europe/London regardless of
 								  # node TZ.
 								  offpeak_cronjobs = {
 								    chatterbox-window-up = {
 								      schedule = var.offpeak_window_up_schedule
 								      action   = "up"
 								    }
 								    chatterbox-window-down = {
 								      schedule = var.offpeak_window_down_schedule
 								      action   = "down"
 								    }
 								    chatterbox-vram-guard = {
 								      schedule = var.offpeak_guard_schedule
 								      action   = "guard"
 								    }
 								  }
 								}
 								resource "kubernetes_namespace" "tts" {
 								  metadata {
 								    name = local.namespace
 								    labels = {
 								      tier               = local.tiers.gpu
 								      "istio-injection"  = "disabled"
 								      "keel.sh/enrolled" = "true"
 								    }
 								  }
 								  lifecycle {
 								    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
 								  }
 								}
 								# Model weights + voices on NFS-SSD (fast load), RWX so a seed Job / kubectl cp
 								# can write the predefined voices + narrator reference WAV while the Deployment
 								# mounts it. Path /srv/nfs-ssd/chatterbox on the Proxmox host. Mirrors
 								# llama-cpp's nfs_models. First start downloads the model into /data/hf_cache
 								# (HF_HOME below), so weights persist across pod restarts.
 								module "nfs_models" {
 								  source     = "../../modules/kubernetes/nfs_volume"
 								  name       = "chatterbox-models"
 								  namespace  = kubernetes_namespace.tts.metadata[0].name
 								  nfs_server = "192.168.1.127"
 								  nfs_path   = "/srv/nfs-ssd/chatterbox"
 								  storage    = "20Gi" # multilingual weights + HF cache + voices headroom
 								}
 								resource "kubernetes_config_map" "chatterbox_config" {
 								  metadata {
 								    name      = "chatterbox-config"
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								    labels    = local.labels
 								  }
 								  data = {
 								    "config.yaml" = local.chatterbox_config
 								  }
 								}
 								# Single Deployment running the devnen Chatterbox-TTS-Server (OpenAI-compatible
 								# /v1/audio/speech). Sits at replicas=0 — the off-peak CronJobs below scale it
 								# to 1 only when the free-VRAM preflight passes (Option A), and back to 0 at
 								# window end. wait_for_rollout=false so apply never blocks on a pod that is
 								# intentionally scaled to 0.
 								resource "kubernetes_deployment" "chatterbox" {
 								  metadata {
 								    name      = "chatterbox-tts"
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								    labels    = merge(local.labels, { tier = local.tiers.gpu })
 								  }
 								  wait_for_rollout = false
 								  spec {
 								    # Off-peak control owns the replica count at runtime (CronJobs scale 0<->1).
 								    # Declare 0 here so a plain `tg apply` outside the window doesn't wake the
 								    # card. ignore_changes on replicas (below) stops apply from fighting the
 								    # CronJob's scale.
 								    replicas = 0
 								    strategy { type = "Recreate" }
 								    selector {
 								      match_labels = { app = "chatterbox-tts" }
 								    }
 								    template {
 								      metadata {
 								        labels = { app = "chatterbox-tts" }
 								        annotations = {
 								          "checksum/config" = sha256(local.chatterbox_config)
 								        }
 								      }
 								      spec {
 								        node_selector = { "nvidia.com/gpu.present" = "true" }
 								        toleration {
 								          key      = "nvidia.com/gpu"
 								          operator = "Equal"
 								          value    = "true"
 								          effect   = "NoSchedule"
 								        }
 								        # C-hardening (§3.RECOMMENDATION.3): Chatterbox is a polite, best-effort
 								        # batch tenant — give it the regular tier-2-gpu priority (600000) so it
 								        # is ALWAYS the pod evicted under GPU-node pressure, never immich-ml /
 								        # frigate / llama-swap. This relies on the `tts` namespace being EXCLUDED
 								        # from the Kyverno `inject-gpu-workload-priority` policy (which would
 								        # otherwise stamp the immich-equal gpu-workload=1,200,000 priority on any
 								        # nvidia.com/gpu pod). That exclusion is the two-line edit to the kyverno
 								        # stack flagged in the PR. Without it, this priority_class_name is
 								        # overwritten on pod CREATE and Chatterbox would compete as an equal.
 								        priority_class_name = "tier-2-gpu"
 								        image_pull_secrets { name = "registry-credentials" }
 								        container {
 								          name  = "chatterbox-tts"
 								          image = local.image
 								          port {
 								            container_port = 8004
 								            name           = "http"
 								          }
 								          # T4 is Turing — NO bf16 (ADR-0004). Pin off; run FP16/FP32.
 								          env {
 								            name  = "TTS_BF16"
 								            value = "off"
 								          }
 								          # Park the HuggingFace cache on the NFS-SSD PVC so model weights
 								          # download once and persist across pod restarts (the pod is recreated
 								          # every window). The devnen compose mounts HF cache at /app/hf_cache;
 								          # point HF_HOME at the PVC instead.
 								          env {
 								            name  = "HF_HOME"
 								            value = "/data/hf_cache"
 								          }
 								          env {
 								            name  = "HF_HUB_CACHE"
 								            value = "/data/hf_cache"
 								          }
 								          volume_mount {
 								            name       = "config"
 								            mount_path = "/app/config.yaml"
 								            sub_path   = "config.yaml"
 								          }
 								          volume_mount {
 								            name       = "models"
 								            mount_path = "/data"
 								          }
 								          # /v1/audio/voices is cheap and only 200s once the model is loaded —
 								          # so it gates real readiness. First start downloads the model, which
 								          # is slow; the generous failure_threshold absorbs that.
 								          readiness_probe {
 								            http_get {
 								              path = "/v1/audio/voices"
 								              port = 8004
 								            }
 								            initial_delay_seconds = 20
 								            period_seconds        = 15
 								            failure_threshold     = 12
 								          }
 								          liveness_probe {
 								            http_get {
 								              path = "/v1/audio/voices"
 								              port = 8004
 								            }
 								            initial_delay_seconds = 120
 								            period_seconds        = 30
 								            failure_threshold     = 5
 								          }
 								          resources {
 								            requests = {
 								              cpu    = "200m"
 								              memory = "2Gi"
 								            }
 								            limits = {
 								              memory           = "8Gi"
 								              "nvidia.com/gpu" = "1" # ONE time-slice (operator advertises 100), NOT the whole card
 								            }
 								          }
 								        }
 								        volume {
 								          name = "config"
 								          config_map {
 								            name = kubernetes_config_map.chatterbox_config.metadata[0].name
 								          }
 								        }
 								        volume {
 								          name = "models"
 								          persistent_volume_claim {
 								            claim_name = module.nfs_models.claim_name
 								          }
 								        }
 								      }
 								    }
 								  }
 								  lifecycle {
 								    ignore_changes = [
 								      # Off-peak CronJobs own the replica count — don't let apply reset it.
 								      spec[0].replicas,
 								      spec[0].template[0].spec[0].dns_config,         # KYVERNO_LIFECYCLE_V1
 								      spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE
 								      metadata[0].annotations["keel.sh/match-tag"],
 								      metadata[0].annotations["keel.sh/policy"],
 								      metadata[0].annotations["keel.sh/trigger"],
 								      metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
 								      metadata[0].annotations["kubernetes.io/change-cause"],
 								      metadata[0].annotations["deployment.kubernetes.io/revision"],
 								      spec[0].template[0].metadata[0].annotations["keel.sh/update-time"],
 								    ]
 								  }
 								}
 								resource "kubernetes_service" "chatterbox" {
 								  metadata {
 								    name      = "chatterbox-tts"
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								    labels    = local.labels
 								    annotations = {
 								      # Prometheus annotation-based scrape (mirrors tripit). The devnen server
 								      # has no /metrics; this monitors liveness via the blackbox path and keeps
 								      # the Service in the scrape set if a /metrics endpoint is added later.
 								      "prometheus.io/scrape" = "true"
 								      "prometheus.io/path"   = "/v1/audio/voices"
 								      "prometheus.io/port"   = "8000"
 								    }
 								  }
 								  spec {
 								    type     = "ClusterIP" # in-cluster only — never ingressed (no token needed)
 								    selector = { app = "chatterbox-tts" }
 								    port {
 								      name        = "http"
 								      port        = 8000 # tripit's default TTS_BASE_URL port
 								      target_port = 8004 # the devnen server's actual listen port
 								    }
 								  }
 								}
 								# ─────────────────────────────────────────────────────────────────────────────
 								# Option-A off-peak control: SA + Role (scale the Deployment) + RoleBinding +
 								# three CronJobs (window-up preflight, mid-window guard, window-down). Mirrors
 								# the nextcloud-watchdog in-cluster-kubectl pattern (SA → Role → bitnami/kubectl
 								# CronJob, no kubeconfig).
 								# ─────────────────────────────────────────────────────────────────────────────
 								resource "kubernetes_service_account" "offpeak" {
 								  metadata {
 								    name      = "chatterbox-offpeak"
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								  }
 								}
 								resource "kubernetes_role" "offpeak" {
 								  metadata {
 								    name      = "chatterbox-offpeak"
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								  }
 								  # get + patch on the deployment scale subresource is all the gate needs.
 								  rule {
 								    api_groups = ["apps"]
 								    resources  = ["deployments", "deployments/scale"]
 								    verbs      = ["get", "patch"]
 								  }
 								}
 								resource "kubernetes_role_binding" "offpeak" {
 								  metadata {
 								    name      = "chatterbox-offpeak"
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								  }
 								  role_ref {
 								    api_group = "rbac.authorization.k8s.io"
 								    kind      = "Role"
 								    name      = kubernetes_role.offpeak.metadata[0].name
 								  }
 								  subject {
 								    kind      = "ServiceAccount"
 								    name      = kubernetes_service_account.offpeak.metadata[0].name
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								  }
 								}
 								resource "kubernetes_cron_job_v1" "offpeak" {
 								  for_each = local.offpeak_cronjobs
 								  metadata {
 								    name      = each.key
 								    namespace = kubernetes_namespace.tts.metadata[0].name
 								    labels    = local.labels
 								  }
 								  spec {
 								    schedule                      = each.value.schedule
 								    timezone                      = "Europe/London"
 								    concurrency_policy            = "Forbid"
 								    starting_deadline_seconds     = 120
 								    successful_jobs_history_limit = 1
 								    failed_jobs_history_limit     = 3
 								    job_template {
 								      metadata { labels = local.labels }
 								      spec {
 								        backoff_limit              = 1
 								        active_deadline_seconds    = 120
 								        ttl_seconds_after_finished = 300
 								        template {
 								          metadata { labels = local.labels }
 								          spec {
 								            service_account_name = kubernetes_service_account.offpeak.metadata[0].name
 								            restart_policy       = "Never"
 								            container {
 								              name    = "vram-gate"
 								              image   = "bitnami/kubectl:latest"
 								              command = ["/bin/bash", "-c", local.vram_gate_script]
 								              env {
 								                name  = "ACTION"
 								                value = each.value.action
 								              }
 								              env {
 								                name  = "FLOOR"
 								                value = tostring(var.vram_free_floor_bytes)
 								              }
 								              env {
 								                name  = "GPU_TOTAL"
 								                value = tostring(var.gpu_total_bytes)
 								              }
 								              resources {
 								                requests = { cpu = "20m", memory = "64Mi" }
 								                limits   = { memory = "128Mi" }
 								              }
 								            }
 								          }
 								        }
 								      }
 								    }
 								  }
 								  lifecycle {
 								    # KYVERNO_LIFECYCLE_V1: Kyverno mutates dns_config with ndots=2 on CronJobs.
 								    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
 								  }
 								}