tts: GPU-gated live narration — demand-gate CronJob + all-day VRAM guard
Viktor asked 'can't we make it live? why the cronjob?' — the overnight window guaranteed VRAM room on the shared T4, but immich/frigate models idle-unload during the day so the card often has room (measured 10.3 GiB free at 01:20). New 'demand' action every 3 min: scale Chatterbox up when tripit's audio queue is non-empty AND free VRAM >= floor; idle it back to 0 when the queue empties (also frees the card early inside the nightly window). Failed metrics scrape fail-safes to no-scale-up, same as the window preflight. The guard moves to all-day */5 — live synthesis can hold the card at any hour, so the yield-on-pressure watchdog must watch at any hour. tripit exposes the unauthenticated in-cluster queue count; a 404 from an older image reads as queued=0 (no-op). The 02:00 window-up stays as the guaranteed nightly catch-up. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
d818f7ed3b
commit
d3d37a15ec
1 changed files with 45 additions and 5 deletions
|
|
@ -44,9 +44,12 @@ variable "offpeak_window_down_schedule" {
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "offpeak_guard_schedule" {
|
variable "offpeak_guard_schedule" {
|
||||||
type = string
|
type = string
|
||||||
default = "*/5 2-5 * * *" # every 5 min inside the 02:00–06:00 window
|
# ALL-DAY since the demand gate (2026-06-12): live synthesis can hold the
|
||||||
description = "Cron schedule for the mid-window guard that yields the card if free VRAM drops."
|
# card at any hour, so the yield-on-VRAM-pressure guard must watch at any
|
||||||
|
# hour too. A guard tick while replicas=0 is a no-op.
|
||||||
|
default = "*/5 * * * *"
|
||||||
|
description = "Cron schedule for the guard that yields the card if free VRAM drops below the floor."
|
||||||
}
|
}
|
||||||
|
|
||||||
locals {
|
locals {
|
||||||
|
|
@ -106,8 +109,8 @@ locals {
|
||||||
# treat conservatively (skip scale-up).
|
# treat conservatively (skip scale-up).
|
||||||
if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
|
if ! BODY="$(curl -sf -m 10 "$${METRICS_URL}")"; then
|
||||||
echo "WARN: could not scrape $${METRICS_URL}"
|
echo "WARN: could not scrape $${METRICS_URL}"
|
||||||
if [ "$${ACTION}" = "up" ]; then
|
if [ "$${ACTION}" = "up" ] || [ "$${ACTION}" = "demand" ]; then
|
||||||
echo "preflight: scrape failed -> NOT scaling up (fail-safe)"; exit 0
|
echo "$${ACTION}: scrape failed -> NOT scaling up (fail-safe)"; exit 0
|
||||||
fi
|
fi
|
||||||
# For down/guard a failed scrape must NOT block yielding the card.
|
# For down/guard a failed scrape must NOT block yielding the card.
|
||||||
BODY=""
|
BODY=""
|
||||||
|
|
@ -139,6 +142,29 @@ locals {
|
||||||
echo "window end -> scaling chatterbox-tts to 0"
|
echo "window end -> scaling chatterbox-tts to 0"
|
||||||
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
|
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
|
||||||
;;
|
;;
|
||||||
|
demand)
|
||||||
|
# GPU-gated LIVE narration (tripit#24 amendment, 2026-06-12): scale up
|
||||||
|
# whenever tripit has audio waiting AND the card has room; idle back
|
||||||
|
# down when the queue empties (even inside the nightly window — done is
|
||||||
|
# done, free the card early). The 02:00 window-up stays the guaranteed
|
||||||
|
# nightly catch-up for days the daytime card never had room.
|
||||||
|
QUEUED="$(curl -sf -m 10 "$${QUEUE_URL}" \
|
||||||
|
| sed -n 's/.*"queued"[^0-9]*\([0-9][0-9]*\).*/\1/p')" || QUEUED=""
|
||||||
|
QUEUED="$${QUEUED:-0}"
|
||||||
|
REPLICAS="$(kubectl -n tts get deploy/chatterbox-tts -o jsonpath='{.spec.replicas}')"
|
||||||
|
echo "demand: queued=$${QUEUED} replicas=$${REPLICAS}"
|
||||||
|
if [ "$${QUEUED}" -gt 0 ] && [ "$${REPLICAS}" = "0" ]; then
|
||||||
|
if [ "$${FREE}" -ge "$${FLOOR}" ]; then
|
||||||
|
echo "demand: audio waiting + room on the card -> scaling chatterbox-tts to 1"
|
||||||
|
kubectl -n tts scale deploy/chatterbox-tts --replicas=1
|
||||||
|
else
|
||||||
|
echo "demand: audio waiting but free < floor -> staying down (nightly window catches up)"
|
||||||
|
fi
|
||||||
|
elif [ "$${QUEUED}" -eq 0 ] && [ "$${REPLICAS}" != "0" ]; then
|
||||||
|
echo "demand: queue empty -> idling chatterbox-tts back to 0"
|
||||||
|
kubectl -n tts scale deploy/chatterbox-tts --replicas=0
|
||||||
|
fi
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
|
|
@ -159,7 +185,17 @@ locals {
|
||||||
schedule = var.offpeak_guard_schedule
|
schedule = var.offpeak_guard_schedule
|
||||||
action = "guard"
|
action = "guard"
|
||||||
}
|
}
|
||||||
|
# GPU-gated live narration: every 3 min, scale up when tripit's audio queue
|
||||||
|
# is non-empty and the VRAM preflight passes; idle down when it empties.
|
||||||
|
chatterbox-demand-gate = {
|
||||||
|
schedule = "*/3 * * * *"
|
||||||
|
action = "demand"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# tripit's unauthenticated in-cluster queue probe (count only, non-sensitive).
|
||||||
|
# A 404 from an older tripit image yields QUEUED=0 -> the gate no-ops.
|
||||||
|
tripit_queue_url = "http://tripit.tripit.svc.cluster.local:8080/api/tour/tts-queue"
|
||||||
}
|
}
|
||||||
|
|
||||||
resource "kubernetes_namespace" "tts" {
|
resource "kubernetes_namespace" "tts" {
|
||||||
|
|
@ -463,6 +499,10 @@ resource "kubernetes_cron_job_v1" "offpeak" {
|
||||||
name = "GPU_TOTAL"
|
name = "GPU_TOTAL"
|
||||||
value = tostring(var.gpu_total_bytes)
|
value = tostring(var.gpu_total_bytes)
|
||||||
}
|
}
|
||||||
|
env {
|
||||||
|
name = "QUEUE_URL"
|
||||||
|
value = local.tripit_queue_url
|
||||||
|
}
|
||||||
resources {
|
resources {
|
||||||
requests = { cpu = "20m", memory = "64Mi" }
|
requests = { cpu = "20m", memory = "64Mi" }
|
||||||
limits = { memory = "128Mi" }
|
limits = { memory = "128Mi" }
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue