From bd0cb71f175f574fa8f7dc7fdc681ea69c3449bc Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Fri, 12 Jun 2026 20:57:27 +0000
Subject: [PATCH] =?UTF-8?q?tts:=20TCP=20probes=20=E2=80=94=20http=20livene?=
 =?UTF-8?q?ss=20killed=20the=20server=20mid-synthesis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The devnen server runs chunked synthesis as a blocking call inside its
async handler, so the event loop (and every HTTP probe) hangs for the
whole multi-minute story. Kubelet's http liveness probe (1s timeout)
then killed the container mid-story (exit 137, twice within 10 min of
the first real drain), which reset the engine, so every following pass
started cold and tripit's 120s synthesis budget could never be met —
the queue would never drain.

TCP probes keep the meaning that matters: uvicorn binds 8004 only
after the model finishes loading in the lifespan hook, so readiness
still gates 'model loaded', while a GPU-busy server is left alive.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 stacks/tts/main.tf | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/stacks/tts/main.tf b/stacks/tts/main.tf
index 5056fced..e8c4d76e 100644
--- a/stacks/tts/main.tf
+++ b/stacks/tts/main.tf
@@ -440,12 +440,18 @@ resource "kubernetes_deployment" "chatterbox" {
             mount_path = "/data"
           }
 
-          # /v1/audio/voices is cheap and only 200s once the model is loaded —
-          # so it gates real readiness. First start downloads the model, which
-          # is slow; the generous failure_threshold absorbs that.
+          # TCP probes, deliberately NOT http: the server synthesizes chunks
+          # as a BLOCKING call inside its async handler, so the event loop —
+          # and any HTTP probe — hangs for the whole multi-minute story. The
+          # http liveness probe killed the container mid-synthesis (exit 137,
+          # observed 2026-06-12 20:48–20:53: every drain pass then faced a
+          # cold engine and timed out forever). TCP keeps the original
+          # semantics where it matters: uvicorn only binds 8004 AFTER the
+          # lifespan hook finishes loading the model ("Application startup
+          # complete" precedes "Uvicorn running"), so a TCP readiness pass
+          # still means "model loaded", while a GPU-busy server stays alive.
           readiness_probe {
-            http_get {
-              path = "/v1/audio/voices"
+            tcp_socket {
               port = 8004
             }
             initial_delay_seconds = 20
@@ -453,8 +459,7 @@ resource "kubernetes_deployment" "chatterbox" {
             failure_threshold     = 12
           }
           liveness_probe {
-            http_get {
-              path = "/v1/audio/voices"
+            tcp_socket {
               port = 8004
             }
             initial_delay_seconds = 120