From dfcef89c3521a332986d9ca7af6ae07aef81b0a9 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 13 Mar 2026 10:23:21 +0000 Subject: [PATCH] fix Frigate GPU stall: add inference speed check to liveness probe The existing probe only checked nvidia-smi + API availability, which passes even when the detector falls back to CPU. Now also checks /api/stats and restarts the pod if inference speed exceeds 100ms (normal GPU: ~20ms, CPU fallback: 200ms+). Co-Authored-By: Claude Opus 4.6 (1M context) --- stacks/frigate/main.tf | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/stacks/frigate/main.tf b/stacks/frigate/main.tf index 241f7ce6..8bb0743b 100644 --- a/stacks/frigate/main.tf +++ b/stacks/frigate/main.tf @@ -128,10 +128,24 @@ resource "kubernetes_deployment" "frigate" { name = "media" mount_path = "/media/frigate" } - # Restart pod if GPU becomes unavailable or Frigate hangs + # Restart pod if GPU becomes unavailable, Frigate hangs, or + # detector falls back to CPU (inference time spikes from ~20ms to 200ms+) liveness_probe { exec { - command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"] + command = ["sh", "-c", <<-EOT + nvidia-smi > /dev/null 2>&1 || exit 1 + STATS=$(curl -sf --max-time 5 http://localhost:5000/api/stats) || exit 1 + echo "$STATS" | python3 -c " +import sys, json +stats = json.load(sys.stdin) +for name, det in stats.get('detectors', {}).items(): + speed = det.get('inference_speed', 0) + if speed > 100: + print(f'UNHEALTHY: detector {name} inference {speed}ms > 100ms threshold') + sys.exit(1) +" + EOT + ] } initial_delay_seconds = 120 period_seconds = 60