fix Frigate GPU stall: add inference speed check to liveness probe

The existing probe only checked nvidia-smi + API availability, which passes even when the detector falls back to CPU. Now also checks /api/stats and restarts the pod if inference speed exceeds 100ms (normal GPU: ~20ms, CPU fallback: 200ms+). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-13 10:23:21 +00:00 · 2026-03-13 10:23:21 +00:00 · dfcef89c35
commit dfcef89c35
parent 50d539908c
1 changed files with 16 additions and 2 deletions
--- a/stacks/frigate/main.tf
+++ b/stacks/frigate/main.tf
@ -128,10 +128,24 @@ resource "kubernetes_deployment" "frigate" {
            name       = "media"
            mount_path = "/media/frigate"
          }
-          # Restart pod if GPU becomes unavailable or Frigate hangs
+          # Restart pod if GPU becomes unavailable, Frigate hangs, or
          # detector falls back to CPU (inference time spikes from ~20ms to 200ms+)
          liveness_probe {
            exec {
-              command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"]
+              command = ["sh", "-c", <<-EOT
                nvidia-smi > /dev/null 2>&1 || exit 1
                STATS=$(curl -sf --max-time 5 http://localhost:5000/api/stats) || exit 1
                echo "$STATS" | python3 -c "
 import sys, json
 stats = json.load(sys.stdin)
 for name, det in stats.get('detectors', {}).items():
    speed = det.get('inference_speed', 0)
    if speed > 100:
        print(f'UNHEALTHY: detector {name} inference {speed}ms > 100ms threshold')
        sys.exit(1)
 "
              EOT
              ]
            }
            initial_delay_seconds = 120
            period_seconds        = 60