[ci skip] frigate: add liveness/startup probes for GPU recovery

When the GPU becomes unavailable (overloaded, CUDA context corruption), Frigate silently falls back to CPU detection burning 4 cores with no automatic recovery. Add liveness probe checking nvidia-smi + API health every 60s (3 failures = restart), and startup probe allowing up to 5min for TensorRT model loading.
2026-03-01 20:36:49 +00:00 · 2026-03-01 20:36:49 +00:00 · 14a5b4d7d5
commit 14a5b4d7d5
parent 78d5aeb5db
1 changed files with 19 additions and 0 deletions
--- a/stacks/frigate/main.tf
+++ b/stacks/frigate/main.tf
@ -109,6 +109,25 @@ resource "kubernetes_deployment" "frigate" {
            name       = "media"
            mount_path = "/media/frigate"
          }
+          # Restart pod if GPU becomes unavailable or Frigate hangs
+          liveness_probe {
+            exec {
+              command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"]
+            }
+            initial_delay_seconds = 120
+            period_seconds        = 60
+            timeout_seconds       = 10
+            failure_threshold     = 3
+          }
+          # TensorRT model loading can take several minutes
+          startup_probe {
+            http_get {
+              path = "/api/version"
+              port = 5000
+            }
+            period_seconds    = 10
+            failure_threshold = 30 # up to 5 minutes for startup
+          }
          security_context {
            privileged = true
          }