[ci skip] frigate: add liveness/startup probes for GPU recovery

When the GPU becomes unavailable (overloaded, CUDA context corruption), Frigate silently falls back to CPU detection burning 4 cores with no automatic recovery. Add liveness probe checking nvidia-smi + API health every 60s (3 failures = restart), and startup probe allowing up to 5min for TensorRT model loading.
2026-03-01 20:36:49 +00:00 · 2026-03-01 20:36:49 +00:00 · ab7c655776
commit ab7c655776
parent 858377e257
1 changed files with 19 additions and 0 deletions
--- a/stacks/frigate/main.tf
+++ b/stacks/frigate/main.tf
@ -109,6 +109,25 @@ resource "kubernetes_deployment" "frigate" {
            name       = "media"
            mount_path = "/media/frigate"
          }
          # Restart pod if GPU becomes unavailable or Frigate hangs
          liveness_probe {
            exec {
              command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"]
            }
            initial_delay_seconds = 120
            period_seconds        = 60
            timeout_seconds       = 10
            failure_threshold     = 3
          }
          # TensorRT model loading can take several minutes
          startup_probe {
            http_get {
              path = "/api/version"
              port = 5000
            }
            period_seconds    = 10
            failure_threshold = 30 # up to 5 minutes for startup
          }
          security_context {
            privileged = true
          }