[ci skip] frigate: add liveness/startup probes for GPU recovery

When the GPU becomes unavailable (overloaded, CUDA context corruption),
Frigate silently falls back to CPU detection burning 4 cores with no
automatic recovery. Add liveness probe checking nvidia-smi + API health
every 60s (3 failures = restart), and startup probe allowing up to 5min
for TensorRT model loading.
This commit is contained in:
Viktor Barzin 2026-03-01 20:36:49 +00:00
parent 78d5aeb5db
commit 14a5b4d7d5
No known key found for this signature in database
GPG key ID: 0EB088298288D958

View file

@ -109,6 +109,25 @@ resource "kubernetes_deployment" "frigate" {
name = "media"
mount_path = "/media/frigate"
}
# Restart pod if GPU becomes unavailable or Frigate hangs
liveness_probe {
exec {
command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"]
}
initial_delay_seconds = 120
period_seconds = 60
timeout_seconds = 10
failure_threshold = 3
}
# TensorRT model loading can take several minutes
startup_probe {
http_get {
path = "/api/version"
port = 5000
}
period_seconds = 10
failure_threshold = 30 # up to 5 minutes for startup
}
security_context {
privileged = true
}