[ci skip] frigate: add liveness/startup probes for GPU recovery
When the GPU becomes unavailable (overloaded, CUDA context corruption), Frigate silently falls back to CPU detection burning 4 cores with no automatic recovery. Add liveness probe checking nvidia-smi + API health every 60s (3 failures = restart), and startup probe allowing up to 5min for TensorRT model loading.
This commit is contained in:
parent
78d5aeb5db
commit
14a5b4d7d5
1 changed files with 19 additions and 0 deletions
|
|
@ -109,6 +109,25 @@ resource "kubernetes_deployment" "frigate" {
|
|||
name = "media"
|
||||
mount_path = "/media/frigate"
|
||||
}
|
||||
# Restart pod if GPU becomes unavailable or Frigate hangs
|
||||
liveness_probe {
|
||||
exec {
|
||||
command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"]
|
||||
}
|
||||
initial_delay_seconds = 120
|
||||
period_seconds = 60
|
||||
timeout_seconds = 10
|
||||
failure_threshold = 3
|
||||
}
|
||||
# TensorRT model loading can take several minutes
|
||||
startup_probe {
|
||||
http_get {
|
||||
path = "/api/version"
|
||||
port = 5000
|
||||
}
|
||||
period_seconds = 10
|
||||
failure_threshold = 30 # up to 5 minutes for startup
|
||||
}
|
||||
security_context {
|
||||
privileged = true
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue