[ci skip] frigate: add liveness/startup probes for GPU recovery
When the GPU becomes unavailable (overloaded, CUDA context corruption), Frigate silently falls back to CPU detection burning 4 cores with no automatic recovery. Add liveness probe checking nvidia-smi + API health every 60s (3 failures = restart), and startup probe allowing up to 5min for TensorRT model loading.
This commit is contained in:
parent
858377e257
commit
ab7c655776
1 changed files with 19 additions and 0 deletions
|
|
@ -109,6 +109,25 @@ resource "kubernetes_deployment" "frigate" {
|
||||||
name = "media"
|
name = "media"
|
||||||
mount_path = "/media/frigate"
|
mount_path = "/media/frigate"
|
||||||
}
|
}
|
||||||
|
# Restart pod if GPU becomes unavailable or Frigate hangs
|
||||||
|
liveness_probe {
|
||||||
|
exec {
|
||||||
|
command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"]
|
||||||
|
}
|
||||||
|
initial_delay_seconds = 120
|
||||||
|
period_seconds = 60
|
||||||
|
timeout_seconds = 10
|
||||||
|
failure_threshold = 3
|
||||||
|
}
|
||||||
|
# TensorRT model loading can take several minutes
|
||||||
|
startup_probe {
|
||||||
|
http_get {
|
||||||
|
path = "/api/version"
|
||||||
|
port = 5000
|
||||||
|
}
|
||||||
|
period_seconds = 10
|
||||||
|
failure_threshold = 30 # up to 5 minutes for startup
|
||||||
|
}
|
||||||
security_context {
|
security_context {
|
||||||
privileged = true
|
privileged = true
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue