From 14a5b4d7d50e4b681849e4928c4af01581ad57f3 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 1 Mar 2026 20:36:49 +0000 Subject: [PATCH] [ci skip] frigate: add liveness/startup probes for GPU recovery When the GPU becomes unavailable (overloaded, CUDA context corruption), Frigate silently falls back to CPU detection burning 4 cores with no automatic recovery. Add liveness probe checking nvidia-smi + API health every 60s (3 failures = restart), and startup probe allowing up to 5min for TensorRT model loading. --- stacks/frigate/main.tf | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/stacks/frigate/main.tf b/stacks/frigate/main.tf index 8d919bdc..ba28eb34 100644 --- a/stacks/frigate/main.tf +++ b/stacks/frigate/main.tf @@ -109,6 +109,25 @@ resource "kubernetes_deployment" "frigate" { name = "media" mount_path = "/media/frigate" } + # Restart pod if GPU becomes unavailable or Frigate hangs + liveness_probe { + exec { + command = ["sh", "-c", "nvidia-smi > /dev/null 2>&1 && curl -sf http://localhost:5000/api/version > /dev/null"] + } + initial_delay_seconds = 120 + period_seconds = 60 + timeout_seconds = 10 + failure_threshold = 3 + } + # TensorRT model loading can take several minutes + startup_probe { + http_get { + path = "/api/version" + port = 5000 + } + period_seconds = 10 + failure_threshold = 30 # up to 5 minutes for startup + } security_context { privileged = true }