From a4eafafe49be45f018eda59ce254be0aa194c0e4 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <vbarzin@gmail.com>
Date: Wed, 22 Apr 2026 14:05:12 +0000
Subject: [PATCH] =?UTF-8?q?[monitoring]=20Add=20GPUNodeUnschedulable=20ale?=
 =?UTF-8?q?rt=20=E2=80=94=20fires=20when=20GPU=20node=20is=20cordoned?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After k8s-node1 was silently cordoned and broke Frigate camera streams,
existing alerts (NvidiaExporterDown, PodUnschedulable) didn't catch the
root cause proactively. This alert fires within 5m of the GPU node being
cordoned, before any pod restart attempts to schedule and fails.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../modules/monitoring/prometheus_chart_values.tpl        | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index c90f2239..a293fee8 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -750,6 +750,14 @@ serverFiles:
               severity: critical
             annotations:
               summary: "NVIDIA GPU exporter is down - no GPU metrics available"
+          - alert: GPUNodeUnschedulable
+            expr: kube_node_spec_unschedulable{node="k8s-node1"} == 1
+            for: 5m
+            labels:
+              severity: critical
+              subsystem: gpu
+            annotations:
+              summary: "GPU node {{ $labels.node }} is cordoned — Frigate and GPU workloads cannot schedule"
       - name: Power
         rules:
           - alert: OnBattery