diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index f39b1489..65cfd0a8 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -1290,6 +1290,42 @@ serverFiles:
             annotations:
               summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) cannot pull image"
               description: "Check the deployment's image reference — often a stale tag, a removed registry, or a credentials mismatch. `kubectl -n {{ $labels.namespace }} describe pod {{ $labels.pod }}` shows the pull error."
+          # N-1 capacity check: if any non-GPU worker (node2/3/4) died, would
+          # its memory requests fit on the remaining Ready workers (incl. node1
+          # GPU node — its taint is PreferNoSchedule, soft)? Fires when the
+          # most-loaded non-GPU worker holds more memory requests than the rest
+          # of the cluster has free.
+          - alert: ClusterCannotTolerateNonGpuNodeLoss
+            expr: |
+              max(
+                sum by (node) (
+                  kube_pod_container_resource_requests{resource="memory",unit="byte",node=~"k8s-node[234]"}
+                )
+              )
+              >
+              sum(
+                clamp_min(
+                  kube_node_status_allocatable{resource="memory",unit="byte",node=~"k8s-node[1234]"}
+                  - on(node) group_left() sum by (node) (
+                      kube_pod_container_resource_requests{resource="memory",unit="byte",node=~"k8s-node[1234]"}
+                    ),
+                  0
+                )
+                and on(node) (kube_node_status_condition{condition="Ready",status="true"} == 1)
+              )
+            for: 15m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Cluster cannot tolerate losing any non-GPU worker — memory requests won't fit on the rest"
+              description: |
+                The most-loaded non-GPU worker (k8s-node2/3/4) has more memory
+                requests pinned to it than the rest of the workers (incl. node1
+                GPU node) currently have free. If that node went down, its
+                pods would not reschedule and stay Pending.
+                Remediation: right-size top reservers via Goldilocks (immich-server,
+                frigate, prometheus, pg-cluster, paperless) or bump VM RAM on
+                k8s-node2/k8s-node3 from 32GB → 48GB to match node1.
       - name: Infrastructure Health
         rules:
           - alert: HomeAssistantDown