fix: increase tier-2-gpu quota to 12Gi, add NvidiaExporterDown alert

- Increase tier-2-gpu requests.memory from 8Gi to 12Gi to give immich
  ML pods scheduling headroom (was at 96% utilization)
- Add critical NvidiaExporterDown Prometheus alert that fires when GPU
  metrics are absent for >10 minutes (faster than generic ScrapeTargetDown)
This commit is contained in:
Viktor Barzin 2026-03-23 03:04:33 +02:00
parent 20d0404a42
commit 877cd15b45
2 changed files with 65 additions and 1 deletions

View file

@ -615,7 +615,7 @@ resource "kubernetes_manifest" "generate_resourcequota_by_tier" {
spec = {
hard = {
"requests.cpu" = "8"
"requests.memory" = "8Gi"
"requests.memory" = "12Gi"
"limits.memory" = "32Gi"
pods = "40"
}

View file

@ -388,6 +388,13 @@ serverFiles:
severity: info
annotations:
summary: "VRAM used: {{ $value | printf \"%.1f\" }} GB (threshold: 14 GB)"
- alert: NvidiaExporterDown
expr: absent(nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP) == 1
for: 10m
labels:
severity: critical
annotations:
summary: "NVIDIA GPU exporter is down - no GPU metrics available"
- name: Power
rules:
- alert: OnBattery
@ -1131,6 +1138,63 @@ serverFiles:
severity: warning
annotations:
summary: "Privatebin has no available replicas"
- name: "Network Traffic (GoFlow2)"
rules:
- alert: GoFlow2Down
expr: up{job="goflow2"} == 0
for: 10m
labels:
severity: warning
annotations:
summary: "GoFlow2 NetFlow collector is down — no network flow visibility"
- alert: NoNetFlowData
expr: absent(goflow2_flow_traffic_bytes_total) and on() up{job="goflow2"} == 1
for: 30m
labels:
severity: warning
annotations:
summary: "GoFlow2 is up but receiving no NetFlow data — check softflowd on pfSense"
- alert: NetFlowTrafficSpike
expr: |
rate(goflow2_flow_traffic_bytes_total[5m]) > 2 * avg_over_time(rate(goflow2_flow_traffic_bytes_total[5m])[1h:5m])
and rate(goflow2_flow_traffic_bytes_total[5m]) > 1048576
for: 10m
labels:
severity: warning
annotations:
summary: "NetFlow traffic spike: {{ $value | humanize1024 }}B/s — more than 2x the 1h average"
- alert: NetFlowHighErrorRate
expr: |
rate(goflow2_flow_decoder_error_total[5m]) /
(rate(goflow2_flow_process_nf_total[5m]) + 1) > 0.1
for: 15m
labels:
severity: warning
annotations:
summary: "GoFlow2 decoder error rate: {{ $value | printf \"%.1f\" }}% — possible malformed flows or attack"
- alert: NetFlowProcessingDelay
expr: goflow2_flow_process_nf_delay_seconds{quantile="0.5"} > 600
for: 15m
labels:
severity: info
annotations:
summary: "NetFlow processing delay p50: {{ $value | printf \"%.0f\" }}s — softflowd may be overloaded"
- name: "DNS Anomaly Detection"
rules:
- alert: DNSQuerySpike
expr: dns_anomaly_total_queries > 2 * dns_anomaly_avg_queries and dns_anomaly_total_queries > 1000
for: 0m
labels:
severity: warning
annotations:
summary: "DNS query spike: {{ $value | printf \"%.0f\" }} queries (>2x average)"
- alert: DNSHighErrorRate
expr: dns_anomaly_server_failure > 100
for: 0m
labels:
severity: warning
annotations:
summary: "High DNS SERVFAIL rate: {{ $value | printf \"%.0f\" }} failures detected"
extraScrapeConfigs: |
- job_name: 'proxmox-host'