From b92e1166a89e1fe71a5bad61da73823832405b5b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 21 May 2026 08:32:57 +0000 Subject: [PATCH] monitoring: prometheus global scrape 1m -> 2m + UPS pinned 30s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Halves sample volume on all default-scrape jobs (cAdvisor, node-exporter, service-endpoints, etc.). Memory id 559's earlier scrape-2m tuning was applied live but not codified — this restores the Helm template. Companion changes to keep alerting fidelity: - evaluation_interval kept at 1m (alerts evaluate every minute) - snmp-ups job pinned to scrape_interval=30s so PowerOutage / LowUPSBattery detect within ~30s instead of 2m - 3 alerts bumped from for:1m to for:3m (HighGPUTemp, LowUPSBattery, PowerOutage) for stability above the new 2m global cadence Other jobs that already had per-job overrides (snmp-idrac 1m, redfish-idrac 3m, kubernetes-pods 5m, kubernetes-services 5m) unaffected. Expected: 50-150m sustained CPU saving on Prometheus + apiserver. Verification ongoing — apiserver settles ~minutes after Prometheus config reload due to initial-target-scrape burst. --- .../monitoring/prometheus_chart_values.tpl | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 69f0c2ec..2c4b59a1 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -238,6 +238,13 @@ prometheus-pushgateway: limits: memory: 256Mi server: + # Halve scrape load on apiserver + cAdvisor + node-exporter without losing + # alerting fidelity. Per-job overrides (snmp-ups 30s, snmp-idrac 1m, etc.) + # below keep critical metrics fresh; alert `for:` durations were audited and + # all 1m alerts were bumped to 3m to stay above the new scrape cadence. + global: + scrape_interval: 2m + evaluation_interval: 1m # Enable me to delete metrics extraFlags: - "web.enable-admin-api" @@ -798,7 +805,7 @@ serverFiles: rules: - alert: HighGPUTemp expr: nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP > 65 - for: 1m + for: 3m # bumped from 1m for global scrape_interval=2m labels: severity: warning annotations: @@ -851,14 +858,14 @@ serverFiles: summary: "UPS on battery: {{ $value | printf \"%.0f\" }}s" - alert: LowUPSBattery expr: ups_upsEstimatedMinutesRemaining < 25 and on(instance) ups_upsInputVoltage < 150 - for: 1m + for: 3m # bumped from 1m for global scrape_interval=2m; snmp-ups job pinned to 30s labels: severity: critical annotations: summary: "UPS battery low: {{ $value | printf \"%.0f\" }} min remaining (threshold: 25 min)" - alert: PowerOutage expr: ups_upsInputVoltage < 150 - for: 1m + for: 3m # bumped from 1m for global scrape_interval=2m; snmp-ups job pinned to 30s labels: severity: critical annotations: @@ -2930,6 +2937,9 @@ extraScrapeConfigs: | regex: '(.*)' replacement: 'openwrt_$${1}' - job_name: 'snmp-ups' + # Keep UPS fast: 30s overrides the 2m global so PowerOutage / LowUPSBattery + # detect within ~30s instead of 2m. + scrape_interval: 30s params: module: [huawei] static_configs: