monitoring: prometheus global scrape 1m -> 2m + UPS pinned 30s
Halves sample volume on all default-scrape jobs (cAdvisor, node-exporter, service-endpoints, etc.). Memory id 559's earlier scrape-2m tuning was applied live but not codified — this restores the Helm template. Companion changes to keep alerting fidelity: - evaluation_interval kept at 1m (alerts evaluate every minute) - snmp-ups job pinned to scrape_interval=30s so PowerOutage / LowUPSBattery detect within ~30s instead of 2m - 3 alerts bumped from for:1m to for:3m (HighGPUTemp, LowUPSBattery, PowerOutage) for stability above the new 2m global cadence Other jobs that already had per-job overrides (snmp-idrac 1m, redfish-idrac 3m, kubernetes-pods 5m, kubernetes-services 5m) unaffected. Expected: 50-150m sustained CPU saving on Prometheus + apiserver. Verification ongoing — apiserver settles ~minutes after Prometheus config reload due to initial-target-scrape burst.
This commit is contained in:
parent
aba061cf2e
commit
af6aa18b25
1 changed files with 13 additions and 3 deletions
|
|
@ -238,6 +238,13 @@ prometheus-pushgateway:
|
||||||
limits:
|
limits:
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
server:
|
server:
|
||||||
|
# Halve scrape load on apiserver + cAdvisor + node-exporter without losing
|
||||||
|
# alerting fidelity. Per-job overrides (snmp-ups 30s, snmp-idrac 1m, etc.)
|
||||||
|
# below keep critical metrics fresh; alert `for:` durations were audited and
|
||||||
|
# all 1m alerts were bumped to 3m to stay above the new scrape cadence.
|
||||||
|
global:
|
||||||
|
scrape_interval: 2m
|
||||||
|
evaluation_interval: 1m
|
||||||
# Enable me to delete metrics
|
# Enable me to delete metrics
|
||||||
extraFlags:
|
extraFlags:
|
||||||
- "web.enable-admin-api"
|
- "web.enable-admin-api"
|
||||||
|
|
@ -798,7 +805,7 @@ serverFiles:
|
||||||
rules:
|
rules:
|
||||||
- alert: HighGPUTemp
|
- alert: HighGPUTemp
|
||||||
expr: nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP > 65
|
expr: nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP > 65
|
||||||
for: 1m
|
for: 3m # bumped from 1m for global scrape_interval=2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
|
@ -851,14 +858,14 @@ serverFiles:
|
||||||
summary: "UPS on battery: {{ $value | printf \"%.0f\" }}s"
|
summary: "UPS on battery: {{ $value | printf \"%.0f\" }}s"
|
||||||
- alert: LowUPSBattery
|
- alert: LowUPSBattery
|
||||||
expr: ups_upsEstimatedMinutesRemaining < 25 and on(instance) ups_upsInputVoltage < 150
|
expr: ups_upsEstimatedMinutesRemaining < 25 and on(instance) ups_upsInputVoltage < 150
|
||||||
for: 1m
|
for: 3m # bumped from 1m for global scrape_interval=2m; snmp-ups job pinned to 30s
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "UPS battery low: {{ $value | printf \"%.0f\" }} min remaining (threshold: 25 min)"
|
summary: "UPS battery low: {{ $value | printf \"%.0f\" }} min remaining (threshold: 25 min)"
|
||||||
- alert: PowerOutage
|
- alert: PowerOutage
|
||||||
expr: ups_upsInputVoltage < 150
|
expr: ups_upsInputVoltage < 150
|
||||||
for: 1m
|
for: 3m # bumped from 1m for global scrape_interval=2m; snmp-ups job pinned to 30s
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
|
|
@ -2930,6 +2937,9 @@ extraScrapeConfigs: |
|
||||||
regex: '(.*)'
|
regex: '(.*)'
|
||||||
replacement: 'openwrt_$${1}'
|
replacement: 'openwrt_$${1}'
|
||||||
- job_name: 'snmp-ups'
|
- job_name: 'snmp-ups'
|
||||||
|
# Keep UPS fast: 30s overrides the 2m global so PowerOutage / LowUPSBattery
|
||||||
|
# detect within ~30s instead of 2m.
|
||||||
|
scrape_interval: 30s
|
||||||
params:
|
params:
|
||||||
module: [huawei]
|
module: [huawei]
|
||||||
static_configs:
|
static_configs:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue