monitoring: migrate R730 iDRAC scraping to SNMP (fast primary) + thin Redfish remnant
The Redfish exporter (mrlhansen, metrics:all:true) walked every BMC subtree on
each scrape — ~18.5s avg / 28s peak against the slow iDRAC — forcing a 3m
interval. Moved the fast path to SNMP via the (previously unmounted) dell_idrac
module: ~3.7s/scrape at 1m.
- snmp_exporter: merge dell_idrac into ups_snmp_values.yaml; hand-add fan-RPM
(coolingDeviceReading + location lookup) and an amperageProbeLocationName
lookup so the "System Board Pwr Consumption" watts probe is label-selectable.
- snmp-idrac job: params module=dell_idrac, auth=public_v2, 1m/30s — now the
primary source for health/thermal/power/fan/voltage (relabeled r730_idrac_*).
- Re-point 9 iDRAC alerts to SNMP metrics + DellStatus enums (OK=3, on=4) and
fix the misnamed iDRACSNMPMetricsMissing/iDRACRedfishMetricsMissing probes.
- Re-point Grafana panels (idrac.json, cluster_health.json) to SNMP names;
temps ÷10 (tenths-degC); DellStatus value-mappings updated.
- Demote the Redfish exporter to a slow remnant: trim collectors to
system/sensors/power/storage/network/memory, scrape 3m->10m. Kept only for
metrics SNMP can't serve (indicator LED, NIC Mbps, machine/BIOS, per-drive
table) AND to keep HA Sofia's sensor.r730_fan_speed working — it reads
idrac_sensors_fan_speed from the exporter directly, so no ha-sofia change.
SSD-wear alerts + SEL panel left as-is (already inert/empty today). Verified
live: snmp-idrac up, scrape 3.7s, all 9 re-pointed alerts resolve without
firing, HA fan metric (idrac_sensors_fan_speed=6) intact. Design/plan +
as-built docs: docs/plans/2026-06-05-idrac-snmp-migration-{design,plan}.md,
docs/architecture/monitoring.md.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
6442978f07
commit
6b1d23abbd
8 changed files with 1891 additions and 55 deletions
|
|
@ -227,7 +227,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max(r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"})",
|
||||
"expr": "max(r730_idrac_temperatureProbeReading{temperatureProbeLocationName=\"CPU1 Temp\"}) / 10",
|
||||
"legendFormat": "CPU Temp",
|
||||
"refId": "A"
|
||||
}
|
||||
|
|
@ -891,7 +891,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max by() (avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts[$__rate_interval]))",
|
||||
"expr": "max by() (avg_over_time(r730_idrac_amperageProbeReading{amperageProbeLocationName=\"System Board Pwr Consumption\"}[$__rate_interval]))",
|
||||
"legendFormat": "Consumed",
|
||||
"refId": "A"
|
||||
},
|
||||
|
|
@ -909,7 +909,7 @@
|
|||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"expr": "max(r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"})",
|
||||
"expr": "max(r730_idrac_temperatureProbeReading{temperatureProbeLocationName=\"CPU1 Temp\"}) / 10",
|
||||
"legendFormat": "CPU Temp",
|
||||
"refId": "C"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -78,17 +78,35 @@
|
|||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"Critical": {
|
||||
"color": "light-red",
|
||||
"index": 2
|
||||
},
|
||||
"OK": {
|
||||
"color": "light-green",
|
||||
"index": 3
|
||||
},
|
||||
"Warning": {
|
||||
"1": {
|
||||
"color": "light-yellow",
|
||||
"index": 1
|
||||
"index": 4,
|
||||
"text": "Other"
|
||||
},
|
||||
"2": {
|
||||
"color": "light-yellow",
|
||||
"index": 5,
|
||||
"text": "Unknown"
|
||||
},
|
||||
"3": {
|
||||
"color": "light-green",
|
||||
"index": 0,
|
||||
"text": "OK"
|
||||
},
|
||||
"4": {
|
||||
"color": "light-yellow",
|
||||
"index": 1,
|
||||
"text": "Warning"
|
||||
},
|
||||
"5": {
|
||||
"color": "light-red",
|
||||
"index": 2,
|
||||
"text": "Critical"
|
||||
},
|
||||
"6": {
|
||||
"color": "light-red",
|
||||
"index": 3,
|
||||
"text": "Non-Recoverable"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
|
|
@ -137,7 +155,7 @@
|
|||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "/^status$/",
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showPercentChange": false,
|
||||
|
|
@ -165,10 +183,10 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "r730_idrac_idrac_system_health{instance=~\"$instance\", job=~\"$job\"}",
|
||||
"format": "table",
|
||||
"expr": "r730_idrac_globalSystemStatus{instance=~\"$instance\", job=~\"$job\"}",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"legendFormat": "{{status}}",
|
||||
"legendFormat": "",
|
||||
"metrics": [
|
||||
{
|
||||
"id": "1",
|
||||
|
|
@ -796,7 +814,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "r730_idrac_idrac_system_memory_size_bytes{instance=\"$instance\", job=~\"$job\"}",
|
||||
"expr": "sum(r730_idrac_memoryDeviceSize{instance=\"$instance\", job=~\"$job\"}) * 1024",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
|
|
@ -1026,7 +1044,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "r730_idrac_idrac_system_cpu_count{instance=\"$instance\", job=~\"$job\"}",
|
||||
"expr": "count(r730_idrac_processorDeviceStatus{instance=\"$instance\", job=~\"$job\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
|
|
@ -1250,12 +1268,12 @@
|
|||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"3": {
|
||||
"color": "red",
|
||||
"index": 1,
|
||||
"text": "Off"
|
||||
},
|
||||
"1": {
|
||||
"4": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "On"
|
||||
|
|
@ -1325,7 +1343,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "r730_idrac_idrac_system_power_on{instance=\"$instance\", job=~\"$job\"}",
|
||||
"expr": "r730_idrac_systemPowerState{instance=\"$instance\", job=~\"$job\"}",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"metrics": [
|
||||
|
|
@ -1975,9 +1993,9 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "avg_over_time(r730_idrac_idrac_sensors_temperature{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"expr": "avg_over_time(r730_idrac_temperatureProbeReading{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval]) / 10",
|
||||
"instant": false,
|
||||
"legendFormat": "{{name}}",
|
||||
"legendFormat": "{{temperatureProbeLocationName}}",
|
||||
"metrics": [
|
||||
{
|
||||
"id": "1",
|
||||
|
|
@ -2095,9 +2113,9 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "avg_over_time(r730_idrac_idrac_sensors_fan_speed{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"expr": "avg_over_time(r730_idrac_coolingDeviceReading{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"instant": false,
|
||||
"legendFormat": "{{name}}",
|
||||
"legendFormat": "{{coolingDeviceLocationName}}",
|
||||
"metrics": [
|
||||
{
|
||||
"id": "1",
|
||||
|
|
@ -2234,8 +2252,8 @@
|
|||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "avg_over_time(r730_idrac_idrac_power_supply_output_watts{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"legendFormat": "Output PSU-{{id}}",
|
||||
"expr": "avg_over_time(r730_idrac_powerSupplyOutputWatts{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"legendFormat": "Output {{powerSupplyLocationName}}",
|
||||
"metrics": [
|
||||
{
|
||||
"id": "1",
|
||||
|
|
@ -2331,7 +2349,7 @@
|
|||
},
|
||||
"editorMode": "code",
|
||||
"exemplar": false,
|
||||
"expr": "avg(r730_idrac_idrac_power_supply_input_voltage{instance=~\"$instance\", job=~\"$job\"})",
|
||||
"expr": "avg(r730_idrac_powerSupplyCurrentInputVoltage{instance=~\"$instance\", job=~\"$job\"})",
|
||||
"format": "time_series",
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
|
|
@ -2447,8 +2465,8 @@
|
|||
"uid": "${datasource}"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts{instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"legendFormat": "{{name}}",
|
||||
"expr": "avg_over_time(r730_idrac_amperageProbeReading{amperageProbeLocationName=\"System Board Pwr Consumption\",instance=~\"$instance\",job=~\"$job\"}[$__rate_interval])",
|
||||
"legendFormat": "{{amperageProbeLocationName}}",
|
||||
"metrics": [
|
||||
{
|
||||
"id": "1",
|
||||
|
|
|
|||
|
|
@ -20,15 +20,22 @@ resource "kubernetes_config_map" "redfish-config" {
|
|||
username: root
|
||||
password: calvin
|
||||
metrics:
|
||||
all: true
|
||||
# system: true
|
||||
# sensors: true
|
||||
# power: true
|
||||
# sel: false # Disable SEL - often slow
|
||||
# storage: true # Disable storage - slowest endpoint
|
||||
# memory: true
|
||||
# network: false # Disable network adapters
|
||||
# firmware: false # Don't need this frequently
|
||||
# SNMP (snmp-idrac job, dell_idrac module) is the FAST primary source
|
||||
# for dynamic + health metrics since 2026-06-05. This Redfish exporter
|
||||
# is the slow remnant (10m Prometheus scrape) serving only what SNMP
|
||||
# cannot: indicator LED, NIC link-speed Mbps, SSD life %, machine/BIOS
|
||||
# info, per-DIMM / per-NIC inventory, PSU input-watts/capacity.
|
||||
# NOTE: HA Sofia's sensor.r730_fan_speed reads idrac_sensors_fan_speed
|
||||
# from THIS exporter directly, so `sensors` MUST stay enabled.
|
||||
# events (SEL empty on this box), processors (cpu count via SNMP),
|
||||
# manager, extra -> left disabled (default false) to trim the walk.
|
||||
all: false
|
||||
system: true
|
||||
sensors: true
|
||||
power: true
|
||||
storage: true
|
||||
network: true
|
||||
memory: true
|
||||
EOF
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -808,7 +808,7 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "System load: {{ $value | printf \"%.0f\" }}% (threshold: 50%)"
|
||||
- alert: FanFailure
|
||||
expr: r730_idrac_redfish_chassis_fan_health != 1
|
||||
expr: r730_idrac_coolingDeviceStatus != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -942,7 +942,7 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "Power outage - input voltage: {{ $value | printf \"%.0f\" }}V (threshold: <150V)"
|
||||
- alert: HighPowerUsage
|
||||
expr: r730_idrac_idrac_power_control_consumed_watts > 300
|
||||
expr: r730_idrac_amperageProbeReading{amperageProbeLocationName="System Board Pwr Consumption"} > 300
|
||||
for: 60m
|
||||
labels:
|
||||
severity: info
|
||||
|
|
@ -1015,28 +1015,28 @@ serverFiles:
|
|||
- name: Server Health
|
||||
rules:
|
||||
- alert: iDRACSystemUnhealthy
|
||||
expr: r730_idrac_redfish_system_health_state != 1
|
||||
expr: r730_idrac_globalSystemStatus != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "iDRAC system health state: {{ $value }} (expected 1=OK)"
|
||||
summary: "iDRAC system health state: {{ $value }} (expected 3=OK)"
|
||||
- alert: iDRACPowerSupplyUnhealthy
|
||||
expr: r730_idrac_redfish_chassis_power_powersupply_health != 1
|
||||
expr: r730_idrac_powerSupplyStatus != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "iDRAC PSU {{ $labels.member_id }} unhealthy (state: {{ $value }})"
|
||||
- alert: iDRACMemoryUnhealthy
|
||||
expr: r730_idrac_redfish_system_memory_health_state != 1
|
||||
expr: r730_idrac_systemStateMemoryDeviceStatusCombined != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "iDRAC memory subsystem unhealthy (state: {{ $value }})"
|
||||
- alert: iDRACStorageDriveUnhealthy
|
||||
expr: r730_idrac_redfish_system_storage_drive_health_state != 1
|
||||
expr: r730_idrac_physicalDiskComponentStatus != 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -1057,12 +1057,12 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "SSD {{ $labels.id }} has {{ $value }}% life remaining"
|
||||
- alert: iDRACServerPoweredOff
|
||||
expr: r730_idrac_redfish_system_power_state != 2
|
||||
expr: r730_idrac_systemPowerState != 4
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "R730 server is not powered on (state: {{ $value }}, expected 2=On)"
|
||||
summary: "R730 server is not powered on (state: {{ $value }}, expected 4=On)"
|
||||
- alert: ProxmoxExporterDown
|
||||
expr: pve_up{id="node/pve"} == 0
|
||||
for: 5m
|
||||
|
|
@ -1171,19 +1171,19 @@ serverFiles:
|
|||
annotations:
|
||||
summary: "UPS metrics missing for 10m - check SNMP exporter and ups.viktorbarzin.lan"
|
||||
- alert: iDRACRedfishMetricsMissing
|
||||
expr: absent(r730_idrac_idrac_power_supply_input_voltage)
|
||||
expr: absent(r730_idrac_powerSupplyCurrentInputVoltage)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "iDRAC Redfish metrics missing for 10m - check idrac-redfish-exporter pod"
|
||||
summary: "iDRAC SNMP PSU input voltage metric missing for 10m - check SNMP exporter and idrac.viktorbarzin.lan"
|
||||
- alert: iDRACSNMPMetricsMissing
|
||||
expr: absent(r730_idrac_idrac_system_health)
|
||||
expr: absent(r730_idrac_globalSystemStatus)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "iDRAC SNMP metrics missing for 10m - check SNMP exporter and idrac.viktorbarzin.lan"
|
||||
summary: "iDRAC SNMP health metric (globalSystemStatus) missing for 10m - check SNMP exporter and idrac.viktorbarzin.lan"
|
||||
- alert: ATSMetricsMissing
|
||||
expr: absent(automatic_transfer_switch_power_mode)
|
||||
for: 15m
|
||||
|
|
@ -3149,7 +3149,10 @@ extraScrapeConfigs: |
|
|||
metrics_path: '/metrics'
|
||||
- job_name: 'snmp-idrac'
|
||||
scrape_interval: 1m
|
||||
scrape_timeout: 45s
|
||||
scrape_timeout: 30s
|
||||
params:
|
||||
module: [dell_idrac]
|
||||
auth: [public_v2]
|
||||
static_configs:
|
||||
- targets:
|
||||
- "idrac.viktorbarzin.lan.:161"
|
||||
|
|
@ -3168,7 +3171,12 @@ extraScrapeConfigs: |
|
|||
regex: '(.*)'
|
||||
replacement: 'r730_idrac_$${1}'
|
||||
- job_name: 'redfish-idrac'
|
||||
scrape_interval: 3m
|
||||
# Slow remnant since 2026-06-05: SNMP (snmp-idrac, 1m) is the fast primary
|
||||
# source. This Redfish job only feeds the few panels SNMP can't serve (LED,
|
||||
# NIC Mbps, SSD life %, machine/BIOS/DIMM/NIC inventory) and keeps the
|
||||
# exporter warm for HA Sofia's direct sensor.r730_fan_speed read. 10m is
|
||||
# plenty for slow-changing inventory/health.
|
||||
scrape_interval: 10m
|
||||
scrape_timeout: 45s
|
||||
metrics_path: /metrics
|
||||
static_configs:
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue