add separate idrac monitoring tool and dashboard [ci skip]

This commit is contained in:
Viktor Barzin 2025-12-14 09:48:59 +00:00
parent 7a6cf5647b
commit bc486227f7
4 changed files with 3310 additions and 1697 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,710 @@
{
"annotations": {
"list": [
{
"$$hashKey": "object:192",
"builtIn": 1,
"datasource": {
"type": "datasource",
"uid": "grafana"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.13+) cluster",
"editable": true,
"fiscalYearStartMonth": 0,
"gnetId": 12239,
"graphTooltip": 0,
"id": 26,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 0
},
"id": 12,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
"instant": false,
"interval": "",
"legendFormat": "GPU 0",
"refId": "A"
}
],
"title": "GPU Temperature",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 70
},
{
"color": "red",
"value": 80
}
]
},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 0
},
"id": 14,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP",
"interval": "",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "GPU Current Temp",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 18,
"x": 0,
"y": 8
},
"id": 10,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"pluginVersion": "6.5.2",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Power Usage",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"max": 2400,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "#EAB839",
"value": 1800
},
{
"color": "red",
"value": 2200
}
]
},
"unit": "watt"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 6,
"x": 18,
"y": 8
},
"id": 16,
"options": {
"minVizHeight": 75,
"minVizWidth": 75,
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"sum"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true,
"sizing": "auto"
},
"pluginVersion": "11.1.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "sum(nvidia_tesla_t4_DCGM_FI_DEV_POWER_USAGE)",
"interval": "",
"legendFormat": "",
"range": true,
"refId": "A"
}
],
"title": "GPU Power Total",
"type": "gauge"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 6,
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Utilization",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "decmbytes"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 18,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU Framebuffer Mem Used",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "hertz"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 2,
"interval": "",
"options": {
"legend": {
"calcs": [
"mean",
"lastNotNull",
"max"
],
"displayMode": "table",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "nvidia_tesla_t4_DCGM_FI_DEV_SM_CLOCK* 1000000",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "GPU {{gpu}}",
"range": true,
"refId": "A"
}
],
"title": "GPU SM Clocks",
"type": "timeseries"
}
],
"refresh": "auto",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "NVIDIA DCGM Exporter Dashboard",
"uid": "Oxed_c6Wz",
"version": 7,
"weekStart": ""
}

View file

@ -333,6 +333,8 @@ resource "kubernetes_config_map" "redfish-config" {
}
data = {
"config.yml" = <<-EOF
address: 0.0.0.0
port: 9610
hosts:
${var.idrac_host}:
username: ${var.idrac_username}
@ -340,10 +342,8 @@ resource "kubernetes_config_map" "redfish-config" {
default:
username: root
password: calvin
groups:
group1:
username: user
password: pass
metrics:
all: true
EOF
}
}
@ -374,20 +374,17 @@ resource "kubernetes_deployment" "idrac-redfish" {
}
spec {
container {
image = "viktorbarzin/redfish-exporter:latest"
# https://github.com/mrlhansen/idrac_exporter?tab=readme-ov-file
image = "ghcr.io/mrlhansen/idrac_exporter:latest"
name = "redfish-exporter"
# command = ["/bin/sh", "-c", "redfish-exporter --config.file /app/config.yml"]
# command = ["/usr/local/bin/redfish_exporter", "--config.file", "/etc/prometheus/redfish_exporter.yml"]
command = ["/usr/local/bin/redfish_exporter", "--config.file", "/app/config.yml"]
port {
container_port = 9610
}
volume_mount {
name = "redfish-exporter-config"
mount_path = "/app/config.yml"
# mount_path = "/etc/prometheus/redfish_exporter.yml"
sub_path = "config.yml"
mount_path = "/etc/prometheus/idrac.yml"
sub_path = "config.yml"
}
}
volume {

View file

@ -81,7 +81,7 @@ server:
# enabled: false
existingClaim: prometheus-iscsi-pvc
# storageClass: rook-cephfs
retention: "12w"
retention: "52w"
strategy:
type: Recreate
baseURL: "https://prometheus.viktorbarzin.me"
@ -222,8 +222,15 @@ serverFiles:
severity: page
annotations:
summary: Power voltage on a power supply is {{ $value }} indicating power outage.
- alert: HighGPUTemp
expr: nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP > 65
for: 1m
labels:
severity: page
annotations:
summary: "High GPU Temperature {{$value}}"
- alert: HighPowerUsage
expr: (max_over_time(r730_idrac_redfish_chassis_power_average_consumed_watts[20m])) > 180
expr: r730_idrac_idrac_power_control_consumed_watts > 200
for: 60m
labels:
severity: page
@ -237,7 +244,7 @@ serverFiles:
annotations:
summary: No node load data. Can signal that prometheus is not scraping
- alert: NoiDRACData
expr: (max(r730_idrac_redfish_chassis_power_average_consumed_watts) or on() vector(0)) == 0
expr: (max(r730_idrac_idrac_system_health + 1) or on() vector(0)) == 0
for: 30m
labels:
severity: page
@ -359,15 +366,13 @@ extraScrapeConfigs: |
- targets:
- "idrac.viktorbarzin.lan:161"
metrics_path: '/snmp'
params:
module: [dell_idrac]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'prometheus-snmp-exporter.monitoring.svc.cluster.local:9116'
replacement: 'snmp-exporter.monitoring.svc.cluster.local:9116'
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
@ -375,9 +380,9 @@ extraScrapeConfigs: |
regex: '(.*)'
replacement: 'r730_idrac_$${1}'
- job_name: 'redfish-idrac'
scrape_interval: 5m
scrape_timeout: 4m
metrics_path: /redfish
scrape_interval: 3m
scrape_timeout: 1m
metrics_path: /metrics
static_configs:
- targets:
- 192.168.1.4
@ -492,3 +497,15 @@ extraScrapeConfigs: |
- "ha-sofia.viktorbarzin.lan:8123"
metrics_path: '/api/prometheus'
bearer_token: "${haos_api_token}"
- job_name: 'nvidia'
static_configs:
- targets:
- "nvidia-exporter.nvidia.svc.cluster.local"
metrics_path: '/metrics'
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: '__name__'
action: replace
regex: '(.*)'
replacement: 'nvidia_tesla_t4_$${1}'