From 608d5ab63624a0cf80ba5ce669f6348bfc0fd6fa Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 15 Feb 2026 19:38:28 +0000 Subject: [PATCH] Add Cluster Health Overview Grafana dashboard [ci skip] --- .../monitoring/dashboards/cluster_health.json | 1753 +++++++++++++++++ 1 file changed, 1753 insertions(+) create mode 100644 modules/kubernetes/monitoring/dashboards/cluster_health.json diff --git a/modules/kubernetes/monitoring/dashboards/cluster_health.json b/modules/kubernetes/monitoring/dashboards/cluster_health.json new file mode 100644 index 00000000..dcc123b3 --- /dev/null +++ b/modules/kubernetes/monitoring/dashboards/cluster_health.json @@ -0,0 +1,1753 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Single-pane-of-glass daily health overview — nodes, pods, quotas, storage, certs, GPU, power", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Cluster Summary", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 3 }, + { "color": "green", "value": 5 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "Nodes Ready", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", + "legendFormat": "Ready", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "blue", "value": null } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "Total Pods", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(kube_pod_info)", + "legendFormat": "Pods", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { + "options": { "0": { "color": "green", "text": "0" } }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "noValue": "0" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "Firing Alerts", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(ALERTS{alertstate=\"firing\"}) OR vector(0)", + "legendFormat": "Firing", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 60 }, + { "color": "red", "value": 80 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "Cluster CPU %", + "type": "gauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100", + "legendFormat": "CPU", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 60 }, + { "color": "red", "value": 80 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "Cluster Memory %", + "type": "gauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)) * 100", + "legendFormat": "Memory", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 7, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "Cluster Disk %", + "type": "gauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"})) * 100", + "legendFormat": "Disk", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 10, + "panels": [], + "title": "Node Health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 60 }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Node" }, + "properties": [{ "id": "custom.width", "value": 160 }] + }, + { + "matcher": { "id": "byName", "options": "CPU %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Memory %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Disk %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Pods" }, + "properties": [{ "id": "custom.width", "value": 60 }] + } + ] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, + "id": 11, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "CPU %" }] + }, + "title": "Node Resource Table", + "type": "table", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "CPU" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "MEM" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) * 100", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "DISK" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count by(node)(kube_pod_info)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "PODS" + } + ], + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "renameByName": { + "instance": "Node", + "node": "Node", + "Value #CPU": "CPU %", + "Value #MEM": "Memory %", + "Value #DISK": "Disk %", + "Value #PODS": "Pods" + } + } + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + }, + "unit": "percent", + "max": 100, + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, + "id": 12, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "title": "CPU per Node (24h)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + }, + "unit": "percent", + "max": 100, + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, + "id": 13, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "title": "Memory per Node (24h)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 20, + "panels": [], + "title": "Resource Governance — Quota Utilization", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 70 }, + { "color": "red", "value": 90 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.width", "value": 160 }] + }, + { + "matcher": { "id": "byName", "options": "Resource" }, + "properties": [{ "id": "custom.width", "value": 120 }] + }, + { + "matcher": { "id": "byName", "options": "Usage %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "id": 21, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Usage %" }] + }, + "title": "Quota Usage by Namespace", + "type": "table", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "kube_resourcequota{type=\"used\"}", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "USED" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "kube_resourcequota{type=\"hard\"}", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "HARD" + } + ], + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "calculateField", + "options": { + "alias": "Usage %", + "binary": { "left": "Value #USED", "operator": "/", "right": "Value #HARD", "reducer": "sum" }, + "mode": "binary", + "reduce": { "reducer": "sum" } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "type": true, "resourcequota": true }, + "renameByName": { + "namespace": "Namespace", + "resource": "Resource", + "Value #USED": "Used", + "Value #HARD": "Hard Limit", + "Usage %": "Usage %" + } + } + }, + { + "id": "filterByValue", + "options": { + "filters": [ + { + "config": { "id": "greater", "options": { "value": 0 } }, + "fieldName": "Hard Limit" + } + ], + "match": "all", + "type": "include" + } + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 70 }, + { "color": "red", "value": 90 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "id": 22, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "title": "Top Quota Consumers", + "type": "bargauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(15, kube_resourcequota{type=\"used\",resource=~\"requests.cpu|requests.memory|limits.cpu|limits.memory|pods\"} / kube_resourcequota{type=\"hard\",resource=~\"requests.cpu|requests.memory|limits.cpu|limits.memory|pods\"} * 100 > 0)", + "legendFormat": "{{namespace}} / {{resource}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 30, + "panels": [], + "title": "Pod Health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "noValue": "0" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 3, "x": 0, "y": 24 }, + "id": 31, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "CrashLooping Pods", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(increase(kube_pod_container_status_restarts_total[1h]) > 5) OR vector(0)", + "legendFormat": "", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "noValue": "0" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 3, "x": 3, "y": 24 }, + "id": 32, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "OOMKilled (24h)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(increase(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[24h])) OR vector(0)", + "legendFormat": "", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "noValue": "0" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 3, "x": 6, "y": 24 }, + "id": 33, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "Failed Pods", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(kube_pod_status_phase{phase=\"Failed\"}) OR vector(0)", + "legendFormat": "", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "noValue": "0" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 3, "x": 9, "y": 24 }, + "id": 34, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "Pending Pods", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "count(kube_pod_status_phase{phase=\"Pending\"}) OR vector(0)", + "legendFormat": "", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 35, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "title": "Restart Rate (24h)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "sum(increase(kube_pod_container_status_restarts_total[1h]))", + "legendFormat": "Total Restarts / hour", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 5 }, + { "color": "red", "value": 20 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.width", "value": 140 }] + }, + { + "matcher": { "id": "byName", "options": "Pod" }, + "properties": [{ "id": "custom.width", "value": 260 }] + }, + { + "matcher": { "id": "byName", "options": "Restarts" }, + "properties": [ + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, + "id": 36, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Restarts" }] + }, + "title": "High-Restart Pods", + "type": "table", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(15, kube_pod_container_status_restarts_total)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "uid": true, "container": true, "endpoint": true, "service": true }, + "renameByName": { + "namespace": "Namespace", + "pod": "Pod", + "Value": "Restarts" + } + } + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 }, + "id": 40, + "panels": [], + "title": "Top Resource Consumers", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 37 }, + "id": 41, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "title": "Top 10 Namespaces by CPU", + "type": "bargauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(10, sum by(namespace)(rate(container_cpu_usage_seconds_total{container!=\"\"}[5m])))", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 37 }, + "id": 42, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "title": "Top 10 Namespaces by Memory", + "type": "bargauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(10, sum by(namespace)(container_memory_working_set_bytes{container!=\"\"}))", + "legendFormat": "{{namespace}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 37 }, + "id": 43, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "title": "Top 10 Pods by Memory", + "type": "bargauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "topk(10, container_memory_working_set_bytes{container!=\"\"})", + "legendFormat": "{{namespace}}/{{pod}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }, + "id": 50, + "panels": [], + "title": "Storage", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 70 }, + { "color": "red", "value": 85 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Namespace" }, + "properties": [{ "id": "custom.width", "value": 140 }] + }, + { + "matcher": { "id": "byName", "options": "PVC" }, + "properties": [{ "id": "custom.width", "value": 260 }] + }, + { + "matcher": { "id": "byName", "options": "Used %" }, + "properties": [ + { "id": "unit", "value": "percent" }, + { "id": "decimals", "value": 1 }, + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + }, + { + "matcher": { "id": "byName", "options": "Used" }, + "properties": [{ "id": "unit", "value": "bytes" }] + }, + { + "matcher": { "id": "byName", "options": "Capacity" }, + "properties": [{ "id": "unit", "value": "bytes" }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 46 }, + "id": 51, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": true, "displayName": "Used %" }] + }, + "title": "PV Usage Table", + "type": "table", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "kubelet_volume_stats_used_bytes", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "USED" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "kubelet_volume_stats_capacity_bytes", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "CAP" + } + ], + "transformations": [ + { + "id": "merge", + "options": {} + }, + { + "id": "calculateField", + "options": { + "alias": "Used %", + "binary": { "left": "Value #USED", "operator": "/", "right": "Value #CAP" }, + "mode": "binary" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "node": true, "metrics_path": true }, + "renameByName": { + "namespace": "Namespace", + "persistentvolumeclaim": "PVC", + "Value #USED": "Used", + "Value #CAP": "Capacity", + "Used %": "Used %" + } + } + }, + { + "id": "filterByValue", + "options": { + "filters": [ + { + "config": { "id": "greater", "options": { "value": 0 } }, + "fieldName": "Capacity" + } + ], + "match": "all", + "type": "include" + } + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 46 }, + "id": 52, + "options": { + "displayMode": "gradient", + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "title": "PV Usage Bar", + "type": "bargauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100", + "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent", + "max": 100, + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 54 }, + "id": 53, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "title": "Node Disk Usage (24h)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) * 100", + "legendFormat": "{{instance}}", + "refId": "A" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 62 }, + "id": 60, + "panels": [], + "title": "Certificate Expiry", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 7 }, + { "color": "green", "value": 30 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Certificate" }, + "properties": [ + { "id": "custom.width", "value": 400 } + ] + }, + { + "matcher": { "id": "byName", "options": "Days Remaining" }, + "properties": [ + { "id": "decimals", "value": 0 }, + { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } + ] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 63 }, + "id": 61, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [{ "desc": false, "displayName": "Days Remaining" }] + }, + "title": "TLS Certificate Expiry", + "type": "table", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "(traefik_tls_certs_not_after - time()) / 86400", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "sans": true }, + "renameByName": { + "cn": "Certificate", + "serial": "Serial", + "Value": "Days Remaining" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [{ "field": "Days Remaining", "desc": false }] + } + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 71 }, + "id": 70, + "panels": [], + "title": "Infrastructure", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 70 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 72 }, + "id": 71, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "GPU Temp", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP", + "legendFormat": "GPU Temp", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 60 }, + { "color": "red", "value": 85 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 72 }, + "id": 72, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "GPU Util %", + "type": "gauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL", + "legendFormat": "GPU Util", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 12000 }, + { "color": "red", "value": 14000 } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 72 }, + "id": 73, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "GPU VRAM Used", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED", + "legendFormat": "VRAM", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "orange", "value": 400 }, + { "color": "red", "value": 600 } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 72 }, + "id": 74, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "Server Power", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg(r730_idrac_idrac_power_supply_input_watts)", + "legendFormat": "Power", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 10 }, + { "color": "green", "value": 30 } + ] + }, + "unit": "m" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 72 }, + "id": 75, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "title": "UPS Battery (min)", + "type": "stat", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "ups_upsEstimatedMinutesRemaining", + "legendFormat": "Battery", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "orange", "value": 30 }, + { "color": "green", "value": 80 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 72 }, + "id": 76, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "title": "UPS Charge %", + "type": "gauge", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "ups_upsEstimatedChargeRemaining", + "legendFormat": "Charge", + "refId": "A" + } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "watt", + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 76 }, + "id": 77, + "options": { + "legend": { "calcs": ["mean", "max", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "title": "Power Draw (24h)", + "type": "timeseries", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg_over_time(r730_idrac_idrac_power_supply_input_watts[$__rate_interval])", + "legendFormat": "PSU {{id}}", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "expr": "avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts[$__rate_interval])", + "legendFormat": "Avg Consumed", + "refId": "B" + } + ] + } + ], + "refresh": "5m", + "schemaVersion": 39, + "tags": ["cluster-health", "daily-report"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "PBFA97CFB590B2093" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Cluster Health Overview", + "uid": "cluster-health-overview", + "version": 1 +}