{ "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Single-pane-of-glass daily health overview — nodes, pods, quotas, storage, certs, GPU, power", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, "links": [], "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 1, "panels": [], "title": "Cluster Summary", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 3 }, { "color": "green", "value": 5 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, "id": 2, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "Nodes Ready", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "legendFormat": "Ready", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, "id": 3, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "Total Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(kube_pod_info)", "legendFormat": "Pods", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "0": { "color": "green", "text": "0" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, "id": 4, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "Firing Alerts", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(ALERTS{alertstate=\"firing\"}) OR vector(0)", "legendFormat": "Firing", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, "id": 5, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "Cluster CPU %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "avg(1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100", "legendFormat": "CPU", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, "id": 6, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "Cluster Memory %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)) * 100", "legendFormat": "Memory", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, "id": 7, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "Cluster Disk %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"})) * 100", "legendFormat": "Disk", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 10, "panels": [], "title": "Node Health", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Node" }, "properties": [{ "id": "custom.width", "value": 160 }] }, { "matcher": { "id": "byName", "options": "CPU %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Memory %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Disk %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Pods" }, "properties": [{ "id": "custom.width", "value": 60 }] } ] }, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, "id": 11, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "CPU %" }] }, "title": "Node Resource Table", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100", "format": "table", "instant": true, "legendFormat": "", "refId": "CPU" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "format": "table", "instant": true, "legendFormat": "", "refId": "MEM" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) * 100", "format": "table", "instant": true, "legendFormat": "", "refId": "DISK" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count by(node)(kube_pod_info)", "format": "table", "instant": true, "legendFormat": "", "refId": "PODS" } ], "transformations": [ { "id": "merge", "options": {} }, { "id": "organize", "options": { "excludeByName": { "Time": true }, "renameByName": { "instance": "Node", "node": "Node", "Value #CPU": "CPU %", "Value #MEM": "Memory %", "Value #DISK": "Disk %", "Value #PODS": "Pods" } } } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percent", "max": 100, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, "id": 12, "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "CPU per Node (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100", "legendFormat": "{{instance}}", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percent", "max": 100, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, "id": 13, "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Memory per Node (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "legendFormat": "{{instance}}", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, "id": 20, "panels": [], "title": "Resource Governance — Quota Utilization", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 90 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 160 }] }, { "matcher": { "id": "byName", "options": "Resource" }, "properties": [{ "id": "custom.width", "value": 120 }] }, { "matcher": { "id": "byName", "options": "Usage %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, "id": 21, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Usage %" }] }, "title": "Quota Usage by Namespace", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kube_resourcequota{type=\"used\"}", "format": "table", "instant": true, "legendFormat": "", "refId": "USED" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kube_resourcequota{type=\"hard\"}", "format": "table", "instant": true, "legendFormat": "", "refId": "HARD" } ], "transformations": [ { "id": "merge", "options": {} }, { "id": "calculateField", "options": { "alias": "Usage %", "binary": { "left": "Value #USED", "operator": "/", "right": "Value #HARD", "reducer": "sum" }, "mode": "binary", "reduce": { "reducer": "sum" } } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "type": true, "resourcequota": true }, "renameByName": { "namespace": "Namespace", "resource": "Resource", "Value #USED": "Used", "Value #HARD": "Hard Limit", "Usage %": "Usage %" } } }, { "id": "filterByValue", "options": { "filters": [ { "config": { "id": "greater", "options": { "value": 0 } }, "fieldName": "Hard Limit" } ], "match": "all", "type": "include" } } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, "id": 22, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top Quota Consumers", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(15, kube_resourcequota{type=\"used\",resource=~\"requests.cpu|requests.memory|limits.cpu|limits.memory|pods\"} / kube_resourcequota{type=\"hard\",resource=~\"requests.cpu|requests.memory|limits.cpu|limits.memory|pods\"} * 100 > 0)", "legendFormat": "{{namespace}} / {{resource}}", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, "id": 30, "panels": [], "title": "Pod Health", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 0, "y": 24 }, "id": 31, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "CrashLooping Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(increase(kube_pod_container_status_restarts_total[1h]) > 5) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 5 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 3, "y": 24 }, "id": 32, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "OOMKilled (24h)", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(increase(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[24h])) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 6, "y": 24 }, "id": 33, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "Failed Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(kube_pod_status_phase{phase=\"Failed\"}) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 5 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 9, "y": 24 }, "id": 34, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "Pending Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(kube_pod_status_phase{phase=\"Pending\"}) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "id": 35, "options": { "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Restart Rate (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(increase(kube_pod_container_status_restarts_total[1h]))", "legendFormat": "Total Restarts / hour", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 140 }] }, { "matcher": { "id": "byName", "options": "Pod" }, "properties": [{ "id": "custom.width", "value": 260 }] }, { "matcher": { "id": "byName", "options": "Restarts" }, "properties": [ { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, "id": 36, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Restarts" }] }, "title": "High-Restart Pods", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(15, kube_pod_container_status_restarts_total)", "format": "table", "instant": true, "legendFormat": "", "refId": "A" } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "uid": true, "container": true, "endpoint": true, "service": true }, "renameByName": { "namespace": "Namespace", "pod": "Pod", "Value": "Restarts" } } } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 36 }, "id": 40, "panels": [], "title": "Top Resource Consumers", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 37 }, "id": 41, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top 10 Namespaces by CPU", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(10, sum by(namespace)(rate(container_cpu_usage_seconds_total{container!=\"\"}[5m])))", "legendFormat": "{{namespace}}", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "bytes" }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 37 }, "id": 42, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top 10 Namespaces by Memory", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(10, sum by(namespace)(container_memory_working_set_bytes{container!=\"\"}))", "legendFormat": "{{namespace}}", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "bytes" }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 37 }, "id": 43, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top 10 Pods by Memory", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(10, container_memory_working_set_bytes{container!=\"\"})", "legendFormat": "{{namespace}}/{{pod}}", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }, "id": 50, "panels": [], "title": "Storage", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [{ "id": "custom.width", "value": 140 }] }, { "matcher": { "id": "byName", "options": "PVC" }, "properties": [{ "id": "custom.width", "value": 260 }] }, { "matcher": { "id": "byName", "options": "Used %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Used" }, "properties": [{ "id": "unit", "value": "bytes" }] }, { "matcher": { "id": "byName", "options": "Capacity" }, "properties": [{ "id": "unit", "value": "bytes" }] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 46 }, "id": 51, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Used %" }] }, "title": "PV Usage Table", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kubelet_volume_stats_used_bytes", "format": "table", "instant": true, "legendFormat": "", "refId": "USED" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kubelet_volume_stats_capacity_bytes", "format": "table", "instant": true, "legendFormat": "", "refId": "CAP" } ], "transformations": [ { "id": "merge", "options": {} }, { "id": "calculateField", "options": { "alias": "Used %", "binary": { "left": "Value #USED", "operator": "/", "right": "Value #CAP" }, "mode": "binary" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "node": true, "metrics_path": true }, "renameByName": { "namespace": "Namespace", "persistentvolumeclaim": "PVC", "Value #USED": "Used", "Value #CAP": "Capacity", "Used %": "Used %" } } }, { "id": "filterByValue", "options": { "filters": [ { "config": { "id": "greater", "options": { "value": 0 } }, "fieldName": "Capacity" } ], "match": "all", "type": "include" } } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 46 }, "id": 52, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "PV Usage Bar", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 85 } ] }, "unit": "percent", "max": 100, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 54 }, "id": 53, "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Node Disk Usage (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) * 100", "legendFormat": "{{instance}}", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 62 }, "id": 60, "panels": [], "title": "Certificate Expiry", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 7 }, { "color": "green", "value": 30 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Certificate" }, "properties": [ { "id": "custom.width", "value": 400 } ] }, { "matcher": { "id": "byName", "options": "Days Remaining" }, "properties": [ { "id": "decimals", "value": 0 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] } ] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 63 }, "id": 61, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": false, "displayName": "Days Remaining" }] }, "title": "TLS Certificate Expiry", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(traefik_tls_certs_not_after - time()) / 86400", "format": "table", "instant": true, "legendFormat": "", "refId": "A" } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "sans": true }, "renameByName": { "cn": "Certificate", "serial": "Serial", "Value": "Days Remaining" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [{ "field": "Days Remaining", "desc": false }] } } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 71 }, "id": 70, "panels": [], "title": "Infrastructure", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 72 }, "id": 71, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "GPU Temp", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP", "legendFormat": "GPU Temp", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 72 }, "id": 72, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "GPU Util %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL", "legendFormat": "GPU Util", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 12000 }, { "color": "red", "value": 14000 } ] }, "unit": "decmbytes" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 72 }, "id": 73, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "GPU VRAM Used", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "nvidia_tesla_t4_DCGM_FI_DEV_FB_USED", "legendFormat": "VRAM", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 400 }, { "color": "red", "value": 600 } ] }, "unit": "watt" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 72 }, "id": 74, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "Server Power", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "avg(r730_idrac_idrac_power_supply_input_watts)", "legendFormat": "Power", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 10 }, { "color": "green", "value": 30 } ] }, "unit": "m" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 72 }, "id": 75, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, "title": "UPS Battery (min)", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "ups_upsEstimatedMinutesRemaining", "legendFormat": "Battery", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 30 }, { "color": "green", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 72 }, "id": 76, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "UPS Charge %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "ups_upsEstimatedChargeRemaining", "legendFormat": "Charge", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "watt", "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 76 }, "id": 77, "options": { "legend": { "calcs": ["mean", "max", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Power Draw (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "avg_over_time(r730_idrac_idrac_power_supply_input_watts[$__rate_interval])", "legendFormat": "PSU {{id}}", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts[$__rate_interval])", "legendFormat": "Avg Consumed", "refId": "B" } ] } ], "refresh": "5m", "schemaVersion": 39, "tags": ["cluster-health", "daily-report"], "templating": { "list": [ { "current": { "selected": false, "text": "Prometheus", "value": "PBFA97CFB590B2093" }, "hide": 0, "includeAll": false, "multi": false, "name": "datasource", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", "skipUrlSync": false, "type": "datasource" } ] }, "time": { "from": "now-24h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Cluster Health Overview", "uid": "cluster-health-overview", "version": 1 }