{ "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Single-pane-of-glass daily health overview \u2014 nodes, pods, quotas, storage, certs, GPU, power", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, "links": [ { "asDropdown": true, "icon": "external link", "includeVars": false, "keepTime": true, "tags": [], "targetBlank": true, "title": "Dashboards", "type": "dashboards", "url": "" }, { "title": "API Server", "type": "link", "url": "/d/k8s_system_apisrv", "icon": "cloud", "targetBlank": true, "keepTime": true, "tooltip": "Kubernetes API Server" }, { "title": "Nodes", "type": "link", "url": "/d/k8s_views_nodes", "icon": "dashboard", "targetBlank": true, "keepTime": true, "tooltip": "Kubernetes Nodes View" }, { "title": "Pods", "type": "link", "url": "/d/k8s_views_pods", "icon": "dashboard", "targetBlank": true, "keepTime": true, "tooltip": "Kubernetes Pods View" }, { "title": "GPU", "type": "link", "url": "/d/Oxed_c6Wz", "icon": "bolt", "targetBlank": true, "keepTime": true, "tooltip": "NVIDIA DCGM Exporter" }, { "title": "iDRAC", "type": "link", "url": "/d/YVz226S4z", "icon": "server", "targetBlank": true, "keepTime": true, "tooltip": "Dell iDRAC Hardware" }, { "title": "UPS", "type": "link", "url": "/d/ee70yskqw5u68f", "icon": "battery-full", "targetBlank": true, "keepTime": true, "tooltip": "Huawei UPS 2000" }, { "title": "CoreDNS", "type": "link", "url": "/d/wY4blRMGz", "icon": "signal", "targetBlank": true, "keepTime": true, "tooltip": "CoreDNS" }, { "title": "Node Exporter", "type": "link", "url": "/d/rYdddlPWk", "icon": "monitor", "targetBlank": true, "keepTime": true, "tooltip": "Node Exporter Full" }, { "title": "Docker Registry", "type": "link", "url": "/d/CoBSgj8iz", "icon": "cube", "targetBlank": true, "keepTime": true, "tooltip": "Docker Registry" }, { "title": "Traefik", "type": "link", "url": "/d/n5bu_kv45", "icon": "exchange-alt", "targetBlank": true, "keepTime": true, "tooltip": "Traefik Ingress" }, { "title": "Loki Logs", "type": "link", "url": "/d/o6-BGgnnk", "icon": "file-alt", "targetBlank": true, "keepTime": true, "tooltip": "Loki Kubernetes Logs" }, { "title": "kube-state-metrics", "type": "link", "url": "/d/garysdevil-kube-state-metrics-v2", "icon": "graph-bar", "targetBlank": true, "keepTime": true, "tooltip": "kube-state-metrics v2" }, { "title": "Proxmox", "type": "link", "url": "/d/rYdddlPW", "icon": "server", "targetBlank": true, "keepTime": true, "tooltip": "Proxmox Node Exporter" } ], "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 70, "panels": [], "title": "Infrastructure", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 0, "y": 1 }, "id": 78, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "CPU Temp", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"})", "legendFormat": "CPU Temp", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "celsius" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 3, "y": 1 }, "id": 71, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "GPU Temp", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(nvidia_tesla_t4_DCGM_FI_DEV_GPU_TEMP)", "legendFormat": "GPU Temp", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "from": 1, "to": 999999, "result": { "color": "red", "text": "OUTAGE" } }, "type": "range" }, { "options": { "0": { "color": "green", "text": "MAINS" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 6, "y": 1 }, "id": 93, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Power Source", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(ups_upsSecondsOnBattery)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 9, "y": 1 }, "id": 72, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "GPU Util %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(nvidia_tesla_t4_DCGM_FI_DEV_GPU_UTIL)", "legendFormat": "GPU Util", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 12000 }, { "color": "red", "value": 14000 } ] }, "unit": "decmbytes" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 12, "y": 1 }, "id": 73, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "GPU VRAM Used", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(nvidia_tesla_t4_DCGM_FI_DEV_FB_USED)", "legendFormat": "VRAM", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 10 }, { "color": "green", "value": 30 } ] }, "unit": "m" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 15, "y": 1 }, "id": 75, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "UPS Battery (min)", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(ups_upsEstimatedMinutesRemaining)", "legendFormat": "Battery", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 30 }, { "color": "green", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 18, "y": 1 }, "id": 76, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "UPS Charge %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(ups_upsEstimatedChargeRemaining)", "legendFormat": "Charge", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "0": { "color": "green", "text": "0" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 21, "y": 1 }, "id": 94, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "UPS Alarms", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(ups_upsAlarmsPresent)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": true, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "watt", "min": 0 }, "overrides": [ { "matcher": { "id": "byName", "options": "Solar Production" }, "properties": [ { "id": "color", "value": { "fixedColor": "#FADE2A", "mode": "fixed" } }, { "id": "custom.fillOpacity", "value": 20 } ] }, { "matcher": { "id": "byName", "options": "CPU Temp" }, "properties": [ { "id": "custom.axisPlacement", "value": "right" }, { "id": "unit", "value": "celsius" }, { "id": "color", "value": { "fixedColor": "#FF9830", "mode": "fixed" } }, { "id": "custom.fillOpacity", "value": 0 } ] } ] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 5 }, "id": 77, "options": { "legend": { "calcs": [ "mean", "max", "min" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Power Draw (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max by() (avg_over_time(r730_idrac_idrac_power_control_avg_consumed_watts[$__rate_interval]))", "legendFormat": "Consumed", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(haos_sensor_power_w{entity=\"sensor.fv_b_pv_power\"})", "legendFormat": "Solar Production", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "max(r730_idrac_idrac_sensors_temperature{name=\"CPU1 Temp\"})", "legendFormat": "CPU Temp", "refId": "C" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, "id": 10, "panels": [], "title": "Node Health", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Node" }, "properties": [ { "id": "custom.width", "value": 160 } ] }, { "matcher": { "id": "byName", "options": "CPU %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Memory %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Disk %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Pods" }, "properties": [ { "id": "custom.width", "value": 60 } ] } ] }, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 14 }, "id": 11, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, "displayName": "CPU %" } ] }, "title": "Node Resource Table", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100", "format": "table", "instant": true, "legendFormat": "", "refId": "CPU" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "format": "table", "instant": true, "legendFormat": "", "refId": "MEM" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) * 100", "format": "table", "instant": true, "legendFormat": "", "refId": "DISK" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count by(node)(kube_pod_info)", "format": "table", "instant": true, "legendFormat": "", "refId": "PODS" } ], "transformations": [ { "id": "merge", "options": {} }, { "id": "organize", "options": { "excludeByName": { "Time": true }, "renameByName": { "instance": "Node", "node": "Node", "Value #CPU": "CPU %", "Value #MEM": "Memory %", "Value #DISK": "Disk %", "Value #PODS": "Pods" } } } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percent", "max": 100, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 14 }, "id": 12, "options": { "legend": { "calcs": [ "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "CPU per Node (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))) * 100", "legendFormat": "{{instance}}", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percent", "max": 100, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 14 }, "id": 13, "options": { "legend": { "calcs": [ "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Memory per Node (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100", "legendFormat": "{{instance}}", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, "id": 40, "panels": [], "title": "Top Resource Consumers", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, "id": 41, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top 10 Namespaces by CPU", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(10, sum by(namespace)(rate(container_cpu_usage_seconds_total{container!=\"\"}[5m])))", "refId": "A", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "container": true, "endpoint": true, "instance": true, "job": true, "service": true, "uid": true, "id": true, "metrics_path": true, "node": true }, "renameByName": {} } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Value", "desc": true } ] } }, { "id": "rowsToFields", "options": {} } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "bytes" }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, "id": 42, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top 10 Namespaces by Memory", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(10, sum by(namespace)(container_memory_working_set_bytes{container!=\"\"}))", "refId": "A", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "container": true, "endpoint": true, "instance": true, "job": true, "service": true, "uid": true, "id": true, "metrics_path": true, "node": true }, "renameByName": {} } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Value", "desc": true } ] } }, { "id": "rowsToFields", "options": {} } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": true, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "bytes", "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, "id": 43, "options": { "legend": { "calcs": [ "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "GPU Memory by Namespace", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum by (namespace) (gpu_pod_memory_used_bytes) > 0", "format": "time_series", "instant": false, "legendFormat": "{{namespace}}", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, "id": 80, "panels": [], "title": "Key Services", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 50 }, { "color": "green", "value": 80 } ] }, "unit": "percent", "decimals": 1 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 32 }, "id": 83, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "DNS Cache Hit %", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(rate(coredns_cache_hits_total[5m])) / (sum(rate(coredns_cache_hits_total[5m])) + sum(rate(coredns_cache_misses_total[5m]))) * 100", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": null } ] }, "unit": "reqps", "decimals": 1 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 32 }, "id": 84, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Traefik req/s", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(rate(traefik_entrypoint_requests_total[5m]))", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 15 } ] }, "unit": "percent", "noValue": "0", "decimals": 1 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 32 }, "id": 85, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Traefik Error %", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(rate(traefik_entrypoint_requests_total{code=~\"[45]..\"}[5m])) / sum(rate(traefik_entrypoint_requests_total[5m])) * 100", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 50 }, { "color": "green", "value": 80 } ] }, "unit": "percent", "decimals": 1 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 32 }, "id": 86, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Registry Cache %", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(registry_registry_storage_cache_total{type=\"Hit\"}) / sum(registry_registry_storage_cache_total) * 100", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": null } ] }, "decimals": 0 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 36 }, "id": 87, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Open Connections", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(max by(entrypoint, protocol) (traefik_open_connections))", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": null } ] }, "unit": "reqps", "decimals": 1 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 36 }, "id": 89, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "DNS req/s", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(rate(coredns_dns_requests_total[5m]))", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 0.1 }, { "color": "red", "value": 0.5 } ] }, "unit": "s", "decimals": 3 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 36 }, "id": 90, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "DNS Latency P99", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 5 } ] }, "unit": "s", "decimals": 2 }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 36 }, "id": 91, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Ingress Latency P99", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "histogram_quantile(0.99, sum(rate(traefik_entrypoint_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 }, "id": 1, "panels": [], "title": "Cluster Summary", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 3 }, { "color": "green", "value": 5 } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 41 }, "id": 2, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Nodes Ready", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(kube_node_status_condition{condition=\"Ready\",status=\"true\"})", "legendFormat": "Ready", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 41 }, "id": 3, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Total Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(kube_pod_info)", "legendFormat": "Pods", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "0": { "color": "green", "text": "0" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 41 }, "id": 4, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Firing Alerts", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(ALERTS{alertstate=\"firing\"}) OR vector(0)", "legendFormat": "Firing", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 41 }, "id": 5, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "Cluster CPU %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "avg(1 - rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100", "legendFormat": "CPU", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 60 }, { "color": "red", "value": 80 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 41 }, "id": 6, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "Cluster Memory %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - sum(node_memory_MemAvailable_bytes) / sum(node_memory_MemTotal_bytes)) * 100", "legendFormat": "Memory", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 41 }, "id": 7, "options": { "minVizHeight": 75, "minVizWidth": 75, "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showThresholdLabels": false, "showThresholdMarkers": true }, "title": "Cluster Disk %", "type": "gauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - sum(node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) / sum(node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"})) * 100", "legendFormat": "Disk", "refId": "A" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 45 }, "id": 20, "panels": [], "title": "Resource Governance \u2014 Quota Utilization", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 90 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [ { "id": "custom.width", "value": 160 } ] }, { "matcher": { "id": "byName", "options": "Resource" }, "properties": [ { "id": "custom.width", "value": 120 } ] }, { "matcher": { "id": "byName", "options": "Usage %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 46 }, "id": 21, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, "displayName": "Usage %" } ] }, "title": "Quota Usage by Namespace", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kube_resourcequota{type=\"used\"}", "format": "table", "instant": true, "legendFormat": "", "refId": "USED" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kube_resourcequota{type=\"hard\"}", "format": "table", "instant": true, "legendFormat": "", "refId": "HARD" } ], "transformations": [ { "id": "merge", "options": {} }, { "id": "calculateField", "options": { "alias": "Usage %", "binary": { "left": "Value #USED", "operator": "/", "right": "Value #HARD", "reducer": "sum" }, "mode": "binary", "reduce": { "reducer": "sum" } } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "type": true, "resourcequota": true }, "renameByName": { "namespace": "Namespace", "resource": "Resource", "Value #USED": "Used", "Value #HARD": "Hard Limit", "Usage %": "Usage %" } } }, { "id": "filterByValue", "options": { "filters": [ { "config": { "id": "greater", "options": { "value": 0 } }, "fieldName": "Hard Limit" } ], "match": "all", "type": "include" } } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 46 }, "id": 22, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "Top Quota Consumers", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(15, kube_resourcequota{type=\"used\",resource=~\"requests.cpu|requests.memory|limits.cpu|limits.memory|pods\"} / kube_resourcequota{type=\"hard\",resource=~\"requests.cpu|requests.memory|limits.cpu|limits.memory|pods\"} * 100 > 0)", "refId": "A", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "uid": true, "type": true, "resourcequota": true, "metrics_path": true, "resource": true }, "renameByName": {} } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Value", "desc": true } ] } }, { "id": "rowsToFields", "options": {} } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 54 }, "id": 30, "panels": [], "title": "Pod Health", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 0, "y": 55 }, "id": 31, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "CrashLooping Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(increase(kube_pod_container_status_restarts_total[1h]) > 5) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 5 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 3, "y": 55 }, "id": 32, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "OOMKilled (24h)", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(increase(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\"}[24h])) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 6, "y": 55 }, "id": 33, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Failed Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(kube_pod_status_phase{phase=\"Failed\"}) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 }, { "color": "red", "value": 5 } ] }, "noValue": "0" }, "overrides": [] }, "gridPos": { "h": 4, "w": 3, "x": 9, "y": 55 }, "id": 34, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto" }, "title": "Pending Pods", "type": "stat", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "count(kube_pod_status_phase{phase=\"Pending\"}) OR vector(0)", "legendFormat": "", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "min": 0 }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 55 }, "id": 35, "options": { "legend": { "calcs": [ "sum" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Restart Rate (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "sum(increase(kube_pod_container_status_restarts_total[1h]))", "legendFormat": "Total Restarts / hour", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 5 }, { "color": "red", "value": 20 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [ { "id": "custom.width", "value": 140 } ] }, { "matcher": { "id": "byName", "options": "Pod" }, "properties": [ { "id": "custom.width", "value": 260 } ] }, { "matcher": { "id": "byName", "options": "Restarts" }, "properties": [ { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 63 }, "id": 36, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, "displayName": "Restarts" } ] }, "title": "High-Restart Pods", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "topk(15, kube_pod_container_status_restarts_total)", "format": "table", "instant": true, "legendFormat": "", "refId": "A" } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "uid": true, "container": true, "endpoint": true, "service": true }, "renameByName": { "namespace": "Namespace", "pod": "Pod", "Value": "Restarts" } } } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 71 }, "id": 50, "panels": [], "title": "Storage", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Namespace" }, "properties": [ { "id": "custom.width", "value": 140 } ] }, { "matcher": { "id": "byName", "options": "PVC" }, "properties": [ { "id": "custom.width", "value": 260 } ] }, { "matcher": { "id": "byName", "options": "Used %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "decimals", "value": 1 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] }, { "matcher": { "id": "byName", "options": "Used" }, "properties": [ { "id": "unit", "value": "bytes" } ] }, { "matcher": { "id": "byName", "options": "Capacity" }, "properties": [ { "id": "unit", "value": "bytes" } ] } ] }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 72 }, "id": 51, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, "displayName": "Used %" } ] }, "title": "PV Usage Table", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kubelet_volume_stats_used_bytes", "format": "table", "instant": true, "legendFormat": "", "refId": "USED" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kubelet_volume_stats_capacity_bytes", "format": "table", "instant": true, "legendFormat": "", "refId": "CAP" } ], "transformations": [ { "id": "merge", "options": {} }, { "id": "calculateField", "options": { "alias": "Used %", "binary": { "left": "Value #USED", "operator": "/", "right": "Value #CAP" }, "mode": "binary" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "node": true, "metrics_path": true }, "renameByName": { "namespace": "Namespace", "persistentvolumeclaim": "PVC", "Value #USED": "Used", "Value #CAP": "Capacity", "Used %": "Used %" } } }, { "id": "filterByValue", "options": { "filters": [ { "config": { "id": "greater", "options": { "value": 0 } }, "fieldName": "Capacity" } ], "match": "all", "type": "include" } } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 72 }, "id": 52, "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, "title": "PV Usage Bar", "type": "bargauge", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100", "legendFormat": "{{namespace}}/{{persistentvolumeclaim}}", "refId": "A" } ] }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": true, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 85 } ] }, "unit": "percent", "max": 100, "min": 0 }, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/^PVE /" }, "properties": [ { "id": "custom.lineStyle", "value": { "dash": [ 10, 10 ], "fill": "dash" } }, { "id": "custom.lineWidth", "value": 2 } ] } ] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 80 }, "id": 53, "options": { "legend": { "calcs": [ "mean", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "title": "Node Disk Usage (24h)", "type": "timeseries", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(1 - node_filesystem_avail_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"} / node_filesystem_size_bytes{mountpoint=\"/\",fstype!~\"tmpfs\"}) * 100", "legendFormat": "{{instance}}", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "pve_disk_usage_bytes{id=~\"storage/.*\"} / pve_disk_size_bytes{id=~\"storage/.*\"} * 100", "legendFormat": "PVE {{id}}", "refId": "B" } ] }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 88 }, "id": 60, "panels": [], "title": "Certificate Expiry", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "orange", "value": 7 }, { "color": "green", "value": 30 } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Certificate" }, "properties": [ { "id": "custom.width", "value": 400 } ] }, { "matcher": { "id": "byName", "options": "Days Remaining" }, "properties": [ { "id": "decimals", "value": 0 }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } } ] } ] }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 89 }, "id": 61, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": false, "displayName": "Days Remaining" } ] }, "title": "TLS Certificate Expiry", "type": "table", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "expr": "(traefik_tls_certs_not_after - time()) / 86400", "format": "table", "instant": true, "legendFormat": "", "refId": "A" } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "__name__": true, "job": true, "instance": true, "endpoint": true, "service": true, "sans": true }, "renameByName": { "cn": "Certificate", "serial": "Serial", "Value": "Days Remaining" } } }, { "id": "sortBy", "options": { "fields": {}, "sort": [ { "field": "Days Remaining", "desc": false } ] } } ] } ], "refresh": "5m", "schemaVersion": 39, "tags": [ "cluster-health", "daily-report" ], "templating": { "list": [ { "current": { "selected": false, "text": "Prometheus", "value": "PBFA97CFB590B2093" }, "hide": 0, "includeAll": false, "multi": false, "name": "datasource", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", "skipUrlSync": false, "type": "datasource" } ] }, "time": { "from": "now-24h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Cluster Health Overview", "uid": "cluster-health-overview", "version": 1 }