monitoring: wire rpi-sofia (Sofia Pi) into Prometheus/Loki/alerts
The Sofia Raspberry Pi hung this morning (network wedged ~10:13, HA sensors dead, and its local journal had been silent since Apr 27 — a 2017 SD card intermittently flipping the rootfs read-only). Nothing was captured because logging lived only on the failing card. Ship telemetry off-box so the next failure is diagnosable centrally: - Prometheus scrape job `rpi-sofia` (rpi-sofia.viktorbarzin.lan:9100) — node_exporter + a vcgencmd textfile collector on the Pi exporting under-voltage/throttle/SoC-temp as rpi_* metrics. - Alert group "RPi Sofia": node_exporter Down, rootfs ReadOnly (the exact SD-failure signature), Under-voltage since boot, High SoC temp. - LAN-gated Loki write ingress (loki.viktorbarzin.lan) so the Pi's promtail can push its journal — Loki was ClusterIP-only. - Grafana dashboard "RPi Sofia" (Hardware): status, undervoltage/ throttle, temp, load, memory, disk, network. The Pi separately got a systemd hardware watchdog (auto-reboot on a hard hang; today it stayed down ~5h until a manual power-cycle). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
5b96b841fc
commit
f9376a36ff
4 changed files with 298 additions and 0 deletions
230
stacks/monitoring/modules/monitoring/dashboards/rpi-sofia.json
Normal file
230
stacks/monitoring/modules/monitoring/dashboards/rpi-sofia.json
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": { "type": "grafana", "uid": "-- Grafana --" },
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "rpi-sofia (Raspberry Pi 3, Sofia home site) — health + forensic signals. Frigate camera DNAT passthrough + solar inverter path + HA MQTT sensors run on this Pi. The rpi_* metrics come from a vcgencmd textfile collector; the rest from node_exporter.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Status",
|
||||
"type": "stat",
|
||||
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "up{job=\"rpi-sofia\"}", "refId": "A" } ]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "green", "text": "OK" }, "1": { "color": "red", "text": "YES" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||
"id": 2,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Under-voltage (since boot)",
|
||||
"type": "stat",
|
||||
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "refId": "A" } ]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "green", "text": "No" }, "1": { "color": "red", "text": "THROTTLED" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"id": 3,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Throttled now",
|
||||
"type": "stat",
|
||||
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "refId": "A" } ]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "green", "text": "rw" }, "1": { "color": "red", "text": "READ-ONLY" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"id": 4,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"title": "Rootfs mount state",
|
||||
"type": "stat",
|
||||
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_readonly{instance=\"rpi-sofia\", mountpoint=\"/\"}", "refId": "A" } ]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
|
||||
"unit": "celsius",
|
||||
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 80 } ] }
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"id": 5,
|
||||
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
|
||||
"title": "SoC Temperature",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_soc_temp_celsius{instance=\"rpi-sofia\"}", "legendFormat": "vcgencmd temp", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_thermal_zone_temp{instance=\"rpi-sofia\"}", "legendFormat": "thermal zone", "refId": "B" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2, "showPoints": "never", "stepAfter": true },
|
||||
"max": 1,
|
||||
"min": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"id": 6,
|
||||
"options": { "legend": { "calcs": ["max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
|
||||
"title": "Throttle / Under-voltage events (1 = active)",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_now{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage now", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage since boot", "refId": "B" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "legendFormat": "throttled now", "refId": "C" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_occurred{instance=\"rpi-sofia\"}", "legendFormat": "throttled since boot", "refId": "D" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||
"id": 7,
|
||||
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
|
||||
"title": "CPU load average",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load1{instance=\"rpi-sofia\"}", "legendFormat": "load1", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load5{instance=\"rpi-sofia\"}", "legendFormat": "load5", "refId": "B" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load15{instance=\"rpi-sofia\"}", "legendFormat": "load15", "refId": "C" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||
"id": 8,
|
||||
"options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
|
||||
"title": "Memory",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemAvailable_bytes{instance=\"rpi-sofia\"}", "legendFormat": "available", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemTotal_bytes{instance=\"rpi-sofia\"}", "legendFormat": "total", "refId": "B" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
|
||||
"id": 9,
|
||||
"options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
|
||||
"title": "Root filesystem free space",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_avail_bytes{instance=\"rpi-sofia\", mountpoint=\"/\"}", "legendFormat": "/ available", "refId": "A" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
|
||||
"id": 10,
|
||||
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
|
||||
"title": "Network throughput per interface",
|
||||
"type": "timeseries",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_receive_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "rx {{device}}", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_transmit_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "tx {{device}}", "refId": "B" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "1m",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["rpi-sofia", "hardware", "sofia"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "RPi Sofia",
|
||||
"uid": "rpi-sofia",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
@ -124,6 +124,7 @@ locals {
|
|||
"idrac.json" = "Hardware"
|
||||
"ups.json" = "Hardware"
|
||||
"nvidia.json" = "Hardware"
|
||||
"rpi-sofia.json" = "Hardware"
|
||||
|
||||
# Operations
|
||||
"backup_health.json" = "Operations"
|
||||
|
|
|
|||
22
stacks/monitoring/modules/monitoring/loki_ingress.tf
Normal file
22
stacks/monitoring/modules/monitoring/loki_ingress.tf
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# Loki write/push endpoint for EXTERNAL hosts (currently rpi-sofia's promtail).
|
||||
#
|
||||
# Loki runs SingleBinary with the gateway disabled and auth_enabled=false, so it
|
||||
# is ClusterIP-only (svc "loki":3100) and unreachable from off-cluster. An
|
||||
# external log shipper like the Sofia Raspberry Pi cannot POST to
|
||||
# /loki/api/v1/push without this ingress.
|
||||
#
|
||||
# auth = "none": promtail ships logs programmatically (no browser, no Authentik
|
||||
# SSO cookie dance). The allow_local_access_only middleware (192.168.0.0/16 +
|
||||
# 10.0.0.0/8) gates the endpoint to LAN/VPN only — the correct model for a
|
||||
# LAN-only Pi, mirroring the idrac-redfish-exporter ingress in this module.
|
||||
module "loki-write-ingress" {
|
||||
source = "../../../../modules/kubernetes/ingress_factory"
|
||||
auth = "none"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
name = "loki"
|
||||
root_domain = "viktorbarzin.lan"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
allow_local_access_only = true
|
||||
ssl_redirect = false
|
||||
port = 3100
|
||||
}
|
||||
|
|
@ -814,6 +814,36 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Fan unhealthy on R730 - check iDRAC"
|
||||
- name: RPi Sofia
|
||||
rules:
|
||||
- alert: RpiSofiaDown
|
||||
expr: up{job="rpi-sofia"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "rpi-sofia node_exporter unreachable for 5m — Pi down or network/SD wedge (cameras + solar feed at risk)"
|
||||
- alert: RpiSofiaFilesystemReadonly
|
||||
expr: node_filesystem_readonly{instance="rpi-sofia", mountpoint="/"} == 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "rpi-sofia rootfs is READ-ONLY — failing SD card (the silent-journal failure mode from this incident). Reflash/replace the card."
|
||||
- alert: RpiSofiaUndervoltage
|
||||
expr: rpi_under_voltage_occurred{instance="rpi-sofia"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "rpi-sofia under-voltage detected since last boot — check PSU/USB power cable"
|
||||
- alert: RpiSofiaHighTemp
|
||||
expr: rpi_soc_temp_celsius{instance="rpi-sofia"} > 75
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "rpi-sofia SoC temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)"
|
||||
- name: Nvidia Tesla T4 GPU
|
||||
rules:
|
||||
- alert: HighGPUTemp
|
||||
|
|
@ -3058,6 +3088,21 @@ extraScrapeConfigs: |
|
|||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
replacement: 'pve-node-r730' # Giving it a friendly name
|
||||
# rpi-sofia: external Raspberry Pi 3 at the Sofia home site (Frigate camera
|
||||
# DNAT passthrough + solar inverter path + HA MQTT sensors). node_exporter
|
||||
# installed via apt; the rpi_* metrics come from a vcgencmd textfile collector
|
||||
# (undervoltage/throttle/SoC temp). Scraped by hostname (-> 192.168.1.10 wired).
|
||||
- job_name: 'rpi-sofia'
|
||||
static_configs:
|
||||
- targets:
|
||||
- "rpi-sofia.viktorbarzin.lan.:9100"
|
||||
labels:
|
||||
node: 'rpi-sofia'
|
||||
metrics_path: '/metrics'
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: instance
|
||||
replacement: 'rpi-sofia' # Giving it a friendly name
|
||||
- job_name: 'istiod'
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue