monitoring: wire rpi-sofia (Sofia Pi) into Prometheus/Loki/alerts
Some checks failed
ci/woodpecker/push/default Pipeline failed
ci/woodpecker/push/build-cli Pipeline was successful

The Sofia Raspberry Pi hung this morning (network wedged ~10:13, HA
sensors dead, and its local journal had been silent since Apr 27 — a
2017 SD card intermittently flipping the rootfs read-only). Nothing was
captured because logging lived only on the failing card. Ship telemetry
off-box so the next failure is diagnosable centrally:

- Prometheus scrape job `rpi-sofia` (rpi-sofia.viktorbarzin.lan:9100) —
  node_exporter + a vcgencmd textfile collector on the Pi exporting
  under-voltage/throttle/SoC-temp as rpi_* metrics.
- Alert group "RPi Sofia": node_exporter Down, rootfs ReadOnly (the
  exact SD-failure signature), Under-voltage since boot, High SoC temp.
- LAN-gated Loki write ingress (loki.viktorbarzin.lan) so the Pi's
  promtail can push its journal — Loki was ClusterIP-only.
- Grafana dashboard "RPi Sofia" (Hardware): status, undervoltage/
  throttle, temp, load, memory, disk, network.

The Pi separately got a systemd hardware watchdog (auto-reboot on a hard
hang; today it stayed down ~5h until a manual power-cycle).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-05 13:11:40 +00:00
parent 5b96b841fc
commit f9376a36ff
4 changed files with 298 additions and 0 deletions

View file

@ -0,0 +1,230 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": { "type": "grafana", "uid": "-- Grafana --" },
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "rpi-sofia (Raspberry Pi 3, Sofia home site) — health + forensic signals. Frigate camera DNAT passthrough + solar inverter path + HA MQTT sensors run on this Pi. The rpi_* metrics come from a vcgencmd textfile collector; the rest from node_exporter.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"id": 1,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Status",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "up{job=\"rpi-sofia\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "OK" }, "1": { "color": "red", "text": "YES" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"id": 2,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Under-voltage (since boot)",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "No" }, "1": { "color": "red", "text": "THROTTLED" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"id": 3,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Throttled now",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "green", "text": "rw" }, "1": { "color": "red", "text": "READ-ONLY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"id": 4,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"title": "Rootfs mount state",
"type": "stat",
"targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_readonly{instance=\"rpi-sofia\", mountpoint=\"/\"}", "refId": "A" } ]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "celsius",
"thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 80 } ] }
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"id": 5,
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "SoC Temperature",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_soc_temp_celsius{instance=\"rpi-sofia\"}", "legendFormat": "vcgencmd temp", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_thermal_zone_temp{instance=\"rpi-sofia\"}", "legendFormat": "thermal zone", "refId": "B" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2, "showPoints": "never", "stepAfter": true },
"max": 1,
"min": 0
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"id": 6,
"options": { "legend": { "calcs": ["max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Throttle / Under-voltage events (1 = active)",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_now{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage now", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage since boot", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "legendFormat": "throttled now", "refId": "C" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_occurred{instance=\"rpi-sofia\"}", "legendFormat": "throttled since boot", "refId": "D" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"id": 7,
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "CPU load average",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load1{instance=\"rpi-sofia\"}", "legendFormat": "load1", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load5{instance=\"rpi-sofia\"}", "legendFormat": "load5", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load15{instance=\"rpi-sofia\"}", "legendFormat": "load15", "refId": "C" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "bytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"id": 8,
"options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Memory",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemAvailable_bytes{instance=\"rpi-sofia\"}", "legendFormat": "available", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemTotal_bytes{instance=\"rpi-sofia\"}", "legendFormat": "total", "refId": "B" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "bytes"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"id": 9,
"options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Root filesystem free space",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_avail_bytes{instance=\"rpi-sofia\", mountpoint=\"/\"}", "legendFormat": "/ available", "refId": "A" }
]
},
{
"datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" },
"unit": "Bps"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"id": 10,
"options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } },
"title": "Network throughput per interface",
"type": "timeseries",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_receive_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "rx {{device}}", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_transmit_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "tx {{device}}", "refId": "B" }
]
}
],
"refresh": "1m",
"schemaVersion": 39,
"tags": ["rpi-sofia", "hardware", "sofia"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "",
"title": "RPi Sofia",
"uid": "rpi-sofia",
"version": 1,
"weekStart": ""
}

View file

@ -124,6 +124,7 @@ locals {
"idrac.json" = "Hardware"
"ups.json" = "Hardware"
"nvidia.json" = "Hardware"
"rpi-sofia.json" = "Hardware"
# Operations
"backup_health.json" = "Operations"

View file

@ -0,0 +1,22 @@
# Loki write/push endpoint for EXTERNAL hosts (currently rpi-sofia's promtail).
#
# Loki runs SingleBinary with the gateway disabled and auth_enabled=false, so it
# is ClusterIP-only (svc "loki":3100) and unreachable from off-cluster. An
# external log shipper like the Sofia Raspberry Pi cannot POST to
# /loki/api/v1/push without this ingress.
#
# auth = "none": promtail ships logs programmatically (no browser, no Authentik
# SSO cookie dance). The allow_local_access_only middleware (192.168.0.0/16 +
# 10.0.0.0/8) gates the endpoint to LAN/VPN only the correct model for a
# LAN-only Pi, mirroring the idrac-redfish-exporter ingress in this module.
module "loki-write-ingress" {
source = "../../../../modules/kubernetes/ingress_factory"
auth = "none"
namespace = kubernetes_namespace.monitoring.metadata[0].name
name = "loki"
root_domain = "viktorbarzin.lan"
tls_secret_name = var.tls_secret_name
allow_local_access_only = true
ssl_redirect = false
port = 3100
}

View file

@ -814,6 +814,36 @@ serverFiles:
severity: warning
annotations:
summary: "Fan unhealthy on R730 - check iDRAC"
- name: RPi Sofia
rules:
- alert: RpiSofiaDown
expr: up{job="rpi-sofia"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "rpi-sofia node_exporter unreachable for 5m — Pi down or network/SD wedge (cameras + solar feed at risk)"
- alert: RpiSofiaFilesystemReadonly
expr: node_filesystem_readonly{instance="rpi-sofia", mountpoint="/"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: "rpi-sofia rootfs is READ-ONLY — failing SD card (the silent-journal failure mode from this incident). Reflash/replace the card."
- alert: RpiSofiaUndervoltage
expr: rpi_under_voltage_occurred{instance="rpi-sofia"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "rpi-sofia under-voltage detected since last boot — check PSU/USB power cable"
- alert: RpiSofiaHighTemp
expr: rpi_soc_temp_celsius{instance="rpi-sofia"} > 75
for: 10m
labels:
severity: warning
annotations:
summary: "rpi-sofia SoC temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)"
- name: Nvidia Tesla T4 GPU
rules:
- alert: HighGPUTemp
@ -3058,6 +3088,21 @@ extraScrapeConfigs: |
- source_labels: [__address__]
target_label: instance
replacement: 'pve-node-r730' # Giving it a friendly name
# rpi-sofia: external Raspberry Pi 3 at the Sofia home site (Frigate camera
# DNAT passthrough + solar inverter path + HA MQTT sensors). node_exporter
# installed via apt; the rpi_* metrics come from a vcgencmd textfile collector
# (undervoltage/throttle/SoC temp). Scraped by hostname (-> 192.168.1.10 wired).
- job_name: 'rpi-sofia'
static_configs:
- targets:
- "rpi-sofia.viktorbarzin.lan.:9100"
labels:
node: 'rpi-sofia'
metrics_path: '/metrics'
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'rpi-sofia' # Giving it a friendly name
- job_name: 'istiod'
kubernetes_sd_configs:
- role: endpoints