From f9376a36ff190e4d671cd304e19340af50ddec94 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 5 Jun 2026 13:11:40 +0000 Subject: [PATCH] monitoring: wire rpi-sofia (Sofia Pi) into Prometheus/Loki/alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Sofia Raspberry Pi hung this morning (network wedged ~10:13, HA sensors dead, and its local journal had been silent since Apr 27 — a 2017 SD card intermittently flipping the rootfs read-only). Nothing was captured because logging lived only on the failing card. Ship telemetry off-box so the next failure is diagnosable centrally: - Prometheus scrape job `rpi-sofia` (rpi-sofia.viktorbarzin.lan:9100) — node_exporter + a vcgencmd textfile collector on the Pi exporting under-voltage/throttle/SoC-temp as rpi_* metrics. - Alert group "RPi Sofia": node_exporter Down, rootfs ReadOnly (the exact SD-failure signature), Under-voltage since boot, High SoC temp. - LAN-gated Loki write ingress (loki.viktorbarzin.lan) so the Pi's promtail can push its journal — Loki was ClusterIP-only. - Grafana dashboard "RPi Sofia" (Hardware): status, undervoltage/ throttle, temp, load, memory, disk, network. The Pi separately got a systemd hardware watchdog (auto-reboot on a hard hang; today it stayed down ~5h until a manual power-cycle). Co-Authored-By: Claude Opus 4.8 --- .../monitoring/dashboards/rpi-sofia.json | 230 ++++++++++++++++++ .../monitoring/modules/monitoring/grafana.tf | 1 + .../modules/monitoring/loki_ingress.tf | 22 ++ .../monitoring/prometheus_chart_values.tpl | 45 ++++ 4 files changed, 298 insertions(+) create mode 100644 stacks/monitoring/modules/monitoring/dashboards/rpi-sofia.json create mode 100644 stacks/monitoring/modules/monitoring/loki_ingress.tf diff --git a/stacks/monitoring/modules/monitoring/dashboards/rpi-sofia.json b/stacks/monitoring/modules/monitoring/dashboards/rpi-sofia.json new file mode 100644 index 00000000..6f920b50 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/dashboards/rpi-sofia.json @@ -0,0 +1,230 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "rpi-sofia (Raspberry Pi 3, Sofia home site) — health + forensic signals. Frigate camera DNAT passthrough + solar inverter path + HA MQTT sensors run on this Pi. The rpi_* metrics come from a vcgencmd textfile collector; the rest from node_exporter.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Status", + "type": "stat", + "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "up{job=\"rpi-sofia\"}", "refId": "A" } ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "color": "green", "text": "OK" }, "1": { "color": "red", "text": "YES" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Under-voltage (since boot)", + "type": "stat", + "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "refId": "A" } ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "color": "green", "text": "No" }, "1": { "color": "red", "text": "THROTTLED" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Throttled now", + "type": "stat", + "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "refId": "A" } ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "mappings": [ + { "options": { "0": { "color": "green", "text": "rw" }, "1": { "color": "red", "text": "READ-ONLY" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "id": 4, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "title": "Rootfs mount state", + "type": "stat", + "targets": [ { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_readonly{instance=\"rpi-sofia\", mountpoint=\"/\"}", "refId": "A" } ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, + "unit": "celsius", + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 70 }, { "color": "red", "value": 80 } ] } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 5, + "options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "title": "SoC Temperature", + "type": "timeseries", + "targets": [ + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_soc_temp_celsius{instance=\"rpi-sofia\"}", "legendFormat": "vcgencmd temp", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_thermal_zone_temp{instance=\"rpi-sofia\"}", "legendFormat": "thermal zone", "refId": "B" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2, "showPoints": "never", "stepAfter": true }, + "max": 1, + "min": 0 + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 6, + "options": { "legend": { "calcs": ["max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "title": "Throttle / Under-voltage events (1 = active)", + "type": "timeseries", + "targets": [ + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_now{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage now", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_under_voltage_occurred{instance=\"rpi-sofia\"}", "legendFormat": "under-voltage since boot", "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_now{instance=\"rpi-sofia\"}", "legendFormat": "throttled now", "refId": "C" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rpi_throttled_occurred{instance=\"rpi-sofia\"}", "legendFormat": "throttled since boot", "refId": "D" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "id": 7, + "options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "title": "CPU load average", + "type": "timeseries", + "targets": [ + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load1{instance=\"rpi-sofia\"}", "legendFormat": "load1", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load5{instance=\"rpi-sofia\"}", "legendFormat": "load5", "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_load15{instance=\"rpi-sofia\"}", "legendFormat": "load15", "refId": "C" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "id": 8, + "options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "title": "Memory", + "type": "timeseries", + "targets": [ + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemAvailable_bytes{instance=\"rpi-sofia\"}", "legendFormat": "available", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_memory_MemTotal_bytes{instance=\"rpi-sofia\"}", "legendFormat": "total", "refId": "B" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "id": 9, + "options": { "legend": { "calcs": ["last", "min"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "title": "Root filesystem free space", + "type": "timeseries", + "targets": [ + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "node_filesystem_avail_bytes{instance=\"rpi-sofia\", mountpoint=\"/\"}", "legendFormat": "/ available", "refId": "A" } + ] + }, + { + "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2, "showPoints": "never" }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "id": 10, + "options": { "legend": { "calcs": ["last", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, + "title": "Network throughput per interface", + "type": "timeseries", + "targets": [ + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_receive_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "rx {{device}}", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "expr": "rate(node_network_transmit_bytes_total{instance=\"rpi-sofia\", device!=\"lo\"}[5m])", "legendFormat": "tx {{device}}", "refId": "B" } + ] + } + ], + "refresh": "1m", + "schemaVersion": 39, + "tags": ["rpi-sofia", "hardware", "sofia"], + "templating": { "list": [] }, + "time": { "from": "now-24h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "RPi Sofia", + "uid": "rpi-sofia", + "version": 1, + "weekStart": "" +} diff --git a/stacks/monitoring/modules/monitoring/grafana.tf b/stacks/monitoring/modules/monitoring/grafana.tf index 5df70818..1c9737dc 100644 --- a/stacks/monitoring/modules/monitoring/grafana.tf +++ b/stacks/monitoring/modules/monitoring/grafana.tf @@ -124,6 +124,7 @@ locals { "idrac.json" = "Hardware" "ups.json" = "Hardware" "nvidia.json" = "Hardware" + "rpi-sofia.json" = "Hardware" # Operations "backup_health.json" = "Operations" diff --git a/stacks/monitoring/modules/monitoring/loki_ingress.tf b/stacks/monitoring/modules/monitoring/loki_ingress.tf new file mode 100644 index 00000000..6180a209 --- /dev/null +++ b/stacks/monitoring/modules/monitoring/loki_ingress.tf @@ -0,0 +1,22 @@ +# Loki write/push endpoint for EXTERNAL hosts (currently rpi-sofia's promtail). +# +# Loki runs SingleBinary with the gateway disabled and auth_enabled=false, so it +# is ClusterIP-only (svc "loki":3100) and unreachable from off-cluster. An +# external log shipper like the Sofia Raspberry Pi cannot POST to +# /loki/api/v1/push without this ingress. +# +# auth = "none": promtail ships logs programmatically (no browser, no Authentik +# SSO cookie dance). The allow_local_access_only middleware (192.168.0.0/16 + +# 10.0.0.0/8) gates the endpoint to LAN/VPN only — the correct model for a +# LAN-only Pi, mirroring the idrac-redfish-exporter ingress in this module. +module "loki-write-ingress" { + source = "../../../../modules/kubernetes/ingress_factory" + auth = "none" + namespace = kubernetes_namespace.monitoring.metadata[0].name + name = "loki" + root_domain = "viktorbarzin.lan" + tls_secret_name = var.tls_secret_name + allow_local_access_only = true + ssl_redirect = false + port = 3100 +} diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 009f2798..50cb8f6c 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -814,6 +814,36 @@ serverFiles: severity: warning annotations: summary: "Fan unhealthy on R730 - check iDRAC" + - name: RPi Sofia + rules: + - alert: RpiSofiaDown + expr: up{job="rpi-sofia"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "rpi-sofia node_exporter unreachable for 5m — Pi down or network/SD wedge (cameras + solar feed at risk)" + - alert: RpiSofiaFilesystemReadonly + expr: node_filesystem_readonly{instance="rpi-sofia", mountpoint="/"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: "rpi-sofia rootfs is READ-ONLY — failing SD card (the silent-journal failure mode from this incident). Reflash/replace the card." + - alert: RpiSofiaUndervoltage + expr: rpi_under_voltage_occurred{instance="rpi-sofia"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "rpi-sofia under-voltage detected since last boot — check PSU/USB power cable" + - alert: RpiSofiaHighTemp + expr: rpi_soc_temp_celsius{instance="rpi-sofia"} > 75 + for: 10m + labels: + severity: warning + annotations: + summary: "rpi-sofia SoC temp: {{ $value | printf \"%.0f\" }}°C (threshold: 75°C)" - name: Nvidia Tesla T4 GPU rules: - alert: HighGPUTemp @@ -3058,6 +3088,21 @@ extraScrapeConfigs: | - source_labels: [__address__] target_label: instance replacement: 'pve-node-r730' # Giving it a friendly name + # rpi-sofia: external Raspberry Pi 3 at the Sofia home site (Frigate camera + # DNAT passthrough + solar inverter path + HA MQTT sensors). node_exporter + # installed via apt; the rpi_* metrics come from a vcgencmd textfile collector + # (undervoltage/throttle/SoC temp). Scraped by hostname (-> 192.168.1.10 wired). + - job_name: 'rpi-sofia' + static_configs: + - targets: + - "rpi-sofia.viktorbarzin.lan.:9100" + labels: + node: 'rpi-sofia' + metrics_path: '/metrics' + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'rpi-sofia' # Giving it a friendly name - job_name: 'istiod' kubernetes_sd_configs: - role: endpoints