monitoring + proxmox-csi: LVM snapshot RBAC, pushgateway NodePort, backup dashboard

- proxmox-csi: add RBAC for PVE host snapshot restore script
- monitoring: expose Pushgateway via NodePort for PVE LVM snapshot metrics
- monitoring: add backup health Grafana dashboard
This commit is contained in:
Viktor Barzin 2026-04-06 11:57:41 +03:00
parent 72d832fee7
commit fe342a974b
4 changed files with 768 additions and 62 deletions

View file

@ -380,16 +380,184 @@
]
},
{
"title": "Active Backup Alerts",
"title": "LVM Thin Snapshots",
"type": "row",
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 },
"collapsed": false,
"panels": []
},
{
"title": "Time Since Last LVM Snapshot",
"type": "stat",
"gridPos": { "h": 6, "w": 8, "x": 0, "y": 43 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 50400 },
{ "color": "red", "value": 90000 }
]
},
"mappings": [
{
"type": "special",
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
}
]
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "time() - lvm_snapshot_last_run_timestamp{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "LVM Snapshots",
"refId": "A"
}
]
},
{
"title": "LVM Snapshot Status",
"type": "stat",
"gridPos": { "h": 6, "w": 8, "x": 8, "y": 43 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{
"type": "value",
"options": {
"0": { "text": "OK", "color": "green" },
"1": { "text": "PARTIAL", "color": "yellow" },
"2": { "text": "ABORTED", "color": "red" }
}
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "red", "value": 2 }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "lvm_snapshot_last_status{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Status",
"refId": "A"
}
]
},
{
"title": "Thin Pool Free Space",
"type": "gauge",
"gridPos": { "h": 6, "w": 8, "x": 16, "y": 43 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "yellow", "value": 15 },
{ "color": "green", "value": 30 }
]
},
"mappings": [
{
"type": "special",
"options": { "match": "null", "result": { "text": "No data", "color": "red" } }
}
]
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"targets": [
{
"expr": "lvm_snapshot_thinpool_free_pct{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Free %",
"refId": "A"
}
]
},
{
"title": "Snapshots Created / Pruned (Last Run)",
"type": "stat",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 49 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "blue", "value": null }
]
}
},
"overrides": []
},
"options": {
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"colorMode": "background",
"graphMode": "none",
"textMode": "auto"
},
"targets": [
{
"expr": "lvm_snapshot_created_total{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Created",
"refId": "A"
},
{
"expr": "lvm_snapshot_pruned_total{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Pruned",
"refId": "B"
},
{
"expr": "lvm_snapshot_failed_total{job=\"lvm-pvc-snapshot\"}",
"legendFormat": "Failed",
"refId": "C"
}
]
},
{
"title": "Active Backup & Snapshot Alerts",
"type": "alertlist",
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 42 },
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 55 },
"datasource": { "type": "datasource", "uid": "grafana" },
"options": {
"showOptions": "current",
"maxItems": 20,
"sortOrder": 1,
"dashboardAlerts": false,
"alertName": "backup",
"alertName": "",
"stateFilter": {
"firing": true,
"pending": true,
@ -397,6 +565,7 @@
"normal": false,
"error": true
},
"alertInstanceLabelFilter": "{__alert_rule_title__=~\".*[Bb]ackup.*|.*[Ss]napshot.*|.*CloudSync.*|.*ThinPool.*\"}",
"folder": { "id": null, "title": "" },
"folderId": null
}
@ -404,7 +573,7 @@
{
"title": "CronJob Last Schedule",
"type": "table",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 61 },
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {},

View file

@ -0,0 +1,464 @@
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"title": "MAM Tracker",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 0.8 },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 6, "w": 5, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_ratio{tracker=\"mam\"}",
"legendFormat": "MAM Ratio"
}
],
"title": "MAM Ratio",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "decbytes" }
},
"gridPos": { "h": 6, "w": 5, "x": 5, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_uploaded_bytes{tracker=\"mam\"}",
"legendFormat": "Uploaded"
},
{
"expr": "qbt_tracker_downloaded_bytes{tracker=\"mam\"}",
"legendFormat": "Downloaded"
}
],
"title": "MAM Transfer",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "short" }
},
"gridPos": { "h": 6, "w": 4, "x": 10, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_torrents_total{tracker=\"mam\"}",
"legendFormat": "Total"
},
{
"expr": "qbt_tracker_seeding{tracker=\"mam\"}",
"legendFormat": "Seeding"
},
{
"expr": "qbt_tracker_downloading{tracker=\"mam\"}",
"legendFormat": "Downloading"
}
],
"title": "MAM Torrents",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "orange", "value": 15 },
{ "color": "red", "value": 20 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 6, "w": 4, "x": 14, "y": 1 },
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_unsatisfied{tracker=\"mam\"}",
"legendFormat": "Unsatisfied (<72h seed)"
}
],
"title": "MAM Unsatisfied",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"mappings": [
{ "options": { "0": { "color": "red", "text": "Disconnected" }, "1": { "color": "green", "text": "Connected" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1 }
]
}
}
},
"gridPos": { "h": 6, "w": 3, "x": 18, "y": 1 },
"id": 5,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"textMode": "value",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_connected",
"legendFormat": "Connection"
}
],
"title": "Connection",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": { "unit": "short" }
},
"gridPos": { "h": 6, "w": 3, "x": 21, "y": 1 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_dht_nodes",
"legendFormat": "DHT Nodes"
}
],
"title": "DHT Nodes",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true,
"thresholdsStyle": { "mode": "line" }
},
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 },
"id": 7,
"options": {
"legend": { "calcs": ["lastNotNull", "min"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_ratio{tracker=\"mam\"}",
"legendFormat": "MAM Ratio"
}
],
"title": "MAM Ratio Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "scheme",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true
},
"unit": "decbytes"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 },
"id": 8,
"options": {
"legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_uploaded_bytes{tracker=\"mam\"}",
"legendFormat": "MAM Uploaded"
},
{
"expr": "qbt_tracker_downloaded_bytes{tracker=\"mam\"}",
"legendFormat": "MAM Downloaded"
}
],
"title": "MAM Cumulative Transfer",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 },
"id": 101,
"title": "All Trackers Breakdown",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "orange", "value": 0.8 },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 16 },
"id": 9,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"textMode": "value_and_name",
"reduceOptions": { "calcs": ["lastNotNull"] }
},
"targets": [
{
"expr": "qbt_tracker_ratio",
"legendFormat": "{{tracker}}"
}
],
"title": "Ratio by Tracker",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "line",
"fillOpacity": 10,
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true,
"thresholdsStyle": { "mode": "line" }
},
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "red", "value": null },
{ "color": "green", "value": 1.0 }
]
},
"decimals": 3
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 },
"id": 10,
"options": {
"legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_ratio",
"legendFormat": "{{tracker}}"
}
],
"title": "Ratio by Tracker Over Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"drawStyle": "bars",
"fillOpacity": 80,
"lineWidth": 1,
"stacking": { "mode": "normal" }
},
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 },
"id": 11,
"options": {
"legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_tracker_torrents_total",
"legendFormat": "{{tracker}} total"
},
{
"expr": "qbt_tracker_seeding",
"legendFormat": "{{tracker}} seeding"
}
],
"title": "Torrents by Tracker",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
"id": 102,
"title": "Transfer Speeds",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "${datasource}" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisCenteredZero": false,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "scheme",
"lineWidth": 2,
"showPoints": "never",
"spanNulls": true
},
"unit": "Bps"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "Download" },
"properties": [
{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } },
{ "id": "custom.transform", "value": "negative-Y" }
]
},
{
"matcher": { "id": "byName", "options": "Upload" },
"properties": [
{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 31 },
"id": 12,
"options": {
"legend": { "calcs": ["mean", "max", "lastNotNull"], "displayMode": "table", "placement": "bottom" },
"tooltip": { "mode": "multi" }
},
"targets": [
{
"expr": "qbt_ul_speed_bytes",
"legendFormat": "Upload"
},
{
"expr": "qbt_dl_speed_bytes",
"legendFormat": "Download"
}
],
"title": "Transfer Speed (Global)",
"type": "timeseries"
}
],
"refresh": "1m",
"schemaVersion": 39,
"tags": ["qbittorrent", "torrents", "mam"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "Prometheus", "value": "prometheus" },
"hide": 0,
"includeAll": false,
"label": "Data Source",
"multi": false,
"name": "datasource",
"options": [],
"query": "prometheus",
"refresh": 1,
"type": "datasource"
}
]
},
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "qBittorrent - Seeding & Ratio",
"uid": "qbittorrent-mam",
"version": 1
}

View file

@ -337,6 +337,27 @@ resource "kubernetes_cron_job_v1" "dns_anomaly_monitor" {
}
}
# Expose Pushgateway via NodePort so the PVE host can push LVM snapshot metrics
resource "kubernetes_service" "pushgateway_nodeport" {
metadata {
name = "pushgateway-nodeport"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
spec {
type = "NodePort"
selector = {
"app.kubernetes.io/name" = "prometheus-pushgateway"
"app.kubernetes.io/instance" = "prometheus"
}
port {
port = 9091
target_port = 9091
node_port = 30091
protocol = "TCP"
}
}
}
resource "kubernetes_manifest" "status_redirect_middleware" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
@ -355,36 +376,30 @@ resource "kubernetes_manifest" "status_redirect_middleware" {
}
}
resource "kubernetes_ingress_v1" "status" {
metadata {
name = "hetrix-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"traefik.ingress.kubernetes.io/router.middlewares" = "monitoring-status-redirect@kubernetescrd"
"traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
resource "kubernetes_manifest" "status_ingress_route" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "IngressRoute"
metadata = {
name = "hetrix-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
}
spec {
ingress_class_name = "traefik"
tls {
hosts = ["status.viktorbarzin.me"]
secret_name = var.tls_secret_name
}
rule {
host = "status.viktorbarzin.me"
http {
path {
path = "/"
backend {
service {
name = "not-used"
port {
number = 80 # redirected by middleware
}
}
}
}
spec = {
entryPoints = ["websecure"]
routes = [{
match = "Host(`status.viktorbarzin.me`)"
kind = "Rule"
middlewares = [{
name = "status-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}]
services = [{
kind = "TraefikService"
name = "noop@internal"
}]
}]
tls = {
secretName = var.tls_secret_name
}
}
}
@ -408,36 +423,30 @@ resource "kubernetes_manifest" "yotovski_redirect_middleware" {
}
}
resource "kubernetes_ingress_v1" "status_yotovski" {
metadata {
name = "hetrix-yotovski-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
annotations = {
"traefik.ingress.kubernetes.io/router.middlewares" = "monitoring-yotovski-redirect@kubernetescrd"
"traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
resource "kubernetes_manifest" "yotovski_ingress_route" {
manifest = {
apiVersion = "traefik.io/v1alpha1"
kind = "IngressRoute"
metadata = {
name = "hetrix-yotovski-redirect-ingress"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
}
spec {
ingress_class_name = "traefik"
tls {
hosts = ["yotovski-status.viktorbarzin.me"]
secret_name = var.tls_secret_name
}
rule {
host = "yotovski-status.viktorbarzin.me"
http {
path {
path = "/"
backend {
service {
name = "not-used" # redirected by middleware
port {
number = 80
}
}
}
}
spec = {
entryPoints = ["websecure"]
routes = [{
match = "Host(`yotovski-status.viktorbarzin.me`)"
kind = "Rule"
middlewares = [{
name = "yotovski-redirect"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}]
services = [{
kind = "TraefikService"
name = "noop@internal"
}]
}]
tls = {
secretName = var.tls_secret_name
}
}
}

View file

@ -89,3 +89,67 @@ resource "null_resource" "node_labels" {
zone = each.value.proxmox_node
}
}
# --- RBAC for PVE host snapshot restore script ---
# Provides kubectl access from the Proxmox host for the lvm-pvc-snapshot restore subcommand.
# Minimal permissions: read PVs/PVCs/Pods, scale Deployments/StatefulSets.
resource "kubernetes_service_account" "pve_snapshot_admin" {
metadata {
name = "pve-snapshot-admin"
namespace = "kube-system"
}
}
resource "kubernetes_secret" "pve_snapshot_admin_token" {
metadata {
name = "pve-snapshot-admin-token"
namespace = "kube-system"
annotations = {
"kubernetes.io/service-account.name" = kubernetes_service_account.pve_snapshot_admin.metadata[0].name
}
}
type = "kubernetes.io/service-account-token"
}
resource "kubernetes_cluster_role" "pve_snapshot_admin" {
metadata {
name = "pve-snapshot-admin"
}
rule {
api_groups = [""]
resources = ["persistentvolumes", "persistentvolumeclaims", "pods"]
verbs = ["get", "list"]
}
rule {
api_groups = ["apps"]
resources = ["deployments", "statefulsets", "replicasets"]
verbs = ["get", "list", "update", "patch"]
}
rule {
api_groups = ["apps"]
resources = ["deployments/scale", "statefulsets/scale"]
verbs = ["get", "update", "patch"]
}
}
resource "kubernetes_cluster_role_binding" "pve_snapshot_admin" {
metadata {
name = "pve-snapshot-admin"
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = kubernetes_cluster_role.pve_snapshot_admin.metadata[0].name
}
subject {
kind = "ServiceAccount"
name = kubernetes_service_account.pve_snapshot_admin.metadata[0].name
namespace = "kube-system"
}
}