From 725fefe5656b42481e997ec29dcdc652a0ab6023 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 28 Mar 2026 16:07:04 +0200 Subject: [PATCH] fix: add Headscale monitoring, alerts, and pin UI image - Add 4 Prometheus alerts: HeadscaleDown (critical), NoOnlineNodes, HighHTTPLatency, HighErrorRate - Add Grafana dashboard with node count, map responses, HTTP latency, nodestore operations, and memory panels - Pin headscale-ui to digest sha256:015f5ba0... (was :latest) - Set disable_check_updates: true to skip GitHub check on startup - Uptime Kuma monitor already existed (id=19, 300s interval) --- .../headscale/dashboards/headscale.json | 78 +++++++++++++++++++ stacks/headscale/modules/headscale/main.tf | 16 +++- .../monitoring/prometheus_chart_values.tpl | 31 ++++++++ 3 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 stacks/headscale/modules/headscale/dashboards/headscale.json diff --git a/stacks/headscale/modules/headscale/dashboards/headscale.json b/stacks/headscale/modules/headscale/dashboards/headscale.json new file mode 100644 index 00000000..3f17cba4 --- /dev/null +++ b/stacks/headscale/modules/headscale/dashboards/headscale.json @@ -0,0 +1,78 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Online Nodes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "targets": [{ "expr": "headscale_nodestore_nodes_total", "legendFormat": "Nodes" }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "red", "value": 0 }, { "color": "green", "value": 1 }] } } } + }, + { + "title": "Map Responses / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 }, + "targets": [ + { "expr": "rate(headscale_mapresponse_sent_total[5m])", "legendFormat": "sent" }, + { "expr": "rate(headscale_mapresponse_generated_total[5m])", "legendFormat": "generated" }, + { "expr": "rate(headscale_mapresponse_ended_total[5m])", "legendFormat": "ended" } + ] + }, + { + "title": "Endpoint Updates / sec", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 4 }, + "targets": [{ "expr": "rate(headscale_mapresponse_endpoint_updates_total[5m])", "legendFormat": "updates/s" }], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "HTTP Request Rate by Path", + "type": "timeseries", + "gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 }, + "targets": [{ "expr": "sum by (path) (rate(headscale_http_requests_total[5m]))", "legendFormat": "{{ path }}" }] + }, + { + "title": "HTTP p95 Latency by Path", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "targets": [{ "expr": "histogram_quantile(0.95, sum by (path, le) (rate(headscale_http_duration_seconds_bucket[5m])))", "legendFormat": "{{ path }}" }], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "NodeStore Operations / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "targets": [ + { "expr": "rate(headscale_nodestore_operations_total[5m])", "legendFormat": "operations" }, + { "expr": "headscale_nodestore_queue_depth", "legendFormat": "queue depth" } + ] + }, + { + "title": "NodeStore Batch Duration p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "targets": [{ "expr": "histogram_quantile(0.95, rate(headscale_nodestore_batch_duration_seconds_bucket[5m]))", "legendFormat": "p95" }], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Memory Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "targets": [ + { "expr": "go_memstats_alloc_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "alloc" }, + { "expr": "go_memstats_sys_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "sys" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + } + ], + "schemaVersion": 39, + "tags": ["headscale", "vpn"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "title": "Headscale VPN", + "uid": "headscale-vpn" +} diff --git a/stacks/headscale/modules/headscale/main.tf b/stacks/headscale/modules/headscale/main.tf index 33cb83dc..6eed919e 100644 --- a/stacks/headscale/modules/headscale/main.tf +++ b/stacks/headscale/modules/headscale/main.tf @@ -175,7 +175,7 @@ resource "kubernetes_deployment" "headscale" { # } # } container { - image = "ghcr.io/gurucomputing/headscale-ui:latest" + image = "ghcr.io/gurucomputing/headscale-ui@sha256:015f5ba04bcbd5ee03178540a1dbbfc97b6896d7411032e3bf33c2f3e08f8b6f" # image = "ghcr.io/tale/headplane:0.3.2" name = "headscale-ui" @@ -424,3 +424,17 @@ resource "kubernetes_cron_job_v1" "headscale_backup" { } } } + +# Grafana dashboard +resource "kubernetes_config_map" "grafana_headscale_dashboard" { + metadata { + name = "grafana-headscale-dashboard" + namespace = "monitoring" + labels = { + grafana_dashboard = "1" + } + } + data = { + "headscale.json" = file("${path.module}/dashboards/headscale.json") + } +} diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index 4548e87d..e1ca1dff 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1553,6 +1553,37 @@ serverFiles: annotations: summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)" + - name: Headscale VPN + rules: + - alert: HeadscaleDown + expr: up{job="kubernetes-service-endpoints", namespace="headscale"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Headscale VPN control plane is down" + - alert: HeadscaleNoOnlineNodes + expr: headscale_nodestore_nodes_total == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "No nodes registered in Headscale" + - alert: HeadscaleHighHTTPLatency + expr: histogram_quantile(0.95, rate(headscale_http_duration_seconds_bucket[5m])) > 1 + for: 10m + labels: + severity: warning + annotations: + summary: "Headscale p95 HTTP latency is {{ $value | printf \"%.1f\" }}s" + - alert: HeadscaleHighErrorRate + expr: sum(rate(headscale_http_requests_total{code=~"5.."}[5m])) / sum(rate(headscale_http_requests_total[5m])) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Headscale 5xx error rate is {{ $value | printf \"%.1f\" }}%" + extraScrapeConfigs: | - job_name: 'proxmox-host' static_configs: