From 725fefe5656b42481e997ec29dcdc652a0ab6023 Mon Sep 17 00:00:00 2001
From: Viktor Barzin <viktorbarzin@meta.com>
Date: Sat, 28 Mar 2026 16:07:04 +0200
Subject: [PATCH] fix: add Headscale monitoring, alerts, and pin UI image

- Add 4 Prometheus alerts: HeadscaleDown (critical), NoOnlineNodes,
  HighHTTPLatency, HighErrorRate
- Add Grafana dashboard with node count, map responses, HTTP latency,
  nodestore operations, and memory panels
- Pin headscale-ui to digest sha256:015f5ba0... (was :latest)
- Set disable_check_updates: true to skip GitHub check on startup
- Uptime Kuma monitor already existed (id=19, 300s interval)
---
 .../headscale/dashboards/headscale.json       | 78 +++++++++++++++++++
 stacks/headscale/modules/headscale/main.tf    | 16 +++-
 .../monitoring/prometheus_chart_values.tpl    | 31 ++++++++
 3 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 stacks/headscale/modules/headscale/dashboards/headscale.json

diff --git a/stacks/headscale/modules/headscale/dashboards/headscale.json b/stacks/headscale/modules/headscale/dashboards/headscale.json
new file mode 100644
index 00000000..3f17cba4
--- /dev/null
+++ b/stacks/headscale/modules/headscale/dashboards/headscale.json
@@ -0,0 +1,78 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [],
+  "panels": [
+    {
+      "title": "Online Nodes",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+      "targets": [{ "expr": "headscale_nodestore_nodes_total", "legendFormat": "Nodes" }],
+      "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "red", "value": 0 }, { "color": "green", "value": 1 }] } } }
+    },
+    {
+      "title": "Map Responses / sec",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 },
+      "targets": [
+        { "expr": "rate(headscale_mapresponse_sent_total[5m])", "legendFormat": "sent" },
+        { "expr": "rate(headscale_mapresponse_generated_total[5m])", "legendFormat": "generated" },
+        { "expr": "rate(headscale_mapresponse_ended_total[5m])", "legendFormat": "ended" }
+      ]
+    },
+    {
+      "title": "Endpoint Updates / sec",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 4 },
+      "targets": [{ "expr": "rate(headscale_mapresponse_endpoint_updates_total[5m])", "legendFormat": "updates/s" }],
+      "fieldConfig": { "defaults": { "unit": "ops" } }
+    },
+    {
+      "title": "HTTP Request Rate by Path",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 },
+      "targets": [{ "expr": "sum by (path) (rate(headscale_http_requests_total[5m]))", "legendFormat": "{{ path }}" }]
+    },
+    {
+      "title": "HTTP p95 Latency by Path",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "targets": [{ "expr": "histogram_quantile(0.95, sum by (path, le) (rate(headscale_http_duration_seconds_bucket[5m])))", "legendFormat": "{{ path }}" }],
+      "fieldConfig": { "defaults": { "unit": "s" } }
+    },
+    {
+      "title": "NodeStore Operations / sec",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "targets": [
+        { "expr": "rate(headscale_nodestore_operations_total[5m])", "legendFormat": "operations" },
+        { "expr": "headscale_nodestore_queue_depth", "legendFormat": "queue depth" }
+      ]
+    },
+    {
+      "title": "NodeStore Batch Duration p95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "targets": [{ "expr": "histogram_quantile(0.95, rate(headscale_nodestore_batch_duration_seconds_bucket[5m]))", "legendFormat": "p95" }],
+      "fieldConfig": { "defaults": { "unit": "s" } }
+    },
+    {
+      "title": "Memory Usage",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "targets": [
+        { "expr": "go_memstats_alloc_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "alloc" },
+        { "expr": "go_memstats_sys_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "sys" }
+      ],
+      "fieldConfig": { "defaults": { "unit": "bytes" } }
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["headscale", "vpn"],
+  "templating": { "list": [] },
+  "time": { "from": "now-6h", "to": "now" },
+  "title": "Headscale VPN",
+  "uid": "headscale-vpn"
+}
diff --git a/stacks/headscale/modules/headscale/main.tf b/stacks/headscale/modules/headscale/main.tf
index 33cb83dc..6eed919e 100644
--- a/stacks/headscale/modules/headscale/main.tf
+++ b/stacks/headscale/modules/headscale/main.tf
@@ -175,7 +175,7 @@ resource "kubernetes_deployment" "headscale" {
         #   }
         # }
         container {
-          image = "ghcr.io/gurucomputing/headscale-ui:latest"
+          image = "ghcr.io/gurucomputing/headscale-ui@sha256:015f5ba04bcbd5ee03178540a1dbbfc97b6896d7411032e3bf33c2f3e08f8b6f"
           # image = "ghcr.io/tale/headplane:0.3.2"
           name = "headscale-ui"
 
@@ -424,3 +424,17 @@ resource "kubernetes_cron_job_v1" "headscale_backup" {
     }
   }
 }
+
+# Grafana dashboard
+resource "kubernetes_config_map" "grafana_headscale_dashboard" {
+  metadata {
+    name      = "grafana-headscale-dashboard"
+    namespace = "monitoring"
+    labels = {
+      grafana_dashboard = "1"
+    }
+  }
+  data = {
+    "headscale.json" = file("${path.module}/dashboards/headscale.json")
+  }
+}
diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
index 4548e87d..e1ca1dff 100755
--- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl
@@ -1553,6 +1553,37 @@ serverFiles:
             annotations:
               summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)"
 
+      - name: Headscale VPN
+        rules:
+          - alert: HeadscaleDown
+            expr: up{job="kubernetes-service-endpoints", namespace="headscale"} == 0
+            for: 2m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Headscale VPN control plane is down"
+          - alert: HeadscaleNoOnlineNodes
+            expr: headscale_nodestore_nodes_total == 0
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "No nodes registered in Headscale"
+          - alert: HeadscaleHighHTTPLatency
+            expr: histogram_quantile(0.95, rate(headscale_http_duration_seconds_bucket[5m])) > 1
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Headscale p95 HTTP latency is {{ $value | printf \"%.1f\" }}s"
+          - alert: HeadscaleHighErrorRate
+            expr: sum(rate(headscale_http_requests_total{code=~"5.."}[5m])) / sum(rate(headscale_http_requests_total[5m])) > 0.05
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Headscale 5xx error rate is {{ $value | printf \"%.1f\" }}%"
+
 extraScrapeConfigs: |
   - job_name: 'proxmox-host'
     static_configs: