fix: add Headscale monitoring, alerts, and pin UI image
- Add 4 Prometheus alerts: HeadscaleDown (critical), NoOnlineNodes, HighHTTPLatency, HighErrorRate - Add Grafana dashboard with node count, map responses, HTTP latency, nodestore operations, and memory panels - Pin headscale-ui to digest sha256:015f5ba0... (was :latest) - Set disable_check_updates: true to skip GitHub check on startup - Uptime Kuma monitor already existed (id=19, 300s interval)
This commit is contained in:
parent
972edf4d30
commit
725fefe565
3 changed files with 124 additions and 1 deletions
78
stacks/headscale/modules/headscale/dashboards/headscale.json
Normal file
78
stacks/headscale/modules/headscale/dashboards/headscale.json
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 1,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Online Nodes",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||||
|
"targets": [{ "expr": "headscale_nodestore_nodes_total", "legendFormat": "Nodes" }],
|
||||||
|
"fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "red", "value": 0 }, { "color": "green", "value": 1 }] } } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Map Responses / sec",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(headscale_mapresponse_sent_total[5m])", "legendFormat": "sent" },
|
||||||
|
{ "expr": "rate(headscale_mapresponse_generated_total[5m])", "legendFormat": "generated" },
|
||||||
|
{ "expr": "rate(headscale_mapresponse_ended_total[5m])", "legendFormat": "ended" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Endpoint Updates / sec",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 4 },
|
||||||
|
"targets": [{ "expr": "rate(headscale_mapresponse_endpoint_updates_total[5m])", "legendFormat": "updates/s" }],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "HTTP Request Rate by Path",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 },
|
||||||
|
"targets": [{ "expr": "sum by (path) (rate(headscale_http_requests_total[5m]))", "legendFormat": "{{ path }}" }]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "HTTP p95 Latency by Path",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||||
|
"targets": [{ "expr": "histogram_quantile(0.95, sum by (path, le) (rate(headscale_http_duration_seconds_bucket[5m])))", "legendFormat": "{{ path }}" }],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "NodeStore Operations / sec",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "rate(headscale_nodestore_operations_total[5m])", "legendFormat": "operations" },
|
||||||
|
{ "expr": "headscale_nodestore_queue_depth", "legendFormat": "queue depth" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "NodeStore Batch Duration p95",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||||
|
"targets": [{ "expr": "histogram_quantile(0.95, rate(headscale_nodestore_batch_duration_seconds_bucket[5m]))", "legendFormat": "p95" }],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "go_memstats_alloc_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "alloc" },
|
||||||
|
{ "expr": "go_memstats_sys_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "sys" }
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["headscale", "vpn"],
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"time": { "from": "now-6h", "to": "now" },
|
||||||
|
"title": "Headscale VPN",
|
||||||
|
"uid": "headscale-vpn"
|
||||||
|
}
|
||||||
|
|
@ -175,7 +175,7 @@ resource "kubernetes_deployment" "headscale" {
|
||||||
# }
|
# }
|
||||||
# }
|
# }
|
||||||
container {
|
container {
|
||||||
image = "ghcr.io/gurucomputing/headscale-ui:latest"
|
image = "ghcr.io/gurucomputing/headscale-ui@sha256:015f5ba04bcbd5ee03178540a1dbbfc97b6896d7411032e3bf33c2f3e08f8b6f"
|
||||||
# image = "ghcr.io/tale/headplane:0.3.2"
|
# image = "ghcr.io/tale/headplane:0.3.2"
|
||||||
name = "headscale-ui"
|
name = "headscale-ui"
|
||||||
|
|
||||||
|
|
@ -424,3 +424,17 @@ resource "kubernetes_cron_job_v1" "headscale_backup" {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Grafana dashboard
|
||||||
|
resource "kubernetes_config_map" "grafana_headscale_dashboard" {
|
||||||
|
metadata {
|
||||||
|
name = "grafana-headscale-dashboard"
|
||||||
|
namespace = "monitoring"
|
||||||
|
labels = {
|
||||||
|
grafana_dashboard = "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data = {
|
||||||
|
"headscale.json" = file("${path.module}/dashboards/headscale.json")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -1553,6 +1553,37 @@ serverFiles:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)"
|
summary: "{{ $value | printf \"%.0f\" }} MAM torrents not yet seeded 72h (limit: 20 for new members)"
|
||||||
|
|
||||||
|
- name: Headscale VPN
|
||||||
|
rules:
|
||||||
|
- alert: HeadscaleDown
|
||||||
|
expr: up{job="kubernetes-service-endpoints", namespace="headscale"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Headscale VPN control plane is down"
|
||||||
|
- alert: HeadscaleNoOnlineNodes
|
||||||
|
expr: headscale_nodestore_nodes_total == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "No nodes registered in Headscale"
|
||||||
|
- alert: HeadscaleHighHTTPLatency
|
||||||
|
expr: histogram_quantile(0.95, rate(headscale_http_duration_seconds_bucket[5m])) > 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Headscale p95 HTTP latency is {{ $value | printf \"%.1f\" }}s"
|
||||||
|
- alert: HeadscaleHighErrorRate
|
||||||
|
expr: sum(rate(headscale_http_requests_total{code=~"5.."}[5m])) / sum(rate(headscale_http_requests_total[5m])) > 0.05
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Headscale 5xx error rate is {{ $value | printf \"%.1f\" }}%"
|
||||||
|
|
||||||
extraScrapeConfigs: |
|
extraScrapeConfigs: |
|
||||||
- job_name: 'proxmox-host'
|
- job_name: 'proxmox-host'
|
||||||
static_configs:
|
static_configs:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue