fix: restore tree dropped by 6d224861; land stem95su gdrive-sync (10m) [ci skip]

6d224861 came from a --no-checkout worktree whose empty index made the commit drop every file except two. This restores 05b50d2b's full tree and correctly adds stacks/stem95su/gdrive-sync.tf + the service-catalog stem95su entry. Forward-only (parent=6d224861, no force-push); [ci skip] since the live infra was never applied from the broken commit. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 08:45:33 +00:00 · 2026-06-09 08:45:33 +00:00 · fd0f4a0365
commit fd0f4a0365
parent 6d224861c4
1166 changed files with 358546 additions and 0 deletions
--- a/stacks/headscale/main.tf
+++ b/stacks/headscale/main.tf
@ -0,0 +1,24 @@
+variable "tls_secret_name" { type = string }
+variable "nfs_server" { type = string }
+
+data "vault_kv_secret_v2" "secrets" {
+  mount = "secret"
+  name  = "platform"
+}
+
+locals {
+  homepage_credentials = jsondecode(data.vault_kv_secret_v2.secrets.data["homepage_credentials"])
+}
+
+module "headscale" {
+  source             = "./modules/headscale"
+  tls_secret_name    = var.tls_secret_name
+  nfs_server         = var.nfs_server
+  headscale_config   = data.vault_kv_secret_v2.secrets.data["headscale_config"]
+  headscale_acl      = data.vault_kv_secret_v2.secrets.data["headscale_acl"]
+  headscale_derp_map = data.vault_kv_secret_v2.secrets.data["headscale_derp_map"]
+  homepage_token     = try(local.homepage_credentials["headscale"]["api_key"], "")
+  tier               = local.tiers.core
+  ui_cookie_secret   = data.vault_kv_secret_v2.secrets.data["headscale_ui_cookie_secret"]
+  ui_api_key         = data.vault_kv_secret_v2.secrets.data["headscale_ui_api_key"]
+}
--- a/stacks/headscale/modules/headscale/dashboards/headscale.json
+++ b/stacks/headscale/modules/headscale/dashboards/headscale.json
@ -0,0 +1,78 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [],
+  "panels": [
+    {
+      "title": "Online Nodes",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+      "targets": [{ "expr": "headscale_nodestore_nodes_total", "legendFormat": "Nodes" }],
+      "fieldConfig": { "defaults": { "thresholds": { "steps": [{ "color": "red", "value": 0 }, { "color": "green", "value": 1 }] } } }
+    },
+    {
+      "title": "Map Responses / sec",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 9, "x": 6, "y": 0 },
+      "targets": [
+        { "expr": "rate(headscale_mapresponse_sent_total[5m])", "legendFormat": "sent" },
+        { "expr": "rate(headscale_mapresponse_generated_total[5m])", "legendFormat": "generated" },
+        { "expr": "rate(headscale_mapresponse_ended_total[5m])", "legendFormat": "ended" }
+      ]
+    },
+    {
+      "title": "Endpoint Updates / sec",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 4 },
+      "targets": [{ "expr": "rate(headscale_mapresponse_endpoint_updates_total[5m])", "legendFormat": "updates/s" }],
+      "fieldConfig": { "defaults": { "unit": "ops" } }
+    },
+    {
+      "title": "HTTP Request Rate by Path",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 9, "x": 15, "y": 0 },
+      "targets": [{ "expr": "sum by (path) (rate(headscale_http_requests_total[5m]))", "legendFormat": "{{ path }}" }]
+    },
+    {
+      "title": "HTTP p95 Latency by Path",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "targets": [{ "expr": "histogram_quantile(0.95, sum by (path, le) (rate(headscale_http_duration_seconds_bucket[5m])))", "legendFormat": "{{ path }}" }],
+      "fieldConfig": { "defaults": { "unit": "s" } }
+    },
+    {
+      "title": "NodeStore Operations / sec",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "targets": [
+        { "expr": "rate(headscale_nodestore_operations_total[5m])", "legendFormat": "operations" },
+        { "expr": "headscale_nodestore_queue_depth", "legendFormat": "queue depth" }
+      ]
+    },
+    {
+      "title": "NodeStore Batch Duration p95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "targets": [{ "expr": "histogram_quantile(0.95, rate(headscale_nodestore_batch_duration_seconds_bucket[5m]))", "legendFormat": "p95" }],
+      "fieldConfig": { "defaults": { "unit": "s" } }
+    },
+    {
+      "title": "Memory Usage",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "targets": [
+        { "expr": "go_memstats_alloc_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "alloc" },
+        { "expr": "go_memstats_sys_bytes{job=\"kubernetes-service-endpoints\", namespace=\"headscale\"}", "legendFormat": "sys" }
+      ],
+      "fieldConfig": { "defaults": { "unit": "bytes" } }
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["headscale", "vpn"],
+  "templating": { "list": [] },
+  "time": { "from": "now-6h", "to": "now" },
+  "title": "Headscale VPN",
+  "uid": "headscale-vpn"
+}
--- a/stacks/headscale/modules/headscale/main.tf
+++ b/stacks/headscale/modules/headscale/main.tf
@ -0,0 +1,530 @@
+
+variable "tls_secret_name" {}
+variable "tier" { type = string }
+variable "headscale_config" {}
+variable "headscale_acl" {}
+variable "nfs_server" { type = string }
+variable "homepage_token" {
+  type      = string
+  default   = ""
+  sensitive = true
+}
+variable "ui_cookie_secret" {
+  type      = string
+  sensitive = true
+}
+variable "ui_api_key" {
+  type      = string
+  sensitive = true
+}
+variable "headscale_derp_map" {
+  type = string
+}
+
+resource "kubernetes_namespace" "headscale" {
+  metadata {
+    name = "headscale"
+    labels = {
+      tier               = var.tier
+      "keel.sh/enrolled" = "true"
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: goldilocks-vpa-auto-mode ClusterPolicy stamps this label on every namespace
+    ignore_changes = [metadata[0].labels["goldilocks.fairwinds.com/vpa-update-mode"]]
+  }
+}
+
+module "tls_secret" {
+  source          = "../../../../modules/kubernetes/setup_tls_secret"
+  namespace       = kubernetes_namespace.headscale.metadata[0].name
+  tls_secret_name = var.tls_secret_name
+}
+
+module "nfs_data_host" {
+  source     = "../../../../modules/kubernetes/nfs_volume"
+  name       = "headscale-data-host"
+  namespace  = kubernetes_namespace.headscale.metadata[0].name
+  nfs_server = "192.168.1.127"
+  nfs_path   = "/srv/nfs/headscale"
+}
+
+resource "kubernetes_persistent_volume_claim" "data_encrypted" {
+  wait_until_bound = false
+  metadata {
+    name      = "headscale-data-encrypted"
+    namespace = kubernetes_namespace.headscale.metadata[0].name
+    annotations = {
+      "resize.topolvm.io/threshold"     = "10%"
+      "resize.topolvm.io/increase"      = "100%"
+      "resize.topolvm.io/storage_limit" = "5Gi"
+    }
+  }
+  spec {
+    access_modes       = ["ReadWriteOnce"]
+    storage_class_name = "proxmox-lvm-encrypted"
+    resources {
+      requests = {
+        storage = "1Gi"
+      }
+    }
+  }
+  lifecycle {
+    # The autoresizer expands requests.storage up to storage_limit and
+    # PVCs can't shrink. Without this, every TF apply tries to revert
+    # to the spec value, K8s rejects the shrink, and the PVC ends up
+    # in Terminating-but-in-use limbo.
+    ignore_changes = [spec[0].resources[0].requests]
+  }
+}
+
+resource "kubernetes_deployment" "headscale" {
+  metadata {
+    name      = "headscale"
+    namespace = kubernetes_namespace.headscale.metadata[0].name
+    labels = {
+      app  = "headscale"
+      tier = var.tier
+      # scare to try but probably non-http will fail
+      # "istio-injection" : "enabled"
+    }
+
+    annotations = {
+      "reloader.stakater.com/search" = "true"
+    }
+  }
+  spec {
+    replicas = 1
+    strategy {
+      type = "Recreate"
+    }
+    selector {
+      match_labels = {
+        app = "headscale"
+      }
+    }
+    template {
+      metadata {
+        labels = {
+          app = "headscale"
+        }
+        annotations = {
+          # "diun.enable"       = "true"
+          "diun.enable"       = "false"
+          "diun.include_tags" = "^\\d+(?:\\.\\d+)?(?:\\.\\d+)?$"
+        }
+      }
+      spec {
+        container {
+          image = "headscale/headscale:0.28.0"
+          # image   = "headscale/headscale:0.28.0-debug" # -debug is for debug images
+          name    = "headscale"
+          command = ["headscale", "serve"]
+
+          resources {
+            requests = {
+              cpu    = "50m"
+              memory = "128Mi"
+            }
+            limits = {
+              memory = "128Mi"
+            }
+          }
+
+          port {
+            container_port = 8080
+          }
+          port {
+            container_port = 9090
+          }
+          port {
+            container_port = 41641
+          }
+          port {
+            container_port = 3479
+            protocol       = "UDP"
+          }
+
+          liveness_probe {
+            http_get {
+              path = "/health"
+              port = 8080
+            }
+            initial_delay_seconds = 15
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 5
+          }
+          readiness_probe {
+            http_get {
+              path = "/health"
+              port = 8080
+            }
+            initial_delay_seconds = 5
+            period_seconds        = 30
+            timeout_seconds       = 5
+            failure_threshold     = 3
+          }
+
+          volume_mount {
+            name       = "config-volume"
+            mount_path = "/etc/headscale"
+          }
+
+          volume_mount {
+            mount_path = "/mnt"
+            name       = "nfs-config"
+          }
+        }
+        volume {
+          name = "config-volume"
+          config_map {
+            name = "headscale-config"
+            items {
+              key  = "config.yaml"
+              path = "config.yaml"
+            }
+            items {
+              key  = "acl.yaml"
+              path = "acl.yaml"
+            }
+            items {
+              key  = "derp.yaml"
+              path = "derp.yaml"
+            }
+          }
+        }
+
+        volume {
+          name = "nfs-config"
+          persistent_volume_claim {
+            claim_name = kubernetes_persistent_volume_claim.data_encrypted.metadata[0].name
+          }
+        }
+        # container {
+        #   image = "simcu/headscale-ui:0.1.4"
+        #   name  = "headscale-ui"
+        #   port {
+        #     container_port = 80
+        #   }
+        # }
+        container {
+          image = "ghcr.io/gurucomputing/headscale-ui@sha256:015f5ba04bcbd5ee03178540a1dbbfc97b6896d7411032e3bf33c2f3e08f8b6f"
+          # image = "ghcr.io/tale/headplane:0.3.2"
+          name = "headscale-ui"
+
+          resources {
+            requests = {
+              cpu    = "25m"
+              memory = "128Mi"
+            }
+            limits = {
+              memory = "128Mi"
+            }
+          }
+
+          port {
+            container_port = 8081
+            # container_port = 3000
+          }
+          env {
+            name  = "HTTP_PORT"
+            value = "8081"
+          }
+          # env {
+          #   name  = "HTTPS_PORT"
+          #   value = "8082"
+          # }
+          env {
+            name  = "HEADSCALE_URL"
+            value = "http://localhost:8080"
+          }
+          env {
+            name  = "COOKIE_SECRET"
+            value = var.ui_cookie_secret
+          }
+          env {
+            name  = "ROOT_API_KEY"
+            value = var.ui_api_key
+          }
+        }
+        dns_config {
+          option {
+            name  = "ndots"
+            value = "2"
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
+    ignore_changes = [spec[0].template[0].spec[0].dns_config]
+  }
+}
+resource "kubernetes_service" "headscale" {
+  metadata {
+    name      = "headscale"
+    namespace = kubernetes_namespace.headscale.metadata[0].name
+    labels = {
+      "app" = "headscale"
+    }
+    annotations = {
+      "prometheus.io/scrape" = "true"
+      "prometheus.io/port"   = "9090"
+    }
+    # annotations = {
+    #   "metallb.universe.tf/allow-shared-ip" : "shared"
+    # }
+  }
+
+  spec {
+    # type                    = "LoadBalancer"
+    # external_traffic_policy = "Cluster"
+    selector = {
+      app = "headscale"
+
+    }
+    port {
+      name     = "headscale"
+      port     = "8080"
+      protocol = "TCP"
+    }
+    port {
+      name        = "headscale-ui"
+      port        = "80"
+      target_port = 8081
+      # target_port = 3000
+      protocol = "TCP"
+    }
+    port {
+      name     = "metrics"
+      port     = "9090"
+      protocol = "TCP"
+    }
+  }
+}
+
+module "ingress" {
+  source = "../../../../modules/kubernetes/ingress_factory"
+  # Headscale is the Tailscale control plane — native Tailscale clients
+  # register, exchange keys, and pull DERP maps from headscale.viktorbarzin.me.
+  # Forward-auth would break every Tailscale client. Headscale has its own
+  # OIDC + preauth-key auth at the app layer; the web admin UI lives on a
+  # separate /web ingress that remains auth=required.
+  # auth = "app": Headscale control plane — native Tailscale clients register + exchange keys using headscale's own OIDC + preauth-key auth; backend manages authentication.
+  auth            = "app"
+  dns_type        = "non-proxied"
+  namespace       = kubernetes_namespace.headscale.metadata[0].name
+  name            = "headscale"
+  port            = 8080
+  tls_secret_name = var.tls_secret_name
+  extra_annotations = {
+    "gethomepage.dev/enabled"      = "true"
+    "gethomepage.dev/name"         = "Headscale"
+    "gethomepage.dev/description"  = "VPN mesh network"
+    "gethomepage.dev/icon"         = "headscale.png"
+    "gethomepage.dev/group"        = "Identity & Security"
+    "gethomepage.dev/pod-selector" = ""
+  }
+}
+
+# Dedicated IngressRoute for DERP — bypasses CrowdSec, rate limiting, anti-AI,
+# and error pages middlewares that interfere with the Upgrade: DERP protocol.
+resource "kubernetes_manifest" "derp_ingress_route" {
+  manifest = {
+    apiVersion = "traefik.io/v1alpha1"
+    kind       = "IngressRoute"
+    metadata = {
+      name      = "headscale-derp"
+      namespace = kubernetes_namespace.headscale.metadata[0].name
+    }
+    spec = {
+      entryPoints = ["websecure"]
+      routes = [{
+        match = "Host(`headscale.viktorbarzin.me`) && PathPrefix(`/derp`)"
+        kind  = "Rule"
+        services = [{
+          name = kubernetes_service.headscale.metadata[0].name
+          port = 8080
+        }]
+        # Minimal middleware — retry + rate-limit. No CrowdSec/anti-AI (DERP is a relay protocol)
+        middlewares = [
+          {
+            name      = "retry"
+            namespace = "traefik"
+          },
+          {
+            name      = "rate-limit"
+            namespace = "traefik"
+          },
+        ]
+      }]
+      tls = {
+        secretName = var.tls_secret_name
+      }
+    }
+  }
+}
+
+module "ingress-ui" {
+  source          = "../../../../modules/kubernetes/ingress_factory"
+  auth            = "required"
+  namespace       = kubernetes_namespace.headscale.metadata[0].name
+  name            = "headscale-ui"
+  host            = "headscale"
+  service_name    = "headscale"
+  port            = 80
+  ingress_path    = ["/web"]
+  tls_secret_name = var.tls_secret_name
+}
+
+resource "kubernetes_service" "headscale-server" {
+  metadata {
+    name      = "headscale-server"
+    namespace = kubernetes_namespace.headscale.metadata[0].name
+    labels = {
+      "app" = "headscale"
+    }
+    annotations = {
+      "metallb.io/loadBalancerIPs" = "10.0.20.200"
+      "metallb.io/allow-shared-ip" = "shared"
+    }
+  }
+
+  spec {
+    type                    = "LoadBalancer"
+    external_traffic_policy = "Cluster"
+    selector = {
+      app = "headscale"
+
+    }
+    # port {
+    #   name     = "headscale-tcp"
+    #   port     = "41641"
+    #   protocol = "TCP"
+    # }
+    port {
+      name     = "headscale-udp"
+      port     = "41641"
+      protocol = "UDP"
+    }
+    port {
+      name     = "stun"
+      port     = "3479"
+      protocol = "UDP"
+    }
+  }
+}
+
+resource "kubernetes_config_map" "headscale-config" {
+  metadata {
+    name      = "headscale-config"
+    namespace = kubernetes_namespace.headscale.metadata[0].name
+
+    annotations = {
+      "reloader.stakater.com/match" = "true"
+    }
+  }
+
+  data = {
+    "config.yaml" = var.headscale_config
+    "acl.yaml"    = var.headscale_acl
+    "derp.yaml"   = var.headscale_derp_map
+  }
+}
+
+# Backup CronJob — sqlite3 .backup from proxmox-lvm to NFS for cloud sync pickup
+# Uses pod_affinity to co-locate with headscale pod (required for RWO PVC access)
+resource "kubernetes_cron_job_v1" "headscale_backup" {
+  metadata {
+    name      = "headscale-backup"
+    namespace = kubernetes_namespace.headscale.metadata[0].name
+  }
+  spec {
+    concurrency_policy            = "Replace"
+    schedule                      = "0 */6 * * *"
+    successful_jobs_history_limit = 1
+    failed_jobs_history_limit     = 3
+    job_template {
+      metadata {}
+      spec {
+        backoff_limit              = 3
+        ttl_seconds_after_finished = 10
+        template {
+          metadata {}
+          spec {
+            affinity {
+              pod_affinity {
+                required_during_scheduling_ignored_during_execution {
+                  label_selector {
+                    match_labels = {
+                      app = "headscale"
+                    }
+                  }
+                  topology_key = "kubernetes.io/hostname"
+                }
+              }
+            }
+            container {
+              name  = "backup"
+              image = "docker.io/library/alpine"
+              command = ["/bin/sh", "-c", <<-EOT
+                set -euxo pipefail
+                apk add --no-cache sqlite
+                now=$(date +"%Y_%m_%d_%H_%M")
+                mkdir -p /backup
+                sqlite3 /data/db.sqlite ".backup /backup/db.sqlite.bak"
+                echo "Backup completed at $(date)"
+              EOT
+              ]
+              volume_mount {
+                name       = "data"
+                mount_path = "/data"
+                read_only  = true
+              }
+              volume_mount {
+                name       = "backup"
+                mount_path = "/backup"
+              }
+            }
+            volume {
+              name = "data"
+              persistent_volume_claim {
+                claim_name = kubernetes_persistent_volume_claim.data_encrypted.metadata[0].name
+              }
+            }
+            volume {
+              name = "backup"
+              persistent_volume_claim {
+                claim_name = module.nfs_data_host.claim_name
+              }
+            }
+            restart_policy = "OnFailure"
+          }
+        }
+      }
+    }
+  }
+  lifecycle {
+    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
+    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
+  }
+}
+
+# Grafana dashboard
+resource "kubernetes_config_map" "grafana_headscale_dashboard" {
+  metadata {
+    name      = "grafana-headscale-dashboard"
+    namespace = "monitoring"
+    labels = {
+      grafana_dashboard = "1"
+    }
+    annotations = {
+      grafana_folder = "Networking"
+    }
+  }
+  data = {
+    "headscale.json" = file("${path.module}/dashboards/headscale.json")
+  }
+}
--- a/stacks/headscale/secrets
+++ b/stacks/headscale/secrets
@ -0,0 +1 @@
+../../secrets
--- a/stacks/headscale/terragrunt.hcl
+++ b/stacks/headscale/terragrunt.hcl
@ -0,0 +1,8 @@
+include "root" {
+  path = find_in_parent_folders()
+}
+
+dependency "infra" {
+  config_path  = "../infra"
+  skip_outputs = true
+}