[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability

Phase 1 - Critical Security: - Netbox: move hardcoded DB/superuser passwords to variables - MeshCentral: disable public registration, add Authentik auth - Traefik: disable insecure API dashboard (api.insecure=false) - Traefik: configure forwarded headers with Cloudflare trusted IPs Phase 2 - Security Hardening: - Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.) - Add Kyverno pod security policies in audit mode (privileged, host namespaces, SYS_ADMIN, trusted registries) - Tighten rate limiting (avg=10, burst=50) - Add Authentik protection to grampsweb Phase 3 - Monitoring & Alerting: - Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale, Authentik, Loki) - Increase Loki retention from 7 to 30 days (720h) - Add predictive PV filling alert (predict_linear) - Re-enable Hackmd and Privatebin down alerts Phase 4 - Reliability: - Add resource requests/limits to Redis, DBaaS, Technitium, Headscale, Vaultwarden, Uptime Kuma - Increase Alloy DaemonSet memory to 512Mi/1Gi Phase 6 - Maintainability: - Extract duplicated tiers locals to terragrunt.hcl generate block (removed from 67 stacks) - Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114 instances across 63 files) - Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references with variables across ~35 stacks - Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00 · 2026-02-23 22:05:28 +00:00 · 89a6e08245
commit 89a6e08245
parent 1b4737c90c
104 changed files with 773 additions and 920 deletions
--- a/stacks/platform/modules/monitoring/alloy.yaml
+++ b/stacks/platform/modules/monitoring/alloy.yaml
@ -125,7 +125,7 @@ alloy:
  resources:
    requests:
      cpu: 50m
-      memory: 256Mi
+      memory: 512Mi
    limits:
      cpu: 200m
-      memory: 768Mi
+      memory: 1Gi
--- a/stacks/platform/modules/monitoring/grafana.tf
+++ b/stacks/platform/modules/monitoring/grafana.tf
@ -1,4 +1,5 @@

+
 # resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
 #   metadata {
 #     name = "grafana-pv"
@ -11,7 +12,7 @@
 #     persistent_volume_source {
 #       nfs {
 #         path   = "/mnt/main/grafana"
-#         server = "10.0.10.15"
+#         server = var.nfs_server
 #       }
 #       # iscsi {
 #       #   target_portal = "iscsi.viktorbarzin.lan:3260"
@ -35,7 +36,7 @@ resource "kubernetes_persistent_volume" "alertmanager_pv" {
    persistent_volume_source {
      nfs {
        path   = "/mnt/main/alertmanager"
-        server = "10.0.10.15"
+        server = var.nfs_server
      }
    }
  }
@ -65,5 +66,5 @@ resource "helm_release" "grafana" {
  repository = "https://grafana.github.io/helm-charts"
  chart      = "grafana"

-  values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password })]
+  values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
 }
--- a/stacks/platform/modules/monitoring/grafana_chart_values.yaml
+++ b/stacks/platform/modules/monitoring/grafana_chart_values.yaml
@ -48,7 +48,7 @@ env:
 grafana.ini:
  database:
    type: mysql
-    host: mysql.dbaas.svc.cluster.local:3306
+    host: ${mysql_host}:3306
    name: grafana
    user: grafana
    password: $__env{GF_DATABASE_PASSWORD}
--- a/stacks/platform/modules/monitoring/loki.tf
+++ b/stacks/platform/modules/monitoring/loki.tf
@ -1,3 +1,5 @@
+variable "nfs_server" { type = string }
+
 resource "helm_release" "loki" {
  namespace        = kubernetes_namespace.monitoring.metadata[0].name
  create_namespace = true
@ -24,7 +26,7 @@ resource "kubernetes_persistent_volume" "loki" {
    persistent_volume_source {
      nfs {
        path   = "/mnt/main/loki/loki"
-        server = "10.0.10.15"
+        server = var.nfs_server
      }
    }
    persistent_volume_reclaim_policy = "Retain"
--- a/stacks/platform/modules/monitoring/loki.yaml
+++ b/stacks/platform/modules/monitoring/loki.yaml
@ -22,7 +22,7 @@ loki:
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
-    retention_period: 168h
+    retention_period: 720h
  compactor:
    retention_enabled: true
    working_directory: /var/loki/compactor
--- a/stacks/platform/modules/monitoring/main.tf
+++ b/stacks/platform/modules/monitoring/main.tf
@ -16,6 +16,7 @@ variable "pve_password" { type = string }
 variable "grafana_db_password" { type = string }
 variable "grafana_admin_password" { type = string }
 variable "tier" { type = string }
+variable "mysql_host" { type = string }

 resource "kubernetes_namespace" "monitoring" {
  metadata {
--- a/stacks/platform/modules/monitoring/prometheus.tf
+++ b/stacks/platform/modules/monitoring/prometheus.tf
@ -1,4 +1,5 @@

+
 resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
  metadata {
    name      = "prometheus-iscsi-pvc"
@ -29,7 +30,7 @@ resource "kubernetes_persistent_volume" "prometheus_server_pvc" {
    persistent_volume_source {
      nfs {
        path   = "/mnt/main/prometheus"
-        server = "10.0.10.15"
+        server = var.nfs_server
      }
      # iscsi {
      #   fs_type       = "ext4"
--- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
+++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl
@ -316,6 +316,13 @@ serverFiles:
              severity: warning
            annotations:
              summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }}: {{ $value | printf \"%.0f\" }}% used (threshold: 85%)"
+          - alert: PVPredictedFull
+            expr: predict_linear(kubelet_volume_stats_used_bytes[6h], 3600*24) > kubelet_volume_stats_capacity_bytes
+            for: 1h
+            labels:
+              severity: warning
+            annotations:
+              summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h"
      - name: K8s Health
        rules:
          - alert: PodCrashLooping
@ -389,6 +396,50 @@ serverFiles:
              severity: warning
            annotations:
              summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
+      - name: Critical Services
+        rules:
+          - alert: PostgreSQLDown
+            expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "PostgreSQL has no available replicas"
+          - alert: MySQLDown
+            expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "MySQL has no available replicas"
+          - alert: RedisDown
+            expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Redis has no available replicas"
+          - alert: HeadscaleDown
+            expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Headscale VPN has no available replicas"
+          - alert: AuthentikDown
+            expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="authentik-server"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: critical
+            annotations:
+              summary: "Authentik auth server has no available replicas"
+          - alert: LokiDown
+            expr: (kube_statefulset_status_replicas_ready{namespace="monitoring", statefulset=~"loki.*"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Loki log aggregation has no ready replicas"
      - name: Cluster
        rules:
          - alert: NodeDown
@ -548,20 +599,20 @@ serverFiles:
              severity: page
            annotations:
              summary: Mail server has no available replicas. This means mail may not be received.
-          # - alert: Hackmd has no replicas available
-          #   expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
-          #   for: 1m
-          #   labels:
-          #     severity: page
-          #   annotations:
-          #     summary: Hackmd has no available replicas.
-          # - alert: Privatebin has no replicas available
-          #   expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
-          #   for: 10m
-          #   labels:
-          #     severity: page
-          #   annotations:
-          #     summary: Privatebin has no available replicas.
+          - alert: HackmdDown
+            expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
+            for: 5m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Hackmd has no available replicas"
+          - alert: PrivatebinDown
+            expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
+            for: 10m
+            labels:
+              severity: warning
+            annotations:
+              summary: "Privatebin has no available replicas"
          # - name: London OpenWRT Down
          #   rules:
          #     - alert: OpenWRT client unreachable