[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability

Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs

Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
  namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb

Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
  Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts

Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
  Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi

Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
  (removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
  instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
  with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
This commit is contained in:
Viktor Barzin 2026-02-23 22:05:28 +00:00
parent 1b4737c90c
commit 89a6e08245
104 changed files with 773 additions and 920 deletions

View file

@ -125,7 +125,7 @@ alloy:
resources:
requests:
cpu: 50m
memory: 256Mi
memory: 512Mi
limits:
cpu: 200m
memory: 768Mi
memory: 1Gi

View file

@ -1,4 +1,5 @@
# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
# metadata {
# name = "grafana-pv"
@ -11,7 +12,7 @@
# persistent_volume_source {
# nfs {
# path = "/mnt/main/grafana"
# server = "10.0.10.15"
# server = var.nfs_server
# }
# # iscsi {
# # target_portal = "iscsi.viktorbarzin.lan:3260"
@ -35,7 +36,7 @@ resource "kubernetes_persistent_volume" "alertmanager_pv" {
persistent_volume_source {
nfs {
path = "/mnt/main/alertmanager"
server = "10.0.10.15"
server = var.nfs_server
}
}
}
@ -65,5 +66,5 @@ resource "helm_release" "grafana" {
repository = "https://grafana.github.io/helm-charts"
chart = "grafana"
values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password })]
values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
}

View file

@ -48,7 +48,7 @@ env:
grafana.ini:
database:
type: mysql
host: mysql.dbaas.svc.cluster.local:3306
host: ${mysql_host}:3306
name: grafana
user: grafana
password: $__env{GF_DATABASE_PASSWORD}

View file

@ -1,3 +1,5 @@
variable "nfs_server" { type = string }
resource "helm_release" "loki" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
@ -24,7 +26,7 @@ resource "kubernetes_persistent_volume" "loki" {
persistent_volume_source {
nfs {
path = "/mnt/main/loki/loki"
server = "10.0.10.15"
server = var.nfs_server
}
}
persistent_volume_reclaim_policy = "Retain"

View file

@ -22,7 +22,7 @@ loki:
limits_config:
allow_structured_metadata: true
volume_enabled: true
retention_period: 168h
retention_period: 720h
compactor:
retention_enabled: true
working_directory: /var/loki/compactor

View file

@ -16,6 +16,7 @@ variable "pve_password" { type = string }
variable "grafana_db_password" { type = string }
variable "grafana_admin_password" { type = string }
variable "tier" { type = string }
variable "mysql_host" { type = string }
resource "kubernetes_namespace" "monitoring" {
metadata {

View file

@ -1,4 +1,5 @@
resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
metadata {
name = "prometheus-iscsi-pvc"
@ -29,7 +30,7 @@ resource "kubernetes_persistent_volume" "prometheus_server_pvc" {
persistent_volume_source {
nfs {
path = "/mnt/main/prometheus"
server = "10.0.10.15"
server = var.nfs_server
}
# iscsi {
# fs_type = "ext4"

View file

@ -316,6 +316,13 @@ serverFiles:
severity: warning
annotations:
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }}: {{ $value | printf \"%.0f\" }}% used (threshold: 85%)"
- alert: PVPredictedFull
expr: predict_linear(kubelet_volume_stats_used_bytes[6h], 3600*24) > kubelet_volume_stats_capacity_bytes
for: 1h
labels:
severity: warning
annotations:
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h"
- name: K8s Health
rules:
- alert: PodCrashLooping
@ -389,6 +396,50 @@ serverFiles:
severity: warning
annotations:
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
- name: Critical Services
rules:
- alert: PostgreSQLDown
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL has no available replicas"
- alert: MySQLDown
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "MySQL has no available replicas"
- alert: RedisDown
expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Redis has no available replicas"
- alert: HeadscaleDown
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Headscale VPN has no available replicas"
- alert: AuthentikDown
expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="authentik-server"} or on() vector(0)) < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Authentik auth server has no available replicas"
- alert: LokiDown
expr: (kube_statefulset_status_replicas_ready{namespace="monitoring", statefulset=~"loki.*"} or on() vector(0)) < 1
for: 5m
labels:
severity: warning
annotations:
summary: "Loki log aggregation has no ready replicas"
- name: Cluster
rules:
- alert: NodeDown
@ -548,20 +599,20 @@ serverFiles:
severity: page
annotations:
summary: Mail server has no available replicas. This means mail may not be received.
# - alert: Hackmd has no replicas available
# expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
# for: 1m
# labels:
# severity: page
# annotations:
# summary: Hackmd has no available replicas.
# - alert: Privatebin has no replicas available
# expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
# for: 10m
# labels:
# severity: page
# annotations:
# summary: Privatebin has no available replicas.
- alert: HackmdDown
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
for: 5m
labels:
severity: warning
annotations:
summary: "Hackmd has no available replicas"
- alert: PrivatebinDown
expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
for: 10m
labels:
severity: warning
annotations:
summary: "Privatebin has no available replicas"
# - name: London OpenWRT Down
# rules:
# - alert: OpenWRT client unreachable