[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security:
- Netbox: move hardcoded DB/superuser passwords to variables
- MeshCentral: disable public registration, add Authentik auth
- Traefik: disable insecure API dashboard (api.insecure=false)
- Traefik: configure forwarded headers with Cloudflare trusted IPs
Phase 2 - Security Hardening:
- Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.)
- Add Kyverno pod security policies in audit mode (privileged, host
namespaces, SYS_ADMIN, trusted registries)
- Tighten rate limiting (avg=10, burst=50)
- Add Authentik protection to grampsweb
Phase 3 - Monitoring & Alerting:
- Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale,
Authentik, Loki)
- Increase Loki retention from 7 to 30 days (720h)
- Add predictive PV filling alert (predict_linear)
- Re-enable Hackmd and Privatebin down alerts
Phase 4 - Reliability:
- Add resource requests/limits to Redis, DBaaS, Technitium, Headscale,
Vaultwarden, Uptime Kuma
- Increase Alloy DaemonSet memory to 512Mi/1Gi
Phase 6 - Maintainability:
- Extract duplicated tiers locals to terragrunt.hcl generate block
(removed from 67 stacks)
- Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114
instances across 63 files)
- Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references
with variables across ~35 stacks
- Migrate xray raw ingress resources to ingress_factory modules
2026-02-23 22:05:28 +00:00
|
|
|
variable "nfs_server" { type = string }
|
|
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
resource "helm_release" "loki" {
|
|
|
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
|
|
|
create_namespace = true
|
|
|
|
|
name = "loki"
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
repository = "https://grafana.github.io/helm-charts"
|
|
|
|
|
chart = "loki"
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
values = [templatefile("${path.module}/loki.yaml", {})]
|
2026-02-13 23:17:32 +00:00
|
|
|
timeout = 600
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
depends_on = [kubernetes_config_map.loki_alert_rules]
|
|
|
|
|
}
|
2025-12-28 20:07:00 +00:00
|
|
|
|
|
|
|
|
# https://grafana.com/docs/alloy/latest/configure/kubernetes/
|
2026-02-13 23:03:40 +00:00
|
|
|
resource "helm_release" "alloy" {
|
|
|
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
|
|
|
create_namespace = true
|
|
|
|
|
name = "alloy"
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
repository = "https://grafana.github.io/helm-charts"
|
|
|
|
|
chart = "alloy"
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
values = [file("${path.module}/alloy.yaml")]
|
|
|
|
|
atomic = true
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
depends_on = [helm_release.loki]
|
|
|
|
|
}
|
2025-12-28 20:07:00 +00:00
|
|
|
|
2026-02-13 23:03:40 +00:00
|
|
|
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "sysctl-inotify"
|
|
|
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
|
|
|
labels = {
|
|
|
|
|
app = "sysctl-inotify"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spec {
|
|
|
|
|
selector {
|
|
|
|
|
match_labels = {
|
|
|
|
|
app = "sysctl-inotify"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
template {
|
|
|
|
|
metadata {
|
|
|
|
|
labels = {
|
|
|
|
|
app = "sysctl-inotify"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
spec {
|
|
|
|
|
init_container {
|
|
|
|
|
name = "sysctl"
|
|
|
|
|
image = "busybox:1.37"
|
|
|
|
|
command = [
|
|
|
|
|
"sh", "-c",
|
2026-02-21 20:21:04 +00:00
|
|
|
"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
|
2026-02-13 23:03:40 +00:00
|
|
|
]
|
|
|
|
|
security_context {
|
|
|
|
|
privileged = true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
container {
|
|
|
|
|
name = "pause"
|
|
|
|
|
image = "registry.k8s.io/pause:3.10"
|
|
|
|
|
resources {
|
|
|
|
|
requests = {
|
|
|
|
|
cpu = "1m"
|
|
|
|
|
memory = "4Mi"
|
|
|
|
|
}
|
|
|
|
|
limits = {
|
|
|
|
|
cpu = "1m"
|
|
|
|
|
memory = "4Mi"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
host_pid = true
|
|
|
|
|
toleration {
|
|
|
|
|
operator = "Exists"
|
|
|
|
|
}
|
2026-02-23 22:43:05 +00:00
|
|
|
dns_config {
|
|
|
|
|
option {
|
|
|
|
|
name = "ndots"
|
|
|
|
|
value = "2"
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-02-13 23:03:40 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-12-28 20:07:00 +00:00
|
|
|
|
|
|
|
|
# resource "helm_release" "k8s-monitoring" {
|
2025-12-29 10:23:42 +00:00
|
|
|
# namespace = kubernetes_namespace.monitoring.metadata[0].name
|
2025-12-28 20:07:00 +00:00
|
|
|
# create_namespace = true
|
|
|
|
|
# name = "k8s-monitoring"
|
|
|
|
|
|
|
|
|
|
# repository = "https://grafana.github.io/helm-charts"
|
|
|
|
|
# chart = "k8s-monitoring"
|
|
|
|
|
|
|
|
|
|
# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
|
|
|
|
|
# atomic = true
|
|
|
|
|
# }
|
2026-02-13 23:03:40 +00:00
|
|
|
|
|
|
|
|
resource "kubernetes_config_map" "loki_alert_rules" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "loki-alert-rules"
|
|
|
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
|
|
|
}
|
|
|
|
|
data = {
|
|
|
|
|
"rules.yaml" = yamlencode({
|
2026-03-09 22:05:20 +00:00
|
|
|
groups = [
|
|
|
|
|
{
|
|
|
|
|
name = "Node Health"
|
|
|
|
|
rules = [
|
|
|
|
|
{
|
|
|
|
|
alert = "KernelOOMKiller"
|
|
|
|
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
|
|
|
|
|
for = "0m"
|
|
|
|
|
labels = {
|
|
|
|
|
severity = "critical"
|
|
|
|
|
}
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "OOM killer active on {{ $labels.node }}"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
alert = "KernelPanic"
|
|
|
|
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
|
|
|
|
|
for = "0m"
|
|
|
|
|
labels = {
|
|
|
|
|
severity = "critical"
|
|
|
|
|
}
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "Kernel panic on {{ $labels.node }}"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
alert = "KernelHungTask"
|
|
|
|
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
|
|
|
|
|
for = "0m"
|
|
|
|
|
labels = {
|
|
|
|
|
severity = "warning"
|
|
|
|
|
}
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "Hung task detected on {{ $labels.node }}"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
alert = "KernelSoftLockup"
|
|
|
|
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
|
|
|
|
|
for = "0m"
|
|
|
|
|
labels = {
|
|
|
|
|
severity = "critical"
|
|
|
|
|
}
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "Soft lockup on {{ $labels.node }}"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
alert = "ContainerdDown"
|
|
|
|
|
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
|
|
|
|
|
for = "1m"
|
|
|
|
|
labels = {
|
|
|
|
|
severity = "critical"
|
|
|
|
|
}
|
|
|
|
|
annotations = {
|
|
|
|
|
summary = "containerd service unhealthy on {{ $labels.node }}"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
]
|
2026-02-13 23:03:40 +00:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
resource "kubernetes_config_map" "grafana_loki_datasource" {
|
|
|
|
|
metadata {
|
|
|
|
|
name = "grafana-loki-datasource"
|
|
|
|
|
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
|
|
|
|
labels = {
|
|
|
|
|
grafana_datasource = "1"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
data = {
|
|
|
|
|
"loki-datasource.yaml" = yamlencode({
|
|
|
|
|
apiVersion = 1
|
|
|
|
|
datasources = [{
|
|
|
|
|
name = "Loki"
|
|
|
|
|
type = "loki"
|
|
|
|
|
access = "proxy"
|
|
|
|
|
url = "http://loki.monitoring.svc.cluster.local:3100"
|
|
|
|
|
isDefault = false
|
|
|
|
|
}]
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|