[ci skip] Infrastructure hardening: security, monitoring, reliability, maintainability
Phase 1 - Critical Security: - Netbox: move hardcoded DB/superuser passwords to variables - MeshCentral: disable public registration, add Authentik auth - Traefik: disable insecure API dashboard (api.insecure=false) - Traefik: configure forwarded headers with Cloudflare trusted IPs Phase 2 - Security Hardening: - Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.) - Add Kyverno pod security policies in audit mode (privileged, host namespaces, SYS_ADMIN, trusted registries) - Tighten rate limiting (avg=10, burst=50) - Add Authentik protection to grampsweb Phase 3 - Monitoring & Alerting: - Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale, Authentik, Loki) - Increase Loki retention from 7 to 30 days (720h) - Add predictive PV filling alert (predict_linear) - Re-enable Hackmd and Privatebin down alerts Phase 4 - Reliability: - Add resource requests/limits to Redis, DBaaS, Technitium, Headscale, Vaultwarden, Uptime Kuma - Increase Alloy DaemonSet memory to 512Mi/1Gi Phase 6 - Maintainability: - Extract duplicated tiers locals to terragrunt.hcl generate block (removed from 67 stacks) - Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114 instances across 63 files) - Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references with variables across ~35 stacks - Migrate xray raw ingress resources to ingress_factory modules
This commit is contained in:
parent
1b4737c90c
commit
89a6e08245
104 changed files with 773 additions and 920 deletions
|
|
@ -125,7 +125,7 @@ alloy:
|
|||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 256Mi
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 768Mi
|
||||
memory: 1Gi
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
|
||||
|
||||
# resource "kubernetes_persistent_volume" "prometheus_grafana_pv" {
|
||||
# metadata {
|
||||
# name = "grafana-pv"
|
||||
|
|
@ -11,7 +12,7 @@
|
|||
# persistent_volume_source {
|
||||
# nfs {
|
||||
# path = "/mnt/main/grafana"
|
||||
# server = "10.0.10.15"
|
||||
# server = var.nfs_server
|
||||
# }
|
||||
# # iscsi {
|
||||
# # target_portal = "iscsi.viktorbarzin.lan:3260"
|
||||
|
|
@ -35,7 +36,7 @@ resource "kubernetes_persistent_volume" "alertmanager_pv" {
|
|||
persistent_volume_source {
|
||||
nfs {
|
||||
path = "/mnt/main/alertmanager"
|
||||
server = "10.0.10.15"
|
||||
server = var.nfs_server
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -65,5 +66,5 @@ resource "helm_release" "grafana" {
|
|||
repository = "https://grafana.github.io/helm-charts"
|
||||
chart = "grafana"
|
||||
|
||||
values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password })]
|
||||
values = [templatefile("${path.module}/grafana_chart_values.yaml", { db_password = var.grafana_db_password, grafana_admin_password = var.grafana_admin_password, mysql_host = var.mysql_host })]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ env:
|
|||
grafana.ini:
|
||||
database:
|
||||
type: mysql
|
||||
host: mysql.dbaas.svc.cluster.local:3306
|
||||
host: ${mysql_host}:3306
|
||||
name: grafana
|
||||
user: grafana
|
||||
password: $__env{GF_DATABASE_PASSWORD}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
variable "nfs_server" { type = string }
|
||||
|
||||
resource "helm_release" "loki" {
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
create_namespace = true
|
||||
|
|
@ -24,7 +26,7 @@ resource "kubernetes_persistent_volume" "loki" {
|
|||
persistent_volume_source {
|
||||
nfs {
|
||||
path = "/mnt/main/loki/loki"
|
||||
server = "10.0.10.15"
|
||||
server = var.nfs_server
|
||||
}
|
||||
}
|
||||
persistent_volume_reclaim_policy = "Retain"
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ loki:
|
|||
limits_config:
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
retention_period: 168h
|
||||
retention_period: 720h
|
||||
compactor:
|
||||
retention_enabled: true
|
||||
working_directory: /var/loki/compactor
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ variable "pve_password" { type = string }
|
|||
variable "grafana_db_password" { type = string }
|
||||
variable "grafana_admin_password" { type = string }
|
||||
variable "tier" { type = string }
|
||||
variable "mysql_host" { type = string }
|
||||
|
||||
resource "kubernetes_namespace" "monitoring" {
|
||||
metadata {
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
|
||||
|
||||
resource "kubernetes_persistent_volume_claim" "prometheus_server_pvc" {
|
||||
metadata {
|
||||
name = "prometheus-iscsi-pvc"
|
||||
|
|
@ -29,7 +30,7 @@ resource "kubernetes_persistent_volume" "prometheus_server_pvc" {
|
|||
persistent_volume_source {
|
||||
nfs {
|
||||
path = "/mnt/main/prometheus"
|
||||
server = "10.0.10.15"
|
||||
server = var.nfs_server
|
||||
}
|
||||
# iscsi {
|
||||
# fs_type = "ext4"
|
||||
|
|
|
|||
|
|
@ -316,6 +316,13 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }}: {{ $value | printf \"%.0f\" }}% used (threshold: 85%)"
|
||||
- alert: PVPredictedFull
|
||||
expr: predict_linear(kubelet_volume_stats_used_bytes[6h], 3600*24) > kubelet_volume_stats_capacity_bytes
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PV {{ $labels.persistentvolumeclaim }} in {{ $labels.namespace }} predicted to fill within 24h"
|
||||
- name: K8s Health
|
||||
rules:
|
||||
- alert: PodCrashLooping
|
||||
|
|
@ -389,6 +396,50 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus notification errors: {{ $value | printf \"%.2f\" }}/s"
|
||||
- name: Critical Services
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"postgresql.*"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL has no available replicas"
|
||||
- alert: MySQLDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="dbaas", deployment=~"mysql.*"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "MySQL has no available replicas"
|
||||
- alert: RedisDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="redis"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis has no available replicas"
|
||||
- alert: HeadscaleDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="headscale"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Headscale VPN has no available replicas"
|
||||
- alert: AuthentikDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="authentik", deployment="authentik-server"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Authentik auth server has no available replicas"
|
||||
- alert: LokiDown
|
||||
expr: (kube_statefulset_status_replicas_ready{namespace="monitoring", statefulset=~"loki.*"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Loki log aggregation has no ready replicas"
|
||||
- name: Cluster
|
||||
rules:
|
||||
- alert: NodeDown
|
||||
|
|
@ -548,20 +599,20 @@ serverFiles:
|
|||
severity: page
|
||||
annotations:
|
||||
summary: Mail server has no available replicas. This means mail may not be received.
|
||||
# - alert: Hackmd has no replicas available
|
||||
# expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
|
||||
# for: 1m
|
||||
# labels:
|
||||
# severity: page
|
||||
# annotations:
|
||||
# summary: Hackmd has no available replicas.
|
||||
# - alert: Privatebin has no replicas available
|
||||
# expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
|
||||
# for: 10m
|
||||
# labels:
|
||||
# severity: page
|
||||
# annotations:
|
||||
# summary: Privatebin has no available replicas.
|
||||
- alert: HackmdDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Hackmd has no available replicas"
|
||||
- alert: PrivatebinDown
|
||||
expr: (kube_deployment_status_replicas_available{namespace="privatebin"} or on() vector(0)) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Privatebin has no available replicas"
|
||||
# - name: London OpenWRT Down
|
||||
# rules:
|
||||
# - alert: OpenWRT client unreachable
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue