Phase 1 - Critical Security: - Netbox: move hardcoded DB/superuser passwords to variables - MeshCentral: disable public registration, add Authentik auth - Traefik: disable insecure API dashboard (api.insecure=false) - Traefik: configure forwarded headers with Cloudflare trusted IPs Phase 2 - Security Hardening: - Add security headers middleware (HSTS, X-Frame-Options, nosniff, etc.) - Add Kyverno pod security policies in audit mode (privileged, host namespaces, SYS_ADMIN, trusted registries) - Tighten rate limiting (avg=10, burst=50) - Add Authentik protection to grampsweb Phase 3 - Monitoring & Alerting: - Add critical service alerts (PostgreSQL, MySQL, Redis, Headscale, Authentik, Loki) - Increase Loki retention from 7 to 30 days (720h) - Add predictive PV filling alert (predict_linear) - Re-enable Hackmd and Privatebin down alerts Phase 4 - Reliability: - Add resource requests/limits to Redis, DBaaS, Technitium, Headscale, Vaultwarden, Uptime Kuma - Increase Alloy DaemonSet memory to 512Mi/1Gi Phase 6 - Maintainability: - Extract duplicated tiers locals to terragrunt.hcl generate block (removed from 67 stacks) - Replace hardcoded NFS IP 10.0.10.15 with var.nfs_server (114 instances across 63 files) - Replace hardcoded Redis/PostgreSQL/MySQL/Ollama/mail host references with variables across ~35 stacks - Migrate xray raw ingress resources to ingress_factory modules
131 lines
4.6 KiB
YAML
131 lines
4.6 KiB
YAML
alloy:
|
|
configMap:
|
|
content: |-
|
|
// Write your Alloy config here:
|
|
logging {
|
|
level = "info"
|
|
format = "logfmt"
|
|
}
|
|
loki.write "default" {
|
|
endpoint {
|
|
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
|
|
}
|
|
}
|
|
|
|
// discovery.kubernetes allows you to find scrape targets from Kubernetes resources.
|
|
// It watches cluster state and ensures targets are continually synced with what is currently running in your cluster.
|
|
discovery.kubernetes "pod" {
|
|
role = "pod"
|
|
}
|
|
|
|
// discovery.relabel rewrites the label set of the input targets by applying one or more relabeling rules.
|
|
// If no rules are defined, then the input targets are exported as-is.
|
|
discovery.relabel "pod_logs" {
|
|
targets = discovery.kubernetes.pod.targets
|
|
|
|
// Label creation - "namespace" field from "__meta_kubernetes_namespace"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
action = "replace"
|
|
target_label = "namespace"
|
|
}
|
|
|
|
// Label creation - "pod" field from "__meta_kubernetes_pod_name"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
action = "replace"
|
|
target_label = "pod"
|
|
}
|
|
|
|
// Label creation - "container" field from "__meta_kubernetes_pod_container_name"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "container"
|
|
}
|
|
|
|
// Label creation - "app" field from "__meta_kubernetes_pod_label_app_kubernetes_io_name"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
action = "replace"
|
|
target_label = "app"
|
|
}
|
|
|
|
// Label creation - "job" field from "__meta_kubernetes_namespace" and "__meta_kubernetes_pod_container_name"
|
|
// Concatenate values __meta_kubernetes_namespace/__meta_kubernetes_pod_container_name
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace", "__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "job"
|
|
separator = "/"
|
|
replacement = "$1"
|
|
}
|
|
|
|
// Label creation - "container" field from "__meta_kubernetes_pod_uid" and "__meta_kubernetes_pod_container_name"
|
|
// Concatenate values __meta_kubernetes_pod_uid/__meta_kubernetes_pod_container_name.log
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
|
action = "replace"
|
|
target_label = "__path__"
|
|
separator = "/"
|
|
replacement = "/var/log/pods/*$1/*.log"
|
|
}
|
|
|
|
// Label creation - "container_runtime" field from "__meta_kubernetes_pod_container_id"
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_id"]
|
|
action = "replace"
|
|
target_label = "container_runtime"
|
|
regex = "^(\\S+):\\/\\/.+$"
|
|
replacement = "$1"
|
|
}
|
|
}
|
|
|
|
// loki.source.kubernetes tails logs from Kubernetes containers using the Kubernetes API.
|
|
loki.source.kubernetes "pod_logs" {
|
|
targets = discovery.relabel.pod_logs.output
|
|
forward_to = [loki.process.pod_logs.receiver]
|
|
}
|
|
|
|
// loki.process receives log entries from other Loki components, applies one or more processing stages,
|
|
// and forwards the results to the list of receivers in the component's arguments.
|
|
loki.process "pod_logs" {
|
|
stage.static_labels {
|
|
values = {
|
|
cluster = "default",
|
|
}
|
|
}
|
|
|
|
forward_to = [loki.write.default.receiver]
|
|
}
|
|
|
|
// Kubernetes audit log collection from /var/log/kubernetes/audit.log
|
|
// Requires alloy.mounts.varlog=true to mount /var/log from the host
|
|
local.file_match "audit_logs" {
|
|
path_targets = [{
|
|
__path__ = "/var/log/kubernetes/audit.log",
|
|
job = "kubernetes-audit",
|
|
node = env("HOSTNAME"),
|
|
}]
|
|
}
|
|
|
|
loki.source.file "audit_logs" {
|
|
targets = local.file_match.audit_logs.targets
|
|
forward_to = [loki.write.default.receiver]
|
|
}
|
|
|
|
# Mount /var/log from the host for file-based log collection (audit logs)
|
|
mounts:
|
|
varlog: true
|
|
|
|
# Resource limits for DaemonSet pods
|
|
# Alloy tails logs from all containers on the node via K8s API and batches
|
|
# them to Loki. Memory scales with number of active log streams (~30-50 per node).
|
|
# 128Mi was OOMKilled; steady-state usage is ~400-450Mi per pod.
|
|
resources:
|
|
requests:
|
|
cpu: 50m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 200m
|
|
memory: 1Gi
|