[ci skip] Add centralized log collection: Loki + Alloy + sysctl DaemonSet

This commit is contained in:
Viktor Barzin 2026-02-13 23:03:40 +00:00
parent a44dfac721
commit 71ff803978
3 changed files with 219 additions and 69 deletions

View file

@ -98,3 +98,12 @@ alloy:
forward_to = [loki.write.default.receiver]
}
# Resource limits for DaemonSet pods
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 200m
memory: 128Mi

View file

@ -1,74 +1,106 @@
# resource "helm_release" "loki" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "loki"
resource "helm_release" "loki" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "loki"
# repository = "https://grafana.github.io/helm-charts"
# chart = "loki"
repository = "https://grafana.github.io/helm-charts"
chart = "loki"
# values = [templatefile("${path.module}/loki.yaml", {})]
# atomic = true
# timeout = 120
# }
values = [templatefile("${path.module}/loki.yaml", {})]
timeout = 300
# resource "kubernetes_persistent_volume" "loki" {
# metadata {
# name = "loki"
# }
# spec {
# capacity = {
# storage = "15Gi"
# }
# access_modes = ["ReadWriteOnce"]
# persistent_volume_source {
# nfs {
# path = "/mnt/main/loki/loki"
# server = "10.0.10.15"
# }
# }
# persistent_volume_reclaim_policy = "Retain"
# volume_mode = "Filesystem"
# }
# }
# resource "kubernetes_persistent_volume" "loki-minio" {
# metadata {
# name = "loki-minio"
# }
# spec {
# capacity = {
# storage = "15Gi"
# }
# access_modes = ["ReadWriteMany"]
# persistent_volume_source {
# nfs {
# path = "/mnt/main/loki/minio"
# server = "10.0.10.15"
# }
# }
# persistent_volume_reclaim_policy = "Retain"
# volume_mode = "Filesystem"
# }
# }
depends_on = [kubernetes_config_map.loki_alert_rules]
}
resource "kubernetes_persistent_volume" "loki" {
metadata {
name = "loki"
}
spec {
capacity = {
storage = "15Gi"
}
access_modes = ["ReadWriteOnce"]
persistent_volume_source {
nfs {
path = "/mnt/main/loki/loki"
server = "10.0.10.15"
}
}
persistent_volume_reclaim_policy = "Retain"
volume_mode = "Filesystem"
}
}
# https://grafana.com/docs/alloy/latest/configure/kubernetes/
# resource "helm_release" "alloy" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "alloy"
resource "helm_release" "alloy" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "alloy"
# repository = "https://grafana.github.io/helm-charts"
# chart = "alloy"
repository = "https://grafana.github.io/helm-charts"
chart = "alloy"
# atomic = true
# }
values = [file("${path.module}/alloy.yaml")]
atomic = true
# Increase open file limits as alloy is reading files:
# https://serverfault.com/questions/1137211/failed-to-create-fsnotify-watcher-too-many-open-files
depends_on = [helm_release.loki]
}
# run for all nodes using :
# for n in $(kbn | awk '{print $1}'); do echo $n; s wizard@$n 'sudo sysctl -w fs.inotify.max_user_watches=2099999999; sudo sysctl -w fs.inotify.max_user_instances=2099999999;sudo sysctl -w fs.inotify.max_queued_events=2099999999'; done
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
metadata {
name = "sysctl-inotify"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "sysctl-inotify"
}
}
spec {
selector {
match_labels = {
app = "sysctl-inotify"
}
}
template {
metadata {
labels = {
app = "sysctl-inotify"
}
}
spec {
init_container {
name = "sysctl"
image = "busybox:1.37"
command = [
"sh", "-c",
"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=512 && sysctl -w fs.inotify.max_queued_events=1048576"
]
security_context {
privileged = true
}
}
container {
name = "pause"
image = "registry.k8s.io/pause:3.10"
resources {
requests = {
cpu = "1m"
memory = "4Mi"
}
limits = {
cpu = "1m"
memory = "4Mi"
}
}
}
host_pid = true
toleration {
operator = "Exists"
}
}
}
}
}
# resource "helm_release" "k8s-monitoring" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
@ -81,3 +113,74 @@
# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
# atomic = true
# }
resource "kubernetes_config_map" "loki_alert_rules" {
metadata {
name = "loki-alert-rules"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"rules.yaml" = yamlencode({
groups = [{
name = "log-alerts"
rules = [
{
alert = "HighErrorRate"
expr = "sum(rate({namespace=~\".+\"} |= \"error\" [5m])) by (namespace) > 10"
for = "5m"
labels = {
severity = "warning"
}
annotations = {
summary = "High error rate in {{ $labels.namespace }}"
}
},
{
alert = "PodCrashLoopBackOff"
expr = "count_over_time({namespace=~\".+\"} |= \"CrashLoopBackOff\" [5m]) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "CrashLoopBackOff detected in {{ $labels.namespace }}"
}
},
{
alert = "OOMKilled"
expr = "count_over_time({namespace=~\".+\"} |= \"OOMKilled\" [5m]) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "OOMKilled detected in {{ $labels.namespace }}"
}
}
]
}]
})
}
}
resource "kubernetes_config_map" "grafana_loki_datasource" {
metadata {
name = "grafana-loki-datasource"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_datasource = "1"
}
}
data = {
"loki-datasource.yaml" = yamlencode({
apiVersion = 1
datasources = [{
name = "Loki"
type = "loki"
access = "proxy"
url = "http://loki.monitoring.svc.cluster.local:3100"
isDefault = false
}]
})
}
}

View file

@ -10,22 +10,37 @@ loki:
index:
prefix: loki_index_
period: 24h
ingester:
chunk_idle_period: 12h
max_chunk_age: 24h
chunk_retain_period: 1m
chunk_target_size: 1572864
wal:
dir: /loki-wal
pattern_ingester:
enabled: true
limits_config:
allow_structured_metadata: true
volume_enabled: true
retention_period: 168h
compactor:
retention_enabled: true
working_directory: /loki/compactor
compaction_interval: 1h
delete_request_store: filesystem
ruler:
enable_api: true
storage:
type: local
local:
directory: /loki/rules
alertmanager_url: http://alertmanager.monitoring.svc.cluster.local:9093
ring:
kvstore:
store: inmemory
rule_path: /loki/scratch
storage:
type: "filesystem"
persistence:
enabled: true
size: 15Gi
accessModes:
- ReadWriteOnce
# Auth requires a revers proxy providing basic auth
# https://grafana.com/docs/loki/latest/operations/authentication/
auth_enabled: false
minio:
@ -35,6 +50,30 @@ deploymentMode: SingleBinary
singleBinary:
replicas: 1
persistence:
enabled: true
size: 15Gi
storageClass: ""
extraVolumes:
- name: wal
emptyDir:
medium: Memory
sizeLimit: 2Gi
- name: rules
configMap:
name: loki-alert-rules
extraVolumeMounts:
- name: wal
mountPath: /loki-wal
- name: rules
mountPath: /loki/rules/fake
resources:
requests:
cpu: 250m
memory: 4Gi
limits:
cpu: "1"
memory: 6Gi
# Zero out replica counts of other deployment modes
backend:
@ -43,7 +82,6 @@ read:
replicas: 0
write:
replicas: 0
ingester:
replicas: 0
querier: