From 71ff8039788aab88b533ebc032fdcb0bc10a9802 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 13 Feb 2026 23:03:40 +0000 Subject: [PATCH] [ci skip] Add centralized log collection: Loki + Alloy + sysctl DaemonSet --- modules/kubernetes/monitoring/alloy.yaml | 9 + modules/kubernetes/monitoring/loki.tf | 225 +++++++++++++++++------ modules/kubernetes/monitoring/loki.yaml | 54 +++++- 3 files changed, 219 insertions(+), 69 deletions(-) diff --git a/modules/kubernetes/monitoring/alloy.yaml b/modules/kubernetes/monitoring/alloy.yaml index c7b0caaa..d3d6a0db 100644 --- a/modules/kubernetes/monitoring/alloy.yaml +++ b/modules/kubernetes/monitoring/alloy.yaml @@ -98,3 +98,12 @@ alloy: forward_to = [loki.write.default.receiver] } + + # Resource limits for DaemonSet pods + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi diff --git a/modules/kubernetes/monitoring/loki.tf b/modules/kubernetes/monitoring/loki.tf index 2be69371..b6446d09 100644 --- a/modules/kubernetes/monitoring/loki.tf +++ b/modules/kubernetes/monitoring/loki.tf @@ -1,74 +1,106 @@ -# resource "helm_release" "loki" { -# namespace = kubernetes_namespace.monitoring.metadata[0].name -# create_namespace = true -# name = "loki" +resource "helm_release" "loki" { + namespace = kubernetes_namespace.monitoring.metadata[0].name + create_namespace = true + name = "loki" -# repository = "https://grafana.github.io/helm-charts" -# chart = "loki" + repository = "https://grafana.github.io/helm-charts" + chart = "loki" -# values = [templatefile("${path.module}/loki.yaml", {})] -# atomic = true -# timeout = 120 -# } + values = [templatefile("${path.module}/loki.yaml", {})] + timeout = 300 -# resource "kubernetes_persistent_volume" "loki" { -# metadata { -# name = "loki" -# } -# spec { -# capacity = { -# storage = "15Gi" -# } -# access_modes = ["ReadWriteOnce"] -# persistent_volume_source { -# nfs { -# path = "/mnt/main/loki/loki" -# server = "10.0.10.15" -# } -# } -# persistent_volume_reclaim_policy = "Retain" -# volume_mode = "Filesystem" -# } -# } - -# resource "kubernetes_persistent_volume" "loki-minio" { -# metadata { -# name = "loki-minio" -# } -# spec { -# capacity = { -# storage = "15Gi" -# } -# access_modes = ["ReadWriteMany"] -# persistent_volume_source { -# nfs { -# path = "/mnt/main/loki/minio" -# server = "10.0.10.15" -# } -# } -# persistent_volume_reclaim_policy = "Retain" -# volume_mode = "Filesystem" -# } -# } + depends_on = [kubernetes_config_map.loki_alert_rules] +} +resource "kubernetes_persistent_volume" "loki" { + metadata { + name = "loki" + } + spec { + capacity = { + storage = "15Gi" + } + access_modes = ["ReadWriteOnce"] + persistent_volume_source { + nfs { + path = "/mnt/main/loki/loki" + server = "10.0.10.15" + } + } + persistent_volume_reclaim_policy = "Retain" + volume_mode = "Filesystem" + } +} # https://grafana.com/docs/alloy/latest/configure/kubernetes/ -# resource "helm_release" "alloy" { -# namespace = kubernetes_namespace.monitoring.metadata[0].name -# create_namespace = true -# name = "alloy" +resource "helm_release" "alloy" { + namespace = kubernetes_namespace.monitoring.metadata[0].name + create_namespace = true + name = "alloy" -# repository = "https://grafana.github.io/helm-charts" -# chart = "alloy" + repository = "https://grafana.github.io/helm-charts" + chart = "alloy" -# atomic = true -# } + values = [file("${path.module}/alloy.yaml")] + atomic = true -# Increase open file limits as alloy is reading files: -# https://serverfault.com/questions/1137211/failed-to-create-fsnotify-watcher-too-many-open-files + depends_on = [helm_release.loki] +} -# run for all nodes using : -# for n in $(kbn | awk '{print $1}'); do echo $n; s wizard@$n 'sudo sysctl -w fs.inotify.max_user_watches=2099999999; sudo sysctl -w fs.inotify.max_user_instances=2099999999;sudo sysctl -w fs.inotify.max_queued_events=2099999999'; done +resource "kubernetes_daemon_set_v1" "sysctl-inotify" { + metadata { + name = "sysctl-inotify" + namespace = kubernetes_namespace.monitoring.metadata[0].name + labels = { + app = "sysctl-inotify" + } + } + spec { + selector { + match_labels = { + app = "sysctl-inotify" + } + } + template { + metadata { + labels = { + app = "sysctl-inotify" + } + } + spec { + init_container { + name = "sysctl" + image = "busybox:1.37" + command = [ + "sh", "-c", + "sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=512 && sysctl -w fs.inotify.max_queued_events=1048576" + ] + security_context { + privileged = true + } + } + container { + name = "pause" + image = "registry.k8s.io/pause:3.10" + resources { + requests = { + cpu = "1m" + memory = "4Mi" + } + limits = { + cpu = "1m" + memory = "4Mi" + } + } + } + host_pid = true + toleration { + operator = "Exists" + } + } + } + } +} # resource "helm_release" "k8s-monitoring" { # namespace = kubernetes_namespace.monitoring.metadata[0].name @@ -81,3 +113,74 @@ # values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})] # atomic = true # } + +resource "kubernetes_config_map" "loki_alert_rules" { + metadata { + name = "loki-alert-rules" + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + data = { + "rules.yaml" = yamlencode({ + groups = [{ + name = "log-alerts" + rules = [ + { + alert = "HighErrorRate" + expr = "sum(rate({namespace=~\".+\"} |= \"error\" [5m])) by (namespace) > 10" + for = "5m" + labels = { + severity = "warning" + } + annotations = { + summary = "High error rate in {{ $labels.namespace }}" + } + }, + { + alert = "PodCrashLoopBackOff" + expr = "count_over_time({namespace=~\".+\"} |= \"CrashLoopBackOff\" [5m]) > 0" + for = "1m" + labels = { + severity = "critical" + } + annotations = { + summary = "CrashLoopBackOff detected in {{ $labels.namespace }}" + } + }, + { + alert = "OOMKilled" + expr = "count_over_time({namespace=~\".+\"} |= \"OOMKilled\" [5m]) > 0" + for = "1m" + labels = { + severity = "critical" + } + annotations = { + summary = "OOMKilled detected in {{ $labels.namespace }}" + } + } + ] + }] + }) + } +} + +resource "kubernetes_config_map" "grafana_loki_datasource" { + metadata { + name = "grafana-loki-datasource" + namespace = kubernetes_namespace.monitoring.metadata[0].name + labels = { + grafana_datasource = "1" + } + } + data = { + "loki-datasource.yaml" = yamlencode({ + apiVersion = 1 + datasources = [{ + name = "Loki" + type = "loki" + access = "proxy" + url = "http://loki.monitoring.svc.cluster.local:3100" + isDefault = false + }] + }) + } +} diff --git a/modules/kubernetes/monitoring/loki.yaml b/modules/kubernetes/monitoring/loki.yaml index 3684e34e..06bb8765 100644 --- a/modules/kubernetes/monitoring/loki.yaml +++ b/modules/kubernetes/monitoring/loki.yaml @@ -10,22 +10,37 @@ loki: index: prefix: loki_index_ period: 24h + ingester: + chunk_idle_period: 12h + max_chunk_age: 24h + chunk_retain_period: 1m + chunk_target_size: 1572864 + wal: + dir: /loki-wal pattern_ingester: enabled: true limits_config: allow_structured_metadata: true volume_enabled: true + retention_period: 168h + compactor: + retention_enabled: true + working_directory: /loki/compactor + compaction_interval: 1h + delete_request_store: filesystem ruler: enable_api: true + storage: + type: local + local: + directory: /loki/rules + alertmanager_url: http://alertmanager.monitoring.svc.cluster.local:9093 + ring: + kvstore: + store: inmemory + rule_path: /loki/scratch storage: type: "filesystem" - persistence: - enabled: true - size: 15Gi - accessModes: - - ReadWriteOnce - # Auth requires a revers proxy providing basic auth - # https://grafana.com/docs/loki/latest/operations/authentication/ auth_enabled: false minio: @@ -35,6 +50,30 @@ deploymentMode: SingleBinary singleBinary: replicas: 1 + persistence: + enabled: true + size: 15Gi + storageClass: "" + extraVolumes: + - name: wal + emptyDir: + medium: Memory + sizeLimit: 2Gi + - name: rules + configMap: + name: loki-alert-rules + extraVolumeMounts: + - name: wal + mountPath: /loki-wal + - name: rules + mountPath: /loki/rules/fake + resources: + requests: + cpu: 250m + memory: 4Gi + limits: + cpu: "1" + memory: 6Gi # Zero out replica counts of other deployment modes backend: @@ -43,7 +82,6 @@ read: replicas: 0 write: replicas: 0 - ingester: replicas: 0 querier: