infra/stacks/monitoring/modules/monitoring/loki.tf
Viktor Barzin ae36dc253b extract monitoring, nvidia, mailserver, cloudflared, kyverno from platform [ci skip]
Phase 2 of platform stack split. 5 more modules extracted into
independent stacks. All applied successfully with zero destroys.
Cloudflared now reads k8s_users from Vault directly to compute
user_domains. Woodpecker pipeline runs all 8 extracted stacks
in parallel. Memory bumped to 6Gi for 9 concurrent TF processes.
Platform reduced from 27 to 19 modules.
2026-03-17 21:34:11 +00:00

220 lines
6.1 KiB
HCL

variable "nfs_server" { type = string }
# LOKI DISABLED - Uncomment to re-enable centralized logging
# Disabled due to operational overhead vs benefit analysis after node2 incident
# All configuration preserved in loki.yaml for future re-enabling
/*
resource "helm_release" "loki" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "loki"
repository = "https://grafana.github.io/helm-charts"
chart = "loki"
values = [templatefile("${path.module}/loki.yaml", {})]
timeout = 600
depends_on = [kubernetes_config_map.loki_alert_rules]
}
*/
# ALLOY DISABLED - Log collection agents (depends on Loki)
# https://grafana.com/docs/alloy/latest/configure/kubernetes/
# Configuration preserved in alloy.yaml for future re-enabling
/*
resource "helm_release" "alloy" {
namespace = kubernetes_namespace.monitoring.metadata[0].name
create_namespace = true
name = "alloy"
repository = "https://grafana.github.io/helm-charts"
chart = "alloy"
values = [file("${path.module}/alloy.yaml")]
atomic = true
depends_on = [helm_release.loki]
}
*/
# SYSCTL INOTIFY DISABLED - Was specifically for Loki file watching requirements
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_daemon_set_v1" "sysctl-inotify" {
metadata {
name = "sysctl-inotify"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
app = "sysctl-inotify"
}
}
spec {
selector {
match_labels = {
app = "sysctl-inotify"
}
}
template {
metadata {
labels = {
app = "sysctl-inotify"
}
}
spec {
init_container {
name = "sysctl"
image = "busybox:1.37"
command = [
"sh", "-c",
"sysctl -w fs.inotify.max_user_watches=1048576 && sysctl -w fs.inotify.max_user_instances=8192 && sysctl -w fs.inotify.max_queued_events=1048576"
]
security_context {
privileged = true
}
}
container {
name = "pause"
image = "registry.k8s.io/pause:3.10"
resources {
requests = {
cpu = "1m"
memory = "4Mi"
}
limits = {
cpu = "1m"
memory = "4Mi"
}
}
}
host_pid = true
toleration {
operator = "Exists"
}
dns_config {
option {
name = "ndots"
value = "2"
}
}
}
}
}
}
*/
# resource "helm_release" "k8s-monitoring" {
# namespace = kubernetes_namespace.monitoring.metadata[0].name
# create_namespace = true
# name = "k8s-monitoring"
# repository = "https://grafana.github.io/helm-charts"
# chart = "k8s-monitoring"
# values = [templatefile("${path.module}/k8s-monitoring-values.yaml", {})]
# atomic = true
# }
# LOKI ALERT RULES DISABLED - Depend on Loki log queries
# These alert on kernel events from systemd journal logs via Loki
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_config_map" "loki_alert_rules" {
metadata {
name = "loki-alert-rules"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
"rules.yaml" = yamlencode({
groups = [
{
name = "Node Health"
rules = [
{
alert = "KernelOOMKiller"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "OOM killer active on {{ $labels.node }}"
}
},
{
alert = "KernelPanic"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Kernel panic on {{ $labels.node }}"
}
},
{
alert = "KernelHungTask"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
for = "0m"
labels = {
severity = "warning"
}
annotations = {
summary = "Hung task detected on {{ $labels.node }}"
}
},
{
alert = "KernelSoftLockup"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Soft lockup on {{ $labels.node }}"
}
},
{
alert = "ContainerdDown"
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "containerd service unhealthy on {{ $labels.node }}"
}
},
]
}
]
})
}
}
*/
# GRAFANA LOKI DATASOURCE DISABLED - Points to non-existent Loki service
# Can be re-enabled when Loki is restored
/*
resource "kubernetes_config_map" "grafana_loki_datasource" {
metadata {
name = "grafana-loki-datasource"
namespace = kubernetes_namespace.monitoring.metadata[0].name
labels = {
grafana_datasource = "1"
}
}
data = {
"loki-datasource.yaml" = yamlencode({
apiVersion = 1
datasources = [{
name = "Loki"
type = "loki"
access = "proxy"
url = "http://loki.monitoring.svc.cluster.local:3100"
isDefault = false
}]
})
}
}
*/