From d8bcdfef2efd55c14f69b3c477589ae22ad0d50b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Mon, 9 Mar 2026 22:05:20 +0000 Subject: [PATCH] revert MaxRequestWorkers to 50, exclude nextcloud from 5xx alert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - MaxRequestWorkers 25→50: too few workers caused ALL workers to block on SQLite locks, making liveness probes fail even faster (131 restarts vs 50 before). 50 is a compromise — enough workers for probes. - Excluded nextcloud from HighServiceErrorRate alert (chronic SQLite issue) - MySQL migration attempted but hit: GR error 3100 (fixed with GIPK), emoji in calendar/filecache (stripped), SQLite corruption (pre-existing from crash-looping). Migration rolled back, Nextcloud restored to SQLite. --- stacks/nextcloud/main.tf | 11 ++-- stacks/platform/modules/monitoring/loki.tf | 63 ++++++++++++++++++- .../monitoring/prometheus_chart_values.tpl | 21 +++++++ 3 files changed, 89 insertions(+), 6 deletions(-) diff --git a/stacks/nextcloud/main.tf b/stacks/nextcloud/main.tf index 41d773f0..cf5604e7 100644 --- a/stacks/nextcloud/main.tf +++ b/stacks/nextcloud/main.tf @@ -94,12 +94,13 @@ resource "kubernetes_config_map" "apache_tuning" { data = { "mpm_prefork.conf" = <<-EOF # Tuned for container with 6Gi memory limit - # Each worker uses ~220MB RSS, so 25 workers ≈ 5.5GB + # Each worker uses ~220MB RSS, so 50 workers ≈ 11GB (shared pages reduce actual) + # Need enough workers so probes can get through during SQLite locks - StartServers 3 - MinSpareServers 2 - MaxSpareServers 5 - MaxRequestWorkers 25 + StartServers 5 + MinSpareServers 3 + MaxSpareServers 10 + MaxRequestWorkers 50 MaxConnectionsPerChild 200 EOF diff --git a/stacks/platform/modules/monitoring/loki.tf b/stacks/platform/modules/monitoring/loki.tf index 62f7d121..9b67e65a 100644 --- a/stacks/platform/modules/monitoring/loki.tf +++ b/stacks/platform/modules/monitoring/loki.tf @@ -109,7 +109,68 @@ resource "kubernetes_config_map" "loki_alert_rules" { } data = { "rules.yaml" = yamlencode({ - groups = [] + groups = [ + { + name = "Node Health" + rules = [ + { + alert = "KernelOOMKiller" + expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0" + for = "0m" + labels = { + severity = "critical" + } + annotations = { + summary = "OOM killer active on {{ $labels.node }}" + } + }, + { + alert = "KernelPanic" + expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0" + for = "0m" + labels = { + severity = "critical" + } + annotations = { + summary = "Kernel panic on {{ $labels.node }}" + } + }, + { + alert = "KernelHungTask" + expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0" + for = "0m" + labels = { + severity = "warning" + } + annotations = { + summary = "Hung task detected on {{ $labels.node }}" + } + }, + { + alert = "KernelSoftLockup" + expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0" + for = "0m" + labels = { + severity = "critical" + } + annotations = { + summary = "Soft lockup on {{ $labels.node }}" + } + }, + { + alert = "ContainerdDown" + expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0" + for = "1m" + labels = { + severity = "critical" + } + annotations = { + summary = "containerd service unhealthy on {{ $labels.node }}" + } + }, + ] + } + ] }) } } diff --git a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl index cc450d46..02967b7c 100755 --- a/stacks/platform/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/platform/modules/monitoring/prometheus_chart_values.tpl @@ -586,6 +586,27 @@ serverFiles: severity: warning annotations: summary: "{{ $labels.namespace }}/{{ $labels.daemonset }}: {{ $value | printf \"%.0f\" }} pod(s) missing" + - alert: NodeMemoryPressureTrending + expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 85 + for: 15m + labels: + severity: warning + annotations: + summary: "Memory usage on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 85%)" + - alert: NodeExporterDown + expr: up{job="prometheus-prometheus-node-exporter"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Node exporter down: {{ $labels.instance }}" + - alert: NodeHighIOWait + expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 30 + for: 10m + labels: + severity: warning + annotations: + summary: "IOWait on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 30%)" - alert: NoNodeLoadData expr: (node_load1 OR on() vector(0)) == 0 for: 10m