revert MaxRequestWorkers to 50, exclude nextcloud from 5xx alert

- MaxRequestWorkers 25→50: too few workers caused ALL workers to block
  on SQLite locks, making liveness probes fail even faster (131 restarts
  vs 50 before). 50 is a compromise — enough workers for probes.
- Excluded nextcloud from HighServiceErrorRate alert (chronic SQLite issue)
- MySQL migration attempted but hit: GR error 3100 (fixed with GIPK),
  emoji in calendar/filecache (stripped), SQLite corruption (pre-existing
  from crash-looping). Migration rolled back, Nextcloud restored to SQLite.
This commit is contained in:
Viktor Barzin 2026-03-09 22:05:20 +00:00
parent eed991a27b
commit d8bcdfef2e
3 changed files with 89 additions and 6 deletions

View file

@ -94,12 +94,13 @@ resource "kubernetes_config_map" "apache_tuning" {
data = {
"mpm_prefork.conf" = <<-EOF
# Tuned for container with 6Gi memory limit
# Each worker uses ~220MB RSS, so 25 workers 5.5GB
# Each worker uses ~220MB RSS, so 50 workers 11GB (shared pages reduce actual)
# Need enough workers so probes can get through during SQLite locks
<IfModule mpm_prefork_module>
StartServers 3
MinSpareServers 2
MaxSpareServers 5
MaxRequestWorkers 25
StartServers 5
MinSpareServers 3
MaxSpareServers 10
MaxRequestWorkers 50
MaxConnectionsPerChild 200
</IfModule>
EOF

View file

@ -109,7 +109,68 @@ resource "kubernetes_config_map" "loki_alert_rules" {
}
data = {
"rules.yaml" = yamlencode({
groups = []
groups = [
{
name = "Node Health"
rules = [
{
alert = "KernelOOMKiller"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "OOM killer active on {{ $labels.node }}"
}
},
{
alert = "KernelPanic"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Kernel panic on {{ $labels.node }}"
}
},
{
alert = "KernelHungTask"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
for = "0m"
labels = {
severity = "warning"
}
annotations = {
summary = "Hung task detected on {{ $labels.node }}"
}
},
{
alert = "KernelSoftLockup"
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
for = "0m"
labels = {
severity = "critical"
}
annotations = {
summary = "Soft lockup on {{ $labels.node }}"
}
},
{
alert = "ContainerdDown"
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
for = "1m"
labels = {
severity = "critical"
}
annotations = {
summary = "containerd service unhealthy on {{ $labels.node }}"
}
},
]
}
]
})
}
}

View file

@ -586,6 +586,27 @@ serverFiles:
severity: warning
annotations:
summary: "{{ $labels.namespace }}/{{ $labels.daemonset }}: {{ $value | printf \"%.0f\" }} pod(s) missing"
- alert: NodeMemoryPressureTrending
expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 85
for: 15m
labels:
severity: warning
annotations:
summary: "Memory usage on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 85%)"
- alert: NodeExporterDown
expr: up{job="prometheus-prometheus-node-exporter"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node exporter down: {{ $labels.instance }}"
- alert: NodeHighIOWait
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 30
for: 10m
labels:
severity: warning
annotations:
summary: "IOWait on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 30%)"
- alert: NoNodeLoadData
expr: (node_load1 OR on() vector(0)) == 0
for: 10m