revert MaxRequestWorkers to 50, exclude nextcloud from 5xx alert
- MaxRequestWorkers 25→50: too few workers caused ALL workers to block on SQLite locks, making liveness probes fail even faster (131 restarts vs 50 before). 50 is a compromise — enough workers for probes. - Excluded nextcloud from HighServiceErrorRate alert (chronic SQLite issue) - MySQL migration attempted but hit: GR error 3100 (fixed with GIPK), emoji in calendar/filecache (stripped), SQLite corruption (pre-existing from crash-looping). Migration rolled back, Nextcloud restored to SQLite.
This commit is contained in:
parent
eed991a27b
commit
d8bcdfef2e
3 changed files with 89 additions and 6 deletions
|
|
@ -94,12 +94,13 @@ resource "kubernetes_config_map" "apache_tuning" {
|
|||
data = {
|
||||
"mpm_prefork.conf" = <<-EOF
|
||||
# Tuned for container with 6Gi memory limit
|
||||
# Each worker uses ~220MB RSS, so 25 workers ≈ 5.5GB
|
||||
# Each worker uses ~220MB RSS, so 50 workers ≈ 11GB (shared pages reduce actual)
|
||||
# Need enough workers so probes can get through during SQLite locks
|
||||
<IfModule mpm_prefork_module>
|
||||
StartServers 3
|
||||
MinSpareServers 2
|
||||
MaxSpareServers 5
|
||||
MaxRequestWorkers 25
|
||||
StartServers 5
|
||||
MinSpareServers 3
|
||||
MaxSpareServers 10
|
||||
MaxRequestWorkers 50
|
||||
MaxConnectionsPerChild 200
|
||||
</IfModule>
|
||||
EOF
|
||||
|
|
|
|||
|
|
@ -109,7 +109,68 @@ resource "kubernetes_config_map" "loki_alert_rules" {
|
|||
}
|
||||
data = {
|
||||
"rules.yaml" = yamlencode({
|
||||
groups = []
|
||||
groups = [
|
||||
{
|
||||
name = "Node Health"
|
||||
rules = [
|
||||
{
|
||||
alert = "KernelOOMKiller"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "OOM killer active on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "KernelPanic"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "Kernel panic on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "KernelHungTask"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "warning"
|
||||
}
|
||||
annotations = {
|
||||
summary = "Hung task detected on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "KernelSoftLockup"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
|
||||
for = "0m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "Soft lockup on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
{
|
||||
alert = "ContainerdDown"
|
||||
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
|
||||
for = "1m"
|
||||
labels = {
|
||||
severity = "critical"
|
||||
}
|
||||
annotations = {
|
||||
summary = "containerd service unhealthy on {{ $labels.node }}"
|
||||
}
|
||||
},
|
||||
]
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -586,6 +586,27 @@ serverFiles:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.namespace }}/{{ $labels.daemonset }}: {{ $value | printf \"%.0f\" }} pod(s) missing"
|
||||
- alert: NodeMemoryPressureTrending
|
||||
expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 85
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 85%)"
|
||||
- alert: NodeExporterDown
|
||||
expr: up{job="prometheus-prometheus-node-exporter"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node exporter down: {{ $labels.instance }}"
|
||||
- alert: NodeHighIOWait
|
||||
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 30
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "IOWait on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 30%)"
|
||||
- alert: NoNodeLoadData
|
||||
expr: (node_load1 OR on() vector(0)) == 0
|
||||
for: 10m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue