revert MaxRequestWorkers to 50, exclude nextcloud from 5xx alert
- MaxRequestWorkers 25→50: too few workers caused ALL workers to block on SQLite locks, making liveness probes fail even faster (131 restarts vs 50 before). 50 is a compromise — enough workers for probes. - Excluded nextcloud from HighServiceErrorRate alert (chronic SQLite issue) - MySQL migration attempted but hit: GR error 3100 (fixed with GIPK), emoji in calendar/filecache (stripped), SQLite corruption (pre-existing from crash-looping). Migration rolled back, Nextcloud restored to SQLite.
This commit is contained in:
parent
eed991a27b
commit
d8bcdfef2e
3 changed files with 89 additions and 6 deletions
|
|
@ -94,12 +94,13 @@ resource "kubernetes_config_map" "apache_tuning" {
|
||||||
data = {
|
data = {
|
||||||
"mpm_prefork.conf" = <<-EOF
|
"mpm_prefork.conf" = <<-EOF
|
||||||
# Tuned for container with 6Gi memory limit
|
# Tuned for container with 6Gi memory limit
|
||||||
# Each worker uses ~220MB RSS, so 25 workers ≈ 5.5GB
|
# Each worker uses ~220MB RSS, so 50 workers ≈ 11GB (shared pages reduce actual)
|
||||||
|
# Need enough workers so probes can get through during SQLite locks
|
||||||
<IfModule mpm_prefork_module>
|
<IfModule mpm_prefork_module>
|
||||||
StartServers 3
|
StartServers 5
|
||||||
MinSpareServers 2
|
MinSpareServers 3
|
||||||
MaxSpareServers 5
|
MaxSpareServers 10
|
||||||
MaxRequestWorkers 25
|
MaxRequestWorkers 50
|
||||||
MaxConnectionsPerChild 200
|
MaxConnectionsPerChild 200
|
||||||
</IfModule>
|
</IfModule>
|
||||||
EOF
|
EOF
|
||||||
|
|
|
||||||
|
|
@ -109,7 +109,68 @@ resource "kubernetes_config_map" "loki_alert_rules" {
|
||||||
}
|
}
|
||||||
data = {
|
data = {
|
||||||
"rules.yaml" = yamlencode({
|
"rules.yaml" = yamlencode({
|
||||||
groups = []
|
groups = [
|
||||||
|
{
|
||||||
|
name = "Node Health"
|
||||||
|
rules = [
|
||||||
|
{
|
||||||
|
alert = "KernelOOMKiller"
|
||||||
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Out of memory.*Killed process\" [5m])) > 0"
|
||||||
|
for = "0m"
|
||||||
|
labels = {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
annotations = {
|
||||||
|
summary = "OOM killer active on {{ $labels.node }}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert = "KernelPanic"
|
||||||
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)Kernel panic\" [5m])) > 0"
|
||||||
|
for = "0m"
|
||||||
|
labels = {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
annotations = {
|
||||||
|
summary = "Kernel panic on {{ $labels.node }}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert = "KernelHungTask"
|
||||||
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"blocked for more than\" [5m])) > 0"
|
||||||
|
for = "0m"
|
||||||
|
labels = {
|
||||||
|
severity = "warning"
|
||||||
|
}
|
||||||
|
annotations = {
|
||||||
|
summary = "Hung task detected on {{ $labels.node }}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert = "KernelSoftLockup"
|
||||||
|
expr = "sum by (node) (count_over_time({job=\"node-journal\"} |~ \"(?i)soft lockup\" [5m])) > 0"
|
||||||
|
for = "0m"
|
||||||
|
labels = {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
annotations = {
|
||||||
|
summary = "Soft lockup on {{ $labels.node }}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert = "ContainerdDown"
|
||||||
|
expr = "sum by (node) (count_over_time({job=\"node-journal\", unit=\"containerd.service\"} |~ \"(?i)(dead|failed|deactivating)\" [5m])) > 0"
|
||||||
|
for = "1m"
|
||||||
|
labels = {
|
||||||
|
severity = "critical"
|
||||||
|
}
|
||||||
|
annotations = {
|
||||||
|
summary = "containerd service unhealthy on {{ $labels.node }}"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -586,6 +586,27 @@ serverFiles:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "{{ $labels.namespace }}/{{ $labels.daemonset }}: {{ $value | printf \"%.0f\" }} pod(s) missing"
|
summary: "{{ $labels.namespace }}/{{ $labels.daemonset }}: {{ $value | printf \"%.0f\" }} pod(s) missing"
|
||||||
|
- alert: NodeMemoryPressureTrending
|
||||||
|
expr: ((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 85
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Memory usage on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 85%)"
|
||||||
|
- alert: NodeExporterDown
|
||||||
|
expr: up{job="prometheus-prometheus-node-exporter"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Node exporter down: {{ $labels.instance }}"
|
||||||
|
- alert: NodeHighIOWait
|
||||||
|
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 30
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "IOWait on {{ $labels.instance }}: {{ $value | printf \"%.0f\" }}% (threshold: 30%)"
|
||||||
- alert: NoNodeLoadData
|
- alert: NoNodeLoadData
|
||||||
expr: (node_load1 OR on() vector(0)) == 0
|
expr: (node_load1 OR on() vector(0)) == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue