ingress latency: add histogram buckets, fix restarts, right-size memory
- Traefik: add fine-grained Prometheus histogram buckets (0.01-30s) for meaningful P50/P99 - Calibre: relax liveness probe (timeout 5→10s, threshold 3→6) to stop NFS-caused restarts - Novelapp: increase memory 128Mi/256Mi → 640Mi/640Mi (confirmed OOMKilled, VPA upper 505Mi) - Forgejo: increase memory 256Mi → 384Mi (at 80% of limit, VPA upper 311Mi) - ActualBudget: add explicit resources to prevent silent LimitRange defaults - Docs: update Nextcloud note from 4Gi → 8Gi limit (Apache spike history)
This commit is contained in:
parent
5652972c53
commit
1639910043
6 changed files with 17 additions and 7 deletions
|
|
@ -103,7 +103,7 @@ Repo IDs: infra=1, Website=2, finance=3, health=4, travel_blog=5, webhook-handle
|
|||
## Service-Specific Notes
|
||||
| Service | Key Operational Knowledge |
|
||||
|---------|--------------------------|
|
||||
| Nextcloud | MaxRequestWorkers=150, needs 4Gi memory, very generous startup probe |
|
||||
| Nextcloud | MaxRequestWorkers=150, needs 8Gi limit (Apache transient memory spikes, see commit eb94144), very generous startup probe |
|
||||
| Immich | ML on SSD, disable ModSecurity (breaks streaming), CUDA for ML, frequent upgrades |
|
||||
| CrowdSec | Pin version, disable Metabase when not needed (CPU hog), LAPI scaled to 3 |
|
||||
| Frigate | GPU stall detection in liveness probe (inference speed check), high CPU |
|
||||
|
|
|
|||
|
|
@ -64,6 +64,15 @@ resource "kubernetes_deployment" "actualbudget" {
|
|||
port {
|
||||
container_port = 5006
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "15m"
|
||||
memory = "160Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
}
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/data"
|
||||
|
|
|
|||
|
|
@ -244,9 +244,9 @@ resource "kubernetes_deployment" "calibre-web-automated" {
|
|||
path = "/"
|
||||
port = 8083
|
||||
}
|
||||
timeout_seconds = 5
|
||||
timeout_seconds = 10
|
||||
period_seconds = 30
|
||||
failure_threshold = 3
|
||||
failure_threshold = 6
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
|
|
|
|||
|
|
@ -95,10 +95,10 @@ resource "kubernetes_deployment" "forgejo" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "15m"
|
||||
memory = "256Mi"
|
||||
memory = "384Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
memory = "384Mi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
|
|
|
|||
|
|
@ -144,11 +144,11 @@ resource "kubernetes_deployment" "novelapp" {
|
|||
}
|
||||
resources {
|
||||
requests = {
|
||||
memory = "128Mi"
|
||||
memory = "640Mi"
|
||||
cpu = "10m"
|
||||
}
|
||||
limits = {
|
||||
memory = "256Mi"
|
||||
memory = "640Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -172,6 +172,7 @@ resource "helm_release" "traefik" {
|
|||
addEntryPointsLabels = true
|
||||
addServicesLabels = true
|
||||
addRoutersLabels = true
|
||||
buckets = "0.01,0.05,0.1,0.2,0.5,1.0,2.0,5.0,10.0,30.0"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue