fix OOM kills: tune MySQL memory, reduce Nextcloud workers, increase Uptime Kuma limit

MySQL (3 OOM kills): - Cap group_replication_message_cache_size to 128MB (default 1GB caused OOM) - Reduce innodb_log_buffer_size from 64MB to 16MB - Lower max_connections from 151 to 80 (peak usage ~40) - Increase memory limit from 3Gi to 4Gi for headroom Nextcloud (30+ apache2 OOM kills per incident): - Reduce MaxRequestWorkers from 50 to 10 to prevent fork bomb when SQLite locks cause request pileup - Lower StartServers/MinSpare/MaxSpare proportionally Uptime Kuma (Node.js memory leak): - Increase memory limit from 256Mi to 512Mi - Increase CPU limit from 200m to 500m Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-12 07:26:08 +00:00 · 2026-03-12 07:26:08 +00:00 · 81bfccaefc
commit 81bfccaefc
parent f2c7444159
3 changed files with 25 additions and 19 deletions
--- a/stacks/nextcloud/main.tf
+++ b/stacks/nextcloud/main.tf
@ -1,9 +1,9 @@
 variable "tls_secret_name" {
-  type = string
+  type      = string
  sensitive = true
 }
 variable "nextcloud_db_password" {
-  type = string
+  type      = string
  sensitive = true
 }
 variable "nfs_server" { type = string }
@ -93,15 +93,15 @@ resource "kubernetes_config_map" "apache_tuning" {
  }
  data = {
    "mpm_prefork.conf" = <<-EOF
-      # Tuned for container with 6Gi memory limit
-      # Each worker uses ~220MB RSS, so 50 workers ≈ 11GB (shared pages reduce actual)
-      # Need enough workers so probes can get through during SQLite locks
+      # Tuned for container with 6Gi memory limit and SQLite backend
+      # Each worker uses ~100-200MB RSS. 10 workers = ~2GB max
+      # Low count prevents fork bomb when SQLite locks cause request pileup
      <IfModule mpm_prefork_module>
-        StartServers            5
-        MinSpareServers         3
-        MaxSpareServers         10
-        MaxRequestWorkers       50
-        MaxConnectionsPerChild  200
+        StartServers            3
+        MinSpareServers         2
+        MaxSpareServers         5
+        MaxRequestWorkers       10
+        MaxConnectionsPerChild  100
      </IfModule>
    EOF
  }
@ -223,12 +223,12 @@ module "ingress" {
  port            = 8080
  rybbit_site_id  = "5a3bfe59a3fe"
  extra_annotations = {
-    "gethomepage.dev/enabled"        = "true"
-    "gethomepage.dev/name"           = "Nextcloud"
-    "gethomepage.dev/description"    = "Cloud productivity suite"
-    "gethomepage.dev/icon"           = "nextcloud.png"
-    "gethomepage.dev/group"          = "Productivity"
-    "gethomepage.dev/pod-selector"   = ""
+    "gethomepage.dev/enabled"         = "true"
+    "gethomepage.dev/name"            = "Nextcloud"
+    "gethomepage.dev/description"     = "Cloud productivity suite"
+    "gethomepage.dev/icon"            = "nextcloud.png"
+    "gethomepage.dev/group"           = "Productivity"
+    "gethomepage.dev/pod-selector"    = ""
    "gethomepage.dev/widget.type"     = "nextcloud"
    "gethomepage.dev/widget.url"      = "https://nextcloud.viktorbarzin.me"
    "gethomepage.dev/widget.username" = var.homepage_credentials["nextcloud"]["username"]
--- a/stacks/platform/modules/dbaas/main.tf
+++ b/stacks/platform/modules/dbaas/main.tf
@ -171,6 +171,12 @@ resource "helm_release" "mysql_cluster" {
        group_replication_member_expel_timeout=30
        group_replication_unreachable_majority_timeout=60
        group_replication_start_on_boot=ON
+        # Cap XCom cache to prevent unbounded growth (default 1GB causes OOM)
+        group_replication_message_cache_size=134217728
+        # Reduce log buffer (16MB sufficient for this workload, was 64MB)
+        innodb_log_buffer_size=16777216
+        # Limit connections (peak usage ~40, no need for 151)
+        max_connections=80
      EOT
    }

@ -181,7 +187,7 @@ resource "helm_release" "mysql_cluster" {
      }
      limits = {
        cpu    = "2"
-        memory = "3Gi"
+        memory = "4Gi"
      }
    }

--- a/stacks/platform/modules/uptime-kuma/main.tf
+++ b/stacks/platform/modules/uptime-kuma/main.tf
@ -71,8 +71,8 @@ resource "kubernetes_deployment" "uptime-kuma" {
              memory = "64Mi"
            }
            limits = {
-              cpu    = "200m"
-              memory = "256Mi"
+              cpu    = "500m"
+              memory = "512Mi"
            }
          }