From ff3cc44a2964b526dcc3f91adca278c38dfea7f5 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 13 Jun 2026 14:02:55 +0000 Subject: [PATCH] forgejo: raise memory limit from 3Gi to 6Gi (OOMKilled at 3Gi) Forgejo OOMKilled twice on 2026-06-13 at the 3Gi cap (exit 137), briefly taking the git remote and OCI registry down and spiking ingress TTFB to 4.7s and the 4xx rate to 51%. Steady-state is ~2.2Gi but it spiked into the cap (true demand above 3.2Gi). The 2026-06-09 bump to 3Gi was sized for tripit buildkit registry pushes, but that driver is gone now that the Forgejo registry was frozen and emptied today (ADR-0002, images on ghcr), so the spike is git ops / the integrity-probe catalog walk / a possible leak. 6Gi gives headroom on the critical git backbone while we watch whether working-set keeps climbing (which would indicate a leak). Co-Authored-By: Claude Opus 4.8 --- stacks/forgejo/main.tf | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/stacks/forgejo/main.tf b/stacks/forgejo/main.tf index e1b8c351..26e317a8 100644 --- a/stacks/forgejo/main.tf +++ b/stacks/forgejo/main.tf @@ -168,19 +168,25 @@ resource "kubernetes_deployment" "forgejo" { name = "data" mount_path = "/data" } - # Bumped 1Gi -> 3Gi 2026-06-09: Forgejo was OOMKilled (exit 137) - # under registry-push load from in-cluster CI builds (tripit - # buildkit pushes large layers into the OCI registry). VPA - # upperBound reads ~1.5Gi, but that's suppressed by the 1Gi cap it - # kept OOMing against — size for the push spike, not steady-state. + # Bumped 1Gi -> 3Gi 2026-06-09, then 3Gi -> 6Gi 2026-06-13. + # OOMKilled again (exit 137) at the 3Gi cap on 2026-06-13 (2 + # restarts; degraded the git backbone + spiked ingress TTFB/4xx). + # Steady-state is ~2.2Gi but it spiked into the 3Gi cap (true + # demand > 3.2Gi, ceiling unknown). The original 6/9 driver (tripit + # buildkit registry pushes) is GONE — the Forgejo container registry + # was frozen + emptied 2026-06-13 (ADR-0002, images moved to ghcr) — + # so the remaining spike is git ops / the integrity-probe catalog + # walk / a possible leak. Sized for generous headroom on the + # critical git remote; if working-set creeps toward 6Gi over days + # that's a leak to fix, not more RAM. # requests=limits (Guaranteed QoS) per the repo memory convention. resources { requests = { cpu = "15m" - memory = "3Gi" + memory = "6Gi" } limits = { - memory = "3Gi" + memory = "6Gi" } } port {