forgejo: survive CI-build registry-push storms (mem 3Gi + working retention)
Heavy in-cluster builds (e.g. tripit buildkit) were taking Forgejo down via two vectors. Fixes both, without moving Forgejo off the sdc HDD (code-oflt deferred): - Memory 1Gi -> 3Gi (requests=limits). Forgejo was OOMKilled (exit 137) under registry-push load; VPA upperBound ~1.5Gi was suppressed by the 1Gi cap it kept OOMing against. Size for the push spike. - Activate registry retention (DRY_RUN false). Verified the delete list against all running viktor/* images first: 0 running images affected. Pruned 478 -> 161 package versions; PVC was at its 50Gi autoresize ceiling. - FIX broken retention auth: the cleanup PAT was ci-pusher's, but Forgejo scopes container packages per-user, so DELETE on viktor/* returned 403 (the dry-run only did GETs, hiding it). Repointed forgejo_cleanup_token to viktor's write:package PAT. Retention had never actually worked. - Protect buildkit *cache* tags from retention (cleanup.sh keep-set) so the gentler-builds layer cache survives daily pruning. [ci skip] — already applied via scripts/tg. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
fd0f4a0365
commit
c5bda77731
4 changed files with 39 additions and 14 deletions
|
|
@ -4,9 +4,17 @@
|
|||
# it's per-user runtime state inside the Forgejo DB. Driving retention from
|
||||
# a CronJob hitting the public API keeps the policy versioned in this repo.
|
||||
#
|
||||
# Auth: a write:package PAT belonging to ci-pusher (same user that pushes
|
||||
# from CI). DELETE on packages requires write:package scope. PAT lives in
|
||||
# Vault at secret/viktor/forgejo_cleanup_token.
|
||||
# Auth: a write:package PAT belonging to VIKTOR (the package OWNER). PAT
|
||||
# lives in Vault at secret/viktor/forgejo_cleanup_token.
|
||||
#
|
||||
# CORRECTION 2026-06-09: this previously said the PAT belonged to ci-pusher.
|
||||
# That was wrong and silently broke retention — Forgejo container packages
|
||||
# are scoped per-user, so ci-pusher gets HTTP 403 on DELETE of viktor/*
|
||||
# (the dry-run only does GETs, which DO work, so the 403 stayed hidden until
|
||||
# the first live run). DELETE requires a write:package PAT owned by viktor.
|
||||
# forgejo_cleanup_token is therefore set to viktor's write:package PAT (today
|
||||
# the same value as secret/ci/global/forgejo_push_token). IF that push token
|
||||
# is ever regenerated, re-mirror it here or retention silently 403s again.
|
||||
|
||||
data "vault_kv_secret_v2" "forgejo_viktor" {
|
||||
mount = "secret"
|
||||
|
|
@ -14,8 +22,12 @@ data "vault_kv_secret_v2" "forgejo_viktor" {
|
|||
}
|
||||
|
||||
locals {
|
||||
# Flip to false after first 7 days of dry-run logs look correct.
|
||||
forgejo_cleanup_dry_run = true
|
||||
# Activated 2026-06-09 after verifying a dry-run delete list against all
|
||||
# running viktor/* images cluster-wide: 0 running images on the delete set
|
||||
# (would prune 317 stale versions, keeping newest 10 + latest + cache tags).
|
||||
# Live retention is what keeps the registry PVC from filling on the HDD
|
||||
# (we deliberately did NOT move Forgejo to SSD — see beads code-oflt).
|
||||
forgejo_cleanup_dry_run = false
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map" "forgejo_cleanup_script" {
|
||||
|
|
|
|||
|
|
@ -2,8 +2,13 @@
|
|||
# Forgejo container-package retention.
|
||||
#
|
||||
# For each container package owned by ${FORGEJO_OWNER}, keep newest
|
||||
# ${KEEP_LAST_N} versions + always keep tag "latest". Deletes the rest via
|
||||
# ${KEEP_LAST_N} versions + always keep tag "latest" + always keep any
|
||||
# buildkit cache tag (matches "cache", e.g. tripit:cache — these back
|
||||
# --cache-from/--cache-to and must survive retention or every build is a
|
||||
# cold rebuild). Deletes the rest via
|
||||
# DELETE /api/v1/packages/{owner}/container/{name}/{version}.
|
||||
# (Note: an 8-char SHA tag is pure hex and cannot contain "cache" — 'h' is
|
||||
# not a hex digit — so the cache match never catches a real image tag.)
|
||||
#
|
||||
# DRY_RUN=true logs what would be deleted but issues no DELETE calls.
|
||||
#
|
||||
|
|
@ -72,9 +77,11 @@ for NAME in $NAMES; do
|
|||
N_VERSIONS=$(jq 'length' "$TMPDIR/$NAME.json")
|
||||
echo "[$NAME] $N_VERSIONS version(s)"
|
||||
|
||||
# Build the keep set: top $KEEP + anything tagged 'latest'.
|
||||
# Build the keep set: top $KEEP + always 'latest' + any buildkit cache tag.
|
||||
jq -r --argjson keep "$KEEP" '
|
||||
[.[0:$keep][].version] + [.[] | select(.version == "latest") | .version]
|
||||
[.[0:$keep][].version]
|
||||
+ [.[] | select(.version == "latest") | .version]
|
||||
+ [.[] | select(.version | test("cache"; "i")) | .version]
|
||||
| unique
|
||||
| .[]
|
||||
' "$TMPDIR/$NAME.json" > "$TMPDIR/$NAME.keep"
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ resource "kubernetes_namespace" "forgejo" {
|
|||
name = "forgejo"
|
||||
labels = {
|
||||
"istio-injection" : "disabled"
|
||||
tier = local.tiers.edge
|
||||
tier = local.tiers.edge
|
||||
"keel.sh/enrolled" = "true"
|
||||
}
|
||||
}
|
||||
|
|
@ -94,7 +94,7 @@ resource "kubernetes_deployment" "forgejo" {
|
|||
fs_group = 1000
|
||||
}
|
||||
container {
|
||||
name = "forgejo"
|
||||
name = "forgejo"
|
||||
# Pinned to 11.0.14 (latest 11.x as of 2026-05-12) — was on
|
||||
# floating `:11`. On 2026-05-24T15:35:37Z Keel force-policy
|
||||
# rewrote the tag from `11.0.14 → 1.18` (Gitea-era Forgejo
|
||||
|
|
@ -168,13 +168,19 @@ resource "kubernetes_deployment" "forgejo" {
|
|||
name = "data"
|
||||
mount_path = "/data"
|
||||
}
|
||||
# Bumped 1Gi -> 3Gi 2026-06-09: Forgejo was OOMKilled (exit 137)
|
||||
# under registry-push load from in-cluster CI builds (tripit
|
||||
# buildkit pushes large layers into the OCI registry). VPA
|
||||
# upperBound reads ~1.5Gi, but that's suppressed by the 1Gi cap it
|
||||
# kept OOMing against — size for the push spike, not steady-state.
|
||||
# requests=limits (Guaranteed QoS) per the repo memory convention.
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "15m"
|
||||
memory = "1Gi"
|
||||
memory = "3Gi"
|
||||
}
|
||||
limits = {
|
||||
memory = "1Gi"
|
||||
memory = "3Gi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
|
|
@ -202,7 +208,7 @@ resource "kubernetes_deployment" "forgejo" {
|
|||
metadata[0].annotations["keel.sh/match-tag"],
|
||||
metadata[0].annotations["keel.sh/trigger"],
|
||||
metadata[0].annotations["keel.sh/pollSchedule"], # KYVERNO_LIFECYCLE_V2
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
spec[0].template[0].spec[0].container[0].image, # KEEL_IGNORE_IMAGE — Keel manages tag updates
|
||||
metadata[0].annotations["kubernetes.io/change-cause"],
|
||||
metadata[0].annotations["deployment.kubernetes.io/revision"],
|
||||
spec[0].template[0].metadata[0].annotations["keel.sh/update-time"],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue