infra/stacks/forgejo/cleanup.tf

# Forgejo container-package retention CronJob.
#
# Forgejo's per-package "Cleanup Rules" UI is not exposed via Terraform —
# it's per-user runtime state inside the Forgejo DB. Driving retention from
# a CronJob hitting the public API keeps the policy versioned in this repo.
#
# Auth: a write:package PAT belonging to ci-pusher (same user that pushes
# from CI). DELETE on packages requires write:package scope. PAT lives in
# Vault at secret/viktor/forgejo_cleanup_token.

data "vault_kv_secret_v2" "forgejo_viktor" {
  mount = "secret"
  name  = "viktor"
}

locals {
  # Flip to false after first 7 days of dry-run logs look correct.
  forgejo_cleanup_dry_run = true
}

resource "kubernetes_config_map" "forgejo_cleanup_script" {
  metadata {
    name      = "forgejo-cleanup-script"
    namespace = kubernetes_namespace.forgejo.metadata[0].name
  }
  data = {
    "cleanup.sh" = file("${path.module}/files/cleanup.sh")
  }
}

resource "kubernetes_secret" "forgejo_cleanup_token" {
  metadata {
    name      = "forgejo-cleanup-token"
    namespace = kubernetes_namespace.forgejo.metadata[0].name
  }
  type = "Opaque"
  data = {
    # try() so the apply succeeds before the Vault key is populated during
    # Phase 0 bootstrap (see docs/runbooks/forgejo-registry-setup.md). Empty
    # token causes the cleanup CronJob to fail visibly — that's intended.
    FORGEJO_TOKEN = try(data.vault_kv_secret_v2.forgejo_viktor.data["forgejo_cleanup_token"], "")
  }
}

resource "kubernetes_cron_job_v1" "forgejo_cleanup" {
  metadata {
    name      = "forgejo-cleanup"
    namespace = kubernetes_namespace.forgejo.metadata[0].name
  }
  spec {
    concurrency_policy            = "Forbid"
    schedule                      = "0 4 * * *"
    failed_jobs_history_limit     = 3
    successful_jobs_history_limit = 3
    job_template {
      metadata {}
      spec {
        backoff_limit              = 1
        ttl_seconds_after_finished = 3600
        template {
          metadata {}
          spec {
            container {
              name    = "cleanup"
              image   = "docker.io/library/alpine:3.20"
              command = ["/bin/sh", "/scripts/cleanup.sh"]
              env {
                name = "FORGEJO_TOKEN"
                value_from {
                  secret_key_ref {
                    name = kubernetes_secret.forgejo_cleanup_token.metadata[0].name
                    key  = "FORGEJO_TOKEN"
                  }
                }
              }
              env {
                name  = "FORGEJO_HOST"
                value = "http://forgejo.forgejo.svc.cluster.local"
              }
              env {
                name  = "FORGEJO_OWNER"
                value = "viktor"
              }
              env {
                name  = "KEEP_LAST_N"
                value = "10"
              }
              env {
                name  = "DRY_RUN"
                value = local.forgejo_cleanup_dry_run ? "true" : "false"
              }
              volume_mount {
                name       = "scripts"
                mount_path = "/scripts"
              }
              resources {
                requests = {
                  cpu    = "10m"
                  memory = "32Mi"
                }
                limits = {
                  memory = "96Mi"
                }
              }
            }
            volume {
              name = "scripts"
              config_map {
                name         = kubernetes_config_map.forgejo_cleanup_script.metadata[0].name
                default_mode = "0755"
              }
            }
            restart_policy = "OnFailure"
          }
        }
      }
    }
  }
  lifecycle {
    # KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
    ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
  }
}
[forgejo] Phase 0 of registry consolidation: prepare Forgejo OCI registry Stage 1 of moving private images off the registry:2 container at registry.viktorbarzin.me:5050 (which has hit distribution#3324 corruption 3x in 3 weeks) onto Forgejo's built-in OCI registry. No cutover risk — pods still pull from the existing registry until Phase 3. What changes: * Forgejo deployment: memory 384Mi→1Gi, PVC 5Gi→15Gi (cap 50Gi). Explicit FORGEJO__packages__ENABLED + CHUNKED_UPLOAD_PATH (defensive, v11 default-on). * ingress_factory: max_body_size variable was declared but never wired in after the nginx→Traefik migration. Now creates a per-ingress Buffering middleware when set; default null = no limit (preserves existing behavior). Forgejo ingress sets max_body_size=5g to allow multi-GB layer pushes. * Cluster-wide registry-credentials Secret: 4th auths entry for forgejo.viktorbarzin.me, populated from Vault secret/viktor/ forgejo_pull_token (cluster-puller PAT, read:package). Existing Kyverno ClusterPolicy syncs cluster-wide — no policy edits. * Containerd hosts.toml redirect: forgejo.viktorbarzin.me → in-cluster Traefik LB 10.0.20.200 (avoids hairpin NAT for in-cluster pulls). Cloud-init for new VMs + scripts/setup-forgejo-containerd-mirror.sh for existing nodes. * Forgejo retention CronJob (0 4 * * ): keeps newest 10 versions per package + always :latest. First 7 days dry-run (DRY_RUN=true); flip the local in cleanup.tf after log review. Forgejo integrity probe CronJob (/15): same algorithm as the existing registry-integrity-probe. Existing Prometheus alerts (RegistryManifestIntegrityFailure et al) made instance-aware so they cover both registries during the bake. Docs: design+plan in docs/plans/, setup runbook in docs/runbooks/. Operational note — the apply order is non-trivial because the new Vault keys (forgejo_pull_token, forgejo_cleanup_token, secret/ci/global/forgejo_*) must exist BEFORE terragrunt apply in the kyverno + monitoring + forgejo stacks. The setup runbook documents the bootstrap sequence. Phase 1 (per-project dual-push pipelines) follows in subsequent commits. Bake clock starts when the last project goes dual-push. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-07 15:51:34 +00:00			`# Forgejo container-package retention CronJob.`
			`#`
			`# Forgejo's per-package "Cleanup Rules" UI is not exposed via Terraform —`
			`# it's per-user runtime state inside the Forgejo DB. Driving retention from`
			`# a CronJob hitting the public API keeps the policy versioned in this repo.`
			`#`
			`# Auth: a write:package PAT belonging to ci-pusher (same user that pushes`
			`# from CI). DELETE on packages requires write:package scope. PAT lives in`
			`# Vault at secret/viktor/forgejo_cleanup_token.`

			`data "vault_kv_secret_v2" "forgejo_viktor" {`
			`mount = "secret"`
			`name = "viktor"`
			`}`

			`locals {`
			`# Flip to false after first 7 days of dry-run logs look correct.`
			`forgejo_cleanup_dry_run = true`
			`}`

			`resource "kubernetes_config_map" "forgejo_cleanup_script" {`
			`metadata {`
			`name = "forgejo-cleanup-script"`
			`namespace = kubernetes_namespace.forgejo.metadata[0].name`
			`}`
			`data = {`
			`"cleanup.sh" = file("${path.module}/files/cleanup.sh")`
			`}`
			`}`

			`resource "kubernetes_secret" "forgejo_cleanup_token" {`
			`metadata {`
			`name = "forgejo-cleanup-token"`
			`namespace = kubernetes_namespace.forgejo.metadata[0].name`
			`}`
			`type = "Opaque"`
			`data = {`
[forgejo] Tolerate missing Vault keys during Phase 0 bootstrap Wrap the three new Vault key reads in try(...) so the first apply succeeds even when forgejo_pull_token / forgejo_cleanup_token / secret/ci/global haven't been populated yet. Without this, CI auto-apply blocks on the very push that introduces the references — chicken-and-egg with the runbook order (which is: apply Forgejo bumps, then create users + PATs, then apply the rest). Empty tokens are intentionally visible-broken (auth fails, probe reports auth failure, cleanup CronJob errors) — that's the signal to run the bootstrap runbook. Subsequent apply picks up the real values. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-07 15:53:08 +00:00			`# try() so the apply succeeds before the Vault key is populated during`
			`# Phase 0 bootstrap (see docs/runbooks/forgejo-registry-setup.md). Empty`
			`# token causes the cleanup CronJob to fail visibly — that's intended.`
			`FORGEJO_TOKEN = try(data.vault_kv_secret_v2.forgejo_viktor.data["forgejo_cleanup_token"], "")`
[forgejo] Phase 0 of registry consolidation: prepare Forgejo OCI registry Stage 1 of moving private images off the registry:2 container at registry.viktorbarzin.me:5050 (which has hit distribution#3324 corruption 3x in 3 weeks) onto Forgejo's built-in OCI registry. No cutover risk — pods still pull from the existing registry until Phase 3. What changes: * Forgejo deployment: memory 384Mi→1Gi, PVC 5Gi→15Gi (cap 50Gi). Explicit FORGEJO__packages__ENABLED + CHUNKED_UPLOAD_PATH (defensive, v11 default-on). * ingress_factory: max_body_size variable was declared but never wired in after the nginx→Traefik migration. Now creates a per-ingress Buffering middleware when set; default null = no limit (preserves existing behavior). Forgejo ingress sets max_body_size=5g to allow multi-GB layer pushes. * Cluster-wide registry-credentials Secret: 4th auths entry for forgejo.viktorbarzin.me, populated from Vault secret/viktor/ forgejo_pull_token (cluster-puller PAT, read:package). Existing Kyverno ClusterPolicy syncs cluster-wide — no policy edits. * Containerd hosts.toml redirect: forgejo.viktorbarzin.me → in-cluster Traefik LB 10.0.20.200 (avoids hairpin NAT for in-cluster pulls). Cloud-init for new VMs + scripts/setup-forgejo-containerd-mirror.sh for existing nodes. * Forgejo retention CronJob (0 4 * * ): keeps newest 10 versions per package + always :latest. First 7 days dry-run (DRY_RUN=true); flip the local in cleanup.tf after log review. Forgejo integrity probe CronJob (/15): same algorithm as the existing registry-integrity-probe. Existing Prometheus alerts (RegistryManifestIntegrityFailure et al) made instance-aware so they cover both registries during the bake. Docs: design+plan in docs/plans/, setup runbook in docs/runbooks/. Operational note — the apply order is non-trivial because the new Vault keys (forgejo_pull_token, forgejo_cleanup_token, secret/ci/global/forgejo_*) must exist BEFORE terragrunt apply in the kyverno + monitoring + forgejo stacks. The setup runbook documents the bootstrap sequence. Phase 1 (per-project dual-push pipelines) follows in subsequent commits. Bake clock starts when the last project goes dual-push. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> 2026-05-07 15:51:34 +00:00			`}`
			`}`

			`resource "kubernetes_cron_job_v1" "forgejo_cleanup" {`
			`metadata {`
			`name = "forgejo-cleanup"`
			`namespace = kubernetes_namespace.forgejo.metadata[0].name`
			`}`
			`spec {`
			`concurrency_policy = "Forbid"`
			`schedule = "0 4 * * *"`
			`failed_jobs_history_limit = 3`
			`successful_jobs_history_limit = 3`
			`job_template {`
			`metadata {}`
			`spec {`
			`backoff_limit = 1`
			`ttl_seconds_after_finished = 3600`
			`template {`
			`metadata {}`
			`spec {`
			`container {`
			`name = "cleanup"`
			`image = "docker.io/library/alpine:3.20"`
			`command = ["/bin/sh", "/scripts/cleanup.sh"]`
			`env {`
			`name = "FORGEJO_TOKEN"`
			`value_from {`
			`secret_key_ref {`
			`name = kubernetes_secret.forgejo_cleanup_token.metadata[0].name`
			`key = "FORGEJO_TOKEN"`
			`}`
			`}`
			`}`
			`env {`
			`name = "FORGEJO_HOST"`
			`value = "http://forgejo.forgejo.svc.cluster.local"`
			`}`
			`env {`
			`name = "FORGEJO_OWNER"`
			`value = "viktor"`
			`}`
			`env {`
			`name = "KEEP_LAST_N"`
			`value = "10"`
			`}`
			`env {`
			`name = "DRY_RUN"`
			`value = local.forgejo_cleanup_dry_run ? "true" : "false"`
			`}`
			`volume_mount {`
			`name = "scripts"`
			`mount_path = "/scripts"`
			`}`
			`resources {`
			`requests = {`
			`cpu = "10m"`
			`memory = "32Mi"`
			`}`
			`limits = {`
			`memory = "96Mi"`
			`}`
			`}`
			`}`
			`volume {`
			`name = "scripts"`
			`config_map {`
			`name = kubernetes_config_map.forgejo_cleanup_script.metadata[0].name`
			`default_mode = "0755"`
			`}`
			`}`
			`restart_policy = "OnFailure"`
			`}`
			`}`
			`}`
			`}`
			`}`
			`lifecycle {`
			`# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2`
			`ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]`
			`}`
			`}`