[forgejo] Phase 0 of registry consolidation: prepare Forgejo OCI registry
Stage 1 of moving private images off the registry:2 container at registry.viktorbarzin.me:5050 (which has hit distribution#3324 corruption 3x in 3 weeks) onto Forgejo's built-in OCI registry. No cutover risk — pods still pull from the existing registry until Phase 3. What changes: * Forgejo deployment: memory 384Mi→1Gi, PVC 5Gi→15Gi (cap 50Gi). Explicit FORGEJO__packages__ENABLED + CHUNKED_UPLOAD_PATH (defensive, v11 default-on). * ingress_factory: max_body_size variable was declared but never wired in after the nginx→Traefik migration. Now creates a per-ingress Buffering middleware when set; default null = no limit (preserves existing behavior). Forgejo ingress sets max_body_size=5g to allow multi-GB layer pushes. * Cluster-wide registry-credentials Secret: 4th auths entry for forgejo.viktorbarzin.me, populated from Vault secret/viktor/ forgejo_pull_token (cluster-puller PAT, read:package). Existing Kyverno ClusterPolicy syncs cluster-wide — no policy edits. * Containerd hosts.toml redirect: forgejo.viktorbarzin.me → in-cluster Traefik LB 10.0.20.200 (avoids hairpin NAT for in-cluster pulls). Cloud-init for new VMs + scripts/setup-forgejo-containerd-mirror.sh for existing nodes. * Forgejo retention CronJob (0 4 * * *): keeps newest 10 versions per package + always :latest. First 7 days dry-run (DRY_RUN=true); flip the local in cleanup.tf after log review. * Forgejo integrity probe CronJob (*/15): same algorithm as the existing registry-integrity-probe. Existing Prometheus alerts (RegistryManifestIntegrityFailure et al) made instance-aware so they cover both registries during the bake. * Docs: design+plan in docs/plans/, setup runbook in docs/runbooks/. Operational note — the apply order is non-trivial because the new Vault keys (forgejo_pull_token, forgejo_cleanup_token, secret/ci/global/forgejo_*) must exist BEFORE terragrunt apply in the kyverno + monitoring + forgejo stacks. The setup runbook documents the bootstrap sequence. Phase 1 (per-project dual-push pipelines) follows in subsequent commits. Bake clock starts when the last project goes dual-push. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
b1c21f78b9
commit
5d22b449f9
13 changed files with 1072 additions and 10 deletions
120
stacks/forgejo/cleanup.tf
Normal file
120
stacks/forgejo/cleanup.tf
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
# Forgejo container-package retention CronJob.
|
||||
#
|
||||
# Forgejo's per-package "Cleanup Rules" UI is not exposed via Terraform —
|
||||
# it's per-user runtime state inside the Forgejo DB. Driving retention from
|
||||
# a CronJob hitting the public API keeps the policy versioned in this repo.
|
||||
#
|
||||
# Auth: a write:package PAT belonging to ci-pusher (same user that pushes
|
||||
# from CI). DELETE on packages requires write:package scope. PAT lives in
|
||||
# Vault at secret/viktor/forgejo_cleanup_token.
|
||||
|
||||
data "vault_kv_secret_v2" "forgejo_viktor" {
|
||||
mount = "secret"
|
||||
name = "viktor"
|
||||
}
|
||||
|
||||
locals {
|
||||
# Flip to false after first 7 days of dry-run logs look correct.
|
||||
forgejo_cleanup_dry_run = true
|
||||
}
|
||||
|
||||
resource "kubernetes_config_map" "forgejo_cleanup_script" {
|
||||
metadata {
|
||||
name = "forgejo-cleanup-script"
|
||||
namespace = kubernetes_namespace.forgejo.metadata[0].name
|
||||
}
|
||||
data = {
|
||||
"cleanup.sh" = file("${path.module}/files/cleanup.sh")
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_secret" "forgejo_cleanup_token" {
|
||||
metadata {
|
||||
name = "forgejo-cleanup-token"
|
||||
namespace = kubernetes_namespace.forgejo.metadata[0].name
|
||||
}
|
||||
type = "Opaque"
|
||||
data = {
|
||||
FORGEJO_TOKEN = data.vault_kv_secret_v2.forgejo_viktor.data["forgejo_cleanup_token"]
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "forgejo_cleanup" {
|
||||
metadata {
|
||||
name = "forgejo-cleanup"
|
||||
namespace = kubernetes_namespace.forgejo.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Forbid"
|
||||
schedule = "0 4 * * *"
|
||||
failed_jobs_history_limit = 3
|
||||
successful_jobs_history_limit = 3
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 3600
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
container {
|
||||
name = "cleanup"
|
||||
image = "docker.io/library/alpine:3.20"
|
||||
command = ["/bin/sh", "/scripts/cleanup.sh"]
|
||||
env {
|
||||
name = "FORGEJO_TOKEN"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.forgejo_cleanup_token.metadata[0].name
|
||||
key = "FORGEJO_TOKEN"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "FORGEJO_HOST"
|
||||
value = "http://forgejo.forgejo.svc.cluster.local"
|
||||
}
|
||||
env {
|
||||
name = "FORGEJO_OWNER"
|
||||
value = "viktor"
|
||||
}
|
||||
env {
|
||||
name = "KEEP_LAST_N"
|
||||
value = "10"
|
||||
}
|
||||
env {
|
||||
name = "DRY_RUN"
|
||||
value = local.forgejo_cleanup_dry_run ? "true" : "false"
|
||||
}
|
||||
volume_mount {
|
||||
name = "scripts"
|
||||
mount_path = "/scripts"
|
||||
}
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "32Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "96Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
volume {
|
||||
name = "scripts"
|
||||
config_map {
|
||||
name = kubernetes_config_map.forgejo_cleanup_script.metadata[0].name
|
||||
default_mode = "0755"
|
||||
}
|
||||
}
|
||||
restart_policy = "OnFailure"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
109
stacks/forgejo/files/cleanup.sh
Normal file
109
stacks/forgejo/files/cleanup.sh
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
#!/bin/sh
|
||||
# Forgejo container-package retention.
|
||||
#
|
||||
# For each container package owned by ${FORGEJO_OWNER}, keep newest
|
||||
# ${KEEP_LAST_N} versions + always keep tag "latest". Deletes the rest via
|
||||
# DELETE /api/v1/packages/{owner}/container/{name}/{version}.
|
||||
#
|
||||
# DRY_RUN=true logs what would be deleted but issues no DELETE calls.
|
||||
#
|
||||
# Required env:
|
||||
# FORGEJO_HOST e.g. http://forgejo.forgejo.svc.cluster.local
|
||||
# FORGEJO_OWNER e.g. viktor
|
||||
# FORGEJO_USER PAT owner (write:package scope)
|
||||
# FORGEJO_TOKEN PAT
|
||||
# KEEP_LAST_N integer (default 10)
|
||||
# DRY_RUN true|false (default true)
|
||||
|
||||
set -eu
|
||||
|
||||
apk add --no-cache curl jq >/dev/null
|
||||
|
||||
OWNER="${FORGEJO_OWNER}"
|
||||
KEEP="${KEEP_LAST_N:-10}"
|
||||
DRY="${DRY_RUN:-true}"
|
||||
BASE="${FORGEJO_HOST%/}/api/v1"
|
||||
|
||||
AUTH_HEADER="Authorization: token $FORGEJO_TOKEN"
|
||||
|
||||
echo "Forgejo cleanup: owner=$OWNER keep_last=$KEEP dry_run=$DRY"
|
||||
echo "API base: $BASE"
|
||||
|
||||
# Page through ALL container packages.
|
||||
TMPDIR=$(mktemp -d)
|
||||
trap 'rm -rf "$TMPDIR"' EXIT
|
||||
ALL="$TMPDIR/all.json"
|
||||
echo "[]" > "$ALL"
|
||||
|
||||
PAGE=1
|
||||
while :; do
|
||||
RESP=$(curl -sf -H "$AUTH_HEADER" \
|
||||
"$BASE/packages/$OWNER?type=container&limit=50&page=$PAGE")
|
||||
COUNT=$(echo "$RESP" | jq 'length')
|
||||
if [ "$COUNT" = "0" ]; then break; fi
|
||||
jq -s '.[0] + .[1]' "$ALL" <(echo "$RESP") > "$TMPDIR/merged.json"
|
||||
mv "$TMPDIR/merged.json" "$ALL"
|
||||
PAGE=$((PAGE + 1))
|
||||
# Safety: never run away.
|
||||
if [ "$PAGE" -gt 100 ]; then break; fi
|
||||
done
|
||||
|
||||
TOTAL=$(jq 'length' "$ALL")
|
||||
echo "Found $TOTAL package version(s)."
|
||||
|
||||
if [ "$TOTAL" = "0" ]; then
|
||||
echo "Nothing to do."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Group by name and process each group.
|
||||
NAMES=$(jq -r '.[].name' "$ALL" | sort -u)
|
||||
|
||||
DEL=0
|
||||
KEPT=0
|
||||
|
||||
for NAME in $NAMES; do
|
||||
# All versions of this name, sorted by created_at descending.
|
||||
jq --arg n "$NAME" '
|
||||
[.[] | select(.name == $n)]
|
||||
| sort_by(.created_at) | reverse
|
||||
' "$ALL" > "$TMPDIR/$NAME.json"
|
||||
|
||||
N_VERSIONS=$(jq 'length' "$TMPDIR/$NAME.json")
|
||||
echo "[$NAME] $N_VERSIONS version(s)"
|
||||
|
||||
# Build the keep set: top $KEEP + anything tagged 'latest'.
|
||||
jq -r --argjson keep "$KEEP" '
|
||||
[.[0:$keep][].version] + [.[] | select(.version == "latest") | .version]
|
||||
| unique
|
||||
| .[]
|
||||
' "$TMPDIR/$NAME.json" > "$TMPDIR/$NAME.keep"
|
||||
|
||||
# Build the delete set.
|
||||
jq -r '.[].version' "$TMPDIR/$NAME.json" \
|
||||
| grep -vxFf "$TMPDIR/$NAME.keep" > "$TMPDIR/$NAME.delete" || true
|
||||
|
||||
D_COUNT=$(wc -l < "$TMPDIR/$NAME.delete" | tr -d ' ')
|
||||
K_COUNT=$(wc -l < "$TMPDIR/$NAME.keep" | tr -d ' ')
|
||||
echo " keep=$K_COUNT delete=$D_COUNT"
|
||||
KEPT=$((KEPT + K_COUNT))
|
||||
|
||||
while IFS= read -r VER; do
|
||||
[ -z "$VER" ] && continue
|
||||
URL="$BASE/packages/$OWNER/container/$NAME/$VER"
|
||||
if [ "$DRY" = "true" ]; then
|
||||
echo " DRY_RUN would DELETE $URL"
|
||||
else
|
||||
HTTP=$(curl -s -o /dev/null -w '%{http_code}' \
|
||||
-X DELETE -H "$AUTH_HEADER" "$URL" || echo "000")
|
||||
if [ "$HTTP" = "204" ] || [ "$HTTP" = "200" ]; then
|
||||
echo " deleted $NAME:$VER"
|
||||
else
|
||||
echo " FAIL $NAME:$VER HTTP $HTTP"
|
||||
fi
|
||||
fi
|
||||
DEL=$((DEL + 1))
|
||||
done < "$TMPDIR/$NAME.delete"
|
||||
done
|
||||
|
||||
echo "Summary: kept=$KEPT to_delete=$DEL dry_run=$DRY"
|
||||
|
|
@ -32,7 +32,7 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
|
|||
annotations = {
|
||||
"resize.topolvm.io/threshold" = "80%"
|
||||
"resize.topolvm.io/increase" = "50%"
|
||||
"resize.topolvm.io/storage_limit" = "20Gi"
|
||||
"resize.topolvm.io/storage_limit" = "50Gi"
|
||||
}
|
||||
}
|
||||
spec {
|
||||
|
|
@ -40,7 +40,7 @@ resource "kubernetes_persistent_volume_claim" "data_encrypted" {
|
|||
storage_class_name = "proxmox-lvm-encrypted"
|
||||
resources {
|
||||
requests = {
|
||||
storage = "5Gi"
|
||||
storage = "15Gi"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -106,6 +106,18 @@ resource "kubernetes_deployment" "forgejo" {
|
|||
name = "FORGEJO__webhook__ALLOWED_HOST_LIST"
|
||||
value = "*.svc.cluster.local"
|
||||
}
|
||||
# OCI registry (container packages). Default-on in Forgejo v11 but
|
||||
# explicit so it can't be silently disabled by an upstream config
|
||||
# change. Chunked-upload path needs a directory inside /data so it
|
||||
# survives pod restarts and shares the same PVC as the registry blobs.
|
||||
env {
|
||||
name = "FORGEJO__packages__ENABLED"
|
||||
value = "true"
|
||||
}
|
||||
env {
|
||||
name = "FORGEJO__packages__CHUNKED_UPLOAD_PATH"
|
||||
value = "/data/tmp/package-upload"
|
||||
}
|
||||
volume_mount {
|
||||
name = "data"
|
||||
mount_path = "/data"
|
||||
|
|
@ -113,10 +125,10 @@ resource "kubernetes_deployment" "forgejo" {
|
|||
resources {
|
||||
requests = {
|
||||
cpu = "15m"
|
||||
memory = "384Mi"
|
||||
memory = "1Gi"
|
||||
}
|
||||
limits = {
|
||||
memory = "384Mi"
|
||||
memory = "1Gi"
|
||||
}
|
||||
}
|
||||
port {
|
||||
|
|
@ -165,6 +177,9 @@ module "ingress" {
|
|||
namespace = kubernetes_namespace.forgejo.metadata[0].name
|
||||
name = "forgejo"
|
||||
tls_secret_name = var.tls_secret_name
|
||||
# OCI registry pushes ship full image layer blobs in one request; default
|
||||
# Traefik buffering chokes on anything past a few hundred MB.
|
||||
max_body_size = "5g"
|
||||
extra_annotations = {
|
||||
"gethomepage.dev/enabled" = "true"
|
||||
"gethomepage.dev/name" = "Forgejo"
|
||||
|
|
|
|||
|
|
@ -83,6 +83,12 @@ module "k8s-node-template" {
|
|||
mkdir -p /etc/containerd/certs.d/registry.viktorbarzin.me
|
||||
printf 'server = "https://registry.viktorbarzin.me"\n\n[host."https://10.0.20.10:5050"]\n capabilities = ["pull", "resolve", "push"]\n skip_verify = true\n' > /etc/containerd/certs.d/registry.viktorbarzin.me/hosts.toml
|
||||
|
||||
# Forgejo OCI registry: redirect to in-cluster Traefik LB (10.0.20.200) so
|
||||
# pulls don't hairpin out through the WAN gateway. Traefik serves the
|
||||
# *.viktorbarzin.me wildcard so SNI verification still passes.
|
||||
mkdir -p /etc/containerd/certs.d/forgejo.viktorbarzin.me
|
||||
printf 'server = "https://forgejo.viktorbarzin.me"\n\n[host."https://10.0.20.200"]\n capabilities = ["pull", "resolve"]\n' > /etc/containerd/certs.d/forgejo.viktorbarzin.me/hosts.toml
|
||||
|
||||
# Low-traffic registries (registry.k8s.io, quay.io, reg.kyverno.io) pull directly.
|
||||
# Pull-through cache removed: caused corrupted images (truncated downloads)
|
||||
# breaking VPA certgen and Kyverno image pulls.
|
||||
|
|
|
|||
|
|
@ -29,6 +29,12 @@ resource "kubernetes_secret" "registry_credentials" {
|
|||
"10.0.20.10:5050" = {
|
||||
auth = base64encode("${data.vault_kv_secret_v2.viktor.data["registry_user"]}:${data.vault_kv_secret_v2.viktor.data["registry_password"]}")
|
||||
}
|
||||
# Forgejo OCI registry — read-only PAT for the cluster-puller service
|
||||
# account user. Pushes go through ci-pusher (separate PAT in Vault
|
||||
# secret/ci/global, surfaced to Woodpecker).
|
||||
"forgejo.viktorbarzin.me" = {
|
||||
auth = base64encode("cluster-puller:${data.vault_kv_secret_v2.viktor.data["forgejo_pull_token"]}")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,5 +33,6 @@ module "monitoring" {
|
|||
kube_config_path = var.kube_config_path
|
||||
registry_user = data.vault_kv_secret_v2.viktor.data["registry_user"]
|
||||
registry_password = data.vault_kv_secret_v2.viktor.data["registry_password"]
|
||||
forgejo_pull_token = data.vault_kv_secret_v2.viktor.data["forgejo_pull_token"]
|
||||
tier = local.tiers.cluster
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,6 +41,11 @@ variable "registry_password" {
|
|||
type = string
|
||||
sensitive = true
|
||||
}
|
||||
variable "forgejo_pull_token" {
|
||||
type = string
|
||||
sensitive = true
|
||||
description = "PAT for the cluster-puller user, used by the Forgejo registry integrity probe."
|
||||
}
|
||||
|
||||
resource "kubernetes_namespace" "monitoring" {
|
||||
metadata {
|
||||
|
|
@ -426,6 +431,203 @@ resource "kubernetes_cron_job_v1" "registry_integrity_probe" {
|
|||
}
|
||||
}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Forgejo registry integrity probe — same algorithm as registry-integrity-probe
|
||||
# above, but targets the Forgejo OCI registry instead of registry-private. Runs
|
||||
# in parallel with the existing probe during the dual-push bake; once Phase 4
|
||||
# decommissions registry-private, the registry-integrity-probe CronJob is
|
||||
# deleted and only this one remains.
|
||||
#
|
||||
# Auth: HTTP Basic with cluster-puller PAT (read:package scope is enough to
|
||||
# walk catalog + manifests). Reaches Forgejo via the in-cluster service so we
|
||||
# don't hairpin out through Traefik for every probe run.
|
||||
# -----------------------------------------------------------------------------
|
||||
resource "kubernetes_secret" "forgejo_probe_credentials" {
|
||||
metadata {
|
||||
name = "forgejo-probe-credentials"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
}
|
||||
type = "Opaque"
|
||||
data = {
|
||||
REG_USER = "cluster-puller"
|
||||
REG_PASS = var.forgejo_pull_token
|
||||
}
|
||||
}
|
||||
|
||||
resource "kubernetes_cron_job_v1" "forgejo_integrity_probe" {
|
||||
metadata {
|
||||
name = "forgejo-integrity-probe"
|
||||
namespace = kubernetes_namespace.monitoring.metadata[0].name
|
||||
}
|
||||
spec {
|
||||
concurrency_policy = "Forbid"
|
||||
failed_jobs_history_limit = 3
|
||||
successful_jobs_history_limit = 3
|
||||
schedule = "*/15 * * * *"
|
||||
job_template {
|
||||
metadata {}
|
||||
spec {
|
||||
backoff_limit = 1
|
||||
ttl_seconds_after_finished = 600
|
||||
template {
|
||||
metadata {}
|
||||
spec {
|
||||
container {
|
||||
name = "forgejo-integrity-probe"
|
||||
image = "docker.io/library/alpine:3.20"
|
||||
env {
|
||||
name = "REG_USER"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.forgejo_probe_credentials.metadata[0].name
|
||||
key = "REG_USER"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "REG_PASS"
|
||||
value_from {
|
||||
secret_key_ref {
|
||||
name = kubernetes_secret.forgejo_probe_credentials.metadata[0].name
|
||||
key = "REG_PASS"
|
||||
}
|
||||
}
|
||||
}
|
||||
env {
|
||||
name = "REGISTRY_HOST"
|
||||
value = "forgejo.forgejo.svc.cluster.local"
|
||||
}
|
||||
env {
|
||||
name = "REGISTRY_SCHEME"
|
||||
value = "http"
|
||||
}
|
||||
env {
|
||||
name = "REGISTRY_INSTANCE"
|
||||
value = "forgejo.viktorbarzin.me"
|
||||
}
|
||||
env {
|
||||
name = "PUSHGATEWAY"
|
||||
value = "http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/forgejo-integrity-probe"
|
||||
}
|
||||
env {
|
||||
name = "TAGS_PER_REPO"
|
||||
value = "5"
|
||||
}
|
||||
command = ["/bin/sh", "-c", <<-EOT
|
||||
set -eu
|
||||
apk add --no-cache curl jq >/dev/null
|
||||
|
||||
REG="$REGISTRY_HOST"
|
||||
SCHEME="$${REGISTRY_SCHEME:-https}"
|
||||
INSTANCE="$REGISTRY_INSTANCE"
|
||||
AUTH="$REG_USER:$REG_PASS"
|
||||
ACCEPT='application/vnd.oci.image.index.v1+json,application/vnd.oci.image.manifest.v1+json,application/vnd.docker.distribution.manifest.list.v2+json,application/vnd.docker.distribution.manifest.v2+json'
|
||||
|
||||
push() {
|
||||
curl -sf --max-time 10 --data-binary @- "$PUSHGATEWAY" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
CATALOG=$(curl -sk -u "$AUTH" --max-time 30 "$SCHEME://$REG/v2/_catalog?n=1000" || echo "")
|
||||
REPOS=$(echo "$CATALOG" | jq -r '.repositories[]?' 2>/dev/null || echo "")
|
||||
|
||||
if [ -z "$REPOS" ]; then
|
||||
echo "ERROR: empty catalog or auth failure — cannot probe"
|
||||
NOW=$(date +%s)
|
||||
push <<METRICS
|
||||
# TYPE registry_manifest_integrity_catalog_accessible gauge
|
||||
registry_manifest_integrity_catalog_accessible{instance="$INSTANCE"} 0
|
||||
# TYPE registry_manifest_integrity_last_run_timestamp gauge
|
||||
registry_manifest_integrity_last_run_timestamp{instance="$INSTANCE"} $NOW
|
||||
METRICS
|
||||
exit 1
|
||||
fi
|
||||
|
||||
FAIL=0
|
||||
REPOS_N=0
|
||||
TAGS_N=0
|
||||
INDEXES_N=0
|
||||
|
||||
printf '%s\n' $REPOS > /tmp/repos.txt
|
||||
while IFS= read -r repo; do
|
||||
[ -z "$repo" ] && continue
|
||||
REPOS_N=$((REPOS_N + 1))
|
||||
|
||||
TAGS_JSON=$(curl -sk -u "$AUTH" --max-time 15 "$SCHEME://$REG/v2/$repo/tags/list" || echo "")
|
||||
echo "$TAGS_JSON" | jq -r '.tags[]?' 2>/dev/null | tail -n "$TAGS_PER_REPO" > /tmp/tags.txt || true
|
||||
|
||||
while IFS= read -r tag; do
|
||||
[ -z "$tag" ] && continue
|
||||
TAGS_N=$((TAGS_N + 1))
|
||||
|
||||
HTTP=$(curl -sk -u "$AUTH" -o /tmp/m.json -w '%%{http_code}' \
|
||||
-H "Accept: $ACCEPT" --max-time 15 \
|
||||
"$SCHEME://$REG/v2/$repo/manifests/$tag")
|
||||
if [ "$HTTP" != "200" ]; then
|
||||
echo "FAIL: $repo:$tag manifest HTTP $HTTP"
|
||||
FAIL=$((FAIL + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
MT=$(jq -r '.mediaType // empty' /tmp/m.json 2>/dev/null || echo "")
|
||||
if echo "$MT" | grep -Eq 'manifest\.list|image\.index'; then
|
||||
INDEXES_N=$((INDEXES_N + 1))
|
||||
jq -r '.manifests[].digest' /tmp/m.json > /tmp/children.txt 2>/dev/null || true
|
||||
while IFS= read -r d; do
|
||||
[ -z "$d" ] && continue
|
||||
CH=$(curl -sk -u "$AUTH" -o /dev/null -w '%%{http_code}' \
|
||||
-H "Accept: $ACCEPT" --max-time 10 -I \
|
||||
"$SCHEME://$REG/v2/$repo/manifests/$d")
|
||||
if [ "$CH" != "200" ]; then
|
||||
echo "FAIL: $repo:$tag index child $d HTTP $CH"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
done < /tmp/children.txt
|
||||
fi
|
||||
done < /tmp/tags.txt
|
||||
done < /tmp/repos.txt
|
||||
|
||||
NOW=$(date +%s)
|
||||
push <<METRICS
|
||||
# TYPE registry_manifest_integrity_failures gauge
|
||||
registry_manifest_integrity_failures{instance="$INSTANCE"} $FAIL
|
||||
# TYPE registry_manifest_integrity_catalog_accessible gauge
|
||||
registry_manifest_integrity_catalog_accessible{instance="$INSTANCE"} 1
|
||||
# TYPE registry_manifest_integrity_repos_checked gauge
|
||||
registry_manifest_integrity_repos_checked{instance="$INSTANCE"} $REPOS_N
|
||||
# TYPE registry_manifest_integrity_tags_checked gauge
|
||||
registry_manifest_integrity_tags_checked{instance="$INSTANCE"} $TAGS_N
|
||||
# TYPE registry_manifest_integrity_indexes_checked gauge
|
||||
registry_manifest_integrity_indexes_checked{instance="$INSTANCE"} $INDEXES_N
|
||||
# TYPE registry_manifest_integrity_last_run_timestamp gauge
|
||||
registry_manifest_integrity_last_run_timestamp{instance="$INSTANCE"} $NOW
|
||||
METRICS
|
||||
|
||||
echo "Probe complete: $FAIL failures across $REPOS_N repos / $TAGS_N tags / $INDEXES_N indexes"
|
||||
if [ "$FAIL" -gt 0 ]; then exit 1; fi
|
||||
EOT
|
||||
]
|
||||
resources {
|
||||
requests = {
|
||||
cpu = "10m"
|
||||
memory = "48Mi"
|
||||
}
|
||||
limits = {
|
||||
memory = "96Mi"
|
||||
}
|
||||
}
|
||||
}
|
||||
restart_policy = "OnFailure"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
lifecycle {
|
||||
# KYVERNO_LIFECYCLE_V1: Kyverno admission webhook mutates dns_config with ndots=2
|
||||
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||
}
|
||||
}
|
||||
|
||||
# Expose Pushgateway via NodePort so the PVE host can push LVM snapshot metrics
|
||||
resource "kubernetes_service" "pushgateway_nodeport" {
|
||||
metadata {
|
||||
|
|
|
|||
|
|
@ -1656,22 +1656,22 @@ serverFiles:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Registry has {{ $value }} broken manifest reference(s) — orphan index or missing blob"
|
||||
description: "The registry-integrity-probe CronJob in the monitoring namespace found {{ $value }} manifest/blob references that return non-200 on the private registry. Almost certainly an orphan OCI-index child from the cleanup-tags.sh+GC race. Rebuild the affected image per docs/runbooks/registry-rebuild-image.md and investigate which tag(s) the probe logs flagged."
|
||||
summary: "{{ $labels.instance }}: {{ $value }} broken manifest reference(s) — orphan index or missing blob"
|
||||
description: "The integrity probe CronJob found {{ $value }} manifest/blob references that return non-200 on {{ $labels.instance }}. For registry.viktorbarzin.me see docs/runbooks/registry-rebuild-image.md (orphan OCI-index child from cleanup-tags.sh+GC race). For forgejo.viktorbarzin.me see docs/runbooks/forgejo-registry-rebuild-image.md."
|
||||
- alert: RegistryIntegrityProbeStale
|
||||
expr: time() - registry_manifest_integrity_last_run_timestamp > 3600
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Registry integrity probe has not reported in >1h — CronJob may be broken"
|
||||
summary: "{{ $labels.instance }} integrity probe has not reported in >1h — CronJob may be broken"
|
||||
- alert: RegistryCatalogInaccessible
|
||||
expr: registry_manifest_integrity_catalog_accessible == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Registry probe cannot fetch /v2/_catalog — auth failure or registry down"
|
||||
summary: "{{ $labels.instance }} probe cannot fetch /v2/_catalog — auth failure or registry down"
|
||||
- alert: NodeHighCPUUsage
|
||||
expr: pve_cpu_usage_ratio * 100 > 60
|
||||
for: 6h
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue