ci-pipeline-health: daily sweep of the off-infra CI chain (ADR-0002)
Viktor asked to monitor the pipelines closely as builds move off-infra (PRD infra#10). New aux stack: daily 07:30 UTC CronJob on the claude-agent-service image running a deterministic shell sweep — GitHub Actions failures/stuck runs across owned repos, Woodpecker pipeline failures, GHA free-tier minutes burn. Healthy = one quiet Slack line; issues = Slack alert + comment on infra#10. In-cluster (not a cloud routine) because Vault + the Woodpecker token are LAN-only. Secrets via ExternalSecret (github_pat deliberately, not the ghcr_pull_token alias — a scoped packages-only rotation couldn't read Actions runs). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
98f1f7fc24
commit
d02ca4f2db
3 changed files with 280 additions and 0 deletions
112
stacks/ci-pipeline-health/files/sweep.sh
Normal file
112
stacks/ci-pipeline-health/files/sweep.sh
Normal file
|
|
@ -0,0 +1,112 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# ci-pipeline-health — daily sweep of the off-infra CI chain (ADR-0002, PRD infra#10).
|
||||||
|
# Deterministic (no LLM): GitHub Actions runs + Woodpecker pipelines + GHA minutes.
|
||||||
|
# Healthy => one quiet Slack line. Issues => Slack alert + comment on infra#10.
|
||||||
|
# POSIX sh + curl + jq only (runs on the Alpine claude-agent-service image).
|
||||||
|
# Exit 0 = sweep ran (even with findings); exit 2 = the sweep itself errored,
|
||||||
|
# which surfaces through the existing CronJob-failure alerting.
|
||||||
|
|
||||||
|
GH_API="https://api.github.com"
|
||||||
|
WP_API="https://ci.viktorbarzin.me/api"
|
||||||
|
WP_UI="https://ci.viktorbarzin.me"
|
||||||
|
|
||||||
|
NOW_EPOCH=$(date -u +%s)
|
||||||
|
SINCE_EPOCH=$((NOW_EPOCH - 86400))
|
||||||
|
SINCE_ISO=$(date -u -d "@${SINCE_EPOCH}" +%Y-%m-%dT%H:%M:%SZ)
|
||||||
|
PUSH_CUTOFF=$(date -u -d "@$((NOW_EPOCH - 259200))" +%Y-%m-%dT%H:%M:%SZ)
|
||||||
|
|
||||||
|
ISSUES=$(mktemp)
|
||||||
|
NOTES=$(mktemp)
|
||||||
|
trap 'rm -f "$ISSUES" "$NOTES"' EXIT
|
||||||
|
gha_checked=0
|
||||||
|
wp_checked=0
|
||||||
|
sweep_errors=0
|
||||||
|
|
||||||
|
gh_get() { curl -sf --max-time 30 -H "Authorization: Bearer ${GITHUB_PAT}" -H "Accept: application/vnd.github+json" "$1"; }
|
||||||
|
wp_get() { curl -sf --max-time 30 -H "Authorization: Bearer ${WOODPECKER_API_TOKEN}" "$1"; }
|
||||||
|
|
||||||
|
# --- 1) GitHub Actions runs across owned repos with a recent push ---
|
||||||
|
repos=$(gh_get "${GH_API}/user/repos?affiliation=owner&sort=pushed&per_page=60" \
|
||||||
|
| jq -r --arg cutoff "$PUSH_CUTOFF" '.[] | select(.pushed_at >= $cutoff) | .full_name')
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "sweep: failed to list GitHub repos" >>"$ISSUES"; sweep_errors=1; repos=""
|
||||||
|
fi
|
||||||
|
for repo in $repos; do
|
||||||
|
runs=$(gh_get "${GH_API}/repos/${repo}/actions/runs?created=%3E%3D${SINCE_ISO}&per_page=50")
|
||||||
|
if [ $? -ne 0 ]; then echo "sweep: failed to list runs for ${repo}" >>"$ISSUES"; sweep_errors=1; continue; fi
|
||||||
|
n=$(printf '%s' "$runs" | jq '.workflow_runs | length')
|
||||||
|
gha_checked=$((gha_checked + n))
|
||||||
|
printf '%s' "$runs" | jq -r '.workflow_runs[]
|
||||||
|
| select(.conclusion == "failure" or .conclusion == "timed_out" or .conclusion == "cancelled" or .conclusion == "action_required")
|
||||||
|
| "GHA: \(.repository.full_name) #\(.run_number) [\(.name)] \(.conclusion) \(.html_url)"' >>"$ISSUES"
|
||||||
|
printf '%s' "$runs" | jq -r --argjson now "$NOW_EPOCH" '.workflow_runs[]
|
||||||
|
| select(.status == "in_progress" or .status == "queued")
|
||||||
|
| select(($now - ((.run_started_at // .created_at) | fromdateiso8601)) > 7200)
|
||||||
|
| "GHA stuck >2h: \(.repository.full_name) #\(.run_number) [\(.name)] \(.status) \(.html_url)"' >>"$ISSUES"
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- 2) Woodpecker pipelines (deploy chain) ---
|
||||||
|
wrepos=$(wp_get "${WP_API}/repos?perPage=100" | jq -r '.[] | select(.active == true) | "\(.id) \(.full_name)"')
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "sweep: failed to list Woodpecker repos" >>"$ISSUES"; sweep_errors=1; wrepos=""
|
||||||
|
fi
|
||||||
|
printf '%s\n' "$wrepos" | while IFS=' ' read -r id name; do
|
||||||
|
[ -z "$id" ] && continue
|
||||||
|
pls=$(wp_get "${WP_API}/repos/${id}/pipelines?perPage=10")
|
||||||
|
if [ $? -ne 0 ]; then echo "sweep: failed pipelines for ${name}" >>"$ISSUES"; continue; fi
|
||||||
|
printf '%s' "$pls" | jq -r --argjson since "$SINCE_EPOCH" --arg name "$name" --arg ui "$WP_UI" --arg id "$id" '
|
||||||
|
[.[] | select(.created >= $since)][]
|
||||||
|
| select(.status == "failure" or .status == "error" or .status == "killed")
|
||||||
|
| "Woodpecker: \($name) #\(.number) \(.status) (\(.event)) \($ui)/repos/\($id)/pipeline/\(.number)"' >>"$ISSUES"
|
||||||
|
printf '%s' "$pls" | jq --argjson since "$SINCE_EPOCH" '[.[] | select(.created >= $since)] | length' >>"$NOTES.wpcount" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
wp_checked=$(awk '{s+=$1} END {print s+0}' "$NOTES.wpcount" 2>/dev/null || echo 0)
|
||||||
|
rm -f "$NOTES.wpcount"
|
||||||
|
|
||||||
|
# --- 3) GHA minutes vs free tier ---
|
||||||
|
billing=$(gh_get "${GH_API}/users/ViktorBarzin/settings/billing/actions")
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
used=$(printf '%s' "$billing" | jq -r '.total_minutes_used')
|
||||||
|
included=$(printf '%s' "$billing" | jq -r '.included_minutes')
|
||||||
|
if [ "${included:-0}" -gt 0 ] 2>/dev/null; then
|
||||||
|
pct=$((used * 100 / included))
|
||||||
|
echo "GHA minutes: ${used}/${included} (${pct}%)" >>"$NOTES"
|
||||||
|
[ "$pct" -ge 75 ] && echo "GHA minutes at ${pct}% of the free tier (${used}/${included}) — check for runaway workflows or consider Pro" >>"$ISSUES"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "minutes check unavailable" >>"$NOTES"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# v1 scope (deliberate, not silent): Forgejo→GitHub mirror-gap detection (a
|
||||||
|
# Forgejo push that produced no GHA run) is NOT implemented yet — it needs the
|
||||||
|
# per-repo mirror inventory that lands with the offinfra-onboard rollout (#13+).
|
||||||
|
|
||||||
|
# --- Report ---
|
||||||
|
issue_count=$(grep -c . "$ISSUES" || true)
|
||||||
|
summary="ci-pipeline-health: checked ${gha_checked} GHA runs + ${wp_checked} Woodpecker pipelines (24h). $(tr '\n' '; ' <"$NOTES")"
|
||||||
|
|
||||||
|
if [ "$issue_count" -eq 0 ]; then
|
||||||
|
text=":white_check_mark: ${summary}"
|
||||||
|
else
|
||||||
|
text=":rotating_light: ci-pipeline-health: ${issue_count} issue(s)
|
||||||
|
$(sed 's/^/• /' "$ISSUES")
|
||||||
|
${summary}"
|
||||||
|
body="Daily CI sweep found ${issue_count} issue(s):
|
||||||
|
|
||||||
|
$(sed 's/^/- /' "$ISSUES")
|
||||||
|
|
||||||
|
_${summary}_"
|
||||||
|
printf '%s' "$body" | jq -Rs '{body: .}' \
|
||||||
|
| curl -sf --max-time 30 -X POST -H "Authorization: Bearer ${GITHUB_PAT}" \
|
||||||
|
-H "Accept: application/vnd.github+json" \
|
||||||
|
-d @- "${GH_API}/repos/ViktorBarzin/infra/issues/10/comments" >/dev/null \
|
||||||
|
|| { echo "sweep: failed to comment on infra#10"; sweep_errors=1; }
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf '%s' "$text" | jq -Rs '{text: .}' \
|
||||||
|
| curl -sf --max-time 30 -X POST -H 'Content-Type: application/json' -d @- "$SLACK_WEBHOOK" >/dev/null \
|
||||||
|
|| { echo "sweep: failed to post to Slack"; sweep_errors=1; }
|
||||||
|
|
||||||
|
echo "$text"
|
||||||
|
[ "$sweep_errors" -ne 0 ] && exit 2
|
||||||
|
exit 0
|
||||||
159
stacks/ci-pipeline-health/main.tf
Normal file
159
stacks/ci-pipeline-health/main.tf
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
# ci-pipeline-health — daily sweep of the off-infra CI chain (ADR-0002).
|
||||||
|
#
|
||||||
|
# Viktor's standing instruction (2026-06-12): monitor the pipelines closely
|
||||||
|
# during/after the off-infra builds migration (PRD infra#10). Deterministic
|
||||||
|
# shell sweep (files/sweep.sh) on the claude-agent-service image: GitHub
|
||||||
|
# Actions failures/stuck runs across owned repos, Woodpecker pipeline
|
||||||
|
# failures, GHA free-tier minutes burn. Healthy => one quiet Slack line;
|
||||||
|
# issues => Slack alert + a comment on infra#10.
|
||||||
|
#
|
||||||
|
# Runs IN-CLUSTER (not a claude.ai cloud routine) because Vault and the
|
||||||
|
# Woodpecker token are LAN-only — cloud agents can't reach them.
|
||||||
|
|
||||||
|
variable "schedule" {
|
||||||
|
type = string
|
||||||
|
# 07:30 UTC = 08:30 London in summer (07:30 in winter — acceptable drift,
|
||||||
|
# CronJob schedules are UTC-only).
|
||||||
|
default = "30 7 * * *"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mirrors stacks/claude-agent-service image tag — the image ships curl + jq.
|
||||||
|
variable "image_tag" {
|
||||||
|
type = string
|
||||||
|
default = "2fd7670d"
|
||||||
|
}
|
||||||
|
|
||||||
|
locals {
|
||||||
|
namespace = "ci-pipeline-health"
|
||||||
|
image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.image_tag}"
|
||||||
|
labels = {
|
||||||
|
app = "ci-pipeline-health"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "kubernetes_namespace" "ci_pipeline_health" {
|
||||||
|
metadata {
|
||||||
|
name = local.namespace
|
||||||
|
labels = {
|
||||||
|
tier = local.tiers.aux
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# github_pat (NOT the ghcr_pull_token alias): the sweep reads Actions runs +
|
||||||
|
# billing on PRIVATE mirrors, which a future scoped read:packages rotation of
|
||||||
|
# the alias could not do. Blast radius = this single-CronJob namespace.
|
||||||
|
resource "kubernetes_manifest" "external_secret" {
|
||||||
|
manifest = {
|
||||||
|
apiVersion = "external-secrets.io/v1beta1"
|
||||||
|
kind = "ExternalSecret"
|
||||||
|
metadata = {
|
||||||
|
name = "ci-pipeline-health-creds"
|
||||||
|
namespace = kubernetes_namespace.ci_pipeline_health.metadata[0].name
|
||||||
|
}
|
||||||
|
spec = {
|
||||||
|
refreshInterval = "15m"
|
||||||
|
secretStoreRef = {
|
||||||
|
name = "vault-kv"
|
||||||
|
kind = "ClusterSecretStore"
|
||||||
|
}
|
||||||
|
target = {
|
||||||
|
name = "ci-pipeline-health-creds"
|
||||||
|
}
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
secretKey = "GITHUB_PAT"
|
||||||
|
remoteRef = { key = "viktor", property = "github_pat" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
secretKey = "WOODPECKER_API_TOKEN"
|
||||||
|
remoteRef = { key = "ci/global", property = "woodpecker_api_token" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
secretKey = "SLACK_WEBHOOK"
|
||||||
|
remoteRef = { key = "ci/global", property = "slack_webhook" }
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "kubernetes_config_map" "sweep_script" {
|
||||||
|
metadata {
|
||||||
|
name = "ci-pipeline-health-sweep"
|
||||||
|
namespace = kubernetes_namespace.ci_pipeline_health.metadata[0].name
|
||||||
|
}
|
||||||
|
data = {
|
||||||
|
"sweep.sh" = file("${path.module}/files/sweep.sh")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resource "kubernetes_cron_job_v1" "sweep" {
|
||||||
|
metadata {
|
||||||
|
name = "ci-pipeline-health"
|
||||||
|
namespace = kubernetes_namespace.ci_pipeline_health.metadata[0].name
|
||||||
|
labels = local.labels
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
schedule = var.schedule
|
||||||
|
concurrency_policy = "Forbid"
|
||||||
|
successful_jobs_history_limit = 3
|
||||||
|
failed_jobs_history_limit = 3
|
||||||
|
job_template {
|
||||||
|
metadata {
|
||||||
|
labels = local.labels
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
backoff_limit = 1
|
||||||
|
active_deadline_seconds = 600
|
||||||
|
ttl_seconds_after_finished = 86400
|
||||||
|
template {
|
||||||
|
metadata {
|
||||||
|
labels = local.labels
|
||||||
|
}
|
||||||
|
spec {
|
||||||
|
restart_policy = "Never"
|
||||||
|
image_pull_secrets {
|
||||||
|
name = "registry-credentials"
|
||||||
|
}
|
||||||
|
container {
|
||||||
|
name = "sweep"
|
||||||
|
image = local.image
|
||||||
|
command = ["/bin/sh", "/scripts/sweep.sh"]
|
||||||
|
env_from {
|
||||||
|
secret_ref {
|
||||||
|
name = "ci-pipeline-health-creds"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
volume_mount {
|
||||||
|
name = "sweep-script"
|
||||||
|
mount_path = "/scripts"
|
||||||
|
read_only = true
|
||||||
|
}
|
||||||
|
resources {
|
||||||
|
requests = {
|
||||||
|
cpu = "50m"
|
||||||
|
memory = "64Mi"
|
||||||
|
}
|
||||||
|
limits = {
|
||||||
|
memory = "128Mi"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
volume {
|
||||||
|
name = "sweep-script"
|
||||||
|
config_map {
|
||||||
|
name = kubernetes_config_map.sweep_script.metadata[0].name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lifecycle {
|
||||||
|
# KYVERNO_LIFECYCLE_V1
|
||||||
|
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
|
||||||
|
}
|
||||||
|
depends_on = [kubernetes_manifest.external_secret]
|
||||||
|
}
|
||||||
9
stacks/ci-pipeline-health/terragrunt.hcl
Normal file
9
stacks/ci-pipeline-health/terragrunt.hcl
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
include "root" {
|
||||||
|
path = find_in_parent_folders()
|
||||||
|
}
|
||||||
|
|
||||||
|
# ExternalSecret hits ESO which needs to be alive when the manifest applies.
|
||||||
|
dependency "external_secrets" {
|
||||||
|
config_path = "../external-secrets"
|
||||||
|
skip_outputs = true
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue