infra/stacks/ci-pipeline-health/main.tf
Viktor Barzin 02785987dd
All checks were successful
ci/woodpecker/push/default Pipeline was successful
ci/woodpecker/push/build-cli Pipeline was successful
ci-pipeline-health: image :latest+Always — registry lost the 2fd7670d tag
The sha tag other claude-agent-service CronJobs pin no longer exists in
the Forgejo registry (node caches mask it); fresh pulls 404. Follow the
owned-app CronJob convention until infra#19 moves this image to ghcr.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 21:06:20 +00:00

165 lines
4.9 KiB
HCL

# ci-pipeline-health — daily sweep of the off-infra CI chain (ADR-0002).
#
# Viktor's standing instruction (2026-06-12): monitor the pipelines closely
# during/after the off-infra builds migration (PRD infra#10). Deterministic
# shell sweep (files/sweep.sh) on the claude-agent-service image: GitHub
# Actions failures/stuck runs across owned repos, Woodpecker pipeline
# failures, GHA free-tier minutes burn. Healthy => one quiet Slack line;
# issues => Slack alert + a comment on infra#10.
#
# Runs IN-CLUSTER (not a claude.ai cloud routine) because Vault and the
# Woodpecker token are LAN-only — cloud agents can't reach them.
#
# First apply rode the DIFF_BASE fix (pipeline-128 merge-commit detection bug).
variable "schedule" {
type = string
# 07:30 UTC = 08:30 London in summer (07:30 in winter — acceptable drift,
# CronJob schedules are UTC-only).
default = "30 7 * * *"
}
# :latest + Always per the owned-app CronJob convention. NOTE: the registry
# no longer holds the sha tag the other claude-agent-service CronJobs pin
# (2fd7670d) — they survive on node image caches only. When issue infra#19
# migrates claude-agent-service to ghcr, repoint this image too.
variable "image_tag" {
type = string
default = "latest"
}
locals {
namespace = "ci-pipeline-health"
image = "forgejo.viktorbarzin.me/viktor/claude-agent-service:${var.image_tag}"
labels = {
app = "ci-pipeline-health"
}
}
resource "kubernetes_namespace" "ci_pipeline_health" {
metadata {
name = local.namespace
labels = {
tier = local.tiers.aux
}
}
}
# github_pat (NOT the ghcr_pull_token alias): the sweep reads Actions runs +
# billing on PRIVATE mirrors, which a future scoped read:packages rotation of
# the alias could not do. Blast radius = this single-CronJob namespace.
resource "kubernetes_manifest" "external_secret" {
manifest = {
apiVersion = "external-secrets.io/v1beta1"
kind = "ExternalSecret"
metadata = {
name = "ci-pipeline-health-creds"
namespace = kubernetes_namespace.ci_pipeline_health.metadata[0].name
}
spec = {
refreshInterval = "15m"
secretStoreRef = {
name = "vault-kv"
kind = "ClusterSecretStore"
}
target = {
name = "ci-pipeline-health-creds"
}
data = [
{
secretKey = "GITHUB_PAT"
remoteRef = { key = "viktor", property = "github_pat" }
},
{
secretKey = "WOODPECKER_API_TOKEN"
remoteRef = { key = "ci/global", property = "woodpecker_api_token" }
},
{
secretKey = "SLACK_WEBHOOK"
remoteRef = { key = "ci/global", property = "slack_webhook" }
},
]
}
}
}
resource "kubernetes_config_map" "sweep_script" {
metadata {
name = "ci-pipeline-health-sweep"
namespace = kubernetes_namespace.ci_pipeline_health.metadata[0].name
}
data = {
"sweep.sh" = file("${path.module}/files/sweep.sh")
}
}
resource "kubernetes_cron_job_v1" "sweep" {
metadata {
name = "ci-pipeline-health"
namespace = kubernetes_namespace.ci_pipeline_health.metadata[0].name
labels = local.labels
}
spec {
schedule = var.schedule
concurrency_policy = "Forbid"
successful_jobs_history_limit = 3
failed_jobs_history_limit = 3
job_template {
metadata {
labels = local.labels
}
spec {
backoff_limit = 1
active_deadline_seconds = 600
ttl_seconds_after_finished = 86400
template {
metadata {
labels = local.labels
}
spec {
restart_policy = "Never"
image_pull_secrets {
name = "registry-credentials"
}
container {
name = "sweep"
image = local.image
image_pull_policy = "Always"
command = ["/bin/sh", "/scripts/sweep.sh"]
env_from {
secret_ref {
name = "ci-pipeline-health-creds"
}
}
volume_mount {
name = "sweep-script"
mount_path = "/scripts"
read_only = true
}
resources {
requests = {
cpu = "50m"
memory = "64Mi"
}
limits = {
memory = "128Mi"
}
}
}
volume {
name = "sweep-script"
config_map {
name = kubernetes_config_map.sweep_script.metadata[0].name
}
}
}
}
}
}
}
lifecycle {
# KYVERNO_LIFECYCLE_V1
ignore_changes = [spec[0].job_template[0].spec[0].template[0].spec[0].dns_config]
}
depends_on = [kubernetes_manifest.external_secret]
}