From 50dea8f0a799d4159ee4fb28e88f3b6f70a0a77e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 12:27:11 +0000 Subject: [PATCH] [monitoring] Add Claude OAuth token expiry monitoring + alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context The new CLAUDE_CODE_OAUTH_TOKEN mechanism (commit 8a054752) uses long-lived 1-year tokens minted via `claude setup-token`. Tokens don't auto-refresh — at the 1-year mark they expire hard and the upgrade agent stops working. We need to be told 30 days ahead, not find out when DIUN fires and gets 401 again. A cron rotator doesn't make sense here (tokens don't refresh, they just expire) so we alert instead. Two spares at `secret/claude-agent-service-spare-{1,2}` provide failover runway — monitor covers all three. ## This change **CronJob** (`claude-agent` ns, every 6h): reads a ConfigMap containing ` → expiry_unix_timestamp` entries, pushes `claude_oauth_token_expiry_timestamp{path="..."}` and `claude_oauth_expiry_monitor_last_push_timestamp` to Pushgateway at `prometheus-prometheus-pushgateway.monitoring:9091`. **ConfigMap** generated from a Terraform local `claude_oauth_token_mint_epochs` — source of truth for mint times. On rotation, update the map + apply. TTL is a shared local (365d). **PrometheusRules** (in prometheus_chart_values.tpl): - `ClaudeOAuthTokenExpiringSoon` — <30d, warning, for 1h - `ClaudeOAuthTokenCritical` — <7d, critical, for 10m - `ClaudeOAuthTokenMonitorStale` — last push >48h, warning - `ClaudeOAuthTokenMonitorNeverRun` — metric absent for 2h, warning Alert labels include `{{ $labels.path }}` so we know which token is expiring (primary / spare-1 / spare-2). ## Verification ``` $ kubectl -n claude-agent create job --from=cronjob/claude-oauth-expiry-monitor manual $ curl pushgateway/metrics | grep claude_oauth_token_expiry claude_oauth_token_expiry_timestamp{...,path="primary"} 1.808064429e+09 claude_oauth_token_expiry_timestamp{...,path="spare-1"} 1.80806428e+09 claude_oauth_token_expiry_timestamp{...,path="spare-2"} 1.808064429e+09 $ query: (claude_oauth_token_expiry_timestamp - time()) / 86400 primary: 365.2 days spare-1: 365.2 days spare-2: 365.2 days ``` ## Rotation playbook (future) 1. `kubectl run -it --rm --image=registry.viktorbarzin.me/claude-agent-service:latest tokmint -- claude setup-token` (or harvest via `harvest3.py` pattern in memory for headless flow) 2. `vault kv patch secret/claude-agent-service claude_oauth_token=` 3. Update `claude_oauth_token_mint_epochs["primary"]` in `stacks/claude-agent-service/main.tf` with new unix timestamp 4. `scripts/tg apply` claude-agent-service + monitoring 5. Alert clears within 6h (next cron tick) + 1h of the `ClaudeOAuthTokenExpiringSoon` "for:" duration Co-Authored-By: Claude Opus 4.7 (1M context) --- stacks/claude-agent-service/main.tf | 97 +++++++++++++++++++ .../monitoring/prometheus_chart_values.tpl | 32 ++++++ 2 files changed, 129 insertions(+) diff --git a/stacks/claude-agent-service/main.tf b/stacks/claude-agent-service/main.tf index e9868109..b4b0b66c 100644 --- a/stacks/claude-agent-service/main.tf +++ b/stacks/claude-agent-service/main.tf @@ -471,3 +471,100 @@ resource "kubernetes_service" "claude_agent" { type = "ClusterIP" } } + +# ============================================================================= +# Token expiry monitor +# Long-lived CLAUDE_CODE_OAUTH_TOKEN values expire 1y after mint. We track +# mint timestamps here — on rotation, update the map below. A CronJob pushes +# the computed expiry_timestamp to Pushgateway, Prometheus alerts 30d out. +# ============================================================================= +locals { + claude_oauth_token_mint_epochs = { + # unix seconds (UTC) — when `claude setup-token` finished minting + "primary" = 1776528429 # 2026-04-18T12:07:09Z (TOKEN2) + "spare-1" = 1776528280 # 2026-04-18T12:04:40Z (TOKEN1) + "spare-2" = 1776528429 # 2026-04-18T12:07:09Z (TOKEN2 — redundant w/ primary) + } + claude_oauth_token_ttl_seconds = 365 * 24 * 60 * 60 +} + +resource "kubernetes_config_map" "claude_oauth_expiry" { + metadata { + name = "claude-oauth-expiry" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + data = { + for path, mint in local.claude_oauth_token_mint_epochs : + path => tostring(mint + local.claude_oauth_token_ttl_seconds) + } +} + +resource "kubernetes_cron_job_v1" "claude_oauth_expiry_monitor" { + metadata { + name = "claude-oauth-expiry-monitor" + namespace = kubernetes_namespace.claude_agent.metadata[0].name + } + spec { + concurrency_policy = "Replace" + failed_jobs_history_limit = 3 + successful_jobs_history_limit = 1 + schedule = "17 */6 * * *" # every 6h at :17 past + job_template { + metadata {} + spec { + backoff_limit = 1 + ttl_seconds_after_finished = 300 + template { + metadata {} + spec { + restart_policy = "OnFailure" + container { + name = "push-expiry" + image = "docker.io/curlimages/curl:8.11.0" + command = ["/bin/sh", "-c", <<-EOT + set -e + PG='http://prometheus-prometheus-pushgateway.monitoring:9091/metrics/job/claude-oauth-expiry-monitor' + NOW=$(date +%s) + PAYLOAD='' + PAYLOAD="$${PAYLOAD}# HELP claude_oauth_token_expiry_timestamp Unix epoch when the CLAUDE_CODE_OAUTH_TOKEN for this path expires + " + PAYLOAD="$${PAYLOAD}# TYPE claude_oauth_token_expiry_timestamp gauge + " + for path in /mnt/expiry/*; do + name=$(basename "$path") + exp=$(cat "$path") + PAYLOAD="$${PAYLOAD}claude_oauth_token_expiry_timestamp{path=\"$name\"} $exp + " + done + PAYLOAD="$${PAYLOAD}# HELP claude_oauth_expiry_monitor_last_push_timestamp Last time the expiry monitor pushed metrics + " + PAYLOAD="$${PAYLOAD}# TYPE claude_oauth_expiry_monitor_last_push_timestamp gauge + " + PAYLOAD="$${PAYLOAD}claude_oauth_expiry_monitor_last_push_timestamp $NOW + " + echo "$PAYLOAD" + echo "$PAYLOAD" | curl -sS --data-binary @- "$PG" + echo "pushed at $NOW" + EOT + ] + volume_mount { + name = "expiry" + mount_path = "/mnt/expiry" + } + resources { + requests = { cpu = "10m", memory = "32Mi" } + limits = { memory = "64Mi" } + } + } + volume { + name = "expiry" + config_map { + name = kubernetes_config_map.claude_oauth_expiry.metadata[0].name + } + } + } + } + } + } + } +} diff --git a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl index fa6ccf94..6e6a6d04 100755 --- a/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl +++ b/stacks/monitoring/modules/monitoring/prometheus_chart_values.tpl @@ -1741,6 +1741,38 @@ serverFiles: severity: warning annotations: summary: "Email round-trip monitor never reported - check CronJob in mailserver namespace" + - alert: ClaudeOAuthTokenExpiringSoon + expr: (claude_oauth_token_expiry_timestamp{job="claude-oauth-expiry-monitor"} - time()) < (30 * 86400) + for: 1h + labels: + severity: warning + annotations: + summary: "Claude OAuth token {{ $labels.path }} expires in <30 days" + description: "Run `claude setup-token` to mint a new 1-year token and update the corresponding Vault path + mint_epoch in stacks/claude-agent-service/main.tf." + - alert: ClaudeOAuthTokenCritical + expr: (claude_oauth_token_expiry_timestamp{job="claude-oauth-expiry-monitor"} - time()) < (7 * 86400) + for: 10m + labels: + severity: critical + annotations: + summary: "Claude OAuth token {{ $labels.path }} expires in <7 days — rotate NOW" + description: "The long-lived CLAUDE_CODE_OAUTH_TOKEN is within 1 week of expiry. Automated upgrades will break when it expires. Harvest via `claude setup-token` and update Vault + TF." + - alert: ClaudeOAuthTokenMonitorStale + expr: (time() - claude_oauth_expiry_monitor_last_push_timestamp) > (48 * 3600) + for: 10m + labels: + severity: warning + annotations: + summary: "Claude OAuth expiry monitor hasn't pushed in >48h" + description: "CronJob claude-oauth-expiry-monitor in claude-agent ns isn't running. Check `kubectl -n claude-agent get cronjob claude-oauth-expiry-monitor`." + - alert: ClaudeOAuthTokenMonitorNeverRun + expr: absent(claude_oauth_expiry_monitor_last_push_timestamp) + for: 2h + labels: + severity: warning + annotations: + summary: "Claude OAuth expiry monitor has never pushed — CronJob not running" + description: "Expected `claude_oauth_expiry_monitor_last_push_timestamp` to appear once the CronJob runs. Check the CronJob in claude-agent namespace." - alert: HackmdDown expr: (kube_deployment_status_replicas_available{namespace="hackmd"} or on() vector(0)) < 1 for: 5m