From 42f1c3cf4f4786a30fcd7e19f638b84e72f4115f Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 10:12:02 +0000 Subject: [PATCH] [claude-agent-service] Migrate all pipelines from DevVM SSH to K8s HTTP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context The claude-agent-service K8s pod (deployed 2026-04-15) provides an HTTP API for running Claude headless agents. Three workflows still SSH'd to the DevVM (10.0.10.10) to invoke `claude -p`. This eliminates that dependency. ## This change Pipeline migrations (SSH → HTTP POST to claude-agent-service): - `.woodpecker/issue-automation.yml` — Vault auth fetches API token instead of SSH key; curl POST /execute + poll /jobs/{id} replaces SSH invocation - `scripts/postmortem-pipeline.sh` — same pattern; uses jq for safe JSON construction of TODO payloads - `.woodpecker/postmortem-todos.yml` — drop openssh-client from apk install - `stacks/n8n/workflows/diun-upgrade.json` — SSH node replaced with HTTP Request node; API token via $env.CLAUDE_AGENT_API_TOKEN (added to Vault secret/n8n) Documentation updates: - `docs/architecture/incident-response.md` — Mermaid diagram: DevVM → K8s - `docs/architecture/automated-upgrades.md` — pipeline diagram + n8n action - `AGENTS.md` — pipeline description updated ## What is NOT in this change - DevVM decommissioning (still hosts terminal/foolery services) - Removal of SSH key secrets from Vault (kept for rollback) - n8n workflow import (must be done manually in n8n UI) [ci skip] Co-Authored-By: Claude Opus 4 (1M context) --- .woodpecker/issue-automation.yml | 70 ++++++++++++++++--------- .woodpecker/postmortem-todos.yml | 2 +- AGENTS.md | 2 +- docs/architecture/automated-upgrades.md | 6 +-- docs/architecture/incident-response.md | 6 +-- scripts/postmortem-pipeline.sh | 53 ++++++++++++------- stacks/n8n/workflows/diun-upgrade.json | 24 ++++++--- 7 files changed, 104 insertions(+), 59 deletions(-) diff --git a/.woodpecker/issue-automation.yml b/.woodpecker/issue-automation.yml index ddfc3caa..ece97dab 100644 --- a/.woodpecker/issue-automation.yml +++ b/.woodpecker/issue-automation.yml @@ -9,52 +9,70 @@ clone: steps: - name: run-issue-responder - image: python:3.12-alpine + image: alpine:3.20 commands: - - apk add --no-cache openssh-client curl jq + - apk add --no-cache curl jq # Authenticate to Vault via K8s SA JWT - | SA_TOKEN=$(cat /var/run/secrets/kubernetes.io/serviceaccount/token) VAULT_RESP=$(curl -sf -X POST http://vault-active.vault.svc.cluster.local:8200/v1/auth/kubernetes/login \ - -d "{\"role\":\"ci\",\"jwt\":\"$SA_TOKEN\"}") - VAULT_TOKEN=$(echo "$VAULT_RESP" | jq -r .auth.client_token) - if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then + -d "{\"role\":\"ci\",\"jwt\":\"$$SA_TOKEN\"}") + VAULT_TOKEN=$(echo "$$VAULT_RESP" | jq -r .auth.client_token) + if [ -z "$$VAULT_TOKEN" ] || [ "$$VAULT_TOKEN" = "null" ]; then echo "ERROR: Vault authentication failed" exit 1 fi echo "Vault authenticated" - # Fetch DevVM SSH key + # Fetch API token for claude-agent-service - | - curl -sf -H "X-Vault-Token: $VAULT_TOKEN" \ - http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/ci/infra | \ - jq -r '.data.data.devvm_ssh_key' > /tmp/devvm-key - chmod 600 /tmp/devvm-key - if [ ! -s /tmp/devvm-key ]; then - echo "ERROR: Failed to fetch DevVM SSH key" + AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $$VAULT_TOKEN" \ + http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \ + jq -r '.data.data.api_bearer_token') + if [ -z "$$AGENT_TOKEN" ] || [ "$$AGENT_TOKEN" = "null" ]; then + echo "ERROR: Failed to fetch agent API token" exit 1 fi - echo "SSH key fetched" - # SSH to DevVM and run issue-responder agent + echo "Agent token fetched" + # Submit job to claude-agent-service - | ISSUE_NUM="${ISSUE_NUMBER:-}" ISSUE_TITLE="${ISSUE_TITLE:-}" ISSUE_LABELS="${ISSUE_LABELS:-}" ISSUE_URL="${ISSUE_URL:-}" - if [ -z "$ISSUE_NUM" ]; then + if [ -z "$$ISSUE_NUM" ]; then echo "ERROR: No issue number provided" exit 1 fi - echo "Processing issue #$ISSUE_NUM: $ISSUE_TITLE" - echo "Labels: $ISSUE_LABELS" + echo "Processing issue #$$ISSUE_NUM: $$ISSUE_TITLE" - ssh -i /tmp/devvm-key -o StrictHostKeyChecking=no wizard@10.0.10.10 \ - "cd ~/code && git -C infra stash && git -C infra pull --rebase && git -C infra stash pop 2>/dev/null; \ - ~/.local/bin/claude -p \ - --agent infra/.claude/agents/issue-responder \ - --dangerously-skip-permissions \ - --max-budget-usd 10 \ - 'Process GitHub Issue #${ISSUE_NUM}: ${ISSUE_TITLE}. Labels: ${ISSUE_LABELS}. URL: ${ISSUE_URL}. Read the issue body via GitHub API, investigate, and take appropriate action.'" - # Cleanup - - rm -f /tmp/devvm-key + PAYLOAD=$(jq -n \ + --arg prompt "Process GitHub Issue #$$ISSUE_NUM: $$ISSUE_TITLE. Labels: $$ISSUE_LABELS. URL: $$ISSUE_URL. Read the issue body via GitHub API, investigate, and take appropriate action." \ + --arg agent ".claude/agents/issue-responder" \ + '{prompt: $prompt, agent: $agent, max_budget_usd: 10, timeout_seconds: 1800}') + + RESP=$(curl -sf -X POST \ + -H "Authorization: Bearer $$AGENT_TOKEN" \ + -H "Content-Type: application/json" \ + -d "$$PAYLOAD" \ + http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute) + + JOB_ID=$(echo "$$RESP" | jq -r '.job_id') + echo "Job submitted: $$JOB_ID" + # Poll for completion (30min max) + - | + for i in $(seq 1 120); do + sleep 15 + RESULT=$(curl -sf \ + -H "Authorization: Bearer $$AGENT_TOKEN" \ + http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$$JOB_ID) + STATUS=$(echo "$$RESULT" | jq -r '.status') + echo "[$$i/120] Status: $$STATUS" + if [ "$$STATUS" != "running" ]; then + echo "$$RESULT" | jq . + if [ "$$STATUS" = "completed" ]; then exit 0; else exit 1; fi + fi + done + echo "ERROR: Job timed out after 30 minutes" + exit 1 diff --git a/.woodpecker/postmortem-todos.yml b/.woodpecker/postmortem-todos.yml index 26068492..729e9a85 100644 --- a/.woodpecker/postmortem-todos.yml +++ b/.woodpecker/postmortem-todos.yml @@ -17,7 +17,7 @@ steps: - name: parse-and-implement image: python:3.12-alpine commands: - - apk add --no-cache jq curl git openssh-client + - apk add --no-cache jq curl git - sh scripts/postmortem-pipeline.sh - name: notify-slack diff --git a/AGENTS.md b/AGENTS.md index 8ce10dc1..0662fc99 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -105,7 +105,7 @@ Terragrunt-based homelab managing a Kubernetes cluster (5 nodes, v1.34.2) on Pro - **NFS exports**: Create dir on Proxmox host (`ssh root@192.168.1.127 "mkdir -p /srv/nfs/"`), add to `/etc/exports`, run `exportfs -ra`. ## Automated Service Upgrades -- **Pipeline**: DIUN (detect) → n8n webhook (filter + rate limit) → SSH → `claude -p` (upgrade agent) +- **Pipeline**: DIUN (detect) → n8n webhook (filter + rate limit) → HTTP POST → `claude-agent-service` (K8s) → `claude -p` (upgrade agent) - **Agent**: `.claude/agents/service-upgrade.md` — analyzes changelogs, backs up DBs, bumps versions, verifies health, rolls back on failure - **Config**: `.claude/reference/upgrade-config.json` — GitHub repo mappings, DB-backed services, skip patterns - **Rate limit**: Max 5 upgrades per 6h DIUN scan cycle (configured in n8n workflow) diff --git a/docs/architecture/automated-upgrades.md b/docs/architecture/automated-upgrades.md index 650be533..dc1d77af 100644 --- a/docs/architecture/automated-upgrades.md +++ b/docs/architecture/automated-upgrades.md @@ -16,10 +16,10 @@ n8n Webhook (POST /webhook/) │ rate limit: max 5 upgrades per 6h window │ ▼ -SSH → Dev VM (10.0.10.10) +HTTP POST → claude-agent-service (K8s) │ ▼ -claude -p "upgrade agent prompt" +claude -p "upgrade agent prompt" (in-cluster) │ ▼ Service Upgrade Agent @@ -54,7 +54,7 @@ Service Upgrade Agent - Only `status=update` (skip `new`, `unchanged`) - Skip databases, custom images, infra images, `:latest` - **Rate limiting**: Max 5 upgrades per 6-hour window using `$getWorkflowStaticData('global')` -- **Action**: SSH to dev VM, runs `claude -p` with the upgrade agent prompt +- **Action**: HTTP POST to `claude-agent-service.claude-agent.svc:8080/execute` with the upgrade agent prompt ### Upgrade Agent - **Prompt**: `.claude/agents/service-upgrade.md` diff --git a/docs/architecture/incident-response.md b/docs/architecture/incident-response.md index 631b1ece..54ef0e51 100644 --- a/docs/architecture/incident-response.md +++ b/docs/architecture/incident-response.md @@ -178,11 +178,11 @@ flowchart LR subgraph "Kubernetes Cluster" C -->|Yes| D[Woodpecker Pipeline] D --> E[Vault Auth
K8s SA JWT] - E --> F[Fetch SSH Key] + E --> F[Fetch API Token] end - subgraph "DevVM (10.0.10.10)" - F --> G[SSH + Claude Code] + subgraph "claude-agent-service (K8s)" + F --> G[HTTP POST /execute] G --> H[issue-responder agent] H --> I[Investigate / Implement] I --> J[Comment on Issue] diff --git a/scripts/postmortem-pipeline.sh b/scripts/postmortem-pipeline.sh index 913394c2..4b2cf45f 100755 --- a/scripts/postmortem-pipeline.sh +++ b/scripts/postmortem-pipeline.sh @@ -39,26 +39,43 @@ if [ -z "$VAULT_TOKEN" ] || [ "$VAULT_TOKEN" = "null" ]; then fi echo "Vault authenticated" -# 5. Fetch DevVM SSH key from Vault -curl -sf -H "X-Vault-Token: $VAULT_TOKEN" \ - http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/ci/infra | \ - jq -r '.data.data.devvm_ssh_key' > /tmp/devvm-key -chmod 600 /tmp/devvm-key -if [ ! -s /tmp/devvm-key ]; then - echo "ERROR: Failed to fetch DevVM SSH key" +# 5. Fetch API token for claude-agent-service +AGENT_TOKEN=$(curl -sf -H "X-Vault-Token: $VAULT_TOKEN" \ + http://vault-active.vault.svc.cluster.local:8200/v1/secret/data/claude-agent-service | \ + jq -r '.data.data.api_bearer_token') +if [ -z "$AGENT_TOKEN" ] || [ "$AGENT_TOKEN" = "null" ]; then + echo "ERROR: Failed to fetch agent API token" exit 1 fi -echo "SSH key fetched" +echo "Agent token fetched" -# 6. SSH to DevVM and run Claude Code headless +# 6. Submit to claude-agent-service TODOS=$(cat /tmp/todos.json) -ssh -i /tmp/devvm-key -o StrictHostKeyChecking=no wizard@10.0.10.10 \ - "cd ~/code && git -C infra stash && git -C infra pull && git -C infra stash pop 2>/dev/null; ~/.local/bin/claude -p \ - --agent infra/.claude/agents/postmortem-todo-resolver \ - --dangerously-skip-permissions \ - --max-budget-usd 5 \ - 'Implement the auto-implementable TODOs from $PM_FILE. Parsed TODO list: $TODOS'" +PAYLOAD=$(jq -n \ + --arg prompt "Implement the auto-implementable TODOs from $PM_FILE. Parsed TODO list: $TODOS" \ + --arg agent ".claude/agents/postmortem-todo-resolver" \ + '{prompt: $prompt, agent: $agent, max_budget_usd: 5, timeout_seconds: 900}') -# 7. Cleanup -rm -f /tmp/devvm-key -echo "Pipeline complete" +RESP=$(curl -sf -X POST \ + -H "Authorization: Bearer $AGENT_TOKEN" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD" \ + http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute) +JOB_ID=$(echo "$RESP" | jq -r '.job_id') +echo "Job submitted: $JOB_ID" + +# 7. Poll for completion (15min max) +for i in $(seq 1 60); do + sleep 15 + RESULT=$(curl -sf \ + -H "Authorization: Bearer $AGENT_TOKEN" \ + http://claude-agent-service.claude-agent.svc.cluster.local:8080/jobs/$JOB_ID) + STATUS=$(echo "$RESULT" | jq -r '.status') + echo "[$i/60] Status: $STATUS" + if [ "$STATUS" != "running" ]; then + echo "$RESULT" | jq . + if [ "$STATUS" = "completed" ]; then exit 0; else exit 1; fi + fi +done +echo "ERROR: Job timed out after 15 minutes" +exit 1 diff --git a/stacks/n8n/workflows/diun-upgrade.json b/stacks/n8n/workflows/diun-upgrade.json index 9246e339..fcb10994 100644 --- a/stacks/n8n/workflows/diun-upgrade.json +++ b/stacks/n8n/workflows/diun-upgrade.json @@ -38,15 +38,25 @@ }, { "parameters": { - "command": "='claude -p \"You are the service-upgrade agent. Read /home/wizard/code/infra/.claude/agents/service-upgrade.md for full instructions.\\n\\nUpgrade task:\\n- Image: ' + $json.body.diun_entry_image + '\\n- New tag: ' + $json.body.diun_entry_imagetag + '\\n- Hub link: ' + ($json.body.diun_entry_hublink || 'none') + '\\n\\nExecute the upgrade workflow now.\"'", - "cwd": "/home/wizard/code/infra" + "method": "POST", + "url": "http://claude-agent-service.claude-agent.svc.cluster.local:8080/execute", + "sendHeaders": true, + "headerParameters": { + "parameters": [ + {"name": "Authorization", "value": "='Bearer ' + $env.CLAUDE_AGENT_API_TOKEN"}, + {"name": "Content-Type", "value": "application/json"} + ] + }, + "sendBody": true, + "specifyBody": "json", + "jsonBody": "={{ JSON.stringify({ prompt: 'You are the service-upgrade agent. Read .claude/agents/service-upgrade.md for full instructions.\\n\\nUpgrade task:\\n- Image: ' + $json.body.diun_entry_image + '\\n- New tag: ' + $json.body.diun_entry_imagetag + '\\n- Hub link: ' + ($json.body.diun_entry_hublink || 'none') + '\\n\\nExecute the upgrade workflow now.', agent: '.claude/agents/service-upgrade', max_budget_usd: 10, timeout_seconds: 1800 }) }}", + "options": {} }, - "id": "ssh-execute", + "id": "http-execute", "name": "Run Upgrade Agent", - "type": "n8n-nodes-base.ssh", - "typeVersion": 1, - "position": [910, 300], - "credentials": {"sshPassword": {"id": "REPLACE_WITH_SSH_CRED_ID", "name": "Dev VM SSH"}} + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.2, + "position": [910, 300] } ], "connections": {