claude-agent-service/app/main.py
Viktor Barzin 66104a32ab
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
parallel execution: replace single-flight lock with bounded semaphore + per-job workspace
Multiple agent calls now run concurrently, each in its own isolated git
checkout (local clone of the warm base, hardlinked objects, git-crypt
re-unlocked), so concurrent jobs never share a working tree.

- execution_lock (asyncio.Lock) -> execution_semaphore (default MAX_CONCURRENCY=10);
  excess calls queue FIFO instead of 409/503. MAX_QUEUE_DEPTH safety valve.
- /execute never returns 409; jobs go queued -> running. Timeout covers
  execution only, not queue wait.
- /v1/chat/completions queues for a slot instead of 503-busy.
- /health: busy = at-capacity, plus active/queued/capacity fields.
- per-job workspace prepare/cleanup under a short git lock; the agent run holds none.
- in-memory job registry evicted past JOB_TTL_SECONDS.

Design: docs/2026-06-02-parallel-execution-design.md

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-02 20:57:41 +00:00

512 lines
18 KiB
Python

import asyncio
import hmac
import json
import os
import time
import uuid
from contextlib import asynccontextmanager
from datetime import datetime, timezone
from subprocess import PIPE
from typing import Any, Literal
from fastapi import FastAPI, HTTPException, Header
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
app = FastAPI(title="Claude Agent Service")
API_TOKEN = os.environ.get("API_BEARER_TOKEN", "")
# Warm base clone, populated by the init container. Each job clones from this
# into its own dir under JOBS_DIR so concurrent calls never share a working
# tree (no git index.lock contention, no clobbered edits).
BASE_DIR = os.environ.get("WORKSPACE_DIR", "/workspace/infra")
JOBS_DIR = os.environ.get("JOBS_DIR", "/workspace/jobs")
GIT_CRYPT_KEY = os.environ.get("GIT_CRYPT_KEY", "/secrets/git-crypt/key")
# Concurrency. MAX_CONCURRENCY caps simultaneous claude runs ("soft-unbounded"
# — a high default rather than a tight limit); excess calls queue FIFO rather
# than being rejected. MAX_QUEUE_DEPTH is a safety valve so a runaway burst
# can't pin unbounded memory: past it, callers are turned away (429/503).
MAX_CONCURRENCY = int(os.environ.get("MAX_CONCURRENCY", "10"))
MAX_QUEUE_DEPTH = int(os.environ.get("MAX_QUEUE_DEPTH", "100"))
# Completed jobs are evicted from the in-memory registry past this age so the
# dict doesn't grow without bound.
JOB_TTL_SECONDS = int(os.environ.get("JOB_TTL_SECONDS", "3600"))
# Bursts share one base fetch rather than serialising a network round-trip per
# job behind the git lock.
FETCH_DEBOUNCE_SECONDS = int(os.environ.get("FETCH_DEBOUNCE_SECONDS", "15"))
# OpenAI compat: model selection is per-request so callers can pick
# Haiku/Sonnet/Opus to control cost. The agent is fixed — `recruiter-triage`
# has the broadest tool surface (WebSearch, WebFetch, Read, Grep, Glob, Bash);
# the alternative (`beads-task-runner`) is locked to read-only `bd` verbs which
# would fail arbitrary OpenAI-API callers. The model on the agent's frontmatter
# is overridden by the `--model` CLI flag we pass per-request.
# Bare aliases auto-roll forward to the latest published version of each
# family. The Claude CLI resolves `haiku` → `claude-haiku-4-5-20251001`
# (and bumps it when Anthropic ships a newer Haiku) — letting us avoid
# version bumps on every release. Add a specific date-suffixed string here
# only if a caller needs to pin against an upcoming roll-forward.
SUPPORTED_MODELS: frozenset[str] = frozenset({
"haiku",
"sonnet",
"opus",
# Legacy date-suffixed forms — kept for callers that pinned before the
# 2026-06-01 bare-aliases switch (fire-planner < c1c1e22). Drop these
# once all consumers have been re-imaged.
"claude-haiku-4-5",
"claude-sonnet-4-6",
"claude-opus-4-7",
})
DEFAULT_MODEL = "sonnet"
OPENAI_COMPAT_AGENT = "recruiter-triage"
OPENAI_COMPAT_BUDGET_USD = 2.0
OPENAI_COMPAT_TIMEOUT_SECONDS = 900
_TERMINAL_STATUSES = frozenset({"completed", "failed", "timeout", "error"})
jobs: dict[str, dict] = {}
# Concurrency primitives. The semaphore bounds simultaneous executions; the git
# lock is held only for the fast per-job workspace setup/teardown (fetch +
# local clone + unlock + rm), NOT for the agent run itself.
execution_semaphore = asyncio.Semaphore(MAX_CONCURRENCY)
git_lock = asyncio.Lock()
inflight_active = 0
inflight_queued = 0
_last_fetch_epoch = 0.0
class ExecuteRequest(BaseModel):
prompt: str
agent: str
max_budget_usd: float = 5.0
timeout_seconds: int = 2700
metadata: dict | None = None
class ChatMessage(BaseModel):
role: Literal["system", "user", "assistant"]
content: str
class ChatCompletionsRequest(BaseModel):
# `model` is optional: callers that omit it get DEFAULT_MODEL. We still
# validate the explicit value against SUPPORTED_MODELS at the route level
# so we can return a structured 400 listing the allowed IDs.
model: str | None = None
messages: list[ChatMessage] = Field(..., min_length=1)
max_tokens: int | None = None
temperature: float | None = None
stream: bool = False
# Tolerate (and ignore) other OpenAI fields rather than 422-ing on them.
model_config = {"extra": "allow"}
def verify_token(authorization: str | None):
# Reject everything when the service is unconfigured. compare_digest("", "")
# returns True, so without this guard an empty API_TOKEN would happily
# accept an empty header.
if not API_TOKEN:
raise HTTPException(status_code=401, detail="Service unauthenticated")
if not authorization or not authorization.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing bearer token")
token = authorization.removeprefix("Bearer ")
if not hmac.compare_digest(token, API_TOKEN):
raise HTTPException(status_code=401, detail="Invalid token")
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _reserve_queue_slot() -> bool:
"""Admit a call into the queue, or refuse it if the queue is saturated.
Returns False when active + queued already fills MAX_QUEUE_DEPTH — the
caller should then turn the request away (429/503).
"""
global inflight_queued
if inflight_active + inflight_queued >= MAX_QUEUE_DEPTH:
return False
inflight_queued += 1
return True
@asynccontextmanager
async def _execution_slot():
"""Hold one concurrency permit for the duration of an agent run.
The caller must have reserved a queue slot via `_reserve_queue_slot()`
first; this moves it from queued -> active on acquire and always releases.
"""
global inflight_active, inflight_queued
acquired = False
try:
await execution_semaphore.acquire()
acquired = True
inflight_queued -= 1
inflight_active += 1
yield
finally:
if acquired:
inflight_active -= 1
execution_semaphore.release()
else:
# Cancelled while still waiting in the queue.
inflight_queued -= 1
def _evict_old_jobs() -> None:
now = time.time()
stale = [
jid for jid, job in jobs.items()
if job.get("status") in _TERMINAL_STATUSES
and now - job.get("finished_epoch", now) > JOB_TTL_SECONDS
]
for jid in stale:
jobs.pop(jid, None)
async def _run(*cmd: str, cwd: str | None = None, timeout: float | None = None,
check: bool = True, capture: bool = False) -> tuple[int, str]:
"""Run a subprocess (no shell), optionally capturing stdout. Raises on
non-zero unless `check=False`. Used for the git/git-crypt/rm steps of
per-job workspace setup."""
proc = await asyncio.create_subprocess_exec(
*cmd, cwd=cwd, stdout=PIPE, stderr=PIPE,
)
try:
out, err = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
await proc.wait()
raise
rc = proc.returncode or 0
if check and rc != 0:
raise RuntimeError(f"{cmd[0]} failed ({rc}): {err.decode(errors='replace')[:200]}")
return rc, (out.decode(errors="replace") if capture else "")
async def _refresh_base() -> None:
"""Pull the base clone up to origin/master, debounced so a burst of jobs
shares one fetch. Failures are tolerated — jobs run against the last good
base rather than wedging on a transient network blip."""
global _last_fetch_epoch
now = time.time()
if now - _last_fetch_epoch < FETCH_DEBOUNCE_SECONDS:
return
_last_fetch_epoch = now
await _run("git", "-C", BASE_DIR, "fetch", "origin", "--prune",
timeout=120, check=False)
await _run("git", "-C", BASE_DIR, "reset", "--hard", "origin/master",
check=False)
async def prepare_workspace(job_id: str) -> str:
"""Create an isolated git checkout for one job and return its path.
A local clone of the warm base hardlinks the object store (near-free) and
carries only tracked files (no stale .terraform). The git lock is held just
for this fast setup, never for the agent run.
"""
job_dir = os.path.join(JOBS_DIR, job_id)
async with git_lock:
await _refresh_base()
await _run("git", "clone", "--local", BASE_DIR, job_dir)
rc, base_origin = await _run(
"git", "-C", BASE_DIR, "remote", "get-url", "origin",
check=False, capture=True,
)
if rc == 0 and base_origin.strip():
await _run("git", "-C", job_dir, "remote", "set-url", "origin",
base_origin.strip(), check=False)
if GIT_CRYPT_KEY and os.path.exists(GIT_CRYPT_KEY):
await _run("git-crypt", "unlock", GIT_CRYPT_KEY, cwd=job_dir, check=False)
return job_dir
async def cleanup_workspace(path: str | None) -> None:
if not path:
return
await _run("rm", "-rf", path, check=False)
async def _invoke_claude_subprocess(
prompt: str,
agent: str,
max_budget_usd: float,
workspace: str,
model: str | None = None,
) -> dict[str, Any]:
"""Run the claude CLI once in `workspace` and return a result dict.
Holds no lock and does not touch the `jobs` dict, so it is shared by both
the background `/execute` path and the synchronous `/v1/chat/completions`
path. The caller provides an isolated `workspace` (one per job) as cwd.
`model`, when provided, becomes `--model <id>` on the claude CLI. This
overrides whatever `model:` is set in the agent's frontmatter so the
OpenAI-compat path can pick Haiku/Sonnet/Opus per-request.
"""
cmd = [
"claude", "-p",
"--agent", agent,
"--dangerously-skip-permissions",
"--max-budget-usd", str(max_budget_usd),
"--output-format", "json",
]
if model is not None:
cmd.extend(["--model", model])
cmd.append(prompt)
proc = await asyncio.create_subprocess_exec(
*cmd,
cwd=workspace,
stdout=PIPE,
stderr=PIPE,
)
# stdout=PIPE / stderr=PIPE guarantee both streams are present.
assert proc.stdout is not None and proc.stderr is not None
output_lines: list[str] = []
async for line in proc.stdout:
output_lines.append(line.decode())
stderr = await proc.stderr.read()
await proc.wait()
return {
"exit_code": proc.returncode,
"output": output_lines,
"stderr": stderr.decode(),
}
async def _run_execute_job(job_id: str, request: ExecuteRequest):
"""Background worker for /execute: waits for a slot (queued), then runs the
agent in an isolated workspace. The timeout covers execution only, never
the time spent waiting in the queue."""
workspace = None
try:
async with _execution_slot():
jobs[job_id]["status"] = "running"
jobs[job_id]["started_at"] = _now_iso()
workspace = await prepare_workspace(job_id)
result = await asyncio.wait_for(
_invoke_claude_subprocess(
request.prompt, request.agent, request.max_budget_usd, workspace,
),
timeout=request.timeout_seconds,
)
jobs[job_id].update({
"status": "completed" if result["exit_code"] == 0 else "failed",
"exit_code": result["exit_code"],
"output": result["output"],
"stderr": result["stderr"],
"finished_at": _now_iso(),
"finished_epoch": time.time(),
})
except asyncio.TimeoutError:
jobs[job_id].update({
"status": "timeout",
"finished_at": _now_iso(),
"finished_epoch": time.time(),
})
except Exception as exc:
jobs[job_id].update({
"status": "error",
"error": str(exc),
"finished_at": _now_iso(),
"finished_epoch": time.time(),
})
finally:
try:
await cleanup_workspace(workspace)
except Exception:
pass
_evict_old_jobs()
def _extract_assistant_text(output_lines: list[str]) -> str:
"""Pull the final assistant text out of `claude -p --output-format json`.
The CLI emits a single JSON object on stdout (possibly across multiple
lines if it pretty-prints) with a `result` field holding the final
assistant message. If parsing fails for any reason, fall back to the
raw concatenation so callers always get *something* useful.
"""
raw = "".join(output_lines).strip()
if not raw:
return ""
try:
parsed = json.loads(raw)
except json.JSONDecodeError:
return raw
if isinstance(parsed, dict):
for key in ("result", "content", "text"):
value = parsed.get(key)
if isinstance(value, str) and value:
return value
return raw
def _one_line(text: str, limit: int = 200) -> str:
"""Collapse multi-line text to a single line, truncated for response bodies."""
flat = " ".join(text.split())
return flat[:limit]
def _synthesise_prompt(messages: list[ChatMessage]) -> str:
"""Flatten OpenAI chat messages into a single prompt body.
System messages are surfaced as preamble; user messages become the
actual request. Multiple user turns are concatenated in order so a
short multi-turn back-and-forth still works (this is a stateless
completion — we don't replay prior assistant turns).
"""
system_parts = [m.content for m in messages if m.role == "system"]
user_parts = [m.content for m in messages if m.role == "user"]
# Assistant messages from prior turns are intentionally NOT injected —
# claude `-p` is stateless and replaying them as user text would
# confuse the agent.
sections: list[str] = []
if system_parts:
sections.append("System instructions:\n" + "\n\n".join(system_parts))
if user_parts:
sections.append("Request:\n" + "\n\n".join(user_parts))
if not sections:
# Defensive — pydantic min_length=1 should already prevent this.
return ""
return "\n\n---\n\n".join(sections)
@app.get("/health")
async def health():
return {
"status": "ok",
"busy": inflight_active >= MAX_CONCURRENCY,
"active": inflight_active,
"queued": inflight_queued,
"capacity": MAX_CONCURRENCY,
}
@app.post("/execute", status_code=202)
async def execute(
request: ExecuteRequest,
authorization: str | None = Header(default=None),
):
verify_token(authorization)
if not _reserve_queue_slot():
raise HTTPException(status_code=429, detail="Queue full")
job_id = uuid.uuid4().hex[:12]
jobs[job_id] = {
"status": "queued",
"prompt": request.prompt,
"agent": request.agent,
"created_at": _now_iso(),
"metadata": request.metadata,
}
asyncio.create_task(_run_execute_job(job_id, request))
return {"job_id": job_id, "status": "queued"}
@app.get("/jobs/{job_id}")
async def get_job(
job_id: str,
authorization: str | None = Header(default=None),
):
verify_token(authorization)
if job_id not in jobs:
raise HTTPException(status_code=404, detail="Job not found")
return jobs[job_id]
@app.post("/v1/chat/completions")
async def chat_completions(
request: ChatCompletionsRequest,
authorization: str | None = Header(default=None),
):
verify_token(authorization)
if request.stream:
raise HTTPException(status_code=400, detail="streaming not supported")
model = request.model if request.model is not None else DEFAULT_MODEL
if model not in SUPPORTED_MODELS:
return JSONResponse(
status_code=400,
content={
"error": "unsupported model",
"supported": sorted(SUPPORTED_MODELS),
},
)
prompt = _synthesise_prompt(request.messages)
if not _reserve_queue_slot():
return JSONResponse(
status_code=503,
content={"error": "execution failed", "detail": "queue full"},
)
chat_id = uuid.uuid4().hex[:12]
workspace = None
try:
async with _execution_slot():
workspace = await prepare_workspace(chat_id)
result = await asyncio.wait_for(
_invoke_claude_subprocess(
prompt, OPENAI_COMPAT_AGENT, OPENAI_COMPAT_BUDGET_USD,
workspace, model=model,
),
timeout=OPENAI_COMPAT_TIMEOUT_SECONDS,
)
except asyncio.TimeoutError:
return JSONResponse(
status_code=503,
content={"error": "execution failed", "detail": "agent timed out"},
)
except Exception as exc:
return JSONResponse(
status_code=503,
content={"error": "execution failed", "detail": _one_line(str(exc))},
)
finally:
try:
await cleanup_workspace(workspace)
except Exception:
pass
if result["exit_code"] != 0:
detail = _one_line(result.get("stderr") or "") or f"exit {result['exit_code']}"
return JSONResponse(
status_code=503,
content={"error": "execution failed", "detail": detail},
)
content = _extract_assistant_text(result["output"])
completion_id = "chatcmpl-" + uuid.uuid4().hex[:24]
return {
"id": completion_id,
"object": "chat.completion",
"created": int(time.time()),
"model": model,
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": content},
"finish_reason": "stop",
}],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
},
}