All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
Multiple agent calls now run concurrently, each in its own isolated git checkout (local clone of the warm base, hardlinked objects, git-crypt re-unlocked), so concurrent jobs never share a working tree. - execution_lock (asyncio.Lock) -> execution_semaphore (default MAX_CONCURRENCY=10); excess calls queue FIFO instead of 409/503. MAX_QUEUE_DEPTH safety valve. - /execute never returns 409; jobs go queued -> running. Timeout covers execution only, not queue wait. - /v1/chat/completions queues for a slot instead of 503-busy. - /health: busy = at-capacity, plus active/queued/capacity fields. - per-job workspace prepare/cleanup under a short git lock; the agent run holds none. - in-memory job registry evicted past JOB_TTL_SECONDS. Design: docs/2026-06-02-parallel-execution-design.md Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
512 lines
18 KiB
Python
512 lines
18 KiB
Python
import asyncio
|
|
import hmac
|
|
import json
|
|
import os
|
|
import time
|
|
import uuid
|
|
from contextlib import asynccontextmanager
|
|
from datetime import datetime, timezone
|
|
from subprocess import PIPE
|
|
from typing import Any, Literal
|
|
|
|
from fastapi import FastAPI, HTTPException, Header
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel, Field
|
|
|
|
app = FastAPI(title="Claude Agent Service")
|
|
|
|
API_TOKEN = os.environ.get("API_BEARER_TOKEN", "")
|
|
|
|
# Warm base clone, populated by the init container. Each job clones from this
|
|
# into its own dir under JOBS_DIR so concurrent calls never share a working
|
|
# tree (no git index.lock contention, no clobbered edits).
|
|
BASE_DIR = os.environ.get("WORKSPACE_DIR", "/workspace/infra")
|
|
JOBS_DIR = os.environ.get("JOBS_DIR", "/workspace/jobs")
|
|
GIT_CRYPT_KEY = os.environ.get("GIT_CRYPT_KEY", "/secrets/git-crypt/key")
|
|
|
|
# Concurrency. MAX_CONCURRENCY caps simultaneous claude runs ("soft-unbounded"
|
|
# — a high default rather than a tight limit); excess calls queue FIFO rather
|
|
# than being rejected. MAX_QUEUE_DEPTH is a safety valve so a runaway burst
|
|
# can't pin unbounded memory: past it, callers are turned away (429/503).
|
|
MAX_CONCURRENCY = int(os.environ.get("MAX_CONCURRENCY", "10"))
|
|
MAX_QUEUE_DEPTH = int(os.environ.get("MAX_QUEUE_DEPTH", "100"))
|
|
# Completed jobs are evicted from the in-memory registry past this age so the
|
|
# dict doesn't grow without bound.
|
|
JOB_TTL_SECONDS = int(os.environ.get("JOB_TTL_SECONDS", "3600"))
|
|
# Bursts share one base fetch rather than serialising a network round-trip per
|
|
# job behind the git lock.
|
|
FETCH_DEBOUNCE_SECONDS = int(os.environ.get("FETCH_DEBOUNCE_SECONDS", "15"))
|
|
|
|
# OpenAI compat: model selection is per-request so callers can pick
|
|
# Haiku/Sonnet/Opus to control cost. The agent is fixed — `recruiter-triage`
|
|
# has the broadest tool surface (WebSearch, WebFetch, Read, Grep, Glob, Bash);
|
|
# the alternative (`beads-task-runner`) is locked to read-only `bd` verbs which
|
|
# would fail arbitrary OpenAI-API callers. The model on the agent's frontmatter
|
|
# is overridden by the `--model` CLI flag we pass per-request.
|
|
# Bare aliases auto-roll forward to the latest published version of each
|
|
# family. The Claude CLI resolves `haiku` → `claude-haiku-4-5-20251001`
|
|
# (and bumps it when Anthropic ships a newer Haiku) — letting us avoid
|
|
# version bumps on every release. Add a specific date-suffixed string here
|
|
# only if a caller needs to pin against an upcoming roll-forward.
|
|
SUPPORTED_MODELS: frozenset[str] = frozenset({
|
|
"haiku",
|
|
"sonnet",
|
|
"opus",
|
|
# Legacy date-suffixed forms — kept for callers that pinned before the
|
|
# 2026-06-01 bare-aliases switch (fire-planner < c1c1e22). Drop these
|
|
# once all consumers have been re-imaged.
|
|
"claude-haiku-4-5",
|
|
"claude-sonnet-4-6",
|
|
"claude-opus-4-7",
|
|
})
|
|
DEFAULT_MODEL = "sonnet"
|
|
OPENAI_COMPAT_AGENT = "recruiter-triage"
|
|
OPENAI_COMPAT_BUDGET_USD = 2.0
|
|
OPENAI_COMPAT_TIMEOUT_SECONDS = 900
|
|
|
|
_TERMINAL_STATUSES = frozenset({"completed", "failed", "timeout", "error"})
|
|
|
|
jobs: dict[str, dict] = {}
|
|
|
|
# Concurrency primitives. The semaphore bounds simultaneous executions; the git
|
|
# lock is held only for the fast per-job workspace setup/teardown (fetch +
|
|
# local clone + unlock + rm), NOT for the agent run itself.
|
|
execution_semaphore = asyncio.Semaphore(MAX_CONCURRENCY)
|
|
git_lock = asyncio.Lock()
|
|
inflight_active = 0
|
|
inflight_queued = 0
|
|
_last_fetch_epoch = 0.0
|
|
|
|
|
|
class ExecuteRequest(BaseModel):
|
|
prompt: str
|
|
agent: str
|
|
max_budget_usd: float = 5.0
|
|
timeout_seconds: int = 2700
|
|
metadata: dict | None = None
|
|
|
|
|
|
class ChatMessage(BaseModel):
|
|
role: Literal["system", "user", "assistant"]
|
|
content: str
|
|
|
|
|
|
class ChatCompletionsRequest(BaseModel):
|
|
# `model` is optional: callers that omit it get DEFAULT_MODEL. We still
|
|
# validate the explicit value against SUPPORTED_MODELS at the route level
|
|
# so we can return a structured 400 listing the allowed IDs.
|
|
model: str | None = None
|
|
messages: list[ChatMessage] = Field(..., min_length=1)
|
|
max_tokens: int | None = None
|
|
temperature: float | None = None
|
|
stream: bool = False
|
|
# Tolerate (and ignore) other OpenAI fields rather than 422-ing on them.
|
|
model_config = {"extra": "allow"}
|
|
|
|
|
|
def verify_token(authorization: str | None):
|
|
# Reject everything when the service is unconfigured. compare_digest("", "")
|
|
# returns True, so without this guard an empty API_TOKEN would happily
|
|
# accept an empty header.
|
|
if not API_TOKEN:
|
|
raise HTTPException(status_code=401, detail="Service unauthenticated")
|
|
if not authorization or not authorization.startswith("Bearer "):
|
|
raise HTTPException(status_code=401, detail="Missing bearer token")
|
|
token = authorization.removeprefix("Bearer ")
|
|
if not hmac.compare_digest(token, API_TOKEN):
|
|
raise HTTPException(status_code=401, detail="Invalid token")
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
def _reserve_queue_slot() -> bool:
|
|
"""Admit a call into the queue, or refuse it if the queue is saturated.
|
|
|
|
Returns False when active + queued already fills MAX_QUEUE_DEPTH — the
|
|
caller should then turn the request away (429/503).
|
|
"""
|
|
global inflight_queued
|
|
if inflight_active + inflight_queued >= MAX_QUEUE_DEPTH:
|
|
return False
|
|
inflight_queued += 1
|
|
return True
|
|
|
|
|
|
@asynccontextmanager
|
|
async def _execution_slot():
|
|
"""Hold one concurrency permit for the duration of an agent run.
|
|
|
|
The caller must have reserved a queue slot via `_reserve_queue_slot()`
|
|
first; this moves it from queued -> active on acquire and always releases.
|
|
"""
|
|
global inflight_active, inflight_queued
|
|
acquired = False
|
|
try:
|
|
await execution_semaphore.acquire()
|
|
acquired = True
|
|
inflight_queued -= 1
|
|
inflight_active += 1
|
|
yield
|
|
finally:
|
|
if acquired:
|
|
inflight_active -= 1
|
|
execution_semaphore.release()
|
|
else:
|
|
# Cancelled while still waiting in the queue.
|
|
inflight_queued -= 1
|
|
|
|
|
|
def _evict_old_jobs() -> None:
|
|
now = time.time()
|
|
stale = [
|
|
jid for jid, job in jobs.items()
|
|
if job.get("status") in _TERMINAL_STATUSES
|
|
and now - job.get("finished_epoch", now) > JOB_TTL_SECONDS
|
|
]
|
|
for jid in stale:
|
|
jobs.pop(jid, None)
|
|
|
|
|
|
async def _run(*cmd: str, cwd: str | None = None, timeout: float | None = None,
|
|
check: bool = True, capture: bool = False) -> tuple[int, str]:
|
|
"""Run a subprocess (no shell), optionally capturing stdout. Raises on
|
|
non-zero unless `check=False`. Used for the git/git-crypt/rm steps of
|
|
per-job workspace setup."""
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd, cwd=cwd, stdout=PIPE, stderr=PIPE,
|
|
)
|
|
try:
|
|
out, err = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
|
except asyncio.TimeoutError:
|
|
proc.kill()
|
|
await proc.wait()
|
|
raise
|
|
rc = proc.returncode or 0
|
|
if check and rc != 0:
|
|
raise RuntimeError(f"{cmd[0]} failed ({rc}): {err.decode(errors='replace')[:200]}")
|
|
return rc, (out.decode(errors="replace") if capture else "")
|
|
|
|
|
|
async def _refresh_base() -> None:
|
|
"""Pull the base clone up to origin/master, debounced so a burst of jobs
|
|
shares one fetch. Failures are tolerated — jobs run against the last good
|
|
base rather than wedging on a transient network blip."""
|
|
global _last_fetch_epoch
|
|
now = time.time()
|
|
if now - _last_fetch_epoch < FETCH_DEBOUNCE_SECONDS:
|
|
return
|
|
_last_fetch_epoch = now
|
|
await _run("git", "-C", BASE_DIR, "fetch", "origin", "--prune",
|
|
timeout=120, check=False)
|
|
await _run("git", "-C", BASE_DIR, "reset", "--hard", "origin/master",
|
|
check=False)
|
|
|
|
|
|
async def prepare_workspace(job_id: str) -> str:
|
|
"""Create an isolated git checkout for one job and return its path.
|
|
|
|
A local clone of the warm base hardlinks the object store (near-free) and
|
|
carries only tracked files (no stale .terraform). The git lock is held just
|
|
for this fast setup, never for the agent run.
|
|
"""
|
|
job_dir = os.path.join(JOBS_DIR, job_id)
|
|
async with git_lock:
|
|
await _refresh_base()
|
|
await _run("git", "clone", "--local", BASE_DIR, job_dir)
|
|
rc, base_origin = await _run(
|
|
"git", "-C", BASE_DIR, "remote", "get-url", "origin",
|
|
check=False, capture=True,
|
|
)
|
|
if rc == 0 and base_origin.strip():
|
|
await _run("git", "-C", job_dir, "remote", "set-url", "origin",
|
|
base_origin.strip(), check=False)
|
|
if GIT_CRYPT_KEY and os.path.exists(GIT_CRYPT_KEY):
|
|
await _run("git-crypt", "unlock", GIT_CRYPT_KEY, cwd=job_dir, check=False)
|
|
return job_dir
|
|
|
|
|
|
async def cleanup_workspace(path: str | None) -> None:
|
|
if not path:
|
|
return
|
|
await _run("rm", "-rf", path, check=False)
|
|
|
|
|
|
async def _invoke_claude_subprocess(
|
|
prompt: str,
|
|
agent: str,
|
|
max_budget_usd: float,
|
|
workspace: str,
|
|
model: str | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Run the claude CLI once in `workspace` and return a result dict.
|
|
|
|
Holds no lock and does not touch the `jobs` dict, so it is shared by both
|
|
the background `/execute` path and the synchronous `/v1/chat/completions`
|
|
path. The caller provides an isolated `workspace` (one per job) as cwd.
|
|
|
|
`model`, when provided, becomes `--model <id>` on the claude CLI. This
|
|
overrides whatever `model:` is set in the agent's frontmatter so the
|
|
OpenAI-compat path can pick Haiku/Sonnet/Opus per-request.
|
|
"""
|
|
cmd = [
|
|
"claude", "-p",
|
|
"--agent", agent,
|
|
"--dangerously-skip-permissions",
|
|
"--max-budget-usd", str(max_budget_usd),
|
|
"--output-format", "json",
|
|
]
|
|
if model is not None:
|
|
cmd.extend(["--model", model])
|
|
cmd.append(prompt)
|
|
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
cwd=workspace,
|
|
stdout=PIPE,
|
|
stderr=PIPE,
|
|
)
|
|
|
|
# stdout=PIPE / stderr=PIPE guarantee both streams are present.
|
|
assert proc.stdout is not None and proc.stderr is not None
|
|
output_lines: list[str] = []
|
|
async for line in proc.stdout:
|
|
output_lines.append(line.decode())
|
|
|
|
stderr = await proc.stderr.read()
|
|
await proc.wait()
|
|
|
|
return {
|
|
"exit_code": proc.returncode,
|
|
"output": output_lines,
|
|
"stderr": stderr.decode(),
|
|
}
|
|
|
|
|
|
async def _run_execute_job(job_id: str, request: ExecuteRequest):
|
|
"""Background worker for /execute: waits for a slot (queued), then runs the
|
|
agent in an isolated workspace. The timeout covers execution only, never
|
|
the time spent waiting in the queue."""
|
|
workspace = None
|
|
try:
|
|
async with _execution_slot():
|
|
jobs[job_id]["status"] = "running"
|
|
jobs[job_id]["started_at"] = _now_iso()
|
|
workspace = await prepare_workspace(job_id)
|
|
result = await asyncio.wait_for(
|
|
_invoke_claude_subprocess(
|
|
request.prompt, request.agent, request.max_budget_usd, workspace,
|
|
),
|
|
timeout=request.timeout_seconds,
|
|
)
|
|
jobs[job_id].update({
|
|
"status": "completed" if result["exit_code"] == 0 else "failed",
|
|
"exit_code": result["exit_code"],
|
|
"output": result["output"],
|
|
"stderr": result["stderr"],
|
|
"finished_at": _now_iso(),
|
|
"finished_epoch": time.time(),
|
|
})
|
|
except asyncio.TimeoutError:
|
|
jobs[job_id].update({
|
|
"status": "timeout",
|
|
"finished_at": _now_iso(),
|
|
"finished_epoch": time.time(),
|
|
})
|
|
except Exception as exc:
|
|
jobs[job_id].update({
|
|
"status": "error",
|
|
"error": str(exc),
|
|
"finished_at": _now_iso(),
|
|
"finished_epoch": time.time(),
|
|
})
|
|
finally:
|
|
try:
|
|
await cleanup_workspace(workspace)
|
|
except Exception:
|
|
pass
|
|
_evict_old_jobs()
|
|
|
|
|
|
def _extract_assistant_text(output_lines: list[str]) -> str:
|
|
"""Pull the final assistant text out of `claude -p --output-format json`.
|
|
|
|
The CLI emits a single JSON object on stdout (possibly across multiple
|
|
lines if it pretty-prints) with a `result` field holding the final
|
|
assistant message. If parsing fails for any reason, fall back to the
|
|
raw concatenation so callers always get *something* useful.
|
|
"""
|
|
raw = "".join(output_lines).strip()
|
|
if not raw:
|
|
return ""
|
|
try:
|
|
parsed = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
return raw
|
|
if isinstance(parsed, dict):
|
|
for key in ("result", "content", "text"):
|
|
value = parsed.get(key)
|
|
if isinstance(value, str) and value:
|
|
return value
|
|
return raw
|
|
|
|
|
|
def _one_line(text: str, limit: int = 200) -> str:
|
|
"""Collapse multi-line text to a single line, truncated for response bodies."""
|
|
flat = " ".join(text.split())
|
|
return flat[:limit]
|
|
|
|
|
|
def _synthesise_prompt(messages: list[ChatMessage]) -> str:
|
|
"""Flatten OpenAI chat messages into a single prompt body.
|
|
|
|
System messages are surfaced as preamble; user messages become the
|
|
actual request. Multiple user turns are concatenated in order so a
|
|
short multi-turn back-and-forth still works (this is a stateless
|
|
completion — we don't replay prior assistant turns).
|
|
"""
|
|
system_parts = [m.content for m in messages if m.role == "system"]
|
|
user_parts = [m.content for m in messages if m.role == "user"]
|
|
# Assistant messages from prior turns are intentionally NOT injected —
|
|
# claude `-p` is stateless and replaying them as user text would
|
|
# confuse the agent.
|
|
sections: list[str] = []
|
|
if system_parts:
|
|
sections.append("System instructions:\n" + "\n\n".join(system_parts))
|
|
if user_parts:
|
|
sections.append("Request:\n" + "\n\n".join(user_parts))
|
|
if not sections:
|
|
# Defensive — pydantic min_length=1 should already prevent this.
|
|
return ""
|
|
return "\n\n---\n\n".join(sections)
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {
|
|
"status": "ok",
|
|
"busy": inflight_active >= MAX_CONCURRENCY,
|
|
"active": inflight_active,
|
|
"queued": inflight_queued,
|
|
"capacity": MAX_CONCURRENCY,
|
|
}
|
|
|
|
|
|
@app.post("/execute", status_code=202)
|
|
async def execute(
|
|
request: ExecuteRequest,
|
|
authorization: str | None = Header(default=None),
|
|
):
|
|
verify_token(authorization)
|
|
|
|
if not _reserve_queue_slot():
|
|
raise HTTPException(status_code=429, detail="Queue full")
|
|
|
|
job_id = uuid.uuid4().hex[:12]
|
|
jobs[job_id] = {
|
|
"status": "queued",
|
|
"prompt": request.prompt,
|
|
"agent": request.agent,
|
|
"created_at": _now_iso(),
|
|
"metadata": request.metadata,
|
|
}
|
|
|
|
asyncio.create_task(_run_execute_job(job_id, request))
|
|
|
|
return {"job_id": job_id, "status": "queued"}
|
|
|
|
|
|
@app.get("/jobs/{job_id}")
|
|
async def get_job(
|
|
job_id: str,
|
|
authorization: str | None = Header(default=None),
|
|
):
|
|
verify_token(authorization)
|
|
if job_id not in jobs:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
return jobs[job_id]
|
|
|
|
|
|
@app.post("/v1/chat/completions")
|
|
async def chat_completions(
|
|
request: ChatCompletionsRequest,
|
|
authorization: str | None = Header(default=None),
|
|
):
|
|
verify_token(authorization)
|
|
|
|
if request.stream:
|
|
raise HTTPException(status_code=400, detail="streaming not supported")
|
|
|
|
model = request.model if request.model is not None else DEFAULT_MODEL
|
|
if model not in SUPPORTED_MODELS:
|
|
return JSONResponse(
|
|
status_code=400,
|
|
content={
|
|
"error": "unsupported model",
|
|
"supported": sorted(SUPPORTED_MODELS),
|
|
},
|
|
)
|
|
|
|
prompt = _synthesise_prompt(request.messages)
|
|
|
|
if not _reserve_queue_slot():
|
|
return JSONResponse(
|
|
status_code=503,
|
|
content={"error": "execution failed", "detail": "queue full"},
|
|
)
|
|
|
|
chat_id = uuid.uuid4().hex[:12]
|
|
workspace = None
|
|
try:
|
|
async with _execution_slot():
|
|
workspace = await prepare_workspace(chat_id)
|
|
result = await asyncio.wait_for(
|
|
_invoke_claude_subprocess(
|
|
prompt, OPENAI_COMPAT_AGENT, OPENAI_COMPAT_BUDGET_USD,
|
|
workspace, model=model,
|
|
),
|
|
timeout=OPENAI_COMPAT_TIMEOUT_SECONDS,
|
|
)
|
|
except asyncio.TimeoutError:
|
|
return JSONResponse(
|
|
status_code=503,
|
|
content={"error": "execution failed", "detail": "agent timed out"},
|
|
)
|
|
except Exception as exc:
|
|
return JSONResponse(
|
|
status_code=503,
|
|
content={"error": "execution failed", "detail": _one_line(str(exc))},
|
|
)
|
|
finally:
|
|
try:
|
|
await cleanup_workspace(workspace)
|
|
except Exception:
|
|
pass
|
|
|
|
if result["exit_code"] != 0:
|
|
detail = _one_line(result.get("stderr") or "") or f"exit {result['exit_code']}"
|
|
return JSONResponse(
|
|
status_code=503,
|
|
content={"error": "execution failed", "detail": detail},
|
|
)
|
|
|
|
content = _extract_assistant_text(result["output"])
|
|
completion_id = "chatcmpl-" + uuid.uuid4().hex[:24]
|
|
|
|
return {
|
|
"id": completion_id,
|
|
"object": "chat.completion",
|
|
"created": int(time.time()),
|
|
"model": model,
|
|
"choices": [{
|
|
"index": 0,
|
|
"message": {"role": "assistant", "content": content},
|
|
"finish_reason": "stop",
|
|
}],
|
|
"usage": {
|
|
"prompt_tokens": 0,
|
|
"completion_tokens": 0,
|
|
"total_tokens": 0,
|
|
},
|
|
}
|