import asyncio import hmac import json import os import time import uuid from contextlib import asynccontextmanager from datetime import datetime, timezone from subprocess import PIPE from typing import Any, Literal from fastapi import FastAPI, HTTPException, Header from fastapi.responses import JSONResponse from pydantic import BaseModel, Field app = FastAPI(title="Claude Agent Service") API_TOKEN = os.environ.get("API_BEARER_TOKEN", "") # Warm base clone, populated by the init container. Each job clones from this # into its own dir under JOBS_DIR so concurrent calls never share a working # tree (no git index.lock contention, no clobbered edits). BASE_DIR = os.environ.get("WORKSPACE_DIR", "/workspace/infra") JOBS_DIR = os.environ.get("JOBS_DIR", "/workspace/jobs") GIT_CRYPT_KEY = os.environ.get("GIT_CRYPT_KEY", "/secrets/git-crypt/key") # Concurrency. MAX_CONCURRENCY caps simultaneous claude runs ("soft-unbounded" # — a high default rather than a tight limit); excess calls queue FIFO rather # than being rejected. MAX_QUEUE_DEPTH is a safety valve so a runaway burst # can't pin unbounded memory: past it, callers are turned away (429/503). MAX_CONCURRENCY = int(os.environ.get("MAX_CONCURRENCY", "10")) MAX_QUEUE_DEPTH = int(os.environ.get("MAX_QUEUE_DEPTH", "100")) # Completed jobs are evicted from the in-memory registry past this age so the # dict doesn't grow without bound. JOB_TTL_SECONDS = int(os.environ.get("JOB_TTL_SECONDS", "3600")) # Bursts share one base fetch rather than serialising a network round-trip per # job behind the git lock. FETCH_DEBOUNCE_SECONDS = int(os.environ.get("FETCH_DEBOUNCE_SECONDS", "15")) # OpenAI compat: model selection is per-request so callers can pick # Haiku/Sonnet/Opus to control cost. The agent is fixed — `recruiter-triage` # has the broadest tool surface (WebSearch, WebFetch, Read, Grep, Glob, Bash); # the alternative (`beads-task-runner`) is locked to read-only `bd` verbs which # would fail arbitrary OpenAI-API callers. The model on the agent's frontmatter # is overridden by the `--model` CLI flag we pass per-request. # Bare aliases auto-roll forward to the latest published version of each # family. The Claude CLI resolves `haiku` → `claude-haiku-4-5-20251001` # (and bumps it when Anthropic ships a newer Haiku) — letting us avoid # version bumps on every release. Add a specific date-suffixed string here # only if a caller needs to pin against an upcoming roll-forward. SUPPORTED_MODELS: frozenset[str] = frozenset({ "haiku", "sonnet", "opus", # Legacy date-suffixed forms — kept for callers that pinned before the # 2026-06-01 bare-aliases switch (fire-planner < c1c1e22). Drop these # once all consumers have been re-imaged. "claude-haiku-4-5", "claude-sonnet-4-6", "claude-opus-4-7", }) DEFAULT_MODEL = "sonnet" OPENAI_COMPAT_AGENT = "recruiter-triage" OPENAI_COMPAT_BUDGET_USD = 2.0 OPENAI_COMPAT_TIMEOUT_SECONDS = 900 _TERMINAL_STATUSES = frozenset({"completed", "failed", "timeout", "error"}) jobs: dict[str, dict] = {} # Concurrency primitives. The semaphore bounds simultaneous executions; the git # lock is held only for the fast per-job workspace setup/teardown (fetch + # local clone + unlock + rm), NOT for the agent run itself. execution_semaphore = asyncio.Semaphore(MAX_CONCURRENCY) git_lock = asyncio.Lock() inflight_active = 0 inflight_queued = 0 _last_fetch_epoch = 0.0 class ExecuteRequest(BaseModel): prompt: str agent: str max_budget_usd: float = 5.0 timeout_seconds: int = 2700 metadata: dict | None = None class ChatMessage(BaseModel): role: Literal["system", "user", "assistant"] content: str class ChatCompletionsRequest(BaseModel): # `model` is optional: callers that omit it get DEFAULT_MODEL. We still # validate the explicit value against SUPPORTED_MODELS at the route level # so we can return a structured 400 listing the allowed IDs. model: str | None = None messages: list[ChatMessage] = Field(..., min_length=1) max_tokens: int | None = None temperature: float | None = None stream: bool = False # Tolerate (and ignore) other OpenAI fields rather than 422-ing on them. model_config = {"extra": "allow"} def verify_token(authorization: str | None): # Reject everything when the service is unconfigured. compare_digest("", "") # returns True, so without this guard an empty API_TOKEN would happily # accept an empty header. if not API_TOKEN: raise HTTPException(status_code=401, detail="Service unauthenticated") if not authorization or not authorization.startswith("Bearer "): raise HTTPException(status_code=401, detail="Missing bearer token") token = authorization.removeprefix("Bearer ") if not hmac.compare_digest(token, API_TOKEN): raise HTTPException(status_code=401, detail="Invalid token") def _now_iso() -> str: return datetime.now(timezone.utc).isoformat() def _reserve_queue_slot() -> bool: """Admit a call into the queue, or refuse it if the queue is saturated. Returns False when active + queued already fills MAX_QUEUE_DEPTH — the caller should then turn the request away (429/503). """ global inflight_queued if inflight_active + inflight_queued >= MAX_QUEUE_DEPTH: return False inflight_queued += 1 return True @asynccontextmanager async def _execution_slot(): """Hold one concurrency permit for the duration of an agent run. The caller must have reserved a queue slot via `_reserve_queue_slot()` first; this moves it from queued -> active on acquire and always releases. """ global inflight_active, inflight_queued acquired = False try: await execution_semaphore.acquire() acquired = True inflight_queued -= 1 inflight_active += 1 yield finally: if acquired: inflight_active -= 1 execution_semaphore.release() else: # Cancelled while still waiting in the queue. inflight_queued -= 1 def _evict_old_jobs() -> None: now = time.time() stale = [ jid for jid, job in jobs.items() if job.get("status") in _TERMINAL_STATUSES and now - job.get("finished_epoch", now) > JOB_TTL_SECONDS ] for jid in stale: jobs.pop(jid, None) async def _run(*cmd: str, cwd: str | None = None, timeout: float | None = None, check: bool = True, capture: bool = False) -> tuple[int, str]: """Run a subprocess (no shell), optionally capturing stdout. Raises on non-zero unless `check=False`. Used for the git/git-crypt/rm steps of per-job workspace setup.""" proc = await asyncio.create_subprocess_exec( *cmd, cwd=cwd, stdout=PIPE, stderr=PIPE, ) try: out, err = await asyncio.wait_for(proc.communicate(), timeout=timeout) except asyncio.TimeoutError: proc.kill() await proc.wait() raise rc = proc.returncode or 0 if check and rc != 0: raise RuntimeError(f"{cmd[0]} failed ({rc}): {err.decode(errors='replace')[:200]}") return rc, (out.decode(errors="replace") if capture else "") async def _refresh_base() -> None: """Pull the base clone up to origin/master, debounced so a burst of jobs shares one fetch. Failures are tolerated — jobs run against the last good base rather than wedging on a transient network blip.""" global _last_fetch_epoch now = time.time() if now - _last_fetch_epoch < FETCH_DEBOUNCE_SECONDS: return _last_fetch_epoch = now await _run("git", "-C", BASE_DIR, "fetch", "origin", "--prune", timeout=120, check=False) await _run("git", "-C", BASE_DIR, "reset", "--hard", "origin/master", check=False) async def prepare_workspace(job_id: str) -> str: """Create an isolated git checkout for one job and return its path. A local clone of the warm base hardlinks the object store (near-free) and carries only tracked files (no stale .terraform). The git lock is held just for this fast setup, never for the agent run. """ job_dir = os.path.join(JOBS_DIR, job_id) async with git_lock: await _refresh_base() await _run("git", "clone", "--local", BASE_DIR, job_dir) rc, base_origin = await _run( "git", "-C", BASE_DIR, "remote", "get-url", "origin", check=False, capture=True, ) if rc == 0 and base_origin.strip(): await _run("git", "-C", job_dir, "remote", "set-url", "origin", base_origin.strip(), check=False) if GIT_CRYPT_KEY and os.path.exists(GIT_CRYPT_KEY): await _run("git-crypt", "unlock", GIT_CRYPT_KEY, cwd=job_dir, check=False) return job_dir async def cleanup_workspace(path: str | None) -> None: if not path: return await _run("rm", "-rf", path, check=False) async def _invoke_claude_subprocess( prompt: str, agent: str, max_budget_usd: float, workspace: str, model: str | None = None, ) -> dict[str, Any]: """Run the claude CLI once in `workspace` and return a result dict. Holds no lock and does not touch the `jobs` dict, so it is shared by both the background `/execute` path and the synchronous `/v1/chat/completions` path. The caller provides an isolated `workspace` (one per job) as cwd. `model`, when provided, becomes `--model ` on the claude CLI. This overrides whatever `model:` is set in the agent's frontmatter so the OpenAI-compat path can pick Haiku/Sonnet/Opus per-request. """ cmd = [ "claude", "-p", "--agent", agent, "--dangerously-skip-permissions", "--max-budget-usd", str(max_budget_usd), "--output-format", "json", ] if model is not None: cmd.extend(["--model", model]) cmd.append(prompt) proc = await asyncio.create_subprocess_exec( *cmd, cwd=workspace, stdout=PIPE, stderr=PIPE, ) # stdout=PIPE / stderr=PIPE guarantee both streams are present. assert proc.stdout is not None and proc.stderr is not None output_lines: list[str] = [] async for line in proc.stdout: output_lines.append(line.decode()) stderr = await proc.stderr.read() await proc.wait() return { "exit_code": proc.returncode, "output": output_lines, "stderr": stderr.decode(), } async def _run_execute_job(job_id: str, request: ExecuteRequest): """Background worker for /execute: waits for a slot (queued), then runs the agent in an isolated workspace. The timeout covers execution only, never the time spent waiting in the queue.""" workspace = None try: async with _execution_slot(): jobs[job_id]["status"] = "running" jobs[job_id]["started_at"] = _now_iso() workspace = await prepare_workspace(job_id) result = await asyncio.wait_for( _invoke_claude_subprocess( request.prompt, request.agent, request.max_budget_usd, workspace, ), timeout=request.timeout_seconds, ) jobs[job_id].update({ "status": "completed" if result["exit_code"] == 0 else "failed", "exit_code": result["exit_code"], "output": result["output"], "stderr": result["stderr"], "finished_at": _now_iso(), "finished_epoch": time.time(), }) except asyncio.TimeoutError: jobs[job_id].update({ "status": "timeout", "finished_at": _now_iso(), "finished_epoch": time.time(), }) except Exception as exc: jobs[job_id].update({ "status": "error", "error": str(exc), "finished_at": _now_iso(), "finished_epoch": time.time(), }) finally: try: await cleanup_workspace(workspace) except Exception: pass _evict_old_jobs() def _extract_assistant_text(output_lines: list[str]) -> str: """Pull the final assistant text out of `claude -p --output-format json`. The CLI emits a single JSON object on stdout (possibly across multiple lines if it pretty-prints) with a `result` field holding the final assistant message. If parsing fails for any reason, fall back to the raw concatenation so callers always get *something* useful. """ raw = "".join(output_lines).strip() if not raw: return "" try: parsed = json.loads(raw) except json.JSONDecodeError: return raw if isinstance(parsed, dict): for key in ("result", "content", "text"): value = parsed.get(key) if isinstance(value, str) and value: return value return raw def _one_line(text: str, limit: int = 200) -> str: """Collapse multi-line text to a single line, truncated for response bodies.""" flat = " ".join(text.split()) return flat[:limit] def _synthesise_prompt(messages: list[ChatMessage]) -> str: """Flatten OpenAI chat messages into a single prompt body. System messages are surfaced as preamble; user messages become the actual request. Multiple user turns are concatenated in order so a short multi-turn back-and-forth still works (this is a stateless completion — we don't replay prior assistant turns). """ system_parts = [m.content for m in messages if m.role == "system"] user_parts = [m.content for m in messages if m.role == "user"] # Assistant messages from prior turns are intentionally NOT injected — # claude `-p` is stateless and replaying them as user text would # confuse the agent. sections: list[str] = [] if system_parts: sections.append("System instructions:\n" + "\n\n".join(system_parts)) if user_parts: sections.append("Request:\n" + "\n\n".join(user_parts)) if not sections: # Defensive — pydantic min_length=1 should already prevent this. return "" return "\n\n---\n\n".join(sections) @app.get("/health") async def health(): return { "status": "ok", "busy": inflight_active >= MAX_CONCURRENCY, "active": inflight_active, "queued": inflight_queued, "capacity": MAX_CONCURRENCY, } @app.post("/execute", status_code=202) async def execute( request: ExecuteRequest, authorization: str | None = Header(default=None), ): verify_token(authorization) if not _reserve_queue_slot(): raise HTTPException(status_code=429, detail="Queue full") job_id = uuid.uuid4().hex[:12] jobs[job_id] = { "status": "queued", "prompt": request.prompt, "agent": request.agent, "created_at": _now_iso(), "metadata": request.metadata, } asyncio.create_task(_run_execute_job(job_id, request)) return {"job_id": job_id, "status": "queued"} @app.get("/jobs/{job_id}") async def get_job( job_id: str, authorization: str | None = Header(default=None), ): verify_token(authorization) if job_id not in jobs: raise HTTPException(status_code=404, detail="Job not found") return jobs[job_id] @app.post("/v1/chat/completions") async def chat_completions( request: ChatCompletionsRequest, authorization: str | None = Header(default=None), ): verify_token(authorization) if request.stream: raise HTTPException(status_code=400, detail="streaming not supported") model = request.model if request.model is not None else DEFAULT_MODEL if model not in SUPPORTED_MODELS: return JSONResponse( status_code=400, content={ "error": "unsupported model", "supported": sorted(SUPPORTED_MODELS), }, ) prompt = _synthesise_prompt(request.messages) if not _reserve_queue_slot(): return JSONResponse( status_code=503, content={"error": "execution failed", "detail": "queue full"}, ) chat_id = uuid.uuid4().hex[:12] workspace = None try: async with _execution_slot(): workspace = await prepare_workspace(chat_id) result = await asyncio.wait_for( _invoke_claude_subprocess( prompt, OPENAI_COMPAT_AGENT, OPENAI_COMPAT_BUDGET_USD, workspace, model=model, ), timeout=OPENAI_COMPAT_TIMEOUT_SECONDS, ) except asyncio.TimeoutError: return JSONResponse( status_code=503, content={"error": "execution failed", "detail": "agent timed out"}, ) except Exception as exc: return JSONResponse( status_code=503, content={"error": "execution failed", "detail": _one_line(str(exc))}, ) finally: try: await cleanup_workspace(workspace) except Exception: pass if result["exit_code"] != 0: detail = _one_line(result.get("stderr") or "") or f"exit {result['exit_code']}" return JSONResponse( status_code=503, content={"error": "execution failed", "detail": detail}, ) content = _extract_assistant_text(result["output"]) completion_id = "chatcmpl-" + uuid.uuid4().hex[:24] return { "id": completion_id, "object": "chat.completion", "created": int(time.time()), "model": model, "choices": [{ "index": 0, "message": {"role": "assistant", "content": content}, "finish_reason": "stop", }], "usage": { "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, }, }