chat-completions: stream conversational turns (SSE token relay) for realtime voice

Adds stream=true support to POST /v1/chat/completions (it previously 400'd). When streaming, it runs the no-tools `conversational` agent via `claude -p --output-format stream-json --include-partial-messages --verbose` and relays each content_block_delta as an OpenAI chat.completion.chunk SSE event, ending with finish_reason=stop + [DONE]. Free CLI/subscription auth, no tools, no API key. Stateless by design: the full message history is flattened into the prompt (prior assistant turns kept), so an OpenAI-style client that re-sends history each turn — e.g. Pipecat's OpenAILLMService — can stream from us directly. The non-streaming path (recruiter-triage workspace agent) is unchanged. This is phase 1 of the Pipecat realtime full-duplex voice-agent rebuild for portal-assistant (continuous audio, VAD endpointing, barge-in, ~seconds to first words). New pure helpers (stream_argv/delta_text/openai_chunk/ synthesise_chat_prompt) are unit-tested; the SSE endpoint has a mocked-subprocess integration test. 429 passing. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-17 22:22:38 +00:00 · 2026-06-17 22:22:38 +00:00 · a29bffdda3
commit a29bffdda3
parent 4e48214c0b
4 changed files with 304 additions and 8 deletions
--- a/app/main.py
+++ b/app/main.py
@ -2,6 +2,8 @@ import asyncio
 import hmac
 import json
 import os
+import shutil
+import tempfile
 import time
 import uuid
 from contextlib import asynccontextmanager
@ -10,7 +12,7 @@ from subprocess import PIPE
 from typing import Any, Literal

 from fastapi import FastAPI, HTTPException, Header
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, Field

 from app import conversational
@ -446,9 +448,6 @@ async def chat_completions(
 ):
    verify_token(authorization)

-    if request.stream:
-        raise HTTPException(status_code=400, detail="streaming not supported")
-
    model = request.model if request.model is not None else DEFAULT_MODEL
    if model not in SUPPORTED_MODELS:
        return JSONResponse(
@ -459,6 +458,64 @@ async def chat_completions(
            },
        )

+    # Streaming path (the realtime voice agent / Pipecat). Token-level deltas via
+    # the conversational (no-tools) agent in stream-json mode, relayed as
+    # OpenAI chat.completion.chunk SSE. Stateless: the full history is in the
+    # prompt (the client re-sends it each turn). No workspace clone — the
+    # conversational agent reads no files.
+    if request.stream:
+        if not _reserve_queue_slot():
+            return JSONResponse(
+                status_code=503,
+                content={"error": "execution failed", "detail": "queue full"},
+            )
+        prompt = conversational.synthesise_chat_prompt(request.messages)
+        completion_id = "chatcmpl-" + uuid.uuid4().hex[:24]
+        created = int(time.time())
+        spawn = asyncio.create_subprocess_exec  # bound alias (keeps subprocess use tidy)
+
+        async def event_stream():
+            workspace = tempfile.mkdtemp(prefix="conv-stream-")
+            proc = None
+            try:
+                async with _execution_slot():
+                    proc = await spawn(
+                        *conversational.stream_argv(prompt, model),
+                        cwd=workspace, stdout=PIPE, stderr=PIPE,
+                    )
+                    assert proc.stdout is not None
+                    yield conversational.openai_chunk(
+                        completion_id, model, created, role="assistant"
+                    )
+                    try:
+                        async with asyncio.timeout(
+                            conversational.CONVERSATIONAL_TIMEOUT_SECONDS
+                        ):
+                            async for raw in proc.stdout:
+                                text = conversational.delta_text(
+                                    raw.decode(errors="replace")
+                                )
+                                if text:
+                                    yield conversational.openai_chunk(
+                                        completion_id, model, created, content=text
+                                    )
+                    except asyncio.TimeoutError:
+                        pass  # wedged turn — close the stream cleanly
+                    yield conversational.openai_chunk(
+                        completion_id, model, created, finish_reason="stop"
+                    )
+                    yield "data: [DONE]\n\n"
+            finally:
+                if proc is not None and proc.returncode is None:
+                    try:
+                        proc.kill()
+                        await proc.wait()
+                    except ProcessLookupError:
+                        pass
+                shutil.rmtree(workspace, ignore_errors=True)
+
+        return StreamingResponse(event_stream(), media_type="text/event-stream")
+
    prompt = _synthesise_prompt(request.messages)

    if not _reserve_queue_slot():