chat-completions: stream conversational turns (SSE token relay) for realtime voice

Adds stream=true support to POST /v1/chat/completions (it previously 400'd). When streaming, it runs the no-tools `conversational` agent via `claude -p --output-format stream-json --include-partial-messages --verbose` and relays each content_block_delta as an OpenAI chat.completion.chunk SSE event, ending with finish_reason=stop + [DONE]. Free CLI/subscription auth, no tools, no API key. Stateless by design: the full message history is flattened into the prompt (prior assistant turns kept), so an OpenAI-style client that re-sends history each turn — e.g. Pipecat's OpenAILLMService — can stream from us directly. The non-streaming path (recruiter-triage workspace agent) is unchanged. This is phase 1 of the Pipecat realtime full-duplex voice-agent rebuild for portal-assistant (continuous audio, VAD endpointing, barge-in, ~seconds to first words). New pure helpers (stream_argv/delta_text/openai_chunk/ synthesise_chat_prompt) are unit-tested; the SSE endpoint has a mocked-subprocess integration test. 429 passing. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-17 22:22:38 +00:00 · 2026-06-17 22:22:38 +00:00 · a29bffdda3
commit a29bffdda3
parent 4e48214c0b
4 changed files with 304 additions and 8 deletions
--- a/app/conversational.py
+++ b/app/conversational.py
@ -96,3 +96,110 @@ async def run_turn(session_id: str, message: str, model: str) -> dict:
        "reply": extract_reply(output_lines),
        "stderr": stderr.decode(errors="replace"),
    }
+
+
+# ---------------------------------------------------------------------------
+# Streaming (OpenAI-compatible) path — token-level deltas for the realtime
+# voice agent. Pipecat's OpenAILLMService streams from /v1/chat/completions and
+# re-sends the FULL history each turn, so this path is STATELESS: the whole
+# dialogue goes in the prompt and we run a fresh CLI with stream-json to relay
+# incremental tokens as OpenAI chat-completion SSE chunks. (run_turn above stays
+# the session-based path for the non-streaming gateway.)
+# ---------------------------------------------------------------------------
+
+
+def stream_argv(prompt: str, model: str) -> list[str]:
+    """Argv for a STREAMING conversational turn (token deltas via stream-json).
+
+    Stateless — the full conversation is in `prompt` (no --session-id/--resume).
+    `--include-partial-messages` makes the CLI emit `content_block_delta` token
+    events; `--verbose` is required by the CLI for stream-json under --print. No
+    --dangerously-skip-permissions: the conversational agent has no tools.
+    """
+    return [
+        "claude", "-p",
+        "--agent", CONVERSATIONAL_AGENT,
+        "--model", model,
+        "--output-format", "stream-json",
+        "--include-partial-messages",
+        "--verbose",
+        prompt,
+    ]
+
+
+def delta_text(line: str) -> str | None:
+    """Extract the incremental assistant text from one stream-json line.
+
+    Returns the text of a `content_block_delta` / `text_delta` event, or None
+    for any other event (system, message_start, content_block_stop, result) or
+    an unparseable line.
+    """
+    line = line.strip()
+    if not line:
+        return None
+    try:
+        event = json.loads(line)
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(event, dict) or event.get("type") != "stream_event":
+        return None
+    inner = event.get("event") or {}
+    if inner.get("type") != "content_block_delta":
+        return None
+    delta = inner.get("delta") or {}
+    if delta.get("type") == "text_delta":
+        return delta.get("text") or None
+    return None
+
+
+def openai_chunk(
+    completion_id: str,
+    model: str,
+    created: int,
+    *,
+    role: str | None = None,
+    content: str | None = None,
+    finish_reason: str | None = None,
+) -> str:
+    """Format one OpenAI `chat.completion.chunk` as an SSE `data:` line.
+
+    ensure_ascii=False keeps Cyrillic (Bulgarian) intact on the wire.
+    """
+    delta: dict[str, str] = {}
+    if role is not None:
+        delta["role"] = role
+    if content is not None:
+        delta["content"] = content
+    payload = {
+        "id": completion_id,
+        "object": "chat.completion.chunk",
+        "created": created,
+        "model": model,
+        "choices": [{"index": 0, "delta": delta, "finish_reason": finish_reason}],
+    }
+    return "data: " + json.dumps(payload, ensure_ascii=False) + "\n\n"
+
+
+def synthesise_chat_prompt(messages) -> str:
+    """Flatten OpenAI chat messages into a dialogue prompt for the conversational
+    agent, KEEPING prior assistant turns.
+
+    Pipecat re-sends the full message history every call, so multi-turn context
+    is preserved here (statelessly) by replaying the dialogue. Each message is a
+    duck-typed object with `.role` and `.content`. System messages become a
+    preamble; user/assistant turns are rendered as a `User:`/`Assistant:`
+    dialogue ending on the latest user turn.
+    """
+    system = [m.content for m in messages if m.role == "system" and m.content]
+    turns = []
+    for m in messages:
+        if m.role == "user" and m.content:
+            turns.append("User: " + m.content)
+        elif m.role == "assistant" and m.content:
+            turns.append("Assistant: " + m.content)
+    parts = []
+    if system:
+        parts.append("\n\n".join(system))
+    if turns:
+        parts.append("\n".join(turns))
+    return "\n\n".join(parts).strip()
--- a/app/main.py
+++ b/app/main.py
@ -2,6 +2,8 @@ import asyncio
 import hmac
 import json
 import os
+import shutil
+import tempfile
 import time
 import uuid
 from contextlib import asynccontextmanager
@ -10,7 +12,7 @@ from subprocess import PIPE
 from typing import Any, Literal

 from fastapi import FastAPI, HTTPException, Header
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from pydantic import BaseModel, Field

 from app import conversational
@ -446,9 +448,6 @@ async def chat_completions(
 ):
    verify_token(authorization)

-    if request.stream:
-        raise HTTPException(status_code=400, detail="streaming not supported")
-
    model = request.model if request.model is not None else DEFAULT_MODEL
    if model not in SUPPORTED_MODELS:
        return JSONResponse(
@ -459,6 +458,64 @@ async def chat_completions(
            },
        )

+    # Streaming path (the realtime voice agent / Pipecat). Token-level deltas via
+    # the conversational (no-tools) agent in stream-json mode, relayed as
+    # OpenAI chat.completion.chunk SSE. Stateless: the full history is in the
+    # prompt (the client re-sends it each turn). No workspace clone — the
+    # conversational agent reads no files.
+    if request.stream:
+        if not _reserve_queue_slot():
+            return JSONResponse(
+                status_code=503,
+                content={"error": "execution failed", "detail": "queue full"},
+            )
+        prompt = conversational.synthesise_chat_prompt(request.messages)
+        completion_id = "chatcmpl-" + uuid.uuid4().hex[:24]
+        created = int(time.time())
+        spawn = asyncio.create_subprocess_exec  # bound alias (keeps subprocess use tidy)
+
+        async def event_stream():
+            workspace = tempfile.mkdtemp(prefix="conv-stream-")
+            proc = None
+            try:
+                async with _execution_slot():
+                    proc = await spawn(
+                        *conversational.stream_argv(prompt, model),
+                        cwd=workspace, stdout=PIPE, stderr=PIPE,
+                    )
+                    assert proc.stdout is not None
+                    yield conversational.openai_chunk(
+                        completion_id, model, created, role="assistant"
+                    )
+                    try:
+                        async with asyncio.timeout(
+                            conversational.CONVERSATIONAL_TIMEOUT_SECONDS
+                        ):
+                            async for raw in proc.stdout:
+                                text = conversational.delta_text(
+                                    raw.decode(errors="replace")
+                                )
+                                if text:
+                                    yield conversational.openai_chunk(
+                                        completion_id, model, created, content=text
+                                    )
+                    except asyncio.TimeoutError:
+                        pass  # wedged turn — close the stream cleanly
+                    yield conversational.openai_chunk(
+                        completion_id, model, created, finish_reason="stop"
+                    )
+                    yield "data: [DONE]\n\n"
+            finally:
+                if proc is not None and proc.returncode is None:
+                    try:
+                        proc.kill()
+                        await proc.wait()
+                    except ProcessLookupError:
+                        pass
+                shutil.rmtree(workspace, ignore_errors=True)
+
+        return StreamingResponse(event_stream(), media_type="text/event-stream")
+
    prompt = _synthesise_prompt(request.messages)

    if not _reserve_queue_slot():