chat-completions: stream conversational turns (SSE token relay) for realtime voice
Some checks failed
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build (push) Has been cancelled
Build and Push / deploy (push) Has been cancelled
Build and Push / notify-failure (push) Has been cancelled

Adds stream=true support to POST /v1/chat/completions (it previously 400'd).
When streaming, it runs the no-tools `conversational` agent via
`claude -p --output-format stream-json --include-partial-messages --verbose`
and relays each content_block_delta as an OpenAI chat.completion.chunk SSE
event, ending with finish_reason=stop + [DONE]. Free CLI/subscription auth, no
tools, no API key.

Stateless by design: the full message history is flattened into the prompt
(prior assistant turns kept), so an OpenAI-style client that re-sends history
each turn — e.g. Pipecat's OpenAILLMService — can stream from us directly. The
non-streaming path (recruiter-triage workspace agent) is unchanged.

This is phase 1 of the Pipecat realtime full-duplex voice-agent rebuild for
portal-assistant (continuous audio, VAD endpointing, barge-in, ~seconds to
first words). New pure helpers (stream_argv/delta_text/openai_chunk/
synthesise_chat_prompt) are unit-tested; the SSE endpoint has a mocked-subprocess
integration test. 429 passing.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Viktor Barzin 2026-06-17 22:22:38 +00:00
parent 4e48214c0b
commit a29bffdda3
4 changed files with 304 additions and 8 deletions

View file

@ -171,3 +171,79 @@ async def test_conversational_returns_503_on_failure(auth_header):
)
assert r.status_code == 503
assert r.json()["error"] == "execution failed"
# --------------------------------------------------------------------------- #
# streaming helpers (OpenAI-compatible token relay for the realtime voice agent)
# --------------------------------------------------------------------------- #
from collections import namedtuple # noqa: E402
_Msg = namedtuple("_Msg", "role content")
def test_stream_argv_uses_stream_json_and_is_stateless():
argv = conversational.stream_argv("hello", "sonnet")
assert argv[:2] == ["claude", "-p"]
assert "--agent" in argv and "conversational" in argv
assert "stream-json" in argv
assert "--include-partial-messages" in argv
assert "--verbose" in argv
assert "--model" in argv and "sonnet" in argv
assert argv[-1] == "hello"
# stateless + no tools
assert "--resume" not in argv and "--session-id" not in argv
assert "--dangerously-skip-permissions" not in argv
def test_delta_text_extracts_content_block_delta():
line = json.dumps({
"type": "stream_event",
"event": {"type": "content_block_delta",
"delta": {"type": "text_delta", "text": "Слон"}},
})
assert conversational.delta_text(line) == "Слон"
def test_delta_text_ignores_non_text_events():
for ev in [
{"type": "system"},
{"type": "stream_event", "event": {"type": "message_start"}},
{"type": "stream_event", "event": {"type": "content_block_delta",
"delta": {"type": "input_json_delta", "partial_json": "{"}}},
{"type": "result"},
]:
assert conversational.delta_text(json.dumps(ev)) is None
assert conversational.delta_text("") is None
assert conversational.delta_text("not json") is None
def test_openai_chunk_valid_sse_and_keeps_cyrillic():
s = conversational.openai_chunk("chatcmpl-x", "sonnet", 123, content="две")
assert s.startswith("data: ") and s.endswith("\n\n")
payload = json.loads(s[len("data: "):].strip())
assert payload["object"] == "chat.completion.chunk"
assert payload["choices"][0]["delta"]["content"] == "две"
assert payload["choices"][0]["finish_reason"] is None
assert "две" in s # not unicode-escaped
def test_openai_chunk_role_and_finish():
role = conversational.openai_chunk("id", "m", 1, role="assistant")
assert json.loads(role[6:].strip())["choices"][0]["delta"] == {"role": "assistant"}
stop = conversational.openai_chunk("id", "m", 1, finish_reason="stop")
c = json.loads(stop[6:].strip())["choices"][0]
assert c["finish_reason"] == "stop" and c["delta"] == {}
def test_synthesise_chat_prompt_keeps_assistant_turns():
msgs = [
_Msg("system", "Be brief."),
_Msg("user", "Здравей"),
_Msg("assistant", "Здравей! Как си?"),
_Msg("user", "Добре, ти?"),
]
p = conversational.synthesise_chat_prompt(msgs)
assert "Be brief." in p
assert "User: Здравей" in p
assert "Assistant: Здравей! Как си?" in p
assert p.strip().endswith("User: Добре, ти?")

View file

@ -98,14 +98,15 @@ async def test_chat_completions_happy_path(auth_header):
@pytest.mark.asyncio
async def test_chat_completions_rejects_streaming(auth_header):
"""stream=true is not supported and must 400 with a clear message."""
async def test_chat_completions_streaming_rejects_unsupported_model(auth_header):
"""Streaming is supported now; model validation still runs first, so an
unsupported model 400s before any CLI is spawned."""
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.post(
"/v1/chat/completions",
json={
"model": "haiku",
"model": "gpt-4",
"messages": [{"role": "user", "content": "hi"}],
"stream": True,
},
@ -113,7 +114,7 @@ async def test_chat_completions_rejects_streaming(auth_header):
)
assert response.status_code == 400
body = response.json()
assert "streaming not supported" in json.dumps(body).lower()
assert "unsupported model" in json.dumps(body).lower()
@pytest.mark.asyncio
@ -370,3 +371,58 @@ async def test_chat_completions_response_model_echoes_default_when_missing(auth_
)
assert status == 200
assert body["model"] == "sonnet"
def _delta_line(text: str) -> str:
return json.dumps({
"type": "stream_event",
"event": {"type": "content_block_delta",
"delta": {"type": "text_delta", "text": text}},
})
@pytest.mark.asyncio
async def test_chat_completions_streaming_relays_token_sse(auth_header):
"""stream=true relays CLI stream-json token deltas as OpenAI SSE chunks."""
cli_output = "\n".join([
json.dumps({"type": "system"}),
json.dumps({"type": "stream_event", "event": {"type": "message_start"}}),
_delta_line("Две"),
_delta_line(" точки."),
json.dumps({"type": "result", "subtype": "success"}),
]).encode()
mock_proc = _mock_subprocess_returning(cli_output, returncode=0)
with patch("app.main.asyncio.create_subprocess_exec", return_value=mock_proc):
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.post(
"/v1/chat/completions",
json={
"model": "sonnet",
"stream": True,
"messages": [{"role": "user", "content": "Колко е?"}],
},
headers=auth_header,
)
assert response.status_code == 200, response.text
assert response.headers["content-type"].startswith("text/event-stream")
body = response.text
assert "chat.completion.chunk" in body
assert body.rstrip().endswith("data: [DONE]")
# Reassemble the streamed assistant content from the delta chunks.
content = ""
saw_role = False
for line in body.splitlines():
if not line.startswith("data: ") or line.strip() == "data: [DONE]":
continue
payload = json.loads(line[len("data: "):])
assert payload["object"] == "chat.completion.chunk"
delta = payload["choices"][0]["delta"]
if delta.get("role") == "assistant":
saw_role = True
content += delta.get("content", "")
assert saw_role
assert content == "Две точки."