speaches-ai · fedirz · Feb 19, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/docs/installation.md b/docs/installation.md
@@ -1,8 +1,12 @@
+!!! warning
+
+    Additional steps are required to use the text-to-speech feature. Please see the [Text-to-Speech](/docs/usage/text-to-speech.md#prerequisite).
+
 ## Docker Compose (Recommended)
 
 !!! note
 
-    I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
+    I'm using newer Docker Compose features. If you are using an older version of Docker Compose, you may need need to update.
 
 Download the necessary Docker Compose files
 

diff --git a/docs/usage/open-webui-integration.md → docs/usage/open-webui-intergration.md b/docs/usage/open-webui-integration.md → docs/usage/open-webui-intergration.md
diff --git a/docs/usage/text-to-speech.md b/docs/usage/text-to-speech.md
@@ -120,4 +120,4 @@ curl http://localhost:8000/v1/audio/speech --header "Content-Type: application/j
 ## Limitations
 
 - `response_format`: `opus` and `aac` are not supported
-- Maximuam audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M`
+- Maximum audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M`
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -42,9 +42,9 @@ plugins:
 nav:
   - Introduction: index.md
   - Usage / Capabilities:
-      - Voice Chat: usage/voice-chat.md
-      - Speech-to-Text: usage/speech-to-text.md
       - Text-to-Speech: usage/text-to-speech.md
+      - Speech-to-Text: usage/speech-to-text.md
+      - Voice Chat: usage/voice-chat.md
       - Open WebUI Integration: usage/open-webui-integration.md
   - Installation: installation.md
   - Configuration: configuration.md

diff --git a/src/speaches/realtime/conversation_event_router.py b/src/speaches/realtime/conversation_event_router.py
@@ -102,7 +102,9 @@ async def handle_conversation_item_input_audio_transcription_completed_event(
     ctx.response = ResponseHandler(
         completion_client=ctx.completion_client,
         model=ctx.session.model,
-        configuration=Response(**ctx.session.model_dump()),
+        configuration=Response(
+            conversation="auto", input=list(ctx.conversation.items.values()), **ctx.session.model_dump()
+        ),
         conversation=ctx.conversation,
         pubsub=ctx.pubsub,
     )

diff --git a/src/speaches/realtime/response_event_router.py b/src/speaches/realtime/response_event_router.py
@@ -15,6 +15,7 @@
     items_to_chat_messages,
 )
 from speaches.realtime.event_router import EventRouter
+from speaches.realtime.session_event_router import unsupported_field_error, update_dict
 from speaches.realtime.utils import generate_response_id, task_done_callback
 from speaches.types.realtime import (
     ConversationItemContentAudio,
@@ -261,14 +262,37 @@ def stop(self) -> None:
 
 
 @event_router.register("response.create")
-async def handle_response_create_event(ctx: SessionContext, _event: ResponseCreateEvent) -> None:
+async def handle_response_create_event(ctx: SessionContext, event: ResponseCreateEvent) -> None:
     if ctx.response is not None:
         ctx.response.stop()
 
+    configuration = Response(
+        conversation="auto", input=list(ctx.conversation.items.values()), **ctx.session.model_dump()
+    )
+    if event.response is not None:
+        if event.response.conversation is not None:
+            ctx.pubsub.publish_nowait(unsupported_field_error("response.conversation"))
+        if event.response.input is not None:
+            ctx.pubsub.publish_nowait(unsupported_field_error("response.input"))
+        if event.response.output_audio_format is not None:
+            ctx.pubsub.publish_nowait(unsupported_field_error("response.output_audio_format"))
+        if event.response.metadata is not None:
+            ctx.pubsub.publish_nowait(unsupported_field_error("response.metadata"))
+
+        configuration_dict = configuration.model_dump()
+        configuration_update_dict = event.response.model_dump(
+            exclude_none=True, exclude={"conversation", "input", "output_audio_format", "metadata"}
+        )
+        logger.debug(f"Applying response configuration update: {configuration_update_dict}")
+        logger.debug(f"Response configuration before update: {configuration_dict}")
+        updated_configuration = update_dict(configuration_dict, configuration_update_dict)
+        logger.debug(f"Response configuration after update: {updated_configuration}")
+        configuration = Response(**updated_configuration)
+
     ctx.response = ResponseHandler(
         completion_client=ctx.completion_client,
         model=ctx.session.model,
-        configuration=Response(**ctx.session.model_dump()),  # FIXME
+        configuration=configuration,
         conversation=ctx.conversation,
         pubsub=ctx.pubsub,
     )

diff --git a/tests/realtime_vad_test.py b/tests/realtime_vad_test.py
@@ -0,0 +1,108 @@
+import asyncio
+import base64
+import logging
+from pathlib import Path
+
+import numpy as np
+from openai import AsyncOpenAI
+from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection
+from openai.types.beta.realtime.conversation_item_content_param import ConversationItemContentParam
+from openai.types.beta.realtime.conversation_item_param import ConversationItemParam
+from openai.types.beta.realtime.session_update_event_param import Session, SessionTurnDetection
+import pytest
+import soundfile as sf
+import websockets
+
+from speaches.audio import resample_audio
+
+logger = logging.getLogger(__name__)
+
+SAMPLE_RATE = 24000
+SAMPLE_WIDTH = 2
+BYTERATE = SAMPLE_RATE * SAMPLE_WIDTH  # like "bitrate" but in bytes
+
+WS_BASE_URL = "ws://localhost:8000/v1"
+MODEL = "gpt-4o-mini"
+
+RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=True))
+NO_RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=False))
+
+
+async def audio_sender(
+    conn: AsyncRealtimeConnection, audio_bytes: bytes, chunks_per_second: int = 10, speed: int = 1
+) -> None:
+    chunk_size = BYTERATE // chunks_per_second
+    try:
+        async with asyncio.TaskGroup() as tg:
+            for i in range(0, len(audio_bytes), chunk_size):
+                logger.info(f"Sending audio chunk from {i} to {i + chunk_size} of {len(audio_bytes)}")
+                audio_chunk = audio_bytes[i : i + chunk_size]
+                tg.create_task(conn.input_audio_buffer.append(audio=base64.b64encode(audio_chunk).decode("utf-8")))
+                await asyncio.sleep(1 / chunks_per_second / speed)
+    except* websockets.exceptions.ConnectionClosedError:
+        logger.info("Connection closed")
+
+
+async def print_events(conn: AsyncRealtimeConnection, final_event: str | None = None) -> None:
+    try:
+        async for event in conn:
+            if event.type == "response.audio.delta":
+                size = len(base64.b64decode(event.delta))
+                event.delta = f"base64 encoded audio of size {size} bytes"
+            print(event.model_dump_json())
+            if final_event is not None and event.type == final_event:
+                break
+    except websockets.exceptions.ConnectionClosedError:
+        logger.info("Connection closed")
+
+
+data, samplerate = sf.read(Path("1_2_3_4_5_6_7_8.wav"), dtype="int16")
+pcm_audio_bytes = data.tobytes()
+audio_bytes = resample_audio(pcm_audio_bytes, samplerate, 24000)
+quite_audio = np.zeros(SAMPLE_RATE * 3, dtype=np.int16).tobytes()
+audio_bytes = audio_bytes + quite_audio
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires_openai
+async def test_realtime_vad_openai() -> None:
+    realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
+    async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
+        print_events_task = tg.create_task(
+            print_events(conn, final_event="conversation.item.input_audio_transcription.completed")
+        )
+        await conn.session.update(session=NO_RESPONSE_SESSION)
+        audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes))
+        await audio_sender_task
+        await print_events_task
+        await conn.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires_openai
+async def test_realtime_response() -> None:
+    realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
+    async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
+        print_events_task = tg.create_task(print_events(conn, final_event=None))
+        await conn.session.update(session=RESPONSE_SESSION)
+        audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes))
+        await audio_sender_task
+        await print_events_task
+        await conn.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.requires_openai
+async def test_realtime_create_conversation_item() -> None:
+    realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
+    async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
+        print_events_task = tg.create_task(print_events(conn, final_event="response.done"))
+        await conn.session.update(session=NO_RESPONSE_SESSION)
+        await conn.conversation.item.create(
+            item=ConversationItemParam(
+                role="user", type="message", content=[ConversationItemContentParam(type="input_text", text="Hello")]
+            )
+        )
+        await conn.response.create()
+        await print_events_task
+        await conn.close()