Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs feedback #343

Merged
merged 2 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
!!! warning

Additional steps are required to use the text-to-speech feature. Please see the [Text-to-Speech](/docs/usage/text-to-speech.md#prerequisite).

## Docker Compose (Recommended)

!!! note

I'm using newer Docker Compsose features. If you are using an older version of Docker Compose, you may need need to update.
I'm using newer Docker Compose features. If you are using an older version of Docker Compose, you may need need to update.

Download the necessary Docker Compose files

Expand Down
2 changes: 1 addition & 1 deletion docs/usage/text-to-speech.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,4 @@ curl http://localhost:8000/v1/audio/speech --header "Content-Type: application/j
## Limitations

- `response_format`: `opus` and `aac` are not supported
- Maximuam audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M`
- Maximum audio generation length is 10 seconds for `rhasspy/piper-voices` and 30 seconds for `hexgrad/Kokoro-82M`
4 changes: 2 additions & 2 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ plugins:
nav:
- Introduction: index.md
- Usage / Capabilities:
- Voice Chat: usage/voice-chat.md
- Speech-to-Text: usage/speech-to-text.md
- Text-to-Speech: usage/text-to-speech.md
- Speech-to-Text: usage/speech-to-text.md
- Voice Chat: usage/voice-chat.md
- Open WebUI Integration: usage/open-webui-integration.md
- Installation: installation.md
- Configuration: configuration.md
Expand Down
4 changes: 3 additions & 1 deletion src/speaches/realtime/conversation_event_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ async def handle_conversation_item_input_audio_transcription_completed_event(
ctx.response = ResponseHandler(
completion_client=ctx.completion_client,
model=ctx.session.model,
configuration=Response(**ctx.session.model_dump()),
configuration=Response(
conversation="auto", input=list(ctx.conversation.items.values()), **ctx.session.model_dump()
),
conversation=ctx.conversation,
pubsub=ctx.pubsub,
)
Expand Down
28 changes: 26 additions & 2 deletions src/speaches/realtime/response_event_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
items_to_chat_messages,
)
from speaches.realtime.event_router import EventRouter
from speaches.realtime.session_event_router import unsupported_field_error, update_dict
from speaches.realtime.utils import generate_response_id, task_done_callback
from speaches.types.realtime import (
ConversationItemContentAudio,
Expand Down Expand Up @@ -261,14 +262,37 @@ def stop(self) -> None:


@event_router.register("response.create")
async def handle_response_create_event(ctx: SessionContext, _event: ResponseCreateEvent) -> None:
async def handle_response_create_event(ctx: SessionContext, event: ResponseCreateEvent) -> None:
if ctx.response is not None:
ctx.response.stop()

configuration = Response(
conversation="auto", input=list(ctx.conversation.items.values()), **ctx.session.model_dump()
)
if event.response is not None:
if event.response.conversation is not None:
ctx.pubsub.publish_nowait(unsupported_field_error("response.conversation"))
if event.response.input is not None:
ctx.pubsub.publish_nowait(unsupported_field_error("response.input"))
if event.response.output_audio_format is not None:
ctx.pubsub.publish_nowait(unsupported_field_error("response.output_audio_format"))
if event.response.metadata is not None:
ctx.pubsub.publish_nowait(unsupported_field_error("response.metadata"))

configuration_dict = configuration.model_dump()
configuration_update_dict = event.response.model_dump(
exclude_none=True, exclude={"conversation", "input", "output_audio_format", "metadata"}
)
logger.debug(f"Applying response configuration update: {configuration_update_dict}")
logger.debug(f"Response configuration before update: {configuration_dict}")
updated_configuration = update_dict(configuration_dict, configuration_update_dict)
logger.debug(f"Response configuration after update: {updated_configuration}")
configuration = Response(**updated_configuration)

ctx.response = ResponseHandler(
completion_client=ctx.completion_client,
model=ctx.session.model,
configuration=Response(**ctx.session.model_dump()), # FIXME
configuration=configuration,
conversation=ctx.conversation,
pubsub=ctx.pubsub,
)
Expand Down
108 changes: 108 additions & 0 deletions tests/realtime_vad_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import asyncio
import base64
import logging
from pathlib import Path

import numpy as np
from openai import AsyncOpenAI
from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection
from openai.types.beta.realtime.conversation_item_content_param import ConversationItemContentParam
from openai.types.beta.realtime.conversation_item_param import ConversationItemParam
from openai.types.beta.realtime.session_update_event_param import Session, SessionTurnDetection
import pytest
import soundfile as sf
import websockets

from speaches.audio import resample_audio

logger = logging.getLogger(__name__)

SAMPLE_RATE = 24000
SAMPLE_WIDTH = 2
BYTERATE = SAMPLE_RATE * SAMPLE_WIDTH # like "bitrate" but in bytes

WS_BASE_URL = "ws://localhost:8000/v1"
MODEL = "gpt-4o-mini"

RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=True))
NO_RESPONSE_SESSION = Session(turn_detection=SessionTurnDetection(create_response=False))


async def audio_sender(
conn: AsyncRealtimeConnection, audio_bytes: bytes, chunks_per_second: int = 10, speed: int = 1
) -> None:
chunk_size = BYTERATE // chunks_per_second
try:
async with asyncio.TaskGroup() as tg:
for i in range(0, len(audio_bytes), chunk_size):
logger.info(f"Sending audio chunk from {i} to {i + chunk_size} of {len(audio_bytes)}")
audio_chunk = audio_bytes[i : i + chunk_size]
tg.create_task(conn.input_audio_buffer.append(audio=base64.b64encode(audio_chunk).decode("utf-8")))
await asyncio.sleep(1 / chunks_per_second / speed)
except* websockets.exceptions.ConnectionClosedError:
logger.info("Connection closed")


async def print_events(conn: AsyncRealtimeConnection, final_event: str | None = None) -> None:
try:
async for event in conn:
if event.type == "response.audio.delta":
size = len(base64.b64decode(event.delta))
event.delta = f"base64 encoded audio of size {size} bytes"
print(event.model_dump_json())
if final_event is not None and event.type == final_event:
break
except websockets.exceptions.ConnectionClosedError:
logger.info("Connection closed")


data, samplerate = sf.read(Path("1_2_3_4_5_6_7_8.wav"), dtype="int16")
pcm_audio_bytes = data.tobytes()
audio_bytes = resample_audio(pcm_audio_bytes, samplerate, 24000)
quite_audio = np.zeros(SAMPLE_RATE * 3, dtype=np.int16).tobytes()
audio_bytes = audio_bytes + quite_audio


@pytest.mark.asyncio
@pytest.mark.requires_openai
async def test_realtime_vad_openai() -> None:
realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
print_events_task = tg.create_task(
print_events(conn, final_event="conversation.item.input_audio_transcription.completed")
)
await conn.session.update(session=NO_RESPONSE_SESSION)
audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes))
await audio_sender_task
await print_events_task
await conn.close()


@pytest.mark.asyncio
@pytest.mark.requires_openai
async def test_realtime_response() -> None:
realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
print_events_task = tg.create_task(print_events(conn, final_event=None))
await conn.session.update(session=RESPONSE_SESSION)
audio_sender_task = tg.create_task(audio_sender(conn, audio_bytes))
await audio_sender_task
await print_events_task
await conn.close()


@pytest.mark.asyncio
@pytest.mark.requires_openai
async def test_realtime_create_conversation_item() -> None:
realtime_client = AsyncOpenAI(websocket_base_url=WS_BASE_URL).beta.realtime
async with asyncio.TaskGroup() as tg, realtime_client.connect(model=MODEL) as conn:
print_events_task = tg.create_task(print_events(conn, final_event="response.done"))
await conn.session.update(session=NO_RESPONSE_SESSION)
await conn.conversation.item.create(
item=ConversationItemParam(
role="user", type="message", content=[ConversationItemContentParam(type="input_text", text="Hello")]
)
)
await conn.response.create()
await print_events_task
await conn.close()
Loading