From c244dbb5bebc95a6b199a5d89a572c6be1adbfba Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Wed, 13 Nov 2024 13:23:05 -0800 Subject: [PATCH 1/3] Init commit of local piper docker compose container with asyncio and semaphore based locking to limit local concurrency --- .gitignore | 5 +- .../tts_providers/base_tts_provider.py | 27 +- .../piper_docker_tts_provider.py | 252 ++++++++++++++++++ docker-compose.yml | 48 ++++ requirements.txt | 8 +- 5 files changed, 333 insertions(+), 7 deletions(-) create mode 100644 audiobook_generator/tts_providers/piper_docker_tts_provider.py create mode 100644 docker-compose.yml diff --git a/.gitignore b/.gitignore index 1081772..7bbd194 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ private_examples/ # custom scripts/ *.onnx -*.onnx.json \ No newline at end of file +*.onnx.json + +# Models +piper_models/* diff --git a/audiobook_generator/tts_providers/base_tts_provider.py b/audiobook_generator/tts_providers/base_tts_provider.py index 3fed376..ab0d402 100644 --- a/audiobook_generator/tts_providers/base_tts_provider.py +++ b/audiobook_generator/tts_providers/base_tts_provider.py @@ -5,7 +5,8 @@ TTS_AZURE = "azure" TTS_OPENAI = "openai" TTS_EDGE = "edge" -TTS_PIPER = 'piper' +TTS_PIPER = "piper" +TTS_PIPER_DOCKER = "piper_docker" class BaseTTSProvider: # Base interface for TTS providers @@ -35,21 +36,37 @@ def get_output_file_extension(self): # Common support methods for all TTS providers def get_supported_tts_providers() -> List[str]: - return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_PIPER] + return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_PIPER, TTS_PIPER_DOCKER] def get_tts_provider(config) -> BaseTTSProvider: if config.tts == TTS_AZURE: - from audiobook_generator.tts_providers.azure_tts_provider import AzureTTSProvider + from audiobook_generator.tts_providers.azure_tts_provider import ( + AzureTTSProvider, + ) + return AzureTTSProvider(config) elif config.tts == TTS_OPENAI: - from audiobook_generator.tts_providers.openai_tts_provider import OpenAITTSProvider + from audiobook_generator.tts_providers.openai_tts_provider import ( + OpenAITTSProvider, + ) + return OpenAITTSProvider(config) elif config.tts == TTS_EDGE: from audiobook_generator.tts_providers.edge_tts_provider import EdgeTTSProvider + return EdgeTTSProvider(config) elif config.tts == TTS_PIPER: - from audiobook_generator.tts_providers.piper_tts_provider import PiperTTSProvider + from audiobook_generator.tts_providers.piper_tts_provider import ( + PiperTTSProvider, + ) + return PiperTTSProvider(config) + elif config.tts == TTS_PIPER_DOCKER: + from audiobook_generator.tts_providers.piper_docker_tts_provider import ( + PiperDockerTTSProvider, + ) + + return PiperDockerTTSProvider(config) else: raise ValueError(f"Invalid TTS provider: {config.tts}") diff --git a/audiobook_generator/tts_providers/piper_docker_tts_provider.py b/audiobook_generator/tts_providers/piper_docker_tts_provider.py new file mode 100644 index 0000000..7d71c01 --- /dev/null +++ b/audiobook_generator/tts_providers/piper_docker_tts_provider.py @@ -0,0 +1,252 @@ +from math import e +import os +import asyncio +import logging +import timeit +from typing import Optional, Union, List, Tuple + + +from pydub import AudioSegment +from wyoming.client import AsyncTcpClient +from wyoming.tts import Synthesize + +from audiobook_generator.config.general_config import GeneralConfig +from audiobook_generator.core.audio_tags import AudioTags +from audiobook_generator.core.utils import set_audio_tags +from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider + +logger = logging.getLogger(__name__) + +__all__ = ["PiperDockerTTSProvider"] + + +class PiperCommWithPauses: + def __init__( + self, + text: str, + break_string: str = " ", + break_duration: int = 1250, + output_format: str = "mp3", + **kwargs, + ): + self.full_text = text + self.host = os.getenv("PIPER_HOST", "piper") + self.port = int(os.getenv("PIPER_PORT", 10200)) + self.break_string = break_string + self.break_duration = int(break_duration) + self.output_format = output_format + self.client: Optional[AsyncTcpClient] = None + + self.parsed = self.parse_text() + + def parse_text(self) -> List[str]: + logger.debug( + f"Parsing the text, looking for breaks/pauses using break string: '{self.break_string}'" + ) + if self.break_string not in self.full_text or not self.break_string: + logger.debug("No breaks/pauses found in the text") + return [self.full_text] + + parts = self.full_text.split(self.break_string) + parts = [part for part in parts if part.strip() != ""] + new_parts = [ + self.break_string.join(parts[i : i + 10]) for i in range(0, len(parts), 10) + ] + logger.debug(f"Split into {len(new_parts)} parts") + return new_parts + + def generate_pause(self, duration_ms: int) -> AudioSegment: + logger.debug(f"Generating pause of {duration_ms} ms") + # Generate a silent AudioSegment as a pause + silent = AudioSegment.silent(duration=duration_ms) + return silent + + async def synthesize_and_convert_with_semaphore( + self, idx_text: Tuple[int, str], sem: asyncio.Semaphore + ) -> Tuple[int, AudioSegment]: + async with sem: + return await self.synthesize_and_convert(idx_text) + + async def synthesize(self, text: str) -> Tuple[bytes, int, int, int]: + """Sends a synthesis request to the Piper TTS server and returns the audio data and metadata.""" + + audio_data, sample_rate, sample_width, channels = await self.synthesize_speech( + text, host=self.host, port=self.port + ) + if not audio_data: + logger.error("No audio data received") + return b"", 0, 0, 0 + return audio_data, sample_rate, sample_width, channels + + async def synthesize_and_convert( + self, idx_text: Tuple[int, str] + ) -> Tuple[int, AudioSegment]: + """Asynchronously synthesizes text and returns a tuple of index and AudioSegment.""" + idx, text = idx_text + audio_data, rate, width, channels = await self.synthesize(text) + if audio_data == b"": + raise ValueError("No audio data received") + # Ensure sample_width is in bytes per sample + if width > 4: # Assume width is in bits + width = width // 8 + # Convert audio data (bytes) to AudioSegment + audio_segment = AudioSegment( + data=audio_data, + sample_width=width, + frame_rate=rate, + channels=channels, + ) + return idx, audio_segment + + async def chunkify(self) -> AudioSegment: + """Old perf: 11x realtime + + Returns: + AudioSegment: _description_ + """ + logger.debug("Starting chunkify process") + # Prepare the list of texts with their indices + + indexed_texts = list(enumerate(self.parsed)) + max_concurrent_tasks = 5 + sem = asyncio.Semaphore(max_concurrent_tasks) + + tasks = [ + self.synthesize_and_convert_with_semaphore(idx_text, sem) + for idx_text in indexed_texts + ] + + results = [] + start = timeit.default_timer() + for task in asyncio.as_completed(tasks): + result = await task + results.append(result) + now = timeit.default_timer() + elapsed = now - start + total_seconds_remaining = (len(tasks) - len(results)) * ( + elapsed / max(1, len(results)) + ) + estimated_remaining_time_m = total_seconds_remaining // 60 + estimated_remaining_time_s = total_seconds_remaining % 60 + print( + f"Processed {len(results)} of {len(tasks)} chunks in chapter. Estimated time remaining for chapter: {round(estimated_remaining_time_m)} min, {round(estimated_remaining_time_s)} sec", + end="\r", + flush=True, + ) + + # results = await asyncio.gather(*tasks, return_exceptions=True) + + audio_segments = [] + # Collect results and reconstruct the audio segments in order + results_dict = {} + for result in results: + if isinstance(result, Exception): + logger.error(f"An error occurred during synthesis: {result}") + continue + if not isinstance(result, tuple): + logger.error(f"Unexpected result: {result}") + continue + idx, audio_segment = result + results_dict[idx] = audio_segment + + for idx in range(len(self.parsed)): + audio_segment = results_dict.get(idx) + if audio_segment: + audio_segments.append(audio_segment) + if idx < len(self.parsed) - 1 and self.break_duration > 0: + # Insert pause + pause_segment = self.generate_pause(self.break_duration) + audio_segments.append(pause_segment) + else: + logger.error(f"Missing audio segment at index {idx}") + + # Stitch the audio segments together + combined = sum(audio_segments, AudioSegment.empty()) + logger.debug("Chunkify process completed") + return combined + + def save(self, audio_fname: Union[str, bytes]) -> None: + combined = asyncio.run(self.chunkify()) + # Export the combined audio to the desired format + combined.export(audio_fname, format=self.output_format) + logger.info(f"Audio saved to: {audio_fname}") + + def get_client(self, host: str, port: int) -> AsyncTcpClient: + # if not self.client: + # self.client = AsyncTcpClient(host, port) + # return self.client + return AsyncTcpClient(host, port) + + async def synthesize_speech(self, text: str, host: str, port: int): + client = self.get_client(host, port) + synthesize = Synthesize(text=text) + request_event = synthesize.event() + + audio_data = bytearray() + sample_rate = 22050 # Default sample rate + sample_width = 2 # Default to 16-bit audio + channels = 1 # Default to mono + + async with client: + await client.write_event(request_event) + + while True: + response_event = await client.read_event() + if response_event is None: + break + + if response_event.type == "audio-start": + # Extract audio metadata if available + sample_rate = response_event.data.get("rate", sample_rate) + sample_width = response_event.data.get("width", sample_width) + channels = response_event.data.get("channels", channels) + elif response_event.type == "audio-chunk" and response_event.payload: + audio_data.extend(response_event.payload) + elif response_event.type == "audio-stop": + return bytes(audio_data), sample_rate, sample_width, channels + else: + raise ValueError(f"Unexpected event type: {response_event.type}") + return None, sample_rate, sample_width, channels + + +class PiperDockerTTSProvider(BaseTTSProvider): + def __init__(self, config: GeneralConfig): + # TTS provider specific config + config.output_format = config.output_format or "mp3" + config.break_duration = int(config.break_duration or 1250) # in milliseconds + + self.price = 0.000 # Piper is free to use + super().__init__(config) + + def __str__(self) -> str: + return f"PiperDockerTTSProvider(config={self.config})" + + def validate_config(self): + # Add any necessary validation for the config here + pass + + def text_to_speech( + self, + text: str, + output_file: str, + audio_tags: AudioTags, + ): + piper_comm = PiperCommWithPauses( + text=text, + break_string=self.get_break_string().strip(), + break_duration=self.config.break_duration, + output_format=self.config.output_format, + ) + + piper_comm.save(output_file) + + set_audio_tags(output_file, audio_tags) + + def estimate_cost(self, total_chars): + return 0 # Piper is free + + def get_break_string(self): + return "." # Four spaces as the default break string + + def get_output_file_extension(self): + return self.config.output_format diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..75d985a --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,48 @@ +# Be sure to replace and with your actual +# Azure Text-to-Speech API credentials. Also, replace your_book.epub with the +# name of your EPUB file, and audiobook_output with the name of the directory +# where you want to save the output files. + +# After creating and saving the `docker-compose.yml` file, run the +# `docker-compose up` command in the same directory to pull the image and start +# the conversion process. + +# You can then try to modify volumes to fit your need. + +services: + piper: + image: lscr.io/linuxserver/piper:latest + container_name: piper + environment: + - PUID=1000 + - PGID=1000 + - TZ=Etc/UTC + - PIPER_VOICE=en_US-norman-medium + - PIPER_LENGTH=1.0 # optional + - PIPER_NOISE=0.667 # optional + - PIPER_NOISEW=0.333 # optional + - PIPER_SPEAKER=0 # optional + - PIPER_PROCS=1 # optional + volumes: + # - /path/to/piper/data:/config + - /Users/allennikka/Developer/epub_to_audiobook/piper_models:/usr/share/piper_models # Volume for Piper models + ports: + - 10200:10200 + restart: unless-stopped + + epub_to_audiobook: + build: + context: ./ # Directory containing the Dockerfile for epub_to_audiobook + dockerfile: Dockerfile # Name of the Dockerfile (if it's not the default 'Dockerfile') + container_name: epub_to_audiobook + environment: + - TTS_PROVIDER=piper_docker + - PIPER_HOST=piper + - PIPER_PORT=10200 + volumes: + - ./:/app + - /Users/allennikka/Calibre Library:/calibre_library # Map the Calibre Library directory + + command: "--tts piper_docker --no_prompt '/calibre_library/Erick Ries/The Lean Startup Erick Ries (15)/The Lean Startup Erick Ries - Erick Ries.epub' audiobook_output --chapter_start 1 --chapter_end 2" + depends_on: + - piper diff --git a/requirements.txt b/requirements.txt index 001d104..2ca822b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,10 @@ openai==1.35.7 requests==2.32.3 socksio==1.0.0 edge-tts==6.1.12 -pydub==0.25.1 \ No newline at end of file +<<<<<<< HEAD +pydub==0.25.1 +======= +pydub==0.25.1 +python-dotenv==1.0.1 +wyoming==1.6.0 +>>>>>>> 4679f9c (Init commit of local piper docker compose container) From a1745ade82faf68f65d1cac1b5629653943dcefd Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Wed, 13 Nov 2024 17:38:24 -0800 Subject: [PATCH 2/3] Minor updates to piper voice, ingnoring env files, and updating the example docker compose file Remove unused import --- .gitignore | 3 ++ Dockerfile | 2 +- .../piper_docker_tts_provider.py | 7 +-- docker-compose.piper-example.yml | 42 ++++++++++++++++ docker-compose.yml | 48 ------------------- requirements.txt | 5 -- 6 files changed, 47 insertions(+), 60 deletions(-) create mode 100644 docker-compose.piper-example.yml delete mode 100644 docker-compose.yml diff --git a/.gitignore b/.gitignore index 7bbd194..ef231d9 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ scripts/ # Models piper_models/* + +# Env files +.env diff --git a/Dockerfile b/Dockerfile index 897ca2e..e372ce6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,4 +17,4 @@ RUN pip install --no-cache-dir -r requirements.txt WORKDIR /app # Set this as the default command -ENTRYPOINT [ "python", "/app_src/main.py" ] \ No newline at end of file +# ENTRYPOINT [ "python", "/app_src/main.py" ] \ No newline at end of file diff --git a/audiobook_generator/tts_providers/piper_docker_tts_provider.py b/audiobook_generator/tts_providers/piper_docker_tts_provider.py index 7d71c01..c55f0a6 100644 --- a/audiobook_generator/tts_providers/piper_docker_tts_provider.py +++ b/audiobook_generator/tts_providers/piper_docker_tts_provider.py @@ -1,4 +1,3 @@ -from math import e import os import asyncio import logging @@ -128,14 +127,10 @@ async def chunkify(self) -> AudioSegment: ) estimated_remaining_time_m = total_seconds_remaining // 60 estimated_remaining_time_s = total_seconds_remaining % 60 - print( + logger.info( f"Processed {len(results)} of {len(tasks)} chunks in chapter. Estimated time remaining for chapter: {round(estimated_remaining_time_m)} min, {round(estimated_remaining_time_s)} sec", - end="\r", - flush=True, ) - # results = await asyncio.gather(*tasks, return_exceptions=True) - audio_segments = [] # Collect results and reconstruct the audio segments in order results_dict = {} diff --git a/docker-compose.piper-example.yml b/docker-compose.piper-example.yml new file mode 100644 index 0000000..4047056 --- /dev/null +++ b/docker-compose.piper-example.yml @@ -0,0 +1,42 @@ +services: + piper: + image: lscr.io/linuxserver/piper:latest + container_name: piper + environment: + - PUID=1000 + - PGID=1000 + - TZ=Etc/UTC + - PIPER_VOICE=en_US-hfc_male-medium + - PIPER_LENGTH=1.0 # optional + - PIPER_NOISE=0.667 # optional + - PIPER_NOISEW=0.333 # optional + - PIPER_SPEAKER=0 # optional + - PIPER_PROCS=1 # optional + volumes: + # - /path/to/piper/data:/config # Optional volume for Piper config + - ./piper_models:/usr/share/piper_models # Volume for Piper models, find models at https://github.com/rhasspy/piper/ + ports: + - 10200:10200 + restart: unless-stopped + healthcheck: + test: ['CMD-SHELL', 'nc -z localhost 10200'] + interval: 10s + timeout: 5s + retries: 5 + + epub_to_audiobook: + build: + context: ./ # Directory containing the Dockerfile for epub_to_audiobook + dockerfile: Dockerfile # Name of the Dockerfile (if it's not the default 'Dockerfile') + container_name: epub_to_audiobook + environment: + - PIPER_HOST=piper + - PIPER_PORT=10200 + volumes: + - ./:/app + - :/epub_src # Map the top level epub directory on the host machine + # command: tail -f /dev/null # Uncomment this line to keep the container running, and run via connecting to it with `docker exec -it epub_to_audiobook /bin/bash` + command: "python main.py --tts piper_docker --no_prompt '/epub_src/' audiobook_output" # Run command directly when the containers start up + depends_on: + piper: + condition: service_healthy diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 75d985a..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,48 +0,0 @@ -# Be sure to replace and with your actual -# Azure Text-to-Speech API credentials. Also, replace your_book.epub with the -# name of your EPUB file, and audiobook_output with the name of the directory -# where you want to save the output files. - -# After creating and saving the `docker-compose.yml` file, run the -# `docker-compose up` command in the same directory to pull the image and start -# the conversion process. - -# You can then try to modify volumes to fit your need. - -services: - piper: - image: lscr.io/linuxserver/piper:latest - container_name: piper - environment: - - PUID=1000 - - PGID=1000 - - TZ=Etc/UTC - - PIPER_VOICE=en_US-norman-medium - - PIPER_LENGTH=1.0 # optional - - PIPER_NOISE=0.667 # optional - - PIPER_NOISEW=0.333 # optional - - PIPER_SPEAKER=0 # optional - - PIPER_PROCS=1 # optional - volumes: - # - /path/to/piper/data:/config - - /Users/allennikka/Developer/epub_to_audiobook/piper_models:/usr/share/piper_models # Volume for Piper models - ports: - - 10200:10200 - restart: unless-stopped - - epub_to_audiobook: - build: - context: ./ # Directory containing the Dockerfile for epub_to_audiobook - dockerfile: Dockerfile # Name of the Dockerfile (if it's not the default 'Dockerfile') - container_name: epub_to_audiobook - environment: - - TTS_PROVIDER=piper_docker - - PIPER_HOST=piper - - PIPER_PORT=10200 - volumes: - - ./:/app - - /Users/allennikka/Calibre Library:/calibre_library # Map the Calibre Library directory - - command: "--tts piper_docker --no_prompt '/calibre_library/Erick Ries/The Lean Startup Erick Ries (15)/The Lean Startup Erick Ries - Erick Ries.epub' audiobook_output --chapter_start 1 --chapter_end 2" - depends_on: - - piper diff --git a/requirements.txt b/requirements.txt index 2ca822b..8165165 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,5 @@ openai==1.35.7 requests==2.32.3 socksio==1.0.0 edge-tts==6.1.12 -<<<<<<< HEAD pydub==0.25.1 -======= -pydub==0.25.1 -python-dotenv==1.0.1 wyoming==1.6.0 ->>>>>>> 4679f9c (Init commit of local piper docker compose container) From d8030c0dd1b86fae9f327035572f81cd55f4178c Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Thu, 12 Dec 2024 11:08:43 -0800 Subject: [PATCH 3/3] Updated readme --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 10519a9..55adab2 100644 --- a/README.md +++ b/README.md @@ -438,6 +438,17 @@ Piper TTS outputs `wav` format files (or raw) by default you should be able to s python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --piper_speaker 256 --piper_length_scale 1.5 --piper_sentence_silence 0.5 --output_format opus ``` +*Alternatively, you can use the following procedure to use piper in a docker container, which simplifies the process of running everything locally.* + +1. Ensure you have docker desktop installed on your system. See [Docker](https://www.docker.com/) to install (or use the [homebrew](https://formulae.brew.sh/formula/docker) formula). +2. Download a Piper model & config file (see the [piper repo](https://github.com/rhasspy/piper) for details) and place them in the [piper_models](./piper_models/) directory at the top level of this project. +3. Edit the [docker compose file](./docker-compose.piper-example.yml) to: + - In the `piper` container, set the `PIPER_VOICE` environment variable to the name of the model file you downloaded. + - In the `piper` container, map the `volumes` to the location of the piper models on your system (if you used the provided directory described in step 2, you can leave this as is). + - In the `epub_to_audiobook` container, update the `volumes` mapping from `` to the actual path to the epub on your host machine. + - In the `epub_to_audiobook` container, modify the `command` to match the parameters you want to use for the conversion. Make sure to keep `python main.py --tts piper_docker --no_prompt` at the beginning of the command. +4. Run `docker-compose -f docker-compose.piper-example.yml up` to start the conversion process. Note that the current config in the docker compose will automatically start the process, entirely in the container. If you want to run the main python process outside the container, you can uncomment the command `command: tail -f /dev/null`, and use `docker exec -it epub_to_audiobook /bin/bash` to connect to the container and run the python script manually (see comments in the [docker compose file](./docker-compose.piper-example.yml) for more details). + ## Troubleshooting ### ModuleNotFoundError: No module named 'importlib_metadata'