From b998076d7d74cb10b6ea6ac5dc536c615447c73b Mon Sep 17 00:00:00 2001 From: vcalv <66543651+vcalv@users.noreply.github.com> Date: Wed, 24 Jul 2024 01:30:11 +0000 Subject: [PATCH 1/3] piper tts minimal working version --- README.md | 4 +- .../tts_providers/base_tts_provider.py | 6 +- .../tts_providers/piper_tts_provider.py | 69 +++++++++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 audiobook_generator/tts_providers/piper_tts_provider.py diff --git a/README.md b/README.md index ce023cb..7de376c 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ python3 main.py -h ``` ```bash -usage: main.py [-h] [--tts {azure,openai,edge}] +usage: main.py [-h] [--tts {azure,openai,edge,piper}] [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview] [--no_prompt] [--language LANGUAGE] [--newline_mode {single,double,none}] @@ -101,7 +101,7 @@ positional arguments: options: -h, --help show this help message and exit - --tts {azure,openai,edge} + --tts {azure,openai,edge,piper} Choose TTS provider (default: azure). azure: Azure Cognitive Services, openai: OpenAI TTS API. When using azure, environment variables MS_TTS_KEY and diff --git a/audiobook_generator/tts_providers/base_tts_provider.py b/audiobook_generator/tts_providers/base_tts_provider.py index 7cbb061..3fed376 100644 --- a/audiobook_generator/tts_providers/base_tts_provider.py +++ b/audiobook_generator/tts_providers/base_tts_provider.py @@ -5,6 +5,7 @@ TTS_AZURE = "azure" TTS_OPENAI = "openai" TTS_EDGE = "edge" +TTS_PIPER = 'piper' class BaseTTSProvider: # Base interface for TTS providers @@ -34,7 +35,7 @@ def get_output_file_extension(self): # Common support methods for all TTS providers def get_supported_tts_providers() -> List[str]: - return [TTS_AZURE, TTS_OPENAI, TTS_EDGE] + return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_PIPER] def get_tts_provider(config) -> BaseTTSProvider: @@ -47,5 +48,8 @@ def get_tts_provider(config) -> BaseTTSProvider: elif config.tts == TTS_EDGE: from audiobook_generator.tts_providers.edge_tts_provider import EdgeTTSProvider return EdgeTTSProvider(config) + elif config.tts == TTS_PIPER: + from audiobook_generator.tts_providers.piper_tts_provider import PiperTTSProvider + return PiperTTSProvider(config) else: raise ValueError(f"Invalid TTS provider: {config.tts}") diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py new file mode 100644 index 0000000..f16cc50 --- /dev/null +++ b/audiobook_generator/tts_providers/piper_tts_provider.py @@ -0,0 +1,69 @@ +import tempfile +from subprocess import run +from pathlib import Path +import logging + + +from pydub import AudioSegment + +from audiobook_generator.config.general_config import GeneralConfig +from audiobook_generator.core.audio_tags import AudioTags +from audiobook_generator.core.utils import set_audio_tags +from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider + +logger = logging.getLogger(__name__) + +__all__ = ["PiperTTSProvider"] + + +class PiperTTSProvider(BaseTTSProvider): + def __init__(self, config: GeneralConfig): + logger.setLevel(config.log) + + # TTS provider specific config + config.output_format = config.output_format or "opus" + config.voice_rate = config.voice_rate or "1.0" + + # 0.000$ per 1 million characters + # or 0.000$ per 1000 characters + self.price = 0.000 + super().__init__(config) + + def __str__(self) -> str: + return f"{self.config}" + + def validate_config(self): + pass + + def text_to_speech( + self, + text: str, + output_file: str, + audio_tags: AudioTags, + ): + + with tempfile.TemporaryDirectory() as tmpdirname: + logger.debug("created temporary directory %r", tmpdirname) + + tmpfilename = Path(tmpdirname) / "piper.wav" + + run( + ["piper-tts", "--model", self.config.voice_name, "-f", tmpfilename], + input=text.encode("utf-8"), + ) + + # Convert the wav file to the desired format + AudioSegment.from_wav(tmpfilename).export( + output_file, format=self.config.output_format + ) + + set_audio_tags(output_file, audio_tags) + + def estimate_cost(self, total_chars): + return 0 + + def get_break_string(self): + return " " + + def get_output_file_extension(self): + return self.config.output_format From e8cae8bc8478decd4d304b88b9afe98c7627f022 Mon Sep 17 00:00:00 2001 From: vcalv <66543651+vcalv@users.noreply.github.com> Date: Wed, 24 Jul 2024 02:26:51 +0000 Subject: [PATCH 2/3] support all(?) parameters that piper accepts --- .../tts_providers/piper_tts_provider.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py index f16cc50..1f885ab 100644 --- a/audiobook_generator/tts_providers/piper_tts_provider.py +++ b/audiobook_generator/tts_providers/piper_tts_provider.py @@ -22,7 +22,17 @@ def __init__(self, config: GeneralConfig): # TTS provider specific config config.output_format = config.output_format or "opus" - config.voice_rate = config.voice_rate or "1.0" + + if config.voice_rate is None: + config.voice_rate = 1.0 + else: + try: + config.voice_rate = float(config.voice_rate) + except ValueError: + logger.error("Invalid voice_rate %r", config.voice_rate) + config.voice_rate = 1.0 + config.voice_name = config.voice_name or "0" + config.break_duration = config.break_duration or 0.2 # 0.000$ per 1 million characters # or 0.000$ per 1000 characters @@ -48,7 +58,19 @@ def text_to_speech( tmpfilename = Path(tmpdirname) / "piper.wav" run( - ["piper-tts", "--model", self.config.voice_name, "-f", tmpfilename], + [ + "piper-tts", + "--model", + self.config.model_name, + "--speaker", + self.config.voice_name, + "--sentence_silence", + str(self.config.break_duration), + "--length_scale", + str(1.0 / self.config.voice_rate), + "-f", + tmpfilename, + ], input=text.encode("utf-8"), ) From 32a750dda4b9f528ca66c3c5af66874485c8f2ea Mon Sep 17 00:00:00 2001 From: vcalv <66543651+vcalv@users.noreply.github.com> Date: Wed, 24 Jul 2024 02:34:52 +0000 Subject: [PATCH 3/3] piper documentation in README --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 7de376c..f5ea418 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ If you're interested in hearing a sample of the audiobook generated by this tool - [Azure TTS Sample](https://audio.com/paudi/audio/0008-chapter-vii-agricultural-experience) - [OpenAI TTS Sample](https://audio.com/paudi/audio/openai-0008-chapter-vii-agricultural-experience-i-had-now-been-in) - Edge TTS Sample: the voice is almost the same as Azure TTS +- [Piper TTS](https://rhasspy.github.io/piper-samples/) ## Requirements @@ -20,6 +21,7 @@ If you're interested in hearing a sample of the audiobook generated by this tool - For using *Azure TTS*, A Microsoft Azure account with access to the [Microsoft Cognitive Services Speech Services](https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices) is required. - For using *OpenAI TTS*, OpenAI [API Key](https://platform.openai.com/api-keys) is required. - For using *Edge TTS*, no API Key is required. +- Piper TTS executable and models for *Piper TTS* ## Audiobookshelf Integration @@ -351,6 +353,29 @@ Here are some examples that demonstrate various option combinations: python3 main.py "path/to/book.epub" "path/to/output/folder" --tts edge --chapter_start 5 --chapter_end 10 --break_duration "1500" ``` +### Examples Using Piper TTS + This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters. + You always need to specify an onnx model file and the `piper-tts` executable needs to be in the current path. + + ```sh + python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx + ``` + + Some models support multiple voices and that can be specified by using the voice_name parameter. + + ```sh + python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --voice_name 256 + ``` + + You can also specify speed and pause duration + + ```sh + python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --voice_name 256 --voice_rate 1.5 --break_duration 0.75 + ``` + + Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter. + `opus` and `mp3` are good choices for size and compatibility. + ## Troubleshooting ### ModuleNotFoundError: No module named 'importlib_metadata'