Merge pull request #77 from vcalv/piper

Piper TTS support
p0n1 · Aug 24, 2024 · 25d9e52 · 25d9e52
2 parents 4ea2e25 + 32a750d
commit 25d9e52
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -13,13 +13,15 @@ If you're interested in hearing a sample of the audiobook generated by this tool
 - [Azure TTS Sample](https://audio.com/paudi/audio/0008-chapter-vii-agricultural-experience)
 - [OpenAI TTS Sample](https://audio.com/paudi/audio/openai-0008-chapter-vii-agricultural-experience-i-had-now-been-in)
 - Edge TTS Sample: the voice is almost the same as Azure TTS
+- [Piper TTS](https://rhasspy.github.io/piper-samples/)
 
 ## Requirements
 
 - Python 3.6+ Or ***Docker***
 - For using *Azure TTS*, A Microsoft Azure account with access to the [Microsoft Cognitive Services Speech Services](https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices) is required.
 - For using *OpenAI TTS*, OpenAI [API Key](https://platform.openai.com/api-keys) is required.
 - For using *Edge TTS*, no API Key is required.
+- Piper TTS executable and models for *Piper TTS*
 
 ## Audiobookshelf Integration
 
@@ -80,7 +82,7 @@ python3 main.py -h
 ```
 
 ```bash
-usage: main.py [-h] [--tts {azure,openai,edge}]
+usage: main.py [-h] [--tts {azure,openai,edge,piper}]
                [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview]
                [--no_prompt] [--language LANGUAGE]
                [--newline_mode {single,double,none}]
@@ -101,7 +103,7 @@ positional arguments:
 
 options:
   -h, --help            show this help message and exit
-  --tts {azure,openai,edge}
+  --tts {azure,openai,edge,piper}
                         Choose TTS provider (default: azure). azure: Azure
                         Cognitive Services, openai: OpenAI TTS API. When using
                         azure, environment variables MS_TTS_KEY and
@@ -351,6 +353,29 @@ Here are some examples that demonstrate various option combinations:
    python3 main.py "path/to/book.epub" "path/to/output/folder" --tts edge --chapter_start 5 --chapter_end 10 --break_duration "1500"
    ```
 
+### Examples Using Piper TTS
+   This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters.
+   You always need to specify an onnx model file and the `piper-tts` executable needs to be in the current path. 
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx
+   ```
+
+   Some models support multiple voices and that can be specified by using the voice_name parameter.
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256
+   ```
+
+   You can also specify speed and pause duration
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256 --voice_rate 1.5 --break_duration 0.75
+   ```
+
+   Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter.
+   `opus` and `mp3` are good choices for size and compatibility.
+
 ## Troubleshooting
 
 ### ModuleNotFoundError: No module named 'importlib_metadata'

diff --git a/audiobook_generator/tts_providers/base_tts_provider.py b/audiobook_generator/tts_providers/base_tts_provider.py
@@ -5,6 +5,7 @@
 TTS_AZURE = "azure"
 TTS_OPENAI = "openai"
 TTS_EDGE = "edge"
+TTS_PIPER = 'piper'
 
 
 class BaseTTSProvider:  # Base interface for TTS providers
@@ -34,7 +35,7 @@ def get_output_file_extension(self):
 
 # Common support methods for all TTS providers
 def get_supported_tts_providers() -> List[str]:
-    return [TTS_AZURE, TTS_OPENAI, TTS_EDGE]
+    return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_PIPER]
 
 
 def get_tts_provider(config) -> BaseTTSProvider:
@@ -47,5 +48,8 @@ def get_tts_provider(config) -> BaseTTSProvider:
     elif config.tts == TTS_EDGE:
         from audiobook_generator.tts_providers.edge_tts_provider import EdgeTTSProvider
         return EdgeTTSProvider(config)
+    elif config.tts == TTS_PIPER:
+        from audiobook_generator.tts_providers.piper_tts_provider import PiperTTSProvider
+        return PiperTTSProvider(config)
     else:
         raise ValueError(f"Invalid TTS provider: {config.tts}")
diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py
@@ -0,0 +1,91 @@
+import tempfile
+from subprocess import run
+from pathlib import Path
+import logging
+
+
+from pydub import AudioSegment
+
+from audiobook_generator.config.general_config import GeneralConfig
+from audiobook_generator.core.audio_tags import AudioTags
+from audiobook_generator.core.utils import set_audio_tags
+from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["PiperTTSProvider"]
+
+
+class PiperTTSProvider(BaseTTSProvider):
+    def __init__(self, config: GeneralConfig):
+        logger.setLevel(config.log)
+
+        # TTS provider specific config
+        config.output_format = config.output_format or "opus"
+
+        if config.voice_rate is None:
+            config.voice_rate = 1.0
+        else:
+            try:
+                config.voice_rate = float(config.voice_rate)
+            except ValueError:
+                logger.error("Invalid voice_rate %r", config.voice_rate)
+                config.voice_rate = 1.0
+        config.voice_name = config.voice_name or "0"
+        config.break_duration = config.break_duration or 0.2
+
+        # 0.000$ per 1 million characters
+        # or 0.000$ per 1000 characters
+        self.price = 0.000
+        super().__init__(config)
+
+    def __str__(self) -> str:
+        return f"{self.config}"
+
+    def validate_config(self):
+        pass
+
+    def text_to_speech(
+        self,
+        text: str,
+        output_file: str,
+        audio_tags: AudioTags,
+    ):
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            logger.debug("created temporary directory %r", tmpdirname)
+
+            tmpfilename = Path(tmpdirname) / "piper.wav"
+
+            run(
+                [
+                    "piper-tts",
+                    "--model",
+                    self.config.model_name,
+                    "--speaker",
+                    self.config.voice_name,
+                    "--sentence_silence",
+                    str(self.config.break_duration),
+                    "--length_scale",
+                    str(1.0 / self.config.voice_rate),
+                    "-f",
+                    tmpfilename,
+                ],
+                input=text.encode("utf-8"),
+            )
+
+            # Convert the wav file to the desired format
+            AudioSegment.from_wav(tmpfilename).export(
+                output_file, format=self.config.output_format
+            )
+
+        set_audio_tags(output_file, audio_tags)
+
+    def estimate_cost(self, total_chars):
+        return 0
+
+    def get_break_string(self):
+        return "    "
+
+    def get_output_file_extension(self):
+        return self.config.output_format