From b998076d7d74cb10b6ea6ac5dc536c615447c73b Mon Sep 17 00:00:00 2001
From: vcalv <66543651+vcalv@users.noreply.github.com>
Date: Wed, 24 Jul 2024 01:30:11 +0000
Subject: [PATCH 1/3] piper tts minimal working version

---
 README.md                                     |  4 +-
 .../tts_providers/base_tts_provider.py        |  6 +-
 .../tts_providers/piper_tts_provider.py       | 69 +++++++++++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 audiobook_generator/tts_providers/piper_tts_provider.py

diff --git a/README.md b/README.md
index ce023cb..7de376c 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ python3 main.py -h
 ```
 
 ```bash
-usage: main.py [-h] [--tts {azure,openai,edge}]
+usage: main.py [-h] [--tts {azure,openai,edge,piper}]
                [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview]
                [--no_prompt] [--language LANGUAGE]
                [--newline_mode {single,double,none}]
@@ -101,7 +101,7 @@ positional arguments:
 
 options:
   -h, --help            show this help message and exit
-  --tts {azure,openai,edge}
+  --tts {azure,openai,edge,piper}
                         Choose TTS provider (default: azure). azure: Azure
                         Cognitive Services, openai: OpenAI TTS API. When using
                         azure, environment variables MS_TTS_KEY and
diff --git a/audiobook_generator/tts_providers/base_tts_provider.py b/audiobook_generator/tts_providers/base_tts_provider.py
index 7cbb061..3fed376 100644
--- a/audiobook_generator/tts_providers/base_tts_provider.py
+++ b/audiobook_generator/tts_providers/base_tts_provider.py
@@ -5,6 +5,7 @@
 TTS_AZURE = "azure"
 TTS_OPENAI = "openai"
 TTS_EDGE = "edge"
+TTS_PIPER = 'piper'
 
 
 class BaseTTSProvider:  # Base interface for TTS providers
@@ -34,7 +35,7 @@ def get_output_file_extension(self):
 
 # Common support methods for all TTS providers
 def get_supported_tts_providers() -> List[str]:
-    return [TTS_AZURE, TTS_OPENAI, TTS_EDGE]
+    return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_PIPER]
 
 
 def get_tts_provider(config) -> BaseTTSProvider:
@@ -47,5 +48,8 @@ def get_tts_provider(config) -> BaseTTSProvider:
     elif config.tts == TTS_EDGE:
         from audiobook_generator.tts_providers.edge_tts_provider import EdgeTTSProvider
         return EdgeTTSProvider(config)
+    elif config.tts == TTS_PIPER:
+        from audiobook_generator.tts_providers.piper_tts_provider import PiperTTSProvider
+        return PiperTTSProvider(config)
     else:
         raise ValueError(f"Invalid TTS provider: {config.tts}")
diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py
new file mode 100644
index 0000000..f16cc50
--- /dev/null
+++ b/audiobook_generator/tts_providers/piper_tts_provider.py
@@ -0,0 +1,69 @@
+import tempfile
+from subprocess import run
+from pathlib import Path
+import logging
+
+
+from pydub import AudioSegment
+
+from audiobook_generator.config.general_config import GeneralConfig
+from audiobook_generator.core.audio_tags import AudioTags
+from audiobook_generator.core.utils import set_audio_tags
+from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["PiperTTSProvider"]
+
+
+class PiperTTSProvider(BaseTTSProvider):
+    def __init__(self, config: GeneralConfig):
+        logger.setLevel(config.log)
+
+        # TTS provider specific config
+        config.output_format = config.output_format or "opus"
+        config.voice_rate = config.voice_rate or "1.0"
+
+        # 0.000$ per 1 million characters
+        # or 0.000$ per 1000 characters
+        self.price = 0.000
+        super().__init__(config)
+
+    def __str__(self) -> str:
+        return f"{self.config}"
+
+    def validate_config(self):
+        pass
+
+    def text_to_speech(
+        self,
+        text: str,
+        output_file: str,
+        audio_tags: AudioTags,
+    ):
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            logger.debug("created temporary directory %r", tmpdirname)
+
+            tmpfilename = Path(tmpdirname) / "piper.wav"
+
+            run(
+                ["piper-tts", "--model", self.config.voice_name, "-f", tmpfilename],
+                input=text.encode("utf-8"),
+            )
+
+            # Convert the wav file to the desired format
+            AudioSegment.from_wav(tmpfilename).export(
+                output_file, format=self.config.output_format
+            )
+
+        set_audio_tags(output_file, audio_tags)
+
+    def estimate_cost(self, total_chars):
+        return 0
+
+    def get_break_string(self):
+        return "    "
+
+    def get_output_file_extension(self):
+        return self.config.output_format

From e8cae8bc8478decd4d304b88b9afe98c7627f022 Mon Sep 17 00:00:00 2001
From: vcalv <66543651+vcalv@users.noreply.github.com>
Date: Wed, 24 Jul 2024 02:26:51 +0000
Subject: [PATCH 2/3] support  all(?) parameters that piper accepts

---
 .../tts_providers/piper_tts_provider.py       | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py
index f16cc50..1f885ab 100644
--- a/audiobook_generator/tts_providers/piper_tts_provider.py
+++ b/audiobook_generator/tts_providers/piper_tts_provider.py
@@ -22,7 +22,17 @@ def __init__(self, config: GeneralConfig):
 
         # TTS provider specific config
         config.output_format = config.output_format or "opus"
-        config.voice_rate = config.voice_rate or "1.0"
+
+        if config.voice_rate is None:
+            config.voice_rate = 1.0
+        else:
+            try:
+                config.voice_rate = float(config.voice_rate)
+            except ValueError:
+                logger.error("Invalid voice_rate %r", config.voice_rate)
+                config.voice_rate = 1.0
+        config.voice_name = config.voice_name or "0"
+        config.break_duration = config.break_duration or 0.2
 
         # 0.000$ per 1 million characters
         # or 0.000$ per 1000 characters
@@ -48,7 +58,19 @@ def text_to_speech(
             tmpfilename = Path(tmpdirname) / "piper.wav"
 
             run(
-                ["piper-tts", "--model", self.config.voice_name, "-f", tmpfilename],
+                [
+                    "piper-tts",
+                    "--model",
+                    self.config.model_name,
+                    "--speaker",
+                    self.config.voice_name,
+                    "--sentence_silence",
+                    str(self.config.break_duration),
+                    "--length_scale",
+                    str(1.0 / self.config.voice_rate),
+                    "-f",
+                    tmpfilename,
+                ],
                 input=text.encode("utf-8"),
             )
 

From 32a750dda4b9f528ca66c3c5af66874485c8f2ea Mon Sep 17 00:00:00 2001
From: vcalv <66543651+vcalv@users.noreply.github.com>
Date: Wed, 24 Jul 2024 02:34:52 +0000
Subject: [PATCH 3/3] piper documentation in README

---
 README.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/README.md b/README.md
index 7de376c..f5ea418 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ If you're interested in hearing a sample of the audiobook generated by this tool
 - [Azure TTS Sample](https://audio.com/paudi/audio/0008-chapter-vii-agricultural-experience)
 - [OpenAI TTS Sample](https://audio.com/paudi/audio/openai-0008-chapter-vii-agricultural-experience-i-had-now-been-in)
 - Edge TTS Sample: the voice is almost the same as Azure TTS
+- [Piper TTS](https://rhasspy.github.io/piper-samples/)
 
 ## Requirements
 
@@ -20,6 +21,7 @@ If you're interested in hearing a sample of the audiobook generated by this tool
 - For using *Azure TTS*, A Microsoft Azure account with access to the [Microsoft Cognitive Services Speech Services](https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices) is required.
 - For using *OpenAI TTS*, OpenAI [API Key](https://platform.openai.com/api-keys) is required.
 - For using *Edge TTS*, no API Key is required.
+- Piper TTS executable and models for *Piper TTS*
 
 ## Audiobookshelf Integration
 
@@ -351,6 +353,29 @@ Here are some examples that demonstrate various option combinations:
    python3 main.py "path/to/book.epub" "path/to/output/folder" --tts edge --chapter_start 5 --chapter_end 10 --break_duration "1500"
    ```
 
+### Examples Using Piper TTS
+   This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters.
+   You always need to specify an onnx model file and the `piper-tts` executable needs to be in the current path. 
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx
+   ```
+
+   Some models support multiple voices and that can be specified by using the voice_name parameter.
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256
+   ```
+
+   You can also specify speed and pause duration
+
+   ```sh
+   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256 --voice_rate 1.5 --break_duration 0.75
+   ```
+
+   Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter.
+   `opus` and `mp3` are good choices for size and compatibility.
+
 ## Troubleshooting
 
 ### ModuleNotFoundError: No module named 'importlib_metadata'