From 17f489641bc06ebba8b84c5c4335d66bf75dc5ae Mon Sep 17 00:00:00 2001
From: p0n1 <zp@zerok.io>
Date: Thu, 5 Sep 2024 06:23:25 +0000
Subject: [PATCH] fix: split and change piper default settings

also fix logging and opus tagging
---
 .gitignore                                    |  4 +-
 README.md                                     | 39 +++++++-----
 .../book_parsers/epub_book_parser.py          |  1 -
 audiobook_generator/config/general_config.py  |  6 ++
 .../core/audiobook_generator.py               |  1 -
 audiobook_generator/core/utils.py             |  2 +-
 .../tts_providers/azure_tts_provider.py       |  1 -
 .../tts_providers/edge_tts_provider.py        |  1 -
 .../tts_providers/openai_tts_provider.py      |  1 -
 .../tts_providers/piper_tts_provider.py       | 60 +++++++++----------
 main.py                                       | 49 ++++++++++++---
 11 files changed, 104 insertions(+), 61 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2ff201d..1081772 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,4 +30,6 @@ audiobook_output/
 private_examples/
 
 # custom
-scripts/
\ No newline at end of file
+scripts/
+*.onnx
+*.onnx.json
\ No newline at end of file
diff --git a/README.md b/README.md
index 20086b3..c640ec2 100644
--- a/README.md
+++ b/README.md
@@ -390,27 +390,33 @@ Here are some examples that demonstrate various option combinations:
    ```
 
 ### Examples Using Piper TTS
-   This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters.
-   You always need to specify an onnx model file and the `piper-tts` executable needs to be in the current path. 
 
-   ```sh
-   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx
-   ```
+*Make sure you have installed Piper TTS and have an onnx model file and corresponding config file. Check [Piper TTS](https://github.com/rhasspy/piper) for more details. You can follow their instructions to install Piper TTS, download the models and config files, play with it and then come back to try the examples below.*
 
-   Some models support multiple voices and that can be specified by using the voice_name parameter.
+This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters.
+You always need to specify an onnx model file and the `piper` executable needs to be in the current $PATH. 
 
-   ```sh
-   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256
-   ```
+```sh
+python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx
+```
 
-   You can also specify speed and pause duration
+Some models support multiple voices and that can be specified by using the voice_name parameter.
 
-   ```sh
-   python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256 --voice_rate 1.5 --break_duration 0.75
-   ```
+```sh
+python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --piper_speaker 256
+```
+
+You can also specify speed (piper_length_scale) and pause duration (piper_sentence_silence).
+
+```sh
+python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --piper_speaker 256 --piper_length_scale 1.5 --piper_sentence_silence 0.5
+```
 
-   Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter.
-   `opus` and `mp3` are good choices for size and compatibility.
+Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter. The `opus` and `mp3` are good choices for size and compatibility.
+
+```sh
+python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --piper_speaker 256 --piper_length_scale 1.5 --piper_sentence_silence 0.5 --output_format opus
+```
 
 ## Troubleshooting
 
@@ -422,6 +428,9 @@ This may be because the Python version you are using is [less than 3.8](https://
 
 Make sure ffmpeg binary is accessible from your path. If you are on a mac and use homebrew, you can do `brew install ffmpeg`, On Ubuntu you can do `sudo apt install ffmpeg`
 
+### Piper TTS
+
+For installation-related issues, please refer to the [Piper TTS](https://github.com/rhasspy/piper) repository. It's important to note that if you're installing `piper-tts` via pip, [only Python 3.10](https://github.com/rhasspy/piper/issues/509) is currently supported. Mac users may encounter additional challenges when using the downloaded [binary](https://github.com/rhasspy/piper/issues/523). For more information on Mac-specific issues, please check [this issue](https://github.com/rhasspy/piper/issues/395) and [this pull request](https://github.com/rhasspy/piper/pull/412).
 
 ## Related Projects
 
diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py
index 95e98bb..fd8b7dc 100644
--- a/audiobook_generator/book_parsers/epub_book_parser.py
+++ b/audiobook_generator/book_parsers/epub_book_parser.py
@@ -15,7 +15,6 @@
 class EpubBookParser(BaseBookParser):
     def __init__(self, config: GeneralConfig):
         super().__init__(config)
-        logger.setLevel(config.log)
         self.book = epub.read_epub(self.config.input_file, {"ignore_ncx": True})
 
     def __str__(self) -> str:
diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py
index d577393..e81df8d 100644
--- a/audiobook_generator/config/general_config.py
+++ b/audiobook_generator/config/general_config.py
@@ -32,5 +32,11 @@ def __init__(self, args):
         self.voice_pitch = args.voice_pitch
         self.proxy = args.proxy
 
+        # TTS provider: Piper specific arguments
+        self.piper_path = args.piper_path
+        self.piper_speaker = args.piper_speaker
+        self.piper_sentence_silence = args.piper_sentence_silence
+        self.piper_length_scale = args.piper_length_scale
+
     def __str__(self):
         return ', '.join(f"{key}={value}" for key, value in self.__dict__.items())
diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py
index d5bd0d1..74fb980 100644
--- a/audiobook_generator/core/audiobook_generator.py
+++ b/audiobook_generator/core/audiobook_generator.py
@@ -27,7 +27,6 @@ def get_total_chars(chapters):
 class AudiobookGenerator:
     def __init__(self, config: GeneralConfig):
         self.config = config
-        logger.setLevel(config.log)
 
     def __str__(self) -> str:
         return f"{self.config}"
diff --git a/audiobook_generator/core/utils.py b/audiobook_generator/core/utils.py
index 5b49ddc..935afbd 100644
--- a/audiobook_generator/core/utils.py
+++ b/audiobook_generator/core/utils.py
@@ -49,7 +49,7 @@ def set_audio_tags(output_file, audio_tags):
     try:
         try:
             tags = ID3(output_file)
-            print(tags)
+            logger.debug(f"tags: {tags}")
         except ID3NoHeaderError:
             logger.debug(f"handling ID3NoHeaderError: {output_file}")
             tags = ID3()
diff --git a/audiobook_generator/tts_providers/azure_tts_provider.py b/audiobook_generator/tts_providers/azure_tts_provider.py
index ca9aa91..c2ac47c 100644
--- a/audiobook_generator/tts_providers/azure_tts_provider.py
+++ b/audiobook_generator/tts_providers/azure_tts_provider.py
@@ -19,7 +19,6 @@
 
 class AzureTTSProvider(BaseTTSProvider):
     def __init__(self, config: GeneralConfig):
-        logger.setLevel(config.log)
         # TTS provider specific config
         config.voice_name = config.voice_name or "en-US-GuyNeural"
         config.output_format = config.output_format or "audio-24khz-48kbitrate-mono-mp3"
diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py
index cfcd8d7..07acb7e 100644
--- a/audiobook_generator/tts_providers/edge_tts_provider.py
+++ b/audiobook_generator/tts_providers/edge_tts_provider.py
@@ -124,7 +124,6 @@ async def save(
 
 class EdgeTTSProvider(BaseTTSProvider):
     def __init__(self, config: GeneralConfig):
-        logger.setLevel(config.log)
         # TTS provider specific config
         config.voice_name = config.voice_name or "en-US-GuyNeural"
         config.output_format = config.output_format or "audio-24khz-48kbitrate-mono-mp3"
diff --git a/audiobook_generator/tts_providers/openai_tts_provider.py b/audiobook_generator/tts_providers/openai_tts_provider.py
index d80c42d..61ccad5 100644
--- a/audiobook_generator/tts_providers/openai_tts_provider.py
+++ b/audiobook_generator/tts_providers/openai_tts_provider.py
@@ -27,7 +27,6 @@ def get_supported_formats():
 
 class OpenAITTSProvider(BaseTTSProvider):
     def __init__(self, config: GeneralConfig):
-        logger.setLevel(config.log)
         config.model_name = config.model_name or "tts-1"
         config.voice_name = config.voice_name or "alloy"
         config.output_format = config.output_format or "mp3"
diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py
index 1f885ab..c1bfb1c 100644
--- a/audiobook_generator/tts_providers/piper_tts_provider.py
+++ b/audiobook_generator/tts_providers/piper_tts_provider.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 import logging
 
-
 from pydub import AudioSegment
 
 from audiobook_generator.config.general_config import GeneralConfig
@@ -18,24 +17,10 @@
 
 class PiperTTSProvider(BaseTTSProvider):
     def __init__(self, config: GeneralConfig):
-        logger.setLevel(config.log)
 
         # TTS provider specific config
-        config.output_format = config.output_format or "opus"
-
-        if config.voice_rate is None:
-            config.voice_rate = 1.0
-        else:
-            try:
-                config.voice_rate = float(config.voice_rate)
-            except ValueError:
-                logger.error("Invalid voice_rate %r", config.voice_rate)
-                config.voice_rate = 1.0
-        config.voice_name = config.voice_name or "0"
-        config.break_duration = config.break_duration or 0.2
-
-        # 0.000$ per 1 million characters
-        # or 0.000$ per 1000 characters
+        config.output_format = config.output_format or "mp3"
+
         self.price = 0.000
         super().__init__(config)
 
@@ -57,29 +42,42 @@ def text_to_speech(
 
             tmpfilename = Path(tmpdirname) / "piper.wav"
 
+            cmd = [
+                self.config.piper_path,
+                "--model",
+                self.config.model_name,
+                "--speaker",
+                str(self.config.piper_speaker),
+                "--sentence_silence",
+                str(self.config.piper_sentence_silence),
+                "--length_scale",
+                str(self.config.piper_length_scale),
+                "-f",
+                tmpfilename,
+                "--debug",
+            ]
+
+            logger.info(
+                f"Running Piper TTS command: {' '.join(str(arg) for arg in cmd)}"
+            )
             run(
-                [
-                    "piper-tts",
-                    "--model",
-                    self.config.model_name,
-                    "--speaker",
-                    self.config.voice_name,
-                    "--sentence_silence",
-                    str(self.config.break_duration),
-                    "--length_scale",
-                    str(1.0 / self.config.voice_rate),
-                    "-f",
-                    tmpfilename,
-                ],
+                cmd,
                 input=text.encode("utf-8"),
             )
 
+            # set audio tags, need to be done before conversion or opus won't work, not sure why
+            set_audio_tags(tmpfilename, audio_tags)
+
+            logger.info(
+                f"Piper TTS command completed, converting {tmpfilename} to {self.config.output_format} format"
+            )
+
             # Convert the wav file to the desired format
             AudioSegment.from_wav(tmpfilename).export(
                 output_file, format=self.config.output_format
             )
 
-        set_audio_tags(output_file, audio_tags)
+            logger.info(f"Conversion completed, output file: {output_file}")
 
     def estimate_cost(self, total_chars):
         return 0
diff --git a/main.py b/main.py
index 368d8e1..bd28422 100644
--- a/main.py
+++ b/main.py
@@ -7,13 +7,6 @@
     get_supported_tts_providers,
 )
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-logger = logging.getLogger(__name__)
-
 
 def handle_args():
     parser = argparse.ArgumentParser(description="Convert text book to audiobook")
@@ -142,13 +135,53 @@ def handle_args():
         help="Break duration in milliseconds for the different paragraphs or sections (default: 1250, means 1.25 s). Valid values range from 0 to 5000 milliseconds for Azure TTS.",
     )
 
+    piper_tts_group = parser.add_argument_group(title="piper specific")
+    piper_tts_group.add_argument(
+        "--piper_path",
+        default="piper",
+        help="Path to the Piper TTS executable",
+    )
+    piper_tts_group.add_argument(
+        "--piper_speaker",
+        default=0,
+        help="Piper speaker id, used for multi-speaker models",
+    )
+    piper_tts_group.add_argument(
+        "--piper_sentence_silence",
+        default=0.2,
+        help="Seconds of silence after each sentence",
+    )
+    piper_tts_group.add_argument(
+        "--piper_length_scale",
+        default=1.0,
+        help="Phoneme length, a.k.a. speaking rate",
+    )
+
     args = parser.parse_args()
     return GeneralConfig(args)
 
 
+def setup_logging(log_level):
+    # Create a custom formatter
+    formatter = logging.Formatter(
+        "%(asctime)s - %(filename)s:%(lineno)d - %(funcName)s - %(levelname)s - %(message)s"
+    )
+
+    # Create a stream handler (prints to console)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+
+    # Configure the root logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+    root_logger.addHandler(console_handler)
+
+
 def main():
     config = handle_args()
-    logger.setLevel(config.log)
+
+    setup_logging(config.log)
+
     AudiobookGenerator(config).run()