From 17f489641bc06ebba8b84c5c4335d66bf75dc5ae Mon Sep 17 00:00:00 2001 From: p0n1 Date: Thu, 5 Sep 2024 06:23:25 +0000 Subject: [PATCH] fix: split and change piper default settings also fix logging and opus tagging --- .gitignore | 4 +- README.md | 39 +++++++----- .../book_parsers/epub_book_parser.py | 1 - audiobook_generator/config/general_config.py | 6 ++ .../core/audiobook_generator.py | 1 - audiobook_generator/core/utils.py | 2 +- .../tts_providers/azure_tts_provider.py | 1 - .../tts_providers/edge_tts_provider.py | 1 - .../tts_providers/openai_tts_provider.py | 1 - .../tts_providers/piper_tts_provider.py | 60 +++++++++---------- main.py | 49 ++++++++++++--- 11 files changed, 104 insertions(+), 61 deletions(-) diff --git a/.gitignore b/.gitignore index 2ff201d..1081772 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,6 @@ audiobook_output/ private_examples/ # custom -scripts/ \ No newline at end of file +scripts/ +*.onnx +*.onnx.json \ No newline at end of file diff --git a/README.md b/README.md index 20086b3..c640ec2 100644 --- a/README.md +++ b/README.md @@ -390,27 +390,33 @@ Here are some examples that demonstrate various option combinations: ``` ### Examples Using Piper TTS - This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters. - You always need to specify an onnx model file and the `piper-tts` executable needs to be in the current path. - ```sh - python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx - ``` +*Make sure you have installed Piper TTS and have an onnx model file and corresponding config file. Check [Piper TTS](https://github.com/rhasspy/piper) for more details. You can follow their instructions to install Piper TTS, download the models and config files, play with it and then come back to try the examples below.* - Some models support multiple voices and that can be specified by using the voice_name parameter. +This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters. +You always need to specify an onnx model file and the `piper` executable needs to be in the current $PATH. - ```sh - python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --voice_name 256 - ``` +```sh +python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx +``` - You can also specify speed and pause duration +Some models support multiple voices and that can be specified by using the voice_name parameter. - ```sh - python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --voice_name 256 --voice_rate 1.5 --break_duration 0.75 - ``` +```sh +python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --piper_speaker 256 +``` + +You can also specify speed (piper_length_scale) and pause duration (piper_sentence_silence). + +```sh +python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --piper_speaker 256 --piper_length_scale 1.5 --piper_sentence_silence 0.5 +``` - Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter. - `opus` and `mp3` are good choices for size and compatibility. +Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter. The `opus` and `mp3` are good choices for size and compatibility. + +```sh +python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name /en_US-libritts_r-medium.onnx --piper_speaker 256 --piper_length_scale 1.5 --piper_sentence_silence 0.5 --output_format opus +``` ## Troubleshooting @@ -422,6 +428,9 @@ This may be because the Python version you are using is [less than 3.8](https:// Make sure ffmpeg binary is accessible from your path. If you are on a mac and use homebrew, you can do `brew install ffmpeg`, On Ubuntu you can do `sudo apt install ffmpeg` +### Piper TTS + +For installation-related issues, please refer to the [Piper TTS](https://github.com/rhasspy/piper) repository. It's important to note that if you're installing `piper-tts` via pip, [only Python 3.10](https://github.com/rhasspy/piper/issues/509) is currently supported. Mac users may encounter additional challenges when using the downloaded [binary](https://github.com/rhasspy/piper/issues/523). For more information on Mac-specific issues, please check [this issue](https://github.com/rhasspy/piper/issues/395) and [this pull request](https://github.com/rhasspy/piper/pull/412). ## Related Projects diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py index 95e98bb..fd8b7dc 100644 --- a/audiobook_generator/book_parsers/epub_book_parser.py +++ b/audiobook_generator/book_parsers/epub_book_parser.py @@ -15,7 +15,6 @@ class EpubBookParser(BaseBookParser): def __init__(self, config: GeneralConfig): super().__init__(config) - logger.setLevel(config.log) self.book = epub.read_epub(self.config.input_file, {"ignore_ncx": True}) def __str__(self) -> str: diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py index d577393..e81df8d 100644 --- a/audiobook_generator/config/general_config.py +++ b/audiobook_generator/config/general_config.py @@ -32,5 +32,11 @@ def __init__(self, args): self.voice_pitch = args.voice_pitch self.proxy = args.proxy + # TTS provider: Piper specific arguments + self.piper_path = args.piper_path + self.piper_speaker = args.piper_speaker + self.piper_sentence_silence = args.piper_sentence_silence + self.piper_length_scale = args.piper_length_scale + def __str__(self): return ', '.join(f"{key}={value}" for key, value in self.__dict__.items()) diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py index d5bd0d1..74fb980 100644 --- a/audiobook_generator/core/audiobook_generator.py +++ b/audiobook_generator/core/audiobook_generator.py @@ -27,7 +27,6 @@ def get_total_chars(chapters): class AudiobookGenerator: def __init__(self, config: GeneralConfig): self.config = config - logger.setLevel(config.log) def __str__(self) -> str: return f"{self.config}" diff --git a/audiobook_generator/core/utils.py b/audiobook_generator/core/utils.py index 5b49ddc..935afbd 100644 --- a/audiobook_generator/core/utils.py +++ b/audiobook_generator/core/utils.py @@ -49,7 +49,7 @@ def set_audio_tags(output_file, audio_tags): try: try: tags = ID3(output_file) - print(tags) + logger.debug(f"tags: {tags}") except ID3NoHeaderError: logger.debug(f"handling ID3NoHeaderError: {output_file}") tags = ID3() diff --git a/audiobook_generator/tts_providers/azure_tts_provider.py b/audiobook_generator/tts_providers/azure_tts_provider.py index ca9aa91..c2ac47c 100644 --- a/audiobook_generator/tts_providers/azure_tts_provider.py +++ b/audiobook_generator/tts_providers/azure_tts_provider.py @@ -19,7 +19,6 @@ class AzureTTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): - logger.setLevel(config.log) # TTS provider specific config config.voice_name = config.voice_name or "en-US-GuyNeural" config.output_format = config.output_format or "audio-24khz-48kbitrate-mono-mp3" diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index cfcd8d7..07acb7e 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -124,7 +124,6 @@ async def save( class EdgeTTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): - logger.setLevel(config.log) # TTS provider specific config config.voice_name = config.voice_name or "en-US-GuyNeural" config.output_format = config.output_format or "audio-24khz-48kbitrate-mono-mp3" diff --git a/audiobook_generator/tts_providers/openai_tts_provider.py b/audiobook_generator/tts_providers/openai_tts_provider.py index d80c42d..61ccad5 100644 --- a/audiobook_generator/tts_providers/openai_tts_provider.py +++ b/audiobook_generator/tts_providers/openai_tts_provider.py @@ -27,7 +27,6 @@ def get_supported_formats(): class OpenAITTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): - logger.setLevel(config.log) config.model_name = config.model_name or "tts-1" config.voice_name = config.voice_name or "alloy" config.output_format = config.output_format or "mp3" diff --git a/audiobook_generator/tts_providers/piper_tts_provider.py b/audiobook_generator/tts_providers/piper_tts_provider.py index 1f885ab..c1bfb1c 100644 --- a/audiobook_generator/tts_providers/piper_tts_provider.py +++ b/audiobook_generator/tts_providers/piper_tts_provider.py @@ -3,7 +3,6 @@ from pathlib import Path import logging - from pydub import AudioSegment from audiobook_generator.config.general_config import GeneralConfig @@ -18,24 +17,10 @@ class PiperTTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): - logger.setLevel(config.log) # TTS provider specific config - config.output_format = config.output_format or "opus" - - if config.voice_rate is None: - config.voice_rate = 1.0 - else: - try: - config.voice_rate = float(config.voice_rate) - except ValueError: - logger.error("Invalid voice_rate %r", config.voice_rate) - config.voice_rate = 1.0 - config.voice_name = config.voice_name or "0" - config.break_duration = config.break_duration or 0.2 - - # 0.000$ per 1 million characters - # or 0.000$ per 1000 characters + config.output_format = config.output_format or "mp3" + self.price = 0.000 super().__init__(config) @@ -57,29 +42,42 @@ def text_to_speech( tmpfilename = Path(tmpdirname) / "piper.wav" + cmd = [ + self.config.piper_path, + "--model", + self.config.model_name, + "--speaker", + str(self.config.piper_speaker), + "--sentence_silence", + str(self.config.piper_sentence_silence), + "--length_scale", + str(self.config.piper_length_scale), + "-f", + tmpfilename, + "--debug", + ] + + logger.info( + f"Running Piper TTS command: {' '.join(str(arg) for arg in cmd)}" + ) run( - [ - "piper-tts", - "--model", - self.config.model_name, - "--speaker", - self.config.voice_name, - "--sentence_silence", - str(self.config.break_duration), - "--length_scale", - str(1.0 / self.config.voice_rate), - "-f", - tmpfilename, - ], + cmd, input=text.encode("utf-8"), ) + # set audio tags, need to be done before conversion or opus won't work, not sure why + set_audio_tags(tmpfilename, audio_tags) + + logger.info( + f"Piper TTS command completed, converting {tmpfilename} to {self.config.output_format} format" + ) + # Convert the wav file to the desired format AudioSegment.from_wav(tmpfilename).export( output_file, format=self.config.output_format ) - set_audio_tags(output_file, audio_tags) + logger.info(f"Conversion completed, output file: {output_file}") def estimate_cost(self, total_chars): return 0 diff --git a/main.py b/main.py index 368d8e1..bd28422 100644 --- a/main.py +++ b/main.py @@ -7,13 +7,6 @@ get_supported_tts_providers, ) -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", -) -logger = logging.getLogger(__name__) - def handle_args(): parser = argparse.ArgumentParser(description="Convert text book to audiobook") @@ -142,13 +135,53 @@ def handle_args(): help="Break duration in milliseconds for the different paragraphs or sections (default: 1250, means 1.25 s). Valid values range from 0 to 5000 milliseconds for Azure TTS.", ) + piper_tts_group = parser.add_argument_group(title="piper specific") + piper_tts_group.add_argument( + "--piper_path", + default="piper", + help="Path to the Piper TTS executable", + ) + piper_tts_group.add_argument( + "--piper_speaker", + default=0, + help="Piper speaker id, used for multi-speaker models", + ) + piper_tts_group.add_argument( + "--piper_sentence_silence", + default=0.2, + help="Seconds of silence after each sentence", + ) + piper_tts_group.add_argument( + "--piper_length_scale", + default=1.0, + help="Phoneme length, a.k.a. speaking rate", + ) + args = parser.parse_args() return GeneralConfig(args) +def setup_logging(log_level): + # Create a custom formatter + formatter = logging.Formatter( + "%(asctime)s - %(filename)s:%(lineno)d - %(funcName)s - %(levelname)s - %(message)s" + ) + + # Create a stream handler (prints to console) + console_handler = logging.StreamHandler() + console_handler.setFormatter(formatter) + + # Configure the root logger + root_logger = logging.getLogger() + root_logger.setLevel(log_level) + root_logger.addHandler(console_handler) + + def main(): config = handle_args() - logger.setLevel(config.log) + + setup_logging(config.log) + AudiobookGenerator(config).run()