From 993c982c9c258d306895d89653cfb04817db9697 Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Tue, 12 Nov 2024 17:55:02 -0800 Subject: [PATCH 1/5] Added dotenv parsing and default azure voice --- .env-example | 3 +++ .gitignore | 5 ++++- main.py | 13 +++++++++++-- requirements.txt | 3 ++- 4 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 .env-example diff --git a/.env-example b/.env-example new file mode 100644 index 0000000..ab376df --- /dev/null +++ b/.env-example @@ -0,0 +1,3 @@ +MS_TTS_KEY= # for Azure +MS_TTS_REGION= # for Azure +OPENAI_API_KEY= # for OpenAI \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1081772..36c237e 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,7 @@ private_examples/ # custom scripts/ *.onnx -*.onnx.json \ No newline at end of file +*.onnx.json + +# Environment variables +.env \ No newline at end of file diff --git a/main.py b/main.py index bd28422..e375551 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,7 @@ import argparse +from ast import parse import logging +import os from audiobook_generator.config.general_config import GeneralConfig from audiobook_generator.core.audiobook_generator import AudiobookGenerator @@ -7,8 +9,13 @@ get_supported_tts_providers, ) +from dotenv import load_dotenv + +load_dotenv() + def handle_args(): + parser = argparse.ArgumentParser(description="Convert text book to audiobook") parser.add_argument("input_file", help="Path to the EPUB file") parser.add_argument("output_folder", help="Path to the output folder") @@ -85,7 +92,8 @@ def handle_args(): parser.add_argument( "--voice_name", - help="Various TTS providers has different voice names, look up for your provider settings.", + default="en-US-DavisMultilingualNeural", + help="Various TTS providers has different voice names, look up for your provider settings. Default is for Azure TTS.", ) parser.add_argument( @@ -179,7 +187,8 @@ def setup_logging(log_level): def main(): config = handle_args() - + config.tts + print(f"TTS Key is{os.environ.get('MS_TTS_KEY')}") setup_logging(config.log) AudiobookGenerator(config).run() diff --git a/requirements.txt b/requirements.txt index 001d104..336ffef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ openai==1.35.7 requests==2.32.3 socksio==1.0.0 edge-tts==6.1.12 -pydub==0.25.1 \ No newline at end of file +pydub==0.25.1 +python-dotenv==1.0.1 \ No newline at end of file From 295580cb54221133384786a01f49855c3ed6ecee Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Tue, 12 Nov 2024 18:18:44 -0800 Subject: [PATCH 2/5] Move defaults into azure provider --- .../tts_providers/azure_tts_provider.py | 22 ++++++++++--------- main.py | 4 +--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/audiobook_generator/tts_providers/azure_tts_provider.py b/audiobook_generator/tts_providers/azure_tts_provider.py index c2ac47c..02a0cc1 100644 --- a/audiobook_generator/tts_providers/azure_tts_provider.py +++ b/audiobook_generator/tts_providers/azure_tts_provider.py @@ -20,7 +20,7 @@ class AzureTTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): # TTS provider specific config - config.voice_name = config.voice_name or "en-US-GuyNeural" + config.voice_name = config.voice_name or "en-US-DavisMultilingualNeural" config.output_format = config.output_format or "audio-24khz-48kbitrate-mono-mp3" # 16$ per 1 million characters @@ -47,8 +47,8 @@ def __init__(self, config: GeneralConfig): def __str__(self) -> str: return ( - super().__str__() - + f", voice_name={self.config.voice_name}, language={self.config.language}, break_duration={self.config.break_duration}, output_format={self.config.output_format}" + super().__str__() + + f", voice_name={self.config.voice_name}, language={self.config.language}, break_duration={self.config.break_duration}, output_format={self.config.output_format}" ) def is_access_token_expired(self) -> bool: @@ -77,16 +77,16 @@ def get_access_token(self) -> str: f"Network error while getting access token (attempt {retry + 1}/{MAX_RETRIES}): {e}" ) if retry < MAX_RETRIES - 1: - sleep(2 ** retry) + sleep(2**retry) else: raise e raise Exception("Failed to get access token") def text_to_speech( - self, - text: str, - output_file: str, - audio_tags: AudioTags, + self, + text: str, + output_file: str, + audio_tags: AudioTags, ): # Adjust this value based on your testing max_chars = 1800 if self.config.language.startswith("zh") else 3000 @@ -139,7 +139,7 @@ def text_to_speech( f"Error while converting text to speech (attempt {retry + 1}): {e}" ) if retry < MAX_RETRIES - 1: - sleep(2 ** retry) + sleep(2**retry) else: raise e @@ -171,7 +171,9 @@ def get_output_file_extension(self): elif self.config.output_format.endswith("mp3"): return "mp3" else: - raise NotImplementedError(f"Unknown file extension for output format: {self.config.output_format}") + raise NotImplementedError( + f"Unknown file extension for output format: {self.config.output_format}" + ) def validate_config(self): # TODO: Need to dig into Azure properties, im not familiar with them, but look at OpenAI as ref example diff --git a/main.py b/main.py index e375551..21f3067 100644 --- a/main.py +++ b/main.py @@ -92,8 +92,7 @@ def handle_args(): parser.add_argument( "--voice_name", - default="en-US-DavisMultilingualNeural", - help="Various TTS providers has different voice names, look up for your provider settings. Default is for Azure TTS.", + help="Various TTS providers has different voice names, look up for your provider settings.", ) parser.add_argument( @@ -188,7 +187,6 @@ def setup_logging(log_level): def main(): config = handle_args() config.tts - print(f"TTS Key is{os.environ.get('MS_TTS_KEY')}") setup_logging(config.log) AudiobookGenerator(config).run() From 9ce4ab5a15acd0605bcef1775d3aef3b9637f12c Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Thu, 14 Nov 2024 14:59:48 -0800 Subject: [PATCH 3/5] Ignoring custom models --- .gitignore | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 36c237e..ef231d9 100644 --- a/.gitignore +++ b/.gitignore @@ -34,5 +34,8 @@ scripts/ *.onnx *.onnx.json -# Environment variables -.env \ No newline at end of file +# Models +piper_models/* + +# Env files +.env From 32734b712b88d1041adcdf6418f4d473f8f8fb25 Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Thu, 14 Nov 2024 15:20:36 -0800 Subject: [PATCH 4/5] Added multithreading --- .../tts_providers/azure_tts_provider.py | 156 +++++++++++------- 1 file changed, 95 insertions(+), 61 deletions(-) diff --git a/audiobook_generator/tts_providers/azure_tts_provider.py b/audiobook_generator/tts_providers/azure_tts_provider.py index 02a0cc1..d2073c7 100644 --- a/audiobook_generator/tts_providers/azure_tts_provider.py +++ b/audiobook_generator/tts_providers/azure_tts_provider.py @@ -1,15 +1,21 @@ +import concurrent +import concurrent.futures import html import io import logging import math +import multiprocessing import os +import threading from datetime import datetime, timedelta from time import sleep +from typing import Optional + import requests -from audiobook_generator.core.audio_tags import AudioTags from audiobook_generator.config.general_config import GeneralConfig -from audiobook_generator.core.utils import split_text, set_audio_tags +from audiobook_generator.core.audio_tags import AudioTags +from audiobook_generator.core.utils import set_audio_tags, split_text from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider logger = logging.getLogger(__name__) @@ -29,6 +35,7 @@ def __init__(self, config: GeneralConfig): # access token and expiry time self.access_token = None self.token_expiry_time = datetime.utcnow() + self.token_lock = threading.Lock() super().__init__(config) subscription_key = os.environ.get("MS_TTS_KEY") @@ -52,15 +59,21 @@ def __str__(self) -> str: ) def is_access_token_expired(self) -> bool: - return self.access_token is None or datetime.utcnow() >= self.token_expiry_time + with self.token_lock: + return ( + self.access_token is None or datetime.utcnow() >= self.token_expiry_time + ) def auto_renew_access_token(self) -> str: - if self.access_token is None or self.is_access_token_expired(): - logger.info( - f"azure tts access_token doesn't exist or is expired, getting new one" - ) - self.access_token = self.get_access_token() - self.token_expiry_time = datetime.utcnow() + timedelta(minutes=9, seconds=1) + with self.token_lock: + if self.access_token is None or self.is_access_token_expired(): + logger.info( + f"azure tts access_token doesn't exist or is expired, getting new one" + ) + self.access_token = self.get_access_token() + self.token_expiry_time = datetime.utcnow() + timedelta( + minutes=9, seconds=1 + ) return self.access_token def get_access_token(self) -> str: @@ -82,6 +95,57 @@ def get_access_token(self) -> str: raise e raise Exception("Failed to get access token") + def process_chunk( + self, chunk: str, audio_tags: AudioTags, i: int, total_chunks: int + ) -> Optional[tuple[int, io.BytesIO]]: + logger.debug( + f"Processing chunk {i} of {total_chunks}, length={len(chunk)}, text=[{chunk}]" + ) + escaped_text = html.escape(chunk) + logger.debug(f"Escaped text: [{escaped_text}]") + # replace MAGIC_BREAK_STRING with a break tag for section/paragraph break + escaped_text = escaped_text.replace( + self.get_break_string().strip(), + f" ", + ) # strip in case leading bank is missing + logger.info( + f"Processing chapter-{audio_tags.idx} <{audio_tags.title}>, chunk {i} of {total_chunks}" + ) + ssml = f"{escaped_text}" + logger.debug(f"SSML: [{ssml}]") + + for retry in range(MAX_RETRIES): + self.auto_renew_access_token() + headers = { + "Authorization": f"Bearer {self.access_token}", + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": self.config.output_format, + "User-Agent": "Python", + } + try: + logger.info( + "Sending request to Azure TTS, data length: " + str(len(ssml)) + ) + response = requests.post( + self.TTS_URL, headers=headers, data=ssml.encode("utf-8") + ) + response.raise_for_status() # Will raise HTTPError for 4XX or 5XX status + logger.info( + "Got response from Azure TTS, response length: " + + str(len(response.content)) + ) + return (i, io.BytesIO(response.content)) + except requests.exceptions.RequestException as e: + logger.warning( + f"Error while converting text to speech (attempt {retry + 1}): {e}" + ) + if retry < MAX_RETRIES - 1: + sleep(2**retry) + else: + raise e + finally: + response.close() + def text_to_speech( self, text: str, @@ -92,59 +156,29 @@ def text_to_speech( max_chars = 1800 if self.config.language.startswith("zh") else 3000 text_chunks = split_text(text, max_chars, self.config.language) - - audio_segments = [] - - for i, chunk in enumerate(text_chunks, 1): - logger.debug( - f"Processing chunk {i} of {len(text_chunks)}, length={len(chunk)}, text=[{chunk}]" - ) - escaped_text = html.escape(chunk) - logger.debug(f"Escaped text: [{escaped_text}]") - # replace MAGIC_BREAK_STRING with a break tag for section/paragraph break - escaped_text = escaped_text.replace( - self.get_break_string().strip(), - f" ", - ) # strip in case leading bank is missing - logger.info( - f"Processing chapter-{audio_tags.idx} <{audio_tags.title}>, chunk {i} of {len(text_chunks)}" - ) - ssml = f"{escaped_text}" - logger.debug(f"SSML: [{ssml}]") - - for retry in range(MAX_RETRIES): - self.auto_renew_access_token() - headers = { - "Authorization": f"Bearer {self.access_token}", - "Content-Type": "application/ssml+xml", - "X-Microsoft-OutputFormat": self.config.output_format, - "User-Agent": "Python", - } - try: - logger.info( - "Sending request to Azure TTS, data length: " + str(len(ssml)) - ) - response = requests.post( - self.TTS_URL, headers=headers, data=ssml.encode("utf-8") - ) - response.raise_for_status() # Will raise HTTPError for 4XX or 5XX status - logger.info( - "Got response from Azure TTS, response length: " - + str(len(response.content)) - ) - audio_segments.append(io.BytesIO(response.content)) - break - except requests.exceptions.RequestException as e: - logger.warning( - f"Error while converting text to speech (attempt {retry + 1}): {e}" - ) - if retry < MAX_RETRIES - 1: - sleep(2**retry) - else: - raise e - + total_chunks = len(text_chunks) + + audio_segments: list[tuple[int, io.BytesIO]] = [] + with concurrent.futures.ThreadPoolExecutor( + max_workers=multiprocessing.cpu_count() + ) as executor: + futures = { + executor.submit( + self.process_chunk, chunk, audio_tags, i, total_chunks + ): i + for i, chunk in enumerate(text_chunks, 1) + } + for i, chunk in enumerate(text_chunks, 1): + logger.debug( + f"Processing chunk {i} of {len(text_chunks)}, length={len(chunk)}, text=[{chunk}]" + ) + for future in concurrent.futures.as_completed(futures): + result = future.result() + if result: + audio_segments.append(result) with open(output_file, "wb") as outfile: - for segment in audio_segments: + + for _, segment in sorted(audio_segments, key=lambda x: x[0]): segment.seek(0) outfile.write(segment.read()) From eabba42a73922494c351f50a20018f4c98e04d7f Mon Sep 17 00:00:00 2001 From: Allen Nikka Date: Thu, 14 Nov 2024 15:37:25 -0800 Subject: [PATCH 5/5] Limit concurrency and add error handling --- .../tts_providers/azure_tts_provider.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/audiobook_generator/tts_providers/azure_tts_provider.py b/audiobook_generator/tts_providers/azure_tts_provider.py index d2073c7..f589262 100644 --- a/audiobook_generator/tts_providers/azure_tts_provider.py +++ b/audiobook_generator/tts_providers/azure_tts_provider.py @@ -35,7 +35,7 @@ def __init__(self, config: GeneralConfig): # access token and expiry time self.access_token = None self.token_expiry_time = datetime.utcnow() - self.token_lock = threading.Lock() + self.token_lock = threading.RLock() super().__init__(config) subscription_key = os.environ.get("MS_TTS_KEY") @@ -78,6 +78,7 @@ def auto_renew_access_token(self) -> str: def get_access_token(self) -> str: for retry in range(MAX_RETRIES): + response = None try: logger.info("Getting new access token") response = requests.post(self.TOKEN_URL, headers=self.TOKEN_HEADERS) @@ -93,6 +94,9 @@ def get_access_token(self) -> str: sleep(2**retry) else: raise e + finally: + if response is not None: + response.close() raise Exception("Failed to get access token") def process_chunk( @@ -115,13 +119,14 @@ def process_chunk( logger.debug(f"SSML: [{ssml}]") for retry in range(MAX_RETRIES): - self.auto_renew_access_token() + access_token = self.auto_renew_access_token() headers = { - "Authorization": f"Bearer {self.access_token}", + "Authorization": f"Bearer {access_token}", "Content-Type": "application/ssml+xml", "X-Microsoft-OutputFormat": self.config.output_format, "User-Agent": "Python", } + response = None # Initialize response try: logger.info( "Sending request to Azure TTS, data length: " + str(len(ssml)) @@ -129,7 +134,7 @@ def process_chunk( response = requests.post( self.TTS_URL, headers=headers, data=ssml.encode("utf-8") ) - response.raise_for_status() # Will raise HTTPError for 4XX or 5XX status + response.raise_for_status() logger.info( "Got response from Azure TTS, response length: " + str(len(response.content)) @@ -137,14 +142,15 @@ def process_chunk( return (i, io.BytesIO(response.content)) except requests.exceptions.RequestException as e: logger.warning( - f"Error while converting text to speech (attempt {retry + 1}): {e}" + f"Error while converting text to speech (attempt {retry + 1}/{MAX_RETRIES}): {e}" ) if retry < MAX_RETRIES - 1: sleep(2**retry) else: raise e finally: - response.close() + if response is not None: + response.close() def text_to_speech( self, @@ -160,7 +166,7 @@ def text_to_speech( audio_segments: list[tuple[int, io.BytesIO]] = [] with concurrent.futures.ThreadPoolExecutor( - max_workers=multiprocessing.cpu_count() + max_workers=4 # multiprocessing.cpu_count() ) as executor: futures = { executor.submit( @@ -173,9 +179,12 @@ def text_to_speech( f"Processing chunk {i} of {len(text_chunks)}, length={len(chunk)}, text=[{chunk}]" ) for future in concurrent.futures.as_completed(futures): - result = future.result() - if result: - audio_segments.append(result) + try: + result = future.result() + if result: + audio_segments.append(result) + except Exception as e: + logger.error(f"Error processing chunk: {e}") with open(output_file, "wb") as outfile: for _, segment in sorted(audio_segments, key=lambda x: x[0]):