From 9ecd3d03851ea300a04d3bbb7e28a8215333ff66 Mon Sep 17 00:00:00 2001 From: p0n1 Date: Wed, 26 Jun 2024 18:19:14 +0800 Subject: [PATCH 1/5] fix: #56 bad audio for edge tts pause feature --- .../tts_providers/edge_tts_provider.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 6bb71a3..825e04c 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -14,9 +14,6 @@ logger = logging.getLogger(__name__) -MAX_RETRIES = 12 # Max_retries constant for network errors - - async def get_supported_voices(): # List all available voices and their attributes. # This pulls data from the URL used by Microsoft Edge to return a list of @@ -33,6 +30,9 @@ async def get_supported_voices(): return result + +# Credit: https://gist.github.com/moha-abdi/8ddbcb206c38f592c65ada1e5479f2bf +# @phuchoang2603 contributed pause support in https://github.com/p0n1/epub_to_audiobook/pull/45 class CommWithPauses(Communicate): # This class uses edge_tts to generate text # but with pauses for example:- text: 'Hello @@ -48,22 +48,30 @@ def __init__( self.file = io.BytesIO() def parse_text(self): + logger.debug(f"Parsing the text, looking for pauses in text: {self.text}") if not "[pause:" in self.text: - return [(0, self.text)] + logger.debug(f"No pauses found in the text") + yield 0, self.text parts = self.text.split("[pause:") + logger.debug(f"split into parts: {parts}") for part in parts: if "]" in part: pause_time, content = part.split("]", 1) + logger.debug(f"Pause time: {pause_time}, Content: {content.strip()}") yield int(pause_time), content.strip() else: content = part + logger.debug(f"No pause time, Content: {content.strip()}") yield 0, content.strip() async def chunkify(self): + logger.debug(f"Chunkifying the text") for pause_time, content in self.parsed: - if pause_time: + logger.debug(f"pause_time: {pause_time}") + logger.debug(f"content: {content}") + if pause_time > 0: pause_bytes = self.generate_pause(pause_time) self.file.write(pause_bytes) @@ -77,6 +85,7 @@ def generate_pause(self, time: int) -> bytes: return silent.raw_data async def generate_audio(self, text: str) -> bytes: + logger.debug(f"Generating audio for: {text}") # this genertes the real TTS using edge_tts for this part. temp_chunk = io.BytesIO() self.text = text @@ -87,8 +96,10 @@ async def generate_audio(self, text: str) -> bytes: temp_chunk.seek(0) # handle the case where the chunk is empty try: + logger.debug(f"Decoding the chunk") decoded_chunk = AudioSegment.from_mp3(temp_chunk) except: + logger.debug(f"Empty chunk") decoded_chunk = AudioSegment.silent(0, 24000) return decoded_chunk.raw_data @@ -139,12 +150,15 @@ def text_to_speech( output_file: str, audio_tags: AudioTags, ): + # Replace break string with pause tag text = text.replace( self.get_break_string().strip(), f"[pause: {self.config.break_duration}]" ) + logger.debug(f"Text to speech, adding pause mark: {text}") + communicate = CommWithPauses( text=text, voice_name=self.config.voice_name, From 59cd02e2f92991a7fa196142cea6d17154824146 Mon Sep 17 00:00:00 2001 From: p0n1 Date: Thu, 27 Jun 2024 17:47:30 +0800 Subject: [PATCH 2/5] fix: simplify pause feature --- audiobook_generator/config/general_config.py | 2 +- .../core/audiobook_generator.py | 5 + .../tts_providers/edge_tts_provider.py | 128 ++++++++---------- main.py | 6 +- 4 files changed, 64 insertions(+), 77 deletions(-) diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py index b0d68ff..513acb9 100644 --- a/audiobook_generator/config/general_config.py +++ b/audiobook_generator/config/general_config.py @@ -22,7 +22,7 @@ def __init__(self, args): self.output_format = args.output_format self.model_name = args.model_name - # TTS provider: Azure specific arguments + # TTS provider: Azure & Edge TTS specific arguments self.break_duration = args.break_duration # TTS provider: Edge specific arguments diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py index 80f3207..fd84622 100644 --- a/audiobook_generator/core/audiobook_generator.py +++ b/audiobook_generator/core/audiobook_generator.py @@ -103,6 +103,11 @@ def run(self): output_file, audio_tags, ) + logger.info( + f"✅ Converted chapter {idx}/{len(chapters)}: {title}" + ) + logger.info(f"All chapters converted. 🎉🎉🎉") + except KeyboardInterrupt: logger.info("Job stopped by user.") exit() diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 825e04c..bc83fed 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -14,6 +14,7 @@ logger = logging.getLogger(__name__) + async def get_supported_voices(): # List all available voices and their attributes. # This pulls data from the URL used by Microsoft Edge to return a list of @@ -27,7 +28,7 @@ async def get_supported_voices(): for voice in voices: result[voice["ShortName"]] = voice["Locale"] - + return result @@ -36,72 +37,71 @@ async def get_supported_voices(): class CommWithPauses(Communicate): # This class uses edge_tts to generate text # but with pauses for example:- text: 'Hello - # this is simple text. [pause: 2s] Paused 2s' + # this is simple text. [pause: 1000] Paused 1000ms' def __init__( self, text: str, voice_name: str, - **kwargs + break_string: str, + break_duration: int = 1250, + **kwargs, ) -> None: super().__init__(text, voice_name, **kwargs) + self.break_string = break_string + self.break_duration = int(break_duration) self.parsed = self.parse_text() self.file = io.BytesIO() def parse_text(self): - logger.debug(f"Parsing the text, looking for pauses in text: {self.text}") - if not "[pause:" in self.text: - logger.debug(f"No pauses found in the text") - yield 0, self.text - - parts = self.text.split("[pause:") - logger.debug(f"split into parts: {parts}") - for part in parts: - if "]" in part: - pause_time, content = part.split("]", 1) - logger.debug(f"Pause time: {pause_time}, Content: {content.strip()}") - yield int(pause_time), content.strip() - - else: - content = part - logger.debug(f"No pause time, Content: {content.strip()}") - yield 0, content.strip() + logger.debug( + f"Parsing the text, looking for break/pauses in text: <{self.text}>" + ) + if self.break_string not in self.text: + logger.debug(f"No break/pauses found in the text") + return [self.text] + + parts = self.text.split(self.break_string) + logger.debug(f"split into <{len(parts)}> parts: {parts}") + return parts async def chunkify(self): logger.debug(f"Chunkifying the text") - for pause_time, content in self.parsed: - logger.debug(f"pause_time: {pause_time}") - logger.debug(f"content: {content}") - if pause_time > 0: - pause_bytes = self.generate_pause(pause_time) + for content in self.parsed: + audio_bytes = await self.generate_audio(content) + self.file.write(audio_bytes) + if content != self.parsed[-1]: + # only same break duration for all breaks is supported now + pause_bytes = self.generate_pause(self.break_duration) self.file.write(pause_bytes) - - if content: - audio_bytes = await self.generate_audio(content) - self.file.write(audio_bytes) + logger.debug(f"Chunkifying done") def generate_pause(self, time: int) -> bytes: + logger.debug(f"Generating pause") # pause time should be provided in ms silent: AudioSegment = AudioSegment.silent(time, 24000) - return silent.raw_data + return silent.raw_data # type: ignore async def generate_audio(self, text: str) -> bytes: - logger.debug(f"Generating audio for: {text}") + logger.debug(f"Generating audio for: <{text}>") # this genertes the real TTS using edge_tts for this part. temp_chunk = io.BytesIO() self.text = text async for chunk in self.stream(): - if chunk['type'] == 'audio': - temp_chunk.write(chunk['data']) + if chunk["type"] == "audio": + temp_chunk.write(chunk["data"]) temp_chunk.seek(0) # handle the case where the chunk is empty try: logger.debug(f"Decoding the chunk") decoded_chunk = AudioSegment.from_mp3(temp_chunk) - except: - logger.debug(f"Empty chunk") + except Exception as e: + logger.warning( + f"Failed to decode the chunk, reason: {e}, returning a silent chunk." + ) decoded_chunk = AudioSegment.silent(0, 24000) - return decoded_chunk.raw_data + logger.debug(f"Returning the decoded chunk") + return decoded_chunk.raw_data # type: ignore async def save( self, @@ -114,12 +114,11 @@ async def save( self.file.seek(0) audio: AudioSegment = AudioSegment.from_raw( - self.file, - sample_width=2, - frame_rate=24000, - channels=1 + self.file, sample_width=2, frame_rate=24000, channels=1 ) audio.export(audio_fname) + logger.info(f"Saved the audio to: {audio_fname}") + class EdgeTTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): @@ -142,35 +141,29 @@ def __str__(self) -> str: async def validate_config(self): if self.config.voice_name not in await get_supported_voices(): - raise ValueError(f"EdgeTTS: Unsupported voice name: {self.config.voice_name}") + raise ValueError( + f"EdgeTTS: Unsupported voice name: {self.config.voice_name}" + ) def text_to_speech( - self, - text: str, - output_file: str, - audio_tags: AudioTags, + self, + text: str, + output_file: str, + audio_tags: AudioTags, ): - - # Replace break string with pause tag - text = text.replace( - self.get_break_string().strip(), - f"[pause: {self.config.break_duration}]" - ) - - logger.debug(f"Text to speech, adding pause mark: {text}") communicate = CommWithPauses( text=text, voice_name=self.config.voice_name, + break_string=self.get_break_string().strip(), + break_duration=int(self.config.break_duration), rate=self.config.voice_rate, volume=self.config.voice_volume, pitch=self.config.voice_pitch, - proxy=self.config.proxy + proxy=self.config.proxy, ) - asyncio.run( - communicate.save(output_file) - ) + asyncio.run(communicate.save(output_file)) set_audio_tags(output_file, audio_tags) @@ -181,21 +174,10 @@ def get_break_string(self): return " @BRK#" def get_output_file_extension(self): - if self.config.output_format.startswith("amr"): - return "amr" - elif self.config.output_format.startswith("ogg"): - return "ogg" - elif self.config.output_format.endswith("truesilk"): - return "silk" - elif self.config.output_format.endswith("pcm"): - return "pcm" - elif self.config.output_format.startswith("raw"): - return "wav" - elif self.config.output_format.startswith("webm"): - return "webm" - elif self.config.output_format.endswith("opus"): - return "opus" - elif self.config.output_format.endswith("mp3"): + if self.config.output_format.endswith("mp3"): return "mp3" else: - raise NotImplementedError(f"Unknown file extension for output format: {self.config.output_format}") + # Only mp3 supported in edge-tts https://github.com/rany2/edge-tts/issues/179 + raise NotImplementedError( + f"Unknown file extension for output format: {self.config.output_format}. Only mp3 supported in edge-tts. See https://github.com/rany2/edge-tts/issues/179." + ) diff --git a/main.py b/main.py index b47de8b..6a40e17 100644 --- a/main.py +++ b/main.py @@ -124,11 +124,11 @@ def handle_args(): help="Proxy server for the TTS provider. Format: http://[username:password@]proxy.server:port", ) - azure_tts_group = parser.add_argument_group(title="azure specific") - azure_tts_group.add_argument( + azure_edge_tts_group = parser.add_argument_group(title="azure/edge specific") + azure_edge_tts_group.add_argument( "--break_duration", default="1250", - help="Break duration in milliseconds for the different paragraphs or sections (default: 1250). Valid values range from 0 to 5000 milliseconds.", + help="Break duration in milliseconds for the different paragraphs or sections (default: 1250, means 1.25 s). Valid values range from 0 to 5000 milliseconds for Azure TTS.", ) args = parser.parse_args() From 31e5eb3390285f1d327ecb8f30772991a668326c Mon Sep 17 00:00:00 2001 From: p0n1 Date: Thu, 27 Jun 2024 18:23:56 +0800 Subject: [PATCH 3/5] fix: remove useless save --- audiobook_generator/tts_providers/edge_tts_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index bc83fed..b05f247 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -110,12 +110,12 @@ async def save( ) -> None: # Save the audio and metadata to the specified files. await self.chunkify() - await super().save(audio_fname, metadata_fname) self.file.seek(0) audio: AudioSegment = AudioSegment.from_raw( self.file, sample_width=2, frame_rate=24000, channels=1 ) + logger.debug(f"Exporting the audio") audio.export(audio_fname) logger.info(f"Saved the audio to: {audio_fname}") From f6bf9749ed9379cd1a4d308e24f0928414d63e33 Mon Sep 17 00:00:00 2001 From: p0n1 Date: Fri, 28 Jun 2024 00:12:31 +0800 Subject: [PATCH 4/5] feat: add 'none' option for newline_mode to ignore wrong paragraphs in some case --- .../book_parsers/epub_book_parser.py | 2 ++ .../tts_providers/edge_tts_provider.py | 2 +- main.py | 22 ++++++++++--------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py index ea58d7e..9e14072 100644 --- a/audiobook_generator/book_parsers/epub_book_parser.py +++ b/audiobook_generator/book_parsers/epub_book_parser.py @@ -53,6 +53,8 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]: cleaned_text = re.sub(r"[\n]+", break_string, raw.strip()) elif self.config.newline_mode == "double": cleaned_text = re.sub(r"[\n]{2,}", break_string, raw.strip()) + elif self.config.newline_mode == "none": + cleaned_text = re.sub(r"[\n]+", " ", raw.strip()) else: raise ValueError(f"Invalid newline mode: {self.config.newline_mode}") diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index b05f247..ab67ad2 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -69,7 +69,7 @@ async def chunkify(self): for content in self.parsed: audio_bytes = await self.generate_audio(content) self.file.write(audio_bytes) - if content != self.parsed[-1]: + if content != self.parsed[-1] and self.break_duration > 0: # only same break duration for all breaks is supported now pause_bytes = self.generate_pause(self.break_duration) self.file.write(pause_bytes) diff --git a/main.py b/main.py index 6a40e17..d4677f7 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,9 @@ from audiobook_generator.config.general_config import GeneralConfig from audiobook_generator.core.audiobook_generator import AudiobookGenerator -from audiobook_generator.tts_providers.base_tts_provider import get_supported_tts_providers +from audiobook_generator.tts_providers.base_tts_provider import ( + get_supported_tts_providers, +) logging.basicConfig( level=logging.INFO, @@ -25,7 +27,7 @@ def handle_args(): ) parser.add_argument( "--log", - choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level (default: INFO), can be DEBUG, INFO, WARNING, ERROR, CRITICAL", ) @@ -46,9 +48,9 @@ def handle_args(): ) parser.add_argument( "--newline_mode", - choices=["single", "double"], + choices=["single", "double", "none"], default="double", - help="Choose the mode of detecting new paragraphs: 'single' or 'double'. 'single' means a single newline character, while 'double' means two consecutive newline characters. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)", + help="Choose the mode of detecting new paragraphs: 'single', 'double', or 'none'. 'single' means a single newline character, while 'double' means two consecutive newline characters. 'none' means all newline characters will be replace with blank so paragraphs will not be detected. (default: double, works for most ebooks but will detect less paragraphs for some ebooks)", ) parser.add_argument( "--title_mode", @@ -97,26 +99,26 @@ def handle_args(): edge_tts_group = parser.add_argument_group(title="edge specific") edge_tts_group.add_argument( "--voice_rate", - help=''' + help=""" Speaking rate of the text. Valid relative values range from -50%%(--xxx='-50%%') to +100%%. For negative value use format --arg=value, - ''' + """, ) edge_tts_group.add_argument( "--voice_volume", - help=''' + help=""" Volume level of the speaking voice. Valid relative values floor to -100%%. For negative value use format --arg=value, - ''' + """, ) edge_tts_group.add_argument( "--voice_pitch", - help=''' + help=""" Baseline pitch for the text.Valid relative values like -80Hz,+50Hz, pitch changes should be within 0.5 to 1.5 times the original audio. For negative value use format --arg=value, - ''' + """, ) edge_tts_group.add_argument( From d16d571fc91e91c718bbdb8754d33b41dea10601 Mon Sep 17 00:00:00 2001 From: p0n1 Date: Fri, 28 Jun 2024 11:23:14 +0800 Subject: [PATCH 5/5] README: update help msg --- README.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8e1a74d..ce023cb 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,8 @@ python3 main.py -h usage: main.py [-h] [--tts {azure,openai,edge}] [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview] [--no_prompt] [--language LANGUAGE] - [--newline_mode {single,double}] + [--newline_mode {single,double,none}] + [--title_mode {auto,tag_text,first_few}] [--chapter_start CHAPTER_START] [--chapter_end CHAPTER_END] [--output_text] [--remove_endnotes] [--voice_name VOICE_NAME] [--output_format OUTPUT_FORMAT] [--model_name MODEL_NAME] @@ -126,13 +127,20 @@ options: different strategies in this tool, especially for Chinese characters. For Chinese books, use zh-CN, zh- TW, or zh-HK. - --newline_mode {single,double} - Choose the mode of detecting new paragraphs: 'single' - or 'double'. 'single' means a single newline + --newline_mode {single,double,none} + Choose the mode of detecting new paragraphs: 'single', + 'double', or 'none'. 'single' means a single newline character, while 'double' means two consecutive - newline characters. (default: double, works for most + newline characters. 'none' means all newline + characters will be replace with blank so paragraphs + will not be detected. (default: double, works for most ebooks but will detect less paragraphs for some ebooks) + --title_mode {auto,tag_text,first_few} + Choose the parse mode for chapter title, 'tag_text' + search 'title','h1','h2','h3' tag for title, + 'first_few' set first 60 characters as title, 'auto' + auto apply the best mode for current chapter. --chapter_start CHAPTER_START Chapter start index (default: 1, starting from 1) --chapter_end CHAPTER_END @@ -170,11 +178,12 @@ edge specific: --proxy PROXY Proxy server for the TTS provider. Format: http://[username:password@]proxy.server:port -azure specific: +azure/edge specific: --break_duration BREAK_DURATION Break duration in milliseconds for the different - paragraphs or sections (default: 1250). Valid values - range from 0 to 5000 milliseconds. + paragraphs or sections (default: 1250, means 1.25 s). + Valid values range from 0 to 5000 milliseconds for + Azure TTS. ``` **Example**: @@ -249,7 +258,7 @@ Check https://platform.openai.com/docs/quickstart/account-setup. Make sure you c Edge TTS and Azure TTS are almost same, the difference is that Edge TTS don't require API Key because it's based on Edge read aloud functionality, and parameters are restricted a bit, like [custom ssml](https://github.com/rany2/edge-tts#custom-ssml). -Check https://github.com/p0n1/epub_to_audiobook/blob/main/audiobook_generator/tts_providers/edge_tts_provider.py#L17 for supported voices. +Check https://gist.github.com/BettyJJ/17cbaa1de96235a7f5773b8690a20462 for supported voices. **If you want to try this project quickly, Edge TTS is highly recommended.**