From adff0dd29d5d638c722a85239886cafa75ca344b Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Thu, 15 Feb 2024 14:12:55 +0000 Subject: [PATCH 1/7] feat: add edge-tts-pause support for paragraph/section break --- .../tts_providers/edge_tts_provider.py | 115 +++++++++++++++++- requirements.txt | 3 +- 2 files changed, 113 insertions(+), 5 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 6e89b8c..1aba929 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -1,8 +1,11 @@ import asyncio import logging import math +import io from edge_tts.communicate import Communicate +from typing import Union, Optional +from pydub import AudioSegment from audiobook_generator.config.general_config import GeneralConfig from audiobook_generator.core.audio_tags import AudioTags @@ -317,6 +320,104 @@ def get_supported_voices(): 'zu-ZA-ThembaNeural': 'zu-ZA', } +class NoPausesFound(Exception): + def __init__(self, description = None) -> None: + self.description = (f'No pauses were found in the text. Please ' + + f'consider using `edge_tts.Communicate` instead.') + + super().__init__(self.description) + +class CommWithPauses(Communicate): + # This class uses edge_tts to generate text + # but with pauses for example:- text: 'Hello + # this is simple text. [pause: 2s] Paused 2s' + def __init__( + self, + text: str, + voice_name: str, + **kwargs + ) -> None: + super().__init__(text, voice_name, **kwargs) + self.parsed = self.parse_text() + self.file = io.BytesIO() + + def parse_text(self): + if not "[pause:" in self.text: + raise NoPausesFound + + parts = self.text.split("[pause:") + for part in parts: + if "]" in part: + pause_time, content = part.split("]", 1) + pause_time = self.parse_time(pause_time) + + yield pause_time, content.strip() + + else: + content = part + yield 0, content.strip() + + def parse_time(self, time_str: str) -> int: + if time_str[-2:] == 'ms': + unit = 'ms' + time_value = int(time_str[:-2]) + return time_value + else: + raise ValueError(f"Invalid time unit! only ms are allowed") + + async def chunkify(self): + for pause_time, content in self.parsed: + if not pause_time and not content: + pass + + elif not pause_time and content: + audio_bytes = await self.generate_audio(content) + self.file.write(audio_bytes) + + elif not content and pause_time: + pause_bytes = self.generate_pause(pause_time) + self.file.write(pause_bytes) + + else: + pause_bytes = self.generate_pause(pause_time) + audio_bytes = await self.generate_audio(content) + self.file.write(pause_bytes) + self.file.write(audio_bytes) + + def generate_pause(self, time: int) -> bytes: + # pause time should be provided in ms + silent: AudioSegment = AudioSegment.silent(time, 24000) + return silent.raw_data + + async def generate_audio(self, text: str) -> bytes: + # this genertes the real TTS using edge_tts for this part. + temp_chunk = io.BytesIO() + self.text = text + async for chunk in self.stream(): + if chunk['type'] == 'audio': + temp_chunk.write(chunk['data']) + + temp_chunk.seek(0) + decoded_chunk = AudioSegment.from_mp3(temp_chunk) + return decoded_chunk.raw_data + + async def save( + self, + audio_fname: Union[str, bytes], + metadata_fname: Optional[Union[str, bytes]] = None, + ) -> None: + # Save the audio and metadata to the specified files. + await self.chunkify() + await super().save(audio_fname, metadata_fname) + + self.file.seek(0) + audio: AudioSegment = AudioSegment.from_raw( + self.file, + sample_width=2, + frame_rate=24000, + channels=1 + ) + audio.export(audio_fname) class EdgeTTSProvider(BaseTTSProvider): def __init__(self, config: GeneralConfig): @@ -347,15 +448,21 @@ def text_to_speech( output_file: str, audio_tags: AudioTags, ): + # Replace break string with pause tag + text = text.replace( + self.get_break_string().strip(), + f"[pause: {self.config.break_duration}ms]" + ) - communicate = Communicate( - text, - self.config.voice_name, + communicate = CommWithPauses( + text=text, + voice_name=self.config.voice_name, rate=self.config.voice_rate, volume=self.config.voice_volume, pitch=self.config.voice_pitch, proxy=self.config.proxy ) + asyncio.run( communicate.save(output_file) ) @@ -366,7 +473,7 @@ def estimate_cost(self, total_chars): return math.ceil(total_chars / 1000) * self.price def get_break_string(self): - return " " + return " @BRK#" def get_output_file_extension(self): if self.config.output_format.startswith("amr"): diff --git a/requirements.txt b/requirements.txt index 096ba69..77e7b43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ mutagen==1.47.0 openai==1.2.2 requests==2.31.0 socksio==1.0.0 -edge-tts==6.1.9 \ No newline at end of file +edge-tts==6.1.9 +pydub==0.25.1 \ No newline at end of file From 4b96ca53955cd818962249895fb18023b0ca68ab Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Thu, 15 Feb 2024 14:57:37 +0000 Subject: [PATCH 2/7] perf: change the get_supported_voices method for edge-tts --- .../tts_providers/edge_tts_provider.py | 320 +----------------- 1 file changed, 18 insertions(+), 302 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 1aba929..2686da4 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -2,6 +2,7 @@ import logging import math import io +import requests from edge_tts.communicate import Communicate from typing import Union, Optional @@ -18,308 +19,23 @@ def get_supported_voices(): - # https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4 - return { - 'zh-CN-XiaoxiaoNeural': 'zh-CN', - 'zh-CN-XiaoyiNeural': 'zh-CN', - 'zh-CN-YunjianNeural': 'zh-CN', - 'zh-CN-YunxiNeural': 'zh-CN', - 'zh-CN-YunxiaNeural': 'zh-CN', - 'zh-CN-YunyangNeural': 'zh-CN', - 'zh-HK-HiuGaaiNeural': 'zh-HK', - 'zh-HK-HiuMaanNeural': 'zh-HK', - 'zh-HK-WanLungNeural': 'zh-HK', - 'zh-TW-HsiaoChenNeural': 'zh-TW', - 'zh-TW-YunJheNeural': 'zh-TW', - 'zh-TW-HsiaoYuNeural': 'zh-TW', - 'af-ZA-AdriNeural': 'af-ZA', - 'af-ZA-WillemNeural': 'af-ZA', - 'am-ET-AmehaNeural': 'am-ET', - 'am-ET-MekdesNeural': 'am-ET', - 'ar-AE-FatimaNeural': 'ar-AE', - 'ar-AE-HamdanNeural': 'ar-AE', - 'ar-BH-AliNeural': 'ar-BH', - 'ar-BH-LailaNeural': 'ar-BH', - 'ar-DZ-AminaNeural': 'ar-DZ', - 'ar-DZ-IsmaelNeural': 'ar-DZ', - 'ar-EG-SalmaNeural': 'ar-EG', - 'ar-EG-ShakirNeural': 'ar-EG', - 'ar-IQ-BasselNeural': 'ar-IQ', - 'ar-IQ-RanaNeural': 'ar-IQ', - 'ar-JO-SanaNeural': 'ar-JO', - 'ar-JO-TaimNeural': 'ar-JO', - 'ar-KW-FahedNeural': 'ar-KW', - 'ar-KW-NouraNeural': 'ar-KW', - 'ar-LB-LaylaNeural': 'ar-LB', - 'ar-LB-RamiNeural': 'ar-LB', - 'ar-LY-ImanNeural': 'ar-LY', - 'ar-LY-OmarNeural': 'ar-LY', - 'ar-MA-JamalNeural': 'ar-MA', - 'ar-MA-MounaNeural': 'ar-MA', - 'ar-OM-AbdullahNeural': 'ar-OM', - 'ar-OM-AyshaNeural': 'ar-OM', - 'ar-QA-AmalNeural': 'ar-QA', - 'ar-QA-MoazNeural': 'ar-QA', - 'ar-SA-HamedNeural': 'ar-SA', - 'ar-SA-ZariyahNeural': 'ar-SA', - 'ar-SY-AmanyNeural': 'ar-SY', - 'ar-SY-LaithNeural': 'ar-SY', - 'ar-TN-HediNeural': 'ar-TN', - 'ar-TN-ReemNeural': 'ar-TN', - 'ar-YE-MaryamNeural': 'ar-YE', - 'ar-YE-SalehNeural': 'ar-YE', - 'az-AZ-BabekNeural': 'az-AZ', - 'az-AZ-BanuNeural': 'az-AZ', - 'bg-BG-BorislavNeural': 'bg-BG', - 'bg-BG-KalinaNeural': 'bg-BG', - 'bn-BD-NabanitaNeural': 'bn-BD', - 'bn-BD-PradeepNeural': 'bn-BD', - 'bn-IN-BashkarNeural': 'bn-IN', - 'bn-IN-TanishaaNeural': 'bn-IN', - 'bs-BA-GoranNeural': 'bs-BA', - 'bs-BA-VesnaNeural': 'bs-BA', - 'ca-ES-EnricNeural': 'ca-ES', - 'ca-ES-JoanaNeural': 'ca-ES', - 'cs-CZ-AntoninNeural': 'cs-CZ', - 'cs-CZ-VlastaNeural': 'cs-CZ', - 'cy-GB-AledNeural': 'cy-GB', - 'cy-GB-NiaNeural': 'cy-GB', - 'da-DK-ChristelNeural': 'da-DK', - 'da-DK-JeppeNeural': 'da-DK', - 'de-AT-IngridNeural': 'de-AT', - 'de-AT-JonasNeural': 'de-AT', - 'de-CH-JanNeural': 'de-CH', - 'de-CH-LeniNeural': 'de-CH', - 'de-DE-AmalaNeural': 'de-DE', - 'de-DE-ConradNeural': 'de-DE', - 'de-DE-KatjaNeural': 'de-DE', - 'de-DE-KillianNeural': 'de-DE', - 'el-GR-AthinaNeural': 'el-GR', - 'el-GR-NestorasNeural': 'el-GR', - 'en-AU-NatashaNeural': 'en-AU', - 'en-AU-WilliamNeural': 'en-AU', - 'en-CA-ClaraNeural': 'en-CA', - 'en-CA-LiamNeural': 'en-CA', - 'en-GB-LibbyNeural': 'en-GB', - 'en-GB-MaisieNeural': 'en-GB', - 'en-GB-RyanNeural': 'en-GB', - 'en-GB-SoniaNeural': 'en-GB', - 'en-GB-ThomasNeural': 'en-GB', - 'en-HK-SamNeural': 'en-HK', - 'en-HK-YanNeural': 'en-HK', - 'en-IE-ConnorNeural': 'en-IE', - 'en-IE-EmilyNeural': 'en-IE', - 'en-IN-NeerjaNeural': 'en-IN', - 'en-IN-PrabhatNeural': 'en-IN', - 'en-KE-AsiliaNeural': 'en-KE', - 'en-KE-ChilembaNeural': 'en-KE', - 'en-NG-AbeoNeural': 'en-NG', - 'en-NG-EzinneNeural': 'en-NG', - 'en-NZ-MitchellNeural': 'en-NZ', - 'en-NZ-MollyNeural': 'en-NZ', - 'en-PH-JamesNeural': 'en-PH', - 'en-PH-RosaNeural': 'en-PH', - 'en-SG-LunaNeural': 'en-SG', - 'en-SG-WayneNeural': 'en-SG', - 'en-TZ-ElimuNeural': 'en-TZ', - 'en-TZ-ImaniNeural': 'en-TZ', - 'en-US-AnaNeural': 'en-US', - 'en-US-AriaNeural': 'en-US', - 'en-US-ChristopherNeural': 'en-US', - 'en-US-EricNeural': 'en-US', - 'en-US-GuyNeural': 'en-US', - 'en-US-JennyNeural': 'en-US', - 'en-US-MichelleNeural': 'en-US', - 'en-ZA-LeahNeural': 'en-ZA', - 'en-ZA-LukeNeural': 'en-ZA', - 'es-AR-ElenaNeural': 'es-AR', - 'es-AR-TomasNeural': 'es-AR', - 'es-BO-MarceloNeural': 'es-BO', - 'es-BO-SofiaNeural': 'es-BO', - 'es-CL-CatalinaNeural': 'es-CL', - 'es-CL-LorenzoNeural': 'es-CL', - 'es-CO-GonzaloNeural': 'es-CO', - 'es-CO-SalomeNeural': 'es-CO', - 'es-CR-JuanNeural': 'es-CR', - 'es-CR-MariaNeural': 'es-CR', - 'es-CU-BelkysNeural': 'es-CU', - 'es-CU-ManuelNeural': 'es-CU', - 'es-DO-EmilioNeural': 'es-DO', - 'es-DO-RamonaNeural': 'es-DO', - 'es-EC-AndreaNeural': 'es-EC', - 'es-EC-LuisNeural': 'es-EC', - 'es-ES-AlvaroNeural': 'es-ES', - 'es-ES-ElviraNeural': 'es-ES', - 'es-ES-ManuelEsCUNeural': 'es-ES', - 'es-GQ-JavierNeural': 'es-GQ', - 'es-GQ-TeresaNeural': 'es-GQ', - 'es-GT-AndresNeural': 'es-GT', - 'es-GT-MartaNeural': 'es-GT', - 'es-HN-CarlosNeural': 'es-HN', - 'es-HN-KarlaNeural': 'es-HN', - 'es-MX-DaliaNeural': 'es-MX', - 'es-MX-JorgeNeural': 'es-MX', - 'es-MX-LorenzoEsCLNeural': 'es-MX', - 'es-NI-FedericoNeural': 'es-NI', - 'es-NI-YolandaNeural': 'es-NI', - 'es-PA-MargaritaNeural': 'es-PA', - 'es-PA-RobertoNeural': 'es-PA', - 'es-PE-AlexNeural': 'es-PE', - 'es-PE-CamilaNeural': 'es-PE', - 'es-PR-KarinaNeural': 'es-PR', - 'es-PR-VictorNeural': 'es-PR', - 'es-PY-MarioNeural': 'es-PY', - 'es-PY-TaniaNeural': 'es-PY', - 'es-SV-LorenaNeural': 'es-SV', - 'es-SV-RodrigoNeural': 'es-SV', - 'es-US-AlonsoNeural': 'es-US', - 'es-US-PalomaNeural': 'es-US', - 'es-UY-MateoNeural': 'es-UY', - 'es-UY-ValentinaNeural': 'es-UY', - 'es-VE-PaolaNeural': 'es-VE', - 'es-VE-SebastianNeural': 'es-VE', - 'et-EE-AnuNeural': 'et-EE', - 'et-EE-KertNeural': 'et-EE', - 'fa-IR-DilaraNeural': 'fa-IR', - 'fa-IR-FaridNeural': 'fa-IR', - 'fi-FI-HarriNeural': 'fi-FI', - 'fi-FI-NooraNeural': 'fi-FI', - 'fil-PH-AngeloNeural': 'fil-PH', - 'fil-PH-BlessicaNeural': 'fil-PH', - 'fr-BE-CharlineNeural': 'fr-BE', - 'fr-BE-GerardNeural': 'fr-BE', - 'fr-CA-AntoineNeural': 'fr-CA', - 'fr-CA-JeanNeural': 'fr-CA', - 'fr-CA-SylvieNeural': 'fr-CA', - 'fr-CH-ArianeNeural': 'fr-CH', - 'fr-CH-FabriceNeural': 'fr-CH', - 'fr-FR-DeniseNeural': 'fr-FR', - 'fr-FR-EloiseNeural': 'fr-FR', - 'fr-FR-HenriNeural': 'fr-FR', - 'ga-IE-ColmNeural': 'ga-IE', - 'ga-IE-OrlaNeural': 'ga-IE', - 'gl-ES-RoiNeural': 'gl-ES', - 'gl-ES-SabelaNeural': 'gl-ES', - 'gu-IN-DhwaniNeural': 'gu-IN', - 'gu-IN-NiranjanNeural': 'gu-IN', - 'he-IL-AvriNeural': 'he-IL', - 'he-IL-HilaNeural': 'he-IL', - 'hi-IN-MadhurNeural': 'hi-IN', - 'hi-IN-SwaraNeural': 'hi-IN', - 'hr-HR-GabrijelaNeural': 'hr-HR', - 'hr-HR-SreckoNeural': 'hr-HR', - 'hu-HU-NoemiNeural': 'hu-HU', - 'hu-HU-TamasNeural': 'hu-HU', - 'id-ID-ArdiNeural': 'id-ID', - 'id-ID-GadisNeural': 'id-ID', - 'is-IS-GudrunNeural': 'is-IS', - 'is-IS-GunnarNeural': 'is-IS', - 'it-IT-DiegoNeural': 'it-IT', - 'it-IT-ElsaNeural': 'it-IT', - 'it-IT-IsabellaNeural': 'it-IT', - 'ja-JP-KeitaNeural': 'ja-JP', - 'ja-JP-NanamiNeural': 'ja-JP', - 'jv-ID-DimasNeural': 'jv-ID', - 'jv-ID-SitiNeural': 'jv-ID', - 'ka-GE-EkaNeural': 'ka-GE', - 'ka-GE-GiorgiNeural': 'ka-GE', - 'kk-KZ-AigulNeural': 'kk-KZ', - 'kk-KZ-DauletNeural': 'kk-KZ', - 'km-KH-PisethNeural': 'km-KH', - 'km-KH-SreymomNeural': 'km-KH', - 'kn-IN-GaganNeural': 'kn-IN', - 'kn-IN-SapnaNeural': 'kn-IN', - 'ko-KR-InJoonNeural': 'ko-KR', - 'ko-KR-SunHiNeural': 'ko-KR', - 'lo-LA-ChanthavongNeural': 'lo-LA', - 'lo-LA-KeomanyNeural': 'lo-LA', - 'lt-LT-LeonasNeural': 'lt-LT', - 'lt-LT-OnaNeural': 'lt-LT', - 'lv-LV-EveritaNeural': 'lv-LV', - 'lv-LV-NilsNeural': 'lv-LV', - 'mk-MK-AleksandarNeural': 'mk-MK', - 'mk-MK-MarijaNeural': 'mk-MK', - 'ml-IN-MidhunNeural': 'ml-IN', - 'ml-IN-SobhanaNeural': 'ml-IN', - 'mn-MN-BataaNeural': 'mn-MN', - 'mn-MN-YesuiNeural': 'mn-MN', - 'mr-IN-AarohiNeural': 'mr-IN', - 'mr-IN-ManoharNeural': 'mr-IN', - 'ms-MY-OsmanNeural': 'ms-MY', - 'ms-MY-YasminNeural': 'ms-MY', - 'mt-MT-GraceNeural': 'mt-MT', - 'mt-MT-JosephNeural': 'mt-MT', - 'my-MM-NilarNeural': 'my-MM', - 'my-MM-ThihaNeural': 'my-MM', - 'nb-NO-FinnNeural': 'nb-NO', - 'nb-NO-PernilleNeural': 'nb-NO', - 'ne-NP-HemkalaNeural': 'ne-NP', - 'ne-NP-SagarNeural': 'ne-NP', - 'nl-BE-ArnaudNeural': 'nl-BE', - 'nl-BE-DenaNeural': 'nl-BE', - 'nl-NL-ColetteNeural': 'nl-NL', - 'nl-NL-FennaNeural': 'nl-NL', - 'nl-NL-MaartenNeural': 'nl-NL', - 'pl-PL-MarekNeural': 'pl-PL', - 'pl-PL-ZofiaNeural': 'pl-PL', - 'ps-AF-GulNawazNeural': 'ps-AF', - 'ps-AF-LatifaNeural': 'ps-AF', - 'pt-BR-AntonioNeural': 'pt-BR', - 'pt-BR-FranciscaNeural': 'pt-BR', - 'pt-PT-DuarteNeural': 'pt-PT', - 'pt-PT-RaquelNeural': 'pt-PT', - 'ro-RO-AlinaNeural': 'ro-RO', - 'ro-RO-EmilNeural': 'ro-RO', - 'ru-RU-DmitryNeural': 'ru-RU', - 'ru-RU-SvetlanaNeural': 'ru-RU', - 'si-LK-SameeraNeural': 'si-LK', - 'si-LK-ThiliniNeural': 'si-LK', - 'sk-SK-LukasNeural': 'sk-SK', - 'sk-SK-ViktoriaNeural': 'sk-SK', - 'sl-SI-PetraNeural': 'sl-SI', - 'sl-SI-RokNeural': 'sl-SI', - 'so-SO-MuuseNeural': 'so-SO', - 'so-SO-UbaxNeural': 'so-SO', - 'sq-AL-AnilaNeural': 'sq-AL', - 'sq-AL-IlirNeural': 'sq-AL', - 'sr-RS-NicholasNeural': 'sr-RS', - 'sr-RS-SophieNeural': 'sr-RS', - 'su-ID-JajangNeural': 'su-ID', - 'su-ID-TutiNeural': 'su-ID', - 'sv-SE-MattiasNeural': 'sv-SE', - 'sv-SE-SofieNeural': 'sv-SE', - 'sw-KE-RafikiNeural': 'sw-KE', - 'sw-KE-ZuriNeural': 'sw-KE', - 'sw-TZ-DaudiNeural': 'sw-TZ', - 'sw-TZ-RehemaNeural': 'sw-TZ', - 'ta-IN-PallaviNeural': 'ta-IN', - 'ta-IN-ValluvarNeural': 'ta-IN', - 'ta-LK-KumarNeural': 'ta-LK', - 'ta-LK-SaranyaNeural': 'ta-LK', - 'ta-MY-KaniNeural': 'ta-MY', - 'ta-MY-SuryaNeural': 'ta-MY', - 'ta-SG-AnbuNeural': 'ta-SG', - 'ta-SG-VenbaNeural': 'ta-SG', - 'te-IN-MohanNeural': 'te-IN', - 'te-IN-ShrutiNeural': 'te-IN', - 'th-TH-NiwatNeural': 'th-TH', - 'th-TH-PremwadeeNeural': 'th-TH', - 'tr-TR-AhmetNeural': 'tr-TR', - 'tr-TR-EmelNeural': 'tr-TR', - 'uk-UA-OstapNeural': 'uk-UA', - 'uk-UA-PolinaNeural': 'uk-UA', - 'ur-IN-GulNeural': 'ur-IN', - 'ur-IN-SalmanNeural': 'ur-IN', - 'ur-PK-AsadNeural': 'ur-PK', - 'ur-PK-UzmaNeural': 'ur-PK', - 'uz-UZ-MadinaNeural': 'uz-UZ', - 'uz-UZ-SardorNeural': 'uz-UZ', - 'vi-VN-HoaiMyNeural': 'vi-VN', - 'vi-VN-NamMinhNeural': 'vi-VN', - 'zu-ZA-ThandoNeural': 'zu-ZA', - 'zu-ZA-ThembaNeural': 'zu-ZA', - } - + # List all available voices and their attributes. + # This pulls data from the URL used by Microsoft Edge to return a list of + # all available voices. + # Returns: + # dict: A dictionary of voice attributes. + trusted_client_token = "6A5AA1D4EAFF4E9FB37E23D68491D6F4" + + voice_list_url = "https://speech.platform.bing.com/consumer/speech/synthesize/" \ + + "readaloud/voices/list?trustedclienttoken=" \ + + trusted_client_token + + voice_list = requests.get(voice_list_url).json() + result = {} + for item in voice_list: + result[item['ShortName']] = item['Locale'] + return result + class NoPausesFound(Exception): def __init__(self, description = None) -> None: self.description = (f'No pauses were found in the text. Please ' From a53153c59d6f9ea1b58588b59c09bef43077e141 Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Sat, 17 Feb 2024 10:27:04 +0000 Subject: [PATCH 3/7] feat: enhance edge-tts get voices list method --- .../tts_providers/edge_tts_provider.py | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 2686da4..39b665e 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -2,7 +2,6 @@ import logging import math import io -import requests from edge_tts.communicate import Communicate from typing import Union, Optional @@ -18,22 +17,20 @@ MAX_RETRIES = 12 # Max_retries constant for network errors -def get_supported_voices(): +async def get_supported_voices(): # List all available voices and their attributes. # This pulls data from the URL used by Microsoft Edge to return a list of # all available voices. # Returns: # dict: A dictionary of voice attributes. - trusted_client_token = "6A5AA1D4EAFF4E9FB37E23D68491D6F4" - - voice_list_url = "https://speech.platform.bing.com/consumer/speech/synthesize/" \ - + "readaloud/voices/list?trustedclienttoken=" \ - + trusted_client_token - - voice_list = requests.get(voice_list_url).json() + voices = await list_voices() + voices = sorted(voices, key=lambda voice: voice["ShortName"]) + result = {} - for item in voice_list: - result[item['ShortName']] = item['Locale'] + + for voice in voices: + result[voice["ShortName"]] = voice["Locale"] + return result class NoPausesFound(Exception): @@ -154,8 +151,8 @@ def __init__(self, config: GeneralConfig): def __str__(self) -> str: return f"{self.config}" - def validate_config(self): - if self.config.voice_name not in get_supported_voices(): + async def validate_config(self): + if self.config.voice_name not in await get_supported_voices(): raise ValueError(f"EdgeTTS: Unsupported voice name: {self.config.voice_name}") def text_to_speech( From 4983f89bc9acd70ab777d473ffac897a57e0ee94 Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Sat, 17 Feb 2024 10:39:25 +0000 Subject: [PATCH 4/7] refactor: simplify generate audio and pause logic and fix minor bugs --- .../tts_providers/edge_tts_provider.py | 33 +++++-------------- 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 39b665e..3f4379e 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -62,39 +62,20 @@ def parse_text(self): for part in parts: if "]" in part: pause_time, content = part.split("]", 1) - pause_time = self.parse_time(pause_time) - - yield pause_time, content.strip() + yield int(pause_time), content.strip() else: content = part yield 0, content.strip() - def parse_time(self, time_str: str) -> int: - if time_str[-2:] == 'ms': - unit = 'ms' - time_value = int(time_str[:-2]) - return time_value - else: - raise ValueError(f"Invalid time unit! only ms are allowed") - async def chunkify(self): for pause_time, content in self.parsed: - if not pause_time and not content: - pass - - elif not pause_time and content: - audio_bytes = await self.generate_audio(content) - self.file.write(audio_bytes) - - elif not content and pause_time: + if pause_time: pause_bytes = self.generate_pause(pause_time) self.file.write(pause_bytes) - else: - pause_bytes = self.generate_pause(pause_time) + if content: audio_bytes = await self.generate_audio(content) - self.file.write(pause_bytes) self.file.write(audio_bytes) def generate_pause(self, time: int) -> bytes: @@ -111,7 +92,11 @@ async def generate_audio(self, text: str) -> bytes: temp_chunk.write(chunk['data']) temp_chunk.seek(0) - decoded_chunk = AudioSegment.from_mp3(temp_chunk) + # handle the case where the chunk is empty + try: + decoded_chunk = AudioSegment.mp3(temp_chunk) + except Exception as e: + decoded_chunk = AudioSegment.silent(0, 24000) return decoded_chunk.raw_data async def save( @@ -164,7 +149,7 @@ def text_to_speech( # Replace break string with pause tag text = text.replace( self.get_break_string().strip(), - f"[pause: {self.config.break_duration}ms]" + f"[pause: {self.config.break_duration}]" ) communicate = CommWithPauses( From 02ed32d05118039d34481adc21731ca33b364fff Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Sat, 17 Feb 2024 10:39:25 +0000 Subject: [PATCH 5/7] refactor: simplify generate audio and pause logic and fix minor bugs --- audiobook_generator/tts_providers/edge_tts_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 3f4379e..0b10d4d 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -3,7 +3,7 @@ import math import io -from edge_tts.communicate import Communicate +from edge_tts import Communicate, list_voices from typing import Union, Optional from pydub import AudioSegment From cc88eb92f7964633b0ec6ed438ada6b6ef76cefe Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Sat, 17 Feb 2024 10:39:25 +0000 Subject: [PATCH 6/7] refactor: simplify generate audio and pause logic and fix minor bugs --- audiobook_generator/tts_providers/edge_tts_provider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 0b10d4d..61a95c7 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -94,8 +94,8 @@ async def generate_audio(self, text: str) -> bytes: temp_chunk.seek(0) # handle the case where the chunk is empty try: - decoded_chunk = AudioSegment.mp3(temp_chunk) - except Exception as e: + decoded_chunk = AudioSegment.from_mp3(temp_chunk) + except: decoded_chunk = AudioSegment.silent(0, 24000) return decoded_chunk.raw_data From 393794745206dd937d1c31e65b25086785d6caf4 Mon Sep 17 00:00:00 2001 From: timz06 <91061595+phuchoang2603@users.noreply.github.com> Date: Wed, 21 Feb 2024 12:39:00 +0000 Subject: [PATCH 7/7] fix: remove no pause marker error raise --- .../tts_providers/edge_tts_provider.py | 11 ++--------- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/audiobook_generator/tts_providers/edge_tts_provider.py b/audiobook_generator/tts_providers/edge_tts_provider.py index 61a95c7..6bb71a3 100644 --- a/audiobook_generator/tts_providers/edge_tts_provider.py +++ b/audiobook_generator/tts_providers/edge_tts_provider.py @@ -32,13 +32,6 @@ async def get_supported_voices(): result[voice["ShortName"]] = voice["Locale"] return result - -class NoPausesFound(Exception): - def __init__(self, description = None) -> None: - self.description = (f'No pauses were found in the text. Please ' - + f'consider using `edge_tts.Communicate` instead.') - - super().__init__(self.description) class CommWithPauses(Communicate): # This class uses edge_tts to generate text @@ -56,8 +49,8 @@ def __init__( def parse_text(self): if not "[pause:" in self.text: - raise NoPausesFound - + return [(0, self.text)] + parts = self.text.split("[pause:") for part in parts: if "]" in part: diff --git a/requirements.txt b/requirements.txt index 77e7b43..2b2781e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ mutagen==1.47.0 openai==1.2.2 requests==2.31.0 socksio==1.0.0 -edge-tts==6.1.9 +edge-tts==6.1.10 pydub==0.25.1 \ No newline at end of file