From 8a078be6957bc169de12826f2fdd85a45085b8f0 Mon Sep 17 00:00:00 2001 From: Moctar Date: Sun, 7 Apr 2024 18:57:53 +0200 Subject: [PATCH 01/10] added speed as argument --- TTS/api.py | 6 +++--- TTS/tts/models/xtts.py | 29 ++++++++++++++++------------- TTS/utils/synthesizer.py | 4 +++- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index 7abc188e74..c58ab76fc9 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -168,9 +168,7 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): self.synthesizer = None self.model_name = model_name - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( - model_name - ) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name) # init synthesizer # None values are fetch from the model @@ -283,6 +281,7 @@ def tts( style_text=None, reference_speaker_name=None, split_sentences=split_sentences, + speed=1.0, **kwargs, ) return wav @@ -337,6 +336,7 @@ def tts_to_file( language=language, speaker_wav=speaker_wav, split_sentences=split_sentences, + speed=1.0, **kwargs, ) self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 8e9d6bd382..7ca6ff85d0 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -274,7 +274,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = for i in range(0, audio.shape[1], 22050 * chunk_length): audio_chunk = audio[:, i : i + 22050 * chunk_length] - # if the chunk is too short ignore it + # if the chunk is too short ignore it if audio_chunk.size(-1) < 22050 * 0.33: continue @@ -379,7 +379,7 @@ def get_conditioning_latents( return gpt_cond_latents, speaker_embedding - def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwargs): + def synthesize(self, text, config, speaker_wav, language, speaker_id=None, speed=1.0, **kwargs): """Synthesize speech with the given input text. Args: @@ -410,13 +410,15 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwa if speaker_id is not None: gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values() return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings) - settings.update({ - "gpt_cond_len": config.gpt_cond_len, - "gpt_cond_chunk_len": config.gpt_cond_chunk_len, - "max_ref_len": config.max_ref_len, - "sound_norm_refs": config.sound_norm_refs, - }) - return self.full_inference(text, speaker_wav, language, **settings) + settings.update( + { + "gpt_cond_len": config.gpt_cond_len, + "gpt_cond_chunk_len": config.gpt_cond_chunk_len, + "max_ref_len": config.max_ref_len, + "sound_norm_refs": config.sound_norm_refs, + } + ) + return self.full_inference(text, speaker_wav, language, speed, **settings) @torch.inference_mode() def full_inference( @@ -424,6 +426,7 @@ def full_inference( text, ref_audio_path, language, + speed, # GPT inference temperature=0.75, length_penalty=1.0, @@ -484,6 +487,7 @@ def full_inference( max_ref_length=max_ref_len, sound_norm_refs=sound_norm_refs, ) + self.speed = speed return self.inference( text, @@ -518,6 +522,7 @@ def inference( enable_text_splitting=False, **hf_generate_kwargs, ): + speed = self.speed language = language.split("-")[0] # remove the country code length_scale = 1.0 / max(speed, 0.05) gpt_cond_latent = gpt_cond_latent.to(self.device) @@ -756,13 +761,11 @@ def load_checkpoint( model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - - if speaker_file_path is None and checkpoint_dir is not None: - speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") + speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth") self.language_manager = LanguageManager(config) self.speaker_manager = None - if speaker_file_path is not None and os.path.exists(speaker_file_path): + if os.path.exists(speaker_file_path): self.speaker_manager = SpeakerManager(speaker_file_path) if os.path.exists(vocab_path): diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index b98647c30c..824b69b5b1 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -265,6 +265,7 @@ def tts( reference_wav=None, reference_speaker_name=None, split_sentences: bool = True, + speed=1.0, **kwargs, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -335,7 +336,7 @@ def tts( # handle multi-lingual language_id = None if self.tts_languages_file or ( - hasattr(self.tts_model, "language_manager") + hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None and not self.tts_config.model == "xtts" ): @@ -391,6 +392,7 @@ def tts( d_vector=speaker_embedding, speaker_wav=speaker_wav, language=language_name, + speed=1.0, **kwargs, ) else: From 4b0b9068176af754a7e0d0071ba640e41540dfd4 Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Sun, 7 Apr 2024 19:08:21 +0200 Subject: [PATCH 02/10] Update xtts.py --- TTS/tts/models/xtts.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 7ca6ff85d0..aec1a09815 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -410,15 +410,13 @@ def synthesize(self, text, config, speaker_wav, language, speaker_id=None, speed if speaker_id is not None: gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values() return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings) - settings.update( - { - "gpt_cond_len": config.gpt_cond_len, - "gpt_cond_chunk_len": config.gpt_cond_chunk_len, - "max_ref_len": config.max_ref_len, - "sound_norm_refs": config.sound_norm_refs, - } - ) - return self.full_inference(text, speaker_wav, language, speed, **settings) + settings.update({ + "gpt_cond_len": config.gpt_cond_len, + "gpt_cond_chunk_len": config.gpt_cond_chunk_len, + "max_ref_len": config.max_ref_len, + "sound_norm_refs": config.sound_norm_refs, + }) + return self.full_inference(text, speaker_wav, language, **settings) @torch.inference_mode() def full_inference( @@ -761,12 +759,13 @@ def load_checkpoint( model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth") + + if speaker_file_path is None and checkpoint_dir is not None: + speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") self.language_manager = LanguageManager(config) self.speaker_manager = None - if os.path.exists(speaker_file_path): - self.speaker_manager = SpeakerManager(speaker_file_path) + if speaker_file_path is not None and os.path.exists(speaker_file_path): if os.path.exists(vocab_path): self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) From f0880d61f610c8ad3b4d3019a949912ffd61feab Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Sun, 7 Apr 2024 19:11:05 +0200 Subject: [PATCH 03/10] Update synthesizer.py --- TTS/utils/synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 824b69b5b1..960b20f40e 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -336,7 +336,7 @@ def tts( # handle multi-lingual language_id = None if self.tts_languages_file or ( - hasattr(self.tts_model, "language_manager") + hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None and not self.tts_config.model == "xtts" ): From ffe4b8efeb510b0da6b43d25f5b36d9250694fc0 Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Sun, 7 Apr 2024 19:11:55 +0200 Subject: [PATCH 04/10] Update api.py --- TTS/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TTS/api.py b/TTS/api.py index c58ab76fc9..5881138933 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -168,7 +168,9 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False): self.synthesizer = None self.model_name = model_name - model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name) + model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name( + model_name + ) # init synthesizer # None values are fetch from the model From 24e8486715b3c88d92a0a93a8ff747836eb0325b Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Mon, 8 Apr 2024 21:16:49 +0200 Subject: [PATCH 05/10] Update xtts.py --- TTS/tts/models/xtts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index aec1a09815..f95ca8da41 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -274,7 +274,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int = for i in range(0, audio.shape[1], 22050 * chunk_length): audio_chunk = audio[:, i : i + 22050 * chunk_length] - # if the chunk is too short ignore it + # if the chunk is too short ignore it if audio_chunk.size(-1) < 22050 * 0.33: continue From 729463cd62aaeafe161e7138550a405acc40e61b Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Mon, 8 Apr 2024 21:17:36 +0200 Subject: [PATCH 06/10] Update xtts.py --- TTS/tts/models/xtts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index f95ca8da41..b33c45a2ea 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -759,7 +759,7 @@ def load_checkpoint( model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth") vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json") - + if speaker_file_path is None and checkpoint_dir is not None: speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth") From 9e6788bf8061ab2b0e3ccc59eb7e38299af3c40b Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Mon, 8 Apr 2024 21:18:39 +0200 Subject: [PATCH 07/10] Update xtts.py --- TTS/tts/models/xtts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index b33c45a2ea..db13d50149 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -766,6 +766,7 @@ def load_checkpoint( self.language_manager = LanguageManager(config) self.speaker_manager = None if speaker_file_path is not None and os.path.exists(speaker_file_path): + self.speaker_manager = SpeakerManager(speaker_file_path) if os.path.exists(vocab_path): self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) From 784f5f5e8ffb33a247a65169bf5b6c13fa1de77d Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Mon, 8 Apr 2024 21:30:22 +0200 Subject: [PATCH 08/10] Update api.py --- TTS/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/api.py b/TTS/api.py index 5881138933..f3a21acbe7 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -338,7 +338,7 @@ def tts_to_file( language=language, speaker_wav=speaker_wav, split_sentences=split_sentences, - speed=1.0, + speed=speed, **kwargs, ) self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out) From 22017bbd6ae732e82de9e3f75afdeefe8d4e6dda Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Mon, 8 Apr 2024 21:31:44 +0200 Subject: [PATCH 09/10] Update synthesizer.py --- TTS/utils/synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 960b20f40e..acfbfb4c61 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -392,7 +392,7 @@ def tts( d_vector=speaker_embedding, speaker_wav=speaker_wav, language=language_name, - speed=1.0, + speed=speed, **kwargs, ) else: From 03e73dea98ba7a712e8b075178a13383e3ad24f8 Mon Sep 17 00:00:00 2001 From: Moctar Haiz Date: Mon, 8 Apr 2024 22:16:09 +0200 Subject: [PATCH 10/10] Update xtts.py --- TTS/tts/models/xtts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index db13d50149..c8118550f8 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -424,7 +424,6 @@ def full_inference( text, ref_audio_path, language, - speed, # GPT inference temperature=0.75, length_penalty=1.0, @@ -437,6 +436,7 @@ def full_inference( gpt_cond_chunk_len=6, max_ref_len=10, sound_norm_refs=False, + speed=1.0, **hf_generate_kwargs, ): """