Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to use language subfolders in xtts voicesets #481

Open
wants to merge 18 commits into
base: alltalkbeta
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Add the ability to use language subfolders in xtts voicesets
* allows a better selection of sample if the voiceset consists of samples in differents languages
* if the language subfolder is not detected, relies on randoms samples of the root and others languages subfolders
SilyNoMeta committed Jan 5, 2025
commit 9527c5d800c7ad106e1836d8bb12e0f7a538b7ea
60 changes: 51 additions & 9 deletions system/tts_engines/xtts/model_engine.py
Original file line number Diff line number Diff line change
@@ -735,6 +735,12 @@ def voices_file_list(self):
self.print_message("Scanning for individual voice files", message_type="debug_tts")
voices.extend([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))
and f.endswith(".wav")])

# Valid language codes
language_codes = {
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs",
"ar", "zh-cn", "ja", "hu", "ko", "hi"
}

# Scan for voice sets
if os.path.exists(multi_voice_dir):
@@ -745,7 +751,20 @@ def voices_file_list(self):
if any(f.endswith(".wav") for f in os.listdir(voice_set_path)):
voices.append(f"voiceset:{voice_set}")
self.print_message(f"Added voice set: {voice_set}", message_type="debug_tts_variables")

continue # Skip checking subdirectories if .wav files are found at root

# Check for .wav files in valid language subdirectories
language_folders = [
sub for sub in os.listdir(voice_set_path)
if os.path.isdir(voice_set_path / sub) and sub in language_codes
]
if any(
any(f.endswith(".wav") for f in os.listdir(voice_set_path / lang))
for lang in language_folders
):
voices.append(f"voiceset:{voice_set}")
self.print_message(f"Added voice set: {voice_set}", message_type="debug_tts_variables")

# Scan for JSON latents
if not self.current_model_loaded.startswith("apitts"): # APITTS doesnt support latents
if os.path.exists(json_latents_dir):
@@ -973,32 +992,55 @@ async def handle_tts_method_change(self, tts_method):
self.print_message(f"\033[94mModel Loadtime: \033[93m{generate_elapsed_time:.2f}\033[94m seconds\033[0m")
return True

async def prepare_voice_inputs(self, voice):
"""Prepares latents and embeddings based on the voice input."""
async def prepare_voice_inputs(self, voice, language):
"""Prepares latents and embeddings based on the voice input and language parameter."""
gpt_cond_latent = None
speaker_embedding = None

# Handle latent voices
if voice.startswith('latent:'):
if self.current_model_loaded.startswith("xtts"):
gpt_cond_latent, speaker_embedding = self._load_latents(voice)

# Handle voiceset
elif voice.startswith('voiceset:'):
voice_set = voice.replace("voiceset:", "")
voice_set_path = os.path.join(self.main_dir, "voices", "xtts_multi_voice_sets", voice_set)
self.print_message(f"Processing voice set from: {voice_set_path}", message_type="debug_tts")

wavs_files = glob.glob(os.path.join(voice_set_path, "*.wav"))
wavs_files = []

# If a language is specified, try to find WAV files in the language subfolder
if language:
language_path = os.path.join(voice_set_path, language)
if os.path.exists(language_path) and os.path.isdir(language_path):
wavs_files = glob.glob(os.path.join(language_path, "*.wav"))
# If there are more than 5 files, pick 5 randomly
if len(wavs_files) > 5:
wavs_files = random.sample(wavs_files, 5)

# Fallback: Collect files from the root and all subfolders if the language folder is missing or empty
if not wavs_files:
all_wav_files = glob.glob(os.path.join(voice_set_path, "*.wav")) # Files at the root
for subdir in os.listdir(voice_set_path):
subdir_path = os.path.join(voice_set_path, subdir)
if os.path.isdir(subdir_path):
all_wav_files.extend(glob.glob(os.path.join(subdir_path, "*.wav")))

# Select up to 5 random files from the combined list
if all_wav_files:
wavs_files = random.sample(all_wav_files, min(len(all_wav_files), 5))

# Raise an error if no WAV files are found at all
if not wavs_files:
self.print_message(f"No WAV files found in voice set: {voice_set}", message_type="error")
raise HTTPException(status_code=400, detail=f"No WAV files found in voice set: {voice_set}")

if len(wavs_files) > 5:
wavs_files = random.sample(wavs_files, 5)
self.print_message(f"Using 5 random samples from voice set", message_type="debug_tts")

# Generate conditioning latents for the WAV files
if self.current_model_loaded.startswith("xtts"):
gpt_cond_latent, speaker_embedding = self._generate_conditioning_latents(wavs_files)

# Handle single voice files
else:
normalized_path = os.path.normpath(os.path.join(self.main_dir, "voices", voice))
wavs_files = [normalized_path]
@@ -1061,7 +1103,7 @@ async def generate_tts(self, text, voice, language, temperature, repetition_pena

try:
# Preparation of latents and embeddings
gpt_cond_latent, speaker_embedding = await self.prepare_voice_inputs(voice)
gpt_cond_latent, speaker_embedding = await self.prepare_voice_inputs(voice, language)

common_args = {
"text": text,
18 changes: 18 additions & 0 deletions system/tts_engines/xtts/xtts_settings_page.py
Original file line number Diff line number Diff line change
@@ -60,13 +60,31 @@ def xtts_voices_file_list():
[f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.endswith(".wav")]
)

# Valid language codes
language_codes = {
"en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs",
"ar", "zh-cn", "ja", "hu", "ko", "hi"
}

# Scan for voice sets
if os.path.exists(multi_voice_dir):
for voice_set in os.listdir(multi_voice_dir):
voice_set_path = multi_voice_dir / voice_set
if os.path.isdir(voice_set_path):
if any(f.endswith(".wav") for f in os.listdir(voice_set_path)):
voices.append(f"voiceset:{voice_set}")
continue # Skip checking subdirectories if .wav files are found at root

# Check for .wav files in valid language subdirectories
language_folders = [
sub for sub in os.listdir(voice_set_path)
if os.path.isdir(voice_set_path / sub) and sub in language_codes
]
if any(
any(f.endswith(".wav") for f in os.listdir(voice_set_path / lang))
for lang in language_folders
):
voices.append(f"voiceset:{voice_set}")

# Scan for JSON latents
if os.path.exists(json_latents_dir):