Skip to content

Commit

Permalink
Merge branch 'coqui-ai:dev' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
Mrw33554432 authored Nov 20, 2023
2 parents c910a0e + 29dede2 commit b94db72
Show file tree
Hide file tree
Showing 10 changed files with 63 additions and 55 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
build-sdist:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Verify tag matches version
run: |
set -ex
Expand Down Expand Up @@ -38,7 +38,7 @@ jobs:
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
Expand Down
4 changes: 2 additions & 2 deletions TTS/.models.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"multilingual": {
"multi-dataset": {
"xtts_v2": {
"description": "XTTS-v2 by Coqui with 16 languages.",
"description": "XTTS-v2.0.2 by Coqui with 16 languages.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
],
"model_hash": "6a09d1ad43896f06041ed8195956c9698f13b6189dc80f1c74bdc2b8e8d15324",
"model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
"default_vocoder": null,
"commit": "480a6cdf7",
"license": "CPML",
Expand Down
12 changes: 10 additions & 2 deletions TTS/tts/layers/tortoise/diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,18 @@
import numpy as np
import torch
import torch as th
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral
from tqdm import tqdm

from TTS.tts.layers.tortoise.dpm_solver import DPM_Solver, NoiseScheduleVP, model_wrapper

K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
try:
from k_diffusion.sampling import sample_dpmpp_2m, sample_euler_ancestral

K_DIFFUSION_SAMPLERS = {"k_euler_a": sample_euler_ancestral, "dpm++2m": sample_dpmpp_2m}
except ImportError:
K_DIFFUSION_SAMPLERS = None


SAMPLERS = ["dpm++2m", "p", "ddim"]


Expand Down Expand Up @@ -531,6 +537,8 @@ def sample_loop(self, *args, **kwargs):
if self.conditioning_free is not True:
raise RuntimeError("cond_free must be true")
with tqdm(total=self.num_timesteps) as pbar:
if K_DIFFUSION_SAMPLERS is None:
raise ModuleNotFoundError("Install k_diffusion for using k_diffusion samplers")
return self.k_diffusion_sample_loop(K_DIFFUSION_SAMPLERS[s], pbar, *args, **kwargs)
else:
raise RuntimeError("sampler not impl")
Expand Down
4 changes: 3 additions & 1 deletion TTS/tts/layers/xtts/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,9 @@ def forward(
audio_codes = F.pad(audio_codes[:, :max_mel_len], (0, 1), value=self.stop_audio_token)

# Pad mel codes with stop_audio_token
audio_codes = self.set_mel_padding(audio_codes, code_lengths - 3) # -3 to get the real code lengths without consider start and stop tokens that was not added yet
audio_codes = self.set_mel_padding(
audio_codes, code_lengths - 3
) # -3 to get the real code lengths without consider start and stop tokens that was not added yet

# Build input and target tensors
# Prepend start token to inputs and append stop token to targets
Expand Down
23 changes: 12 additions & 11 deletions TTS/tts/layers/xtts/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
import os
import re
import torch
import pypinyin
import textwrap

from functools import cached_property

import pypinyin
import torch
from hangul_romanize import Transliter
from hangul_romanize.rule import academic
from num2words import num2words
from spacy.lang.ar import Arabic
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.ja import Japanese
from spacy.lang.zh import Chinese
from tokenizers import Tokenizer

from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words

from spacy.lang.en import English
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ar import Arabic
from spacy.lang.es import Spanish


def get_spacy_lang(lang):
if lang == "zh":
Expand All @@ -32,6 +31,7 @@ def get_spacy_lang(lang):
# For most languages, Enlish does the job
return English()


def split_sentence(text, lang, text_split_length=250):
"""Preprocess the input text"""
text_splits = []
Expand Down Expand Up @@ -67,6 +67,7 @@ def split_sentence(text, lang, text_split_length=250):

return text_splits


_whitespace_re = re.compile(r"\s+")

# List of (regular expression, replacement) pairs for abbreviations:
Expand Down Expand Up @@ -619,7 +620,7 @@ def katsu(self):
return cutlet.Cutlet()

def check_input_length(self, txt, lang):
lang = lang.split("-")[0] # remove the region
lang = lang.split("-")[0] # remove the region
limit = self.char_limits.get(lang, 250)
if len(txt) > limit:
print(
Expand All @@ -640,7 +641,7 @@ def preprocess_text(self, txt, lang):
return txt

def encode(self, txt, lang):
lang = lang.split("-")[0] # remove the region
lang = lang.split("-")[0] # remove the region
self.check_input_length(txt, lang)
txt = self.preprocess_text(txt, lang)
lang = "zh-cn" if lang == "zh" else lang
Expand Down
7 changes: 4 additions & 3 deletions TTS/tts/layers/xtts/trainer/gpt_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,10 @@ def eval_step(self, batch, criterion):
batch["cond_idxs"] = None
return self.train_step(batch, criterion)

def on_epoch_start(self, trainer): # pylint: disable=W0613
# guarante that dvae will be in eval mode after .train() on evaluation end
self.dvae = self.dvae.eval()
def on_train_epoch_start(self, trainer):
trainer.model.eval() # the whole model to eval
# put gpt model in training mode
trainer.model.xtts.gpt.train()

def on_init_end(self, trainer): # pylint: disable=W0613
# ignore similarities.pth on clearml save/upload
Expand Down
14 changes: 5 additions & 9 deletions TTS/tts/models/xtts.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,13 +513,13 @@ def inference(
enable_text_splitting=False,
**hf_generate_kwargs,
):
language = language.split("-")[0] # remove the country code
language = language.split("-")[0] # remove the country code
length_scale = 1.0 / max(speed, 0.05)
if enable_text_splitting:
text = split_sentence(text, language, self.tokenizer.char_limits[language])
else:
text = [text]

wavs = []
gpt_latents_list = []
for sent in text:
Expand Down Expand Up @@ -563,9 +563,7 @@ def inference(

if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2),
scale_factor=length_scale,
mode="linear"
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
).transpose(1, 2)

gpt_latents_list.append(gpt_latents.cpu())
Expand Down Expand Up @@ -623,7 +621,7 @@ def inference_stream(
enable_text_splitting=False,
**hf_generate_kwargs,
):
language = language.split("-")[0] # remove the country code
language = language.split("-")[0] # remove the country code
length_scale = 1.0 / max(speed, 0.05)
if enable_text_splitting:
text = split_sentence(text, language, self.tokenizer.char_limits[language])
Expand Down Expand Up @@ -675,9 +673,7 @@ def inference_stream(
gpt_latents = torch.cat(all_latents, dim=0)[None, :]
if length_scale != 1.0:
gpt_latents = F.interpolate(
gpt_latents.transpose(1, 2),
scale_factor=length_scale,
mode="linear"
gpt_latents.transpose(1, 2), scale_factor=length_scale, mode="linear"
).transpose(1, 2)
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
Expand Down
1 change: 1 addition & 0 deletions TTS/vocoder/configs/parallel_wavegan_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):
use_noise_augment: bool = False
use_cache: bool = True
steps_to_start_discriminator: int = 200000
target_loss: str = "loss_1"

# LOSS PARAMETERS - overrides
use_stft_loss: bool = True
Expand Down
45 changes: 22 additions & 23 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
# core deps
numpy==1.22.0;python_version<="3.10"
numpy==1.24.3;python_version>"3.10"
cython==0.29.30
numpy>=1.24.3;python_version>"3.10"
cython>=0.29.30
scipy>=1.11.2
torch>=2.1
torchaudio
soundfile==0.12.*
librosa==0.10.*
scikit-learn==1.3.0
soundfile>=0.12.0
librosa>=0.10.0
scikit-learn>=1.3.0
numba==0.55.1;python_version<"3.9"
numba==0.57.0;python_version>="3.9"
inflect==5.6.*
tqdm==4.64.*
anyascii==0.3.*
pyyaml==6.*
fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
aiohttp==3.8.*
packaging==23.1
numba>=0.57.0;python_version>="3.9"
inflect>=5.6.0
tqdm>=4.64.1
anyascii>=0.3.0
pyyaml>=6.0
fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
aiohttp>=3.8.1
packaging>=23.1
# deps for examples
flask==2.*
flask>=2.0.1
# deps for inference
pysbd==0.3.4
pysbd>=0.3.4
# deps for notebooks
umap-learn==0.5.*
umap-learn>=0.5.1
pandas>=1.4,<2.0
# deps for training
matplotlib==3.7.*
matplotlib>=3.7.0
# coqui stack
trainer
trainer>=0.0.32
# config management
coqpit>=0.0.16
# chinese g2p deps
Expand All @@ -46,12 +46,11 @@ bangla
bnnumerizer
bnunicodenormalizer
#deps for tortoise
k_diffusion
einops==0.6.*
transformers==4.33.*
einops>=0.6.0
transformers>=4.33.0
#deps for bark
encodec==0.1.*
encodec>=0.1.1
# deps for XTTS
unidecode==1.3.*
unidecode>=1.3.2
num2words
spacy[ja]>=3
4 changes: 2 additions & 2 deletions tests/zoo_tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_xtts_v2_streaming():
"en",
gpt_cond_latent,
speaker_embedding,
speed=1.5
speed=1.5,
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
Expand All @@ -198,7 +198,7 @@ def test_xtts_v2_streaming():
"en",
gpt_cond_latent,
speaker_embedding,
speed=0.66
speed=0.66,
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
Expand Down

0 comments on commit b94db72

Please sign in to comment.