Skip to content

Commit

Permalink
Merge pull request #102 from EveryVoiceTTS/dev.ej/ras-output
Browse files Browse the repository at this point in the history
Add .readalong format output to synthesize
  • Loading branch information
joanise authored Dec 10, 2024
2 parents 2fd8eb4 + 823bb67 commit 2afc610
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 97 deletions.
7 changes: 4 additions & 3 deletions fs2/cli/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,9 +342,10 @@ def synthesize( # noqa: C901
"--output-type",
help="""Which format(s) to synthesize to.
Multiple formats can be provided by repeating `--output-type`.
**wav** is the default and will synthesize to a playable audio file;
**spec** will generate predicted Mel spectrograms. Tensors are time-oriented (T, K) where T is equal to the number of frames and K is equal to the number of Mel bands.
**textgrid** will generate a Praat TextGrid with alignment labels. This can be helpful for evaluation.
'**wav**' is the default and will synthesize to a playable audio file;
'**spec**' will generate predicted Mel spectrograms. Tensors are time-oriented (T, K) where T is equal to the number of frames and K is equal to the number of Mel bands.
'**textgrid**' will generate a Praat TextGrid with alignment labels. This can be helpful for evaluation.
'**readalong**' will generate a ReadAlong from the given text and synthesized audio (see https://github.com/ReadAlongs).
""",
),
teacher_forcing_directory: Path = typer.Option(
Expand Down
219 changes: 137 additions & 82 deletions fs2/prediction_writing_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from loguru import logger
from pympi import TextGrid
from pytorch_lightning.callbacks import Callback
from readalongs.api import Token, convert_to_readalong

from .config import FastSpeech2Config
from .type_definitions import SynthesizeOutputFormats
Expand All @@ -24,7 +25,7 @@ def get_synthesis_output_callbacks(
vocoder_model: Optional[HiFiGAN] = None,
vocoder_config: Optional[HiFiGANConfig] = None,
vocoder_global_step: Optional[int] = None,
):
) -> list[Callback]:
"""
Given a list of desired output file formats, return the proper callbacks
that will generate those files.
Expand All @@ -48,6 +49,15 @@ def get_synthesis_output_callbacks(
output_key=output_key,
)
)
if SynthesizeOutputFormats.readalong in output_type:
callbacks.append(
PredictionWritingReadAlongCallback(
config=config,
global_step=global_step,
output_dir=output_dir,
output_key=output_key,
)
)
if SynthesizeOutputFormats.wav in output_type:
if (
vocoder_model is None
Expand Down Expand Up @@ -76,33 +86,35 @@ def get_synthesis_output_callbacks(
class PredictionWritingCallbackBase(Callback):
def __init__(
self,
config: FastSpeech2Config,
file_extension: str,
global_step: int,
save_dir: Path,
) -> None:
super().__init__()
self.config = config
self.file_extension = file_extension
self.global_step = f"ckpt={global_step}"
self.save_dir = save_dir
self.sep = "--"

self.save_dir.mkdir(parents=True, exist_ok=True)

def _get_filename(self, basename: str, speaker: str, language: str) -> Path:
def get_filename(
self,
basename: str,
speaker: str,
language: str,
include_global_step: bool = False,
) -> Path:
# We don't truncate or alter the filename here because the basename is
# already truncated/cleaned in cli/synthesize.py
path = self.save_dir / self.sep.join(
[
basename,
speaker,
language,
self.global_step,
self.file_extension,
]
)
path.parent.mkdir(
parents=True, exist_ok=True
) # synthesizing spec allows nested outputs
name_parts = [basename, speaker, language, self.file_extension]
if include_global_step:
name_parts.insert(-1, self.global_step)
path = self.save_dir / self.sep.join(name_parts)
# synthesizing spec allows nested outputs so we may need to make subdirs
path.parent.mkdir(parents=True, exist_ok=True)
return path


Expand All @@ -119,33 +131,15 @@ def __init__(
output_key: str,
):
super().__init__(
config=config,
global_step=global_step,
file_extension=f"spec-pred-{config.preprocessing.audio.input_sampling_rate}-{config.preprocessing.audio.spec_type}.pt",
save_dir=output_dir / "synthesized_spec",
)

self.output_key = output_key
self.config = config
logger.info(f"Saving pytorch output to {self.save_dir}")

def _get_filename(self, basename: str, speaker: str, language: str) -> Path:
# We don't truncate or alter the filename here because the basename is
# already truncated/cleaned in cli/synthesize.py
# the spec should not have the global step printed because it is used to fine-tune
# and the dataloader does not expect a global step in the filename
path = self.save_dir / self.sep.join(
[
basename,
speaker,
language,
self.file_extension,
]
)
path.parent.mkdir(
parents=True, exist_ok=True
) # synthesizing spec allows nested outputs
return path

def on_predict_batch_end( # pyright: ignore [reportIncompatibleMethodOverride]
self,
_trainer,
Expand All @@ -166,53 +160,52 @@ def on_predict_batch_end( # pyright: ignore [reportIncompatibleMethodOverride]
):
torch.save(
data[:unmasked_len].cpu(),
self._get_filename(
basename=basename,
speaker=speaker,
language=language,
),
self.get_filename(basename, speaker, language),
)


class PredictionWritingTextGridCallback(PredictionWritingCallbackBase):
class PredictionWritingAlignedTextCallback(PredictionWritingCallbackBase):
"""
This callback runs inference on a provided text-to-spec model and saves the resulting textgrid of the predicted durations to disk. This can be used for evaluation.
This callback runs inference on a provided text-to-spec model and saves the
resulting time-aligned text to file. The output format depends on the subclass's
implementation of save_aligned_text_to_file.
"""

def __init__(
self,
config: FastSpeech2Config,
global_step: int,
output_dir: Path,
output_key: str,
file_extension: str,
save_dir: Path,
):
super().__init__(
config=config,
global_step=global_step,
file_extension=f"{config.preprocessing.audio.input_sampling_rate}-{config.preprocessing.audio.spec_type}.TextGrid",
save_dir=output_dir / "textgrids",
file_extension=file_extension,
save_dir=save_dir,
)
self.text_processor = TextProcessor(config.text)
self.output_key = output_key
self.config = config
logger.info(f"Saving pytorch output to {self.save_dir}")

def _get_filename(self, basename: str, speaker: str, language: str) -> Path:
# We don't truncate or alter the filename here because the basename is
# already truncated/cleaned in cli/synthesize.py
# the textgrid should not have the global step printed because it is used to fine-tune
# and the dataloader does not expect a global step in the filename
path = self.save_dir / self.sep.join(
[
basename,
speaker,
language,
self.file_extension,
]
)
path.parent.mkdir(
parents=True, exist_ok=True
) # synthesizing spec allows nested outputs
return path
def save_aligned_text_to_file(
self,
max_seconds: float,
phones: list[tuple[float, float, str]],
words: list[tuple[float, float, str]],
language: str,
filename: Path,
): # pragma: no cover
"""
Subclasses must implement this function to save the aligned text to file
in the desired format.
See for example PredictionWritingTextGridCallback.save_aligned_text_to_file
and PredictionWritingReadAlongCallback.save_aligned_text_to_file which save
the results to TextGrid and ReadAlong formats, respectively.
"""
raise NotImplementedError

def frames_to_seconds(self, frames: int) -> float:
return (
Expand Down Expand Up @@ -253,19 +246,13 @@ def on_predict_batch_end( # pyright: ignore [reportIncompatibleMethodOverride]
), f"can't synthesize {raw_text} because the number of predicted duration steps ({len(duration_frames)}) doesn't equal the number of input text labels ({len(text_labels)})"
# get the duration of the audio: (sum_of_frames * hop_size) / sample_rate
xmax_seconds = self.frames_to_seconds(sum(duration_frames))
# create new textgrid
new_tg = TextGrid(xmax=xmax_seconds)
# create the tiers
words: list[tuple[float, float, str]] = []
phones: list[tuple[float, float, str]] = []
raw_text_words = raw_text.split()
current_word_duration = 0.0
last_phone_end = 0.0
last_word_end = 0.0
phone_tier = new_tg.add_tier("phones")
phone_annotation_tier = new_tg.add_tier("phone annotations")
word_tier = new_tg.add_tier("words")
word_annotation_tier = new_tg.add_tier("word annotations")
# skip padding
text_labels_no_padding = [tl for tl in text_labels if tl != "\x80"]
duration_frames_no_padding = duration_frames[: len(text_labels_no_padding)]
Expand All @@ -277,8 +264,6 @@ def on_predict_batch_end( # pyright: ignore [reportIncompatibleMethodOverride]
current_phone_end = last_phone_end + phone_duration
interval = (last_phone_end, current_phone_end, label)
phones.append(interval)
phone_annotation_tier.add_interval(interval[0], interval[1], "")
phone_tier.add_interval(*interval)
last_phone_end = current_phone_end
# accumulate phone to word label
current_word_duration += phone_duration
Expand All @@ -291,18 +276,90 @@ def on_predict_batch_end( # pyright: ignore [reportIncompatibleMethodOverride]
raw_text_words[len(words)],
)
words.append(interval)
word_tier.add_interval(*interval)
word_annotation_tier.add_interval(interval[0], interval[1], "")
last_word_end = current_word_end
current_word_duration = 0

# get the filename
filename = self._get_filename(
basename=basename,
speaker=speaker,
language=language,
filename = self.get_filename(basename, speaker, language)
# Save the output (the subclass has to implement this)
self.save_aligned_text_to_file(
xmax_seconds, phones, words, language, filename
)
# write the file
new_tg.to_file(filename)


class PredictionWritingTextGridCallback(PredictionWritingAlignedTextCallback):
"""
This callback runs inference on a provided text-to-spec model and saves the resulting textgrid of the predicted durations to disk. This can be used for evaluation.
"""

def __init__(
self,
config: FastSpeech2Config,
global_step: int,
output_dir: Path,
output_key: str,
):
super().__init__(
config=config,
global_step=global_step,
output_key=output_key,
file_extension=f"{config.preprocessing.audio.input_sampling_rate}-{config.preprocessing.audio.spec_type}.TextGrid",
save_dir=output_dir / "textgrids",
)

def save_aligned_text_to_file(self, max_seconds, phones, words, language, filename):
"""Save the aligned text as a TextGrid with phones and words layers"""
new_tg = TextGrid(xmax=max_seconds)
phone_tier = new_tg.add_tier("phones")
phone_annotation_tier = new_tg.add_tier("phone annotations")
for interval in phones:
phone_annotation_tier.add_interval(interval[0], interval[1], "")
phone_tier.add_interval(*interval)

word_tier = new_tg.add_tier("words")
word_annotation_tier = new_tg.add_tier("word annotations")
for interval in words:
word_tier.add_interval(*interval)
word_annotation_tier.add_interval(interval[0], interval[1], "")

new_tg.to_file(filename)


class PredictionWritingReadAlongCallback(PredictionWritingAlignedTextCallback):
"""
This callback runs inference on a provided text-to-spec model and saves the resulting readalong of the predicted durations to disk. Combined with the .wav output, this can be loaded in the ReadAlongs Web-Component for viewing.
"""

def __init__(
self,
config: FastSpeech2Config,
global_step: int,
output_dir: Path,
output_key: str,
):
super().__init__(
config=config,
global_step=global_step,
output_key=output_key,
file_extension=f"{config.preprocessing.audio.input_sampling_rate}-{config.preprocessing.audio.spec_type}.readalong",
save_dir=output_dir / "readalongs",
)
self.text_processor = TextProcessor(config.text)
self.output_key = output_key
logger.info(f"Saving pytorch output to {self.save_dir}")

def save_aligned_text_to_file(self, max_seconds, phones, words, language, filename):
"""Save the aligned text as a .readalong file"""

ras_tokens: list[Token] = []
for start, end, label in words:
if ras_tokens:
ras_tokens.append(Token(text=" ", is_word=False))
ras_tokens.append(Token(text=label, time=start, dur=end - start))

readalong = convert_to_readalong([ras_tokens], [language])
with open(filename, "w", encoding="utf8") as f:
f.write(readalong)


class PredictionWritingWavCallback(PredictionWritingCallbackBase):
Expand All @@ -322,14 +379,14 @@ def __init__(
vocoder_global_step: int,
):
super().__init__(
config=config,
file_extension="pred.wav",
global_step=global_step,
save_dir=output_dir / "wav",
)

self.output_key = output_key
self.device = device
self.config = config
self.vocoder_model = vocoder_model
self.vocoder_config = vocoder_config
sampling_rate_change = (
Expand Down Expand Up @@ -403,10 +460,8 @@ def on_predict_batch_end( # pyright: ignore [reportIncompatibleMethodOverride]
outputs["tgt_lens"],
):
write(
self._get_filename(
basename=basename,
speaker=speaker,
language=language,
self.get_filename(
basename, speaker, language, include_global_step=True
),
sr,
# the vocoder output includes padding so we have to remove that
Expand Down
Loading

0 comments on commit 2afc610

Please sign in to comment.