-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery Starbot ⭐ refactored ikrom96git/Real-Time-Voice-Cloning #1
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,21 +32,21 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], | |
this argument will be ignored. | ||
""" | ||
# Load the wav from disk if needed | ||
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): | ||
if isinstance(fpath_or_wav, (str, Path)): | ||
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None) | ||
else: | ||
wav = fpath_or_wav | ||
|
||
# Resample the wav if needed | ||
if source_sr is not None and source_sr != sampling_rate: | ||
wav = librosa.resample(wav, source_sr, sampling_rate) | ||
|
||
# Apply the preprocessing: normalize volume and shorten long silences | ||
if normalize: | ||
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) | ||
if webrtcvad and trim_silence: | ||
wav = trim_long_silences(wav) | ||
|
||
Comment on lines
-35
to
+49
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
return wav | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ def __init__(self, root: Path): | |
def _load_utterances(self): | ||
with self.root.joinpath("_sources.txt").open("r") as sources_file: | ||
sources = [l.split(",") for l in sources_file] | ||
sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources} | ||
sources = dict(sources) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()] | ||
self.utterance_cycler = RandomCycler(self.utterances) | ||
|
||
|
@@ -35,6 +35,4 @@ def random_partial(self, count, n_frames): | |
|
||
utterances = self.utterance_cycler.sample(count) | ||
|
||
a = [(u,) + u.random_partial(n_frames) for u in utterances] | ||
|
||
return a | ||
return [(u,) + u.random_partial(n_frames) for u in utterances] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,7 @@ class SpeakerVerificationDataset(Dataset): | |
def __init__(self, datasets_root: Path): | ||
self.root = datasets_root | ||
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()] | ||
if len(speaker_dirs) == 0: | ||
if not speaker_dirs: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
raise Exception("No speakers found. Make sure you are pointing to the directory " | ||
"containing all preprocessed speaker directories.") | ||
self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,8 +51,7 @@ def embed_frames_batch(frames_batch): | |
raise Exception("Model was not loaded. Call load_model() before inference.") | ||
|
||
frames = torch.from_numpy(frames_batch).to(_device) | ||
embed = _model.forward(frames).detach().cpu().numpy() | ||
return embed | ||
return _model.forward(frames).detach().cpu().numpy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames, | ||
|
@@ -130,10 +129,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): | |
if not using_partials: | ||
frames = audio.wav_to_mel_spectrogram(wav) | ||
embed = embed_frames_batch(frames[None, ...])[0] | ||
if return_partials: | ||
return embed, None, None | ||
return embed | ||
|
||
return (embed, None, None) if return_partials else embed | ||
Comment on lines
-133
to
+132
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
# Compute where to split the utterance into partials and pad if necessary | ||
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) | ||
max_wave_length = wave_slices[-1].stop | ||
|
@@ -149,9 +145,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): | |
raw_embed = np.mean(partial_embeds, axis=0) | ||
embed = raw_embed / np.linalg.norm(raw_embed, 2) | ||
|
||
if return_partials: | ||
return embed, partial_embeds, wave_slices | ||
return embed | ||
return (embed, partial_embeds, wave_slices) if return_partials else embed | ||
|
||
|
||
def embed_speaker(wavs, **kwargs): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -51,14 +51,11 @@ def forward(self, utterances, hidden_init=None): | |
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state | ||
# and the final cell state. | ||
out, (hidden, cell) = self.lstm(utterances, hidden_init) | ||
|
||
# We take only the hidden state of the last layer | ||
embeds_raw = self.relu(self.linear(hidden[-1])) | ||
|
||
# L2-normalize it | ||
embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5) | ||
|
||
return embeds | ||
return embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5) | ||
Comment on lines
-54
to
+58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
|
||
def similarity_matrix(self, embeds): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,11 +18,11 @@ class DatasetLog: | |
Registers metadata about the dataset in a text file. | ||
""" | ||
def __init__(self, root, name): | ||
self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") | ||
self.text_file = open(Path(root, f'Log_{name.replace("/", "_")}.txt'), "w") | ||
self.sample_data = dict() | ||
|
||
start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) | ||
self.write_line("Creating dataset %s on %s" % (name, start_time)) | ||
self.write_line(f"Creating dataset {name} on {start_time}") | ||
Comment on lines
-21
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.write_line("-----") | ||
self._log_params() | ||
|
||
|
@@ -39,7 +39,7 @@ def write_line(self, line): | |
|
||
def add_sample(self, **kwargs): | ||
for param_name, value in kwargs.items(): | ||
if not param_name in self.sample_data: | ||
if param_name not in self.sample_data: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.sample_data[param_name] = [] | ||
self.sample_data[param_name].append(value) | ||
|
||
|
@@ -51,7 +51,7 @@ def finalize(self): | |
self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) | ||
self.write_line("-----") | ||
end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) | ||
self.write_line("Finished on %s" % end_time) | ||
self.write_line(f"Finished on {end_time}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.text_file.close() | ||
|
||
|
||
|
@@ -88,10 +88,10 @@ def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, s | |
sources_file = sources_fpath.open("a" if skip_existing else "w") | ||
audio_durs = [] | ||
for extension in _AUDIO_EXTENSIONS: | ||
for in_fpath in speaker_dir.glob("**/*.%s" % extension): | ||
for in_fpath in speaker_dir.glob(f"**/*.{extension}"): | ||
# Check if the target output file already exists | ||
out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) | ||
out_fname = out_fname.replace(".%s" % extension, ".npy") | ||
out_fname = out_fname.replace(f".{extension}", ".npy") | ||
Comment on lines
-91
to
+94
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
if skip_existing and out_fname in existing_fnames: | ||
continue | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,18 +45,17 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, | |
state_fpath = model_dir / "encoder.pt" | ||
|
||
# Load any existing model | ||
if not force_restart: | ||
if state_fpath.exists(): | ||
print("Found existing model \"%s\", loading it and resuming training." % run_id) | ||
checkpoint = torch.load(state_fpath) | ||
init_step = checkpoint["step"] | ||
model.load_state_dict(checkpoint["model_state"]) | ||
optimizer.load_state_dict(checkpoint["optimizer_state"]) | ||
optimizer.param_groups[0]["lr"] = learning_rate_init | ||
else: | ||
print("No model \"%s\" found, starting training from scratch." % run_id) | ||
else: | ||
if force_restart: | ||
print("Starting the training from scratch.") | ||
elif state_fpath.exists(): | ||
print("Found existing model \"%s\", loading it and resuming training." % run_id) | ||
checkpoint = torch.load(state_fpath) | ||
init_step = checkpoint["step"] | ||
model.load_state_dict(checkpoint["model_state"]) | ||
optimizer.load_state_dict(checkpoint["optimizer_state"]) | ||
optimizer.param_groups[0]["lr"] = learning_rate_init | ||
else: | ||
print("No model \"%s\" found, starting training from scratch." % run_id) | ||
Comment on lines
-48
to
+58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
model.train() | ||
|
||
# Initialize the visualization environment | ||
|
@@ -74,7 +73,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, | |
# Forward pass | ||
inputs = torch.from_numpy(speaker_batch.data).to(device) | ||
sync(device) | ||
profiler.tick("Data to %s" % device) | ||
profiler.tick(f"Data to {device}") | ||
embeds = model(inputs) | ||
sync(device) | ||
profiler.tick("Forward pass") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,11 +42,7 @@ def __init__(self, env_name=None, update_every=10, server="http://localhost", di | |
|
||
# Set the environment name | ||
now = str(datetime.now().strftime("%d-%m %Hh%M")) | ||
if env_name is None: | ||
self.env_name = now | ||
else: | ||
self.env_name = "%s (%s)" % (env_name, now) | ||
|
||
self.env_name = now if env_name is None else f"{env_name} ({now})" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
# Connect to visdom and open the corresponding window in the browser | ||
try: | ||
self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True) | ||
|
@@ -113,7 +109,7 @@ def update(self, loss, eer, step): | |
if step % self.update_every != 0: | ||
return | ||
time_string = "Step time: mean: %5dms std: %5dms" % \ | ||
(int(np.mean(self.step_times)), int(np.std(self.step_times))) | ||
(int(np.mean(self.step_times)), int(np.std(self.step_times))) | ||
Comment on lines
-116
to
+112
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
print("\nStep %6d Loss: %.4f EER: %.4f %s" % | ||
(step, np.mean(self.losses), np.mean(self.eers), time_string)) | ||
if not self.disabled: | ||
|
@@ -143,7 +139,7 @@ def update(self, loss, eer, step): | |
) | ||
if self.implementation_win is not None: | ||
self.vis.text( | ||
self.implementation_string + ("<b>%s</b>" % time_string), | ||
f"{self.implementation_string}<b>{time_string}</b>", | ||
win=self.implementation_win, | ||
opts={"title": "Training implementation"}, | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
|
||
|
||
if __name__ == "__main__": | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lines
|
||
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): | ||
pass | ||
|
||
|
@@ -67,5 +68,5 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio | |
} | ||
args = vars(args) | ||
for dataset in args.pop("datasets"): | ||
print("Preprocessing %s" % dataset) | ||
print(f"Preprocessing {dataset}") | ||
preprocess_func[dataset](**args) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,14 +18,10 @@ def save_wavenet_wav(wav, path, sr): | |
sf.write(path, wav.astype(np.float32), sr) | ||
|
||
def preemphasis(wav, k, preemphasize=True): | ||
if preemphasize: | ||
return signal.lfilter([1, -k], [1], wav) | ||
return wav | ||
return signal.lfilter([1, -k], [1], wav) if preemphasize else wav | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def inv_preemphasis(wav, k, inv_preemphasize=True): | ||
if inv_preemphasize: | ||
return signal.lfilter([1], [1, -k], wav) | ||
return wav | ||
return signal.lfilter([1], [1, -k], wav) if inv_preemphasize else wav | ||
Comment on lines
-26
to
+24
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py | ||
def start_and_end_indices(quantized, silence_threshold=2): | ||
|
@@ -51,52 +47,46 @@ def get_hop_size(hparams): | |
def linearspectrogram(wav, hparams): | ||
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) | ||
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db | ||
|
||
if hparams.signal_normalization: | ||
return _normalize(S, hparams) | ||
return S | ||
|
||
return _normalize(S, hparams) if hparams.signal_normalization else S | ||
Comment on lines
-54
to
+51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def melspectrogram(wav, hparams): | ||
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams) | ||
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db | ||
|
||
if hparams.signal_normalization: | ||
return _normalize(S, hparams) | ||
return S | ||
|
||
return _normalize(S, hparams) if hparams.signal_normalization else S | ||
Comment on lines
-62
to
+57
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def inv_linear_spectrogram(linear_spectrogram, hparams): | ||
"""Converts linear spectrogram to waveform using librosa""" | ||
if hparams.signal_normalization: | ||
D = _denormalize(linear_spectrogram, hparams) | ||
else: | ||
D = linear_spectrogram | ||
|
||
S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear | ||
|
||
if hparams.use_lws: | ||
processor = _lws_processor(hparams) | ||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power) | ||
y = processor.istft(D).astype(np.float32) | ||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) | ||
else: | ||
|
||
if not hparams.use_lws: | ||
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) | ||
processor = _lws_processor(hparams) | ||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power) | ||
y = processor.istft(D).astype(np.float32) | ||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def inv_mel_spectrogram(mel_spectrogram, hparams): | ||
"""Converts mel spectrogram to waveform using librosa""" | ||
if hparams.signal_normalization: | ||
D = _denormalize(mel_spectrogram, hparams) | ||
else: | ||
D = mel_spectrogram | ||
|
||
S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear | ||
|
||
if hparams.use_lws: | ||
processor = _lws_processor(hparams) | ||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power) | ||
y = processor.istft(D).astype(np.float32) | ||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) | ||
else: | ||
|
||
if not hparams.use_lws: | ||
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize) | ||
processor = _lws_processor(hparams) | ||
D = processor.run_lws(S.astype(np.float64).T ** hparams.power) | ||
y = processor.istft(D).astype(np.float32) | ||
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize) | ||
Comment on lines
-90
to
+89
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def _lws_processor(hparams): | ||
import lws | ||
|
@@ -109,7 +99,7 @@ def _griffin_lim(S, hparams): | |
angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) | ||
S_complex = np.abs(S).astype(np.complex) | ||
y = _istft(S_complex * angles, hparams) | ||
for i in range(hparams.griffin_lim_iters): | ||
for _ in range(hparams.griffin_lim_iters): | ||
Comment on lines
-112
to
+102
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
angles = np.exp(1j * np.angle(_stft(y, hparams))) | ||
y = _istft(S_complex * angles, hparams) | ||
return y | ||
|
@@ -129,11 +119,11 @@ def num_frames(length, fsize, fshift): | |
"""Compute number of time frames of spectrogram | ||
""" | ||
pad = (fsize - fshift) | ||
if length % fshift == 0: | ||
M = (length + pad * 2 - fsize) // fshift + 1 | ||
else: | ||
M = (length + pad * 2 - fsize) // fshift + 2 | ||
return M | ||
return ( | ||
(length + pad * 2 - fsize) // fshift + 1 | ||
if length % fshift == 0 | ||
else (length + pad * 2 - fsize) // fshift + 2 | ||
) | ||
Comment on lines
-132
to
+126
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def pad_lr(x, fsize, fshift): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -144,13 +144,12 @@ def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): | |
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that | ||
were fed to the synthesizer when training. | ||
""" | ||
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): | ||
if isinstance(fpath_or_wav, (str, Path)): | ||
wav = Synthesizer.load_preprocess_wav(fpath_or_wav) | ||
else: | ||
wav = fpath_or_wav | ||
|
||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) | ||
return mel_spectrogram | ||
return audio.melspectrogram(wav, hparams).astype(np.float32) | ||
Comment on lines
-147
to
+152
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
@staticmethod | ||
def griffin_lim(mel): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lines
207-207
refactored with the following changes:replace-interpolation-with-fstring
)