Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery Starbot ⭐ refactored ikrom96git/Real-Time-Voice-Cloning #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion demo_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,5 +204,5 @@


except Exception as e:
print("Caught exception: %s" % repr(e))
print(f"Caught exception: {repr(e)}")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 207-207 refactored with the following changes:

print("Restarting\n")
8 changes: 4 additions & 4 deletions encoder/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,21 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
this argument will be ignored.
"""
# Load the wav from disk if needed
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
if isinstance(fpath_or_wav, (str, Path)):
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
else:
wav = fpath_or_wav

# Resample the wav if needed
if source_sr is not None and source_sr != sampling_rate:
wav = librosa.resample(wav, source_sr, sampling_rate)

# Apply the preprocessing: normalize volume and shorten long silences
if normalize:
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
if webrtcvad and trim_silence:
wav = trim_long_silences(wav)

Comment on lines -35 to +49
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function preprocess_wav refactored with the following changes:

return wav


Expand Down
6 changes: 2 additions & 4 deletions encoder/data_objects/speaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, root: Path):
def _load_utterances(self):
with self.root.joinpath("_sources.txt").open("r") as sources_file:
sources = [l.split(",") for l in sources_file]
sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
sources = dict(sources)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function Speaker._load_utterances refactored with the following changes:

self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
self.utterance_cycler = RandomCycler(self.utterances)

Expand All @@ -35,6 +35,4 @@ def random_partial(self, count, n_frames):

utterances = self.utterance_cycler.sample(count)

a = [(u,) + u.random_partial(n_frames) for u in utterances]

return a
return [(u,) + u.random_partial(n_frames) for u in utterances]
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function Speaker.random_partial refactored with the following changes:

2 changes: 1 addition & 1 deletion encoder/data_objects/speaker_verification_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class SpeakerVerificationDataset(Dataset):
def __init__(self, datasets_root: Path):
self.root = datasets_root
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
if len(speaker_dirs) == 0:
if not speaker_dirs:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function SpeakerVerificationDataset.__init__ refactored with the following changes:

raise Exception("No speakers found. Make sure you are pointing to the directory "
"containing all preprocessed speaker directories.")
self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
Expand Down
12 changes: 3 additions & 9 deletions encoder/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ def embed_frames_batch(frames_batch):
raise Exception("Model was not loaded. Call load_model() before inference.")

frames = torch.from_numpy(frames_batch).to(_device)
embed = _model.forward(frames).detach().cpu().numpy()
return embed
return _model.forward(frames).detach().cpu().numpy()
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function embed_frames_batch refactored with the following changes:



def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
Expand Down Expand Up @@ -130,10 +129,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
if not using_partials:
frames = audio.wav_to_mel_spectrogram(wav)
embed = embed_frames_batch(frames[None, ...])[0]
if return_partials:
return embed, None, None
return embed

return (embed, None, None) if return_partials else embed
Comment on lines -133 to +132
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function embed_utterance refactored with the following changes:

# Compute where to split the utterance into partials and pad if necessary
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
max_wave_length = wave_slices[-1].stop
Expand All @@ -149,9 +145,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
raw_embed = np.mean(partial_embeds, axis=0)
embed = raw_embed / np.linalg.norm(raw_embed, 2)

if return_partials:
return embed, partial_embeds, wave_slices
return embed
return (embed, partial_embeds, wave_slices) if return_partials else embed


def embed_speaker(wavs, **kwargs):
Expand Down
7 changes: 2 additions & 5 deletions encoder/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,11 @@ def forward(self, utterances, hidden_init=None):
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
# and the final cell state.
out, (hidden, cell) = self.lstm(utterances, hidden_init)

# We take only the hidden state of the last layer
embeds_raw = self.relu(self.linear(hidden[-1]))

# L2-normalize it
embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)

return embeds
return embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
Comment on lines -54 to +58
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function SpeakerEncoder.forward refactored with the following changes:

This removes the following comments ( why? ):

# L2-normalize it


def similarity_matrix(self, embeds):
"""
Expand Down
12 changes: 6 additions & 6 deletions encoder/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class DatasetLog:
Registers metadata about the dataset in a text file.
"""
def __init__(self, root, name):
self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
self.text_file = open(Path(root, f'Log_{name.replace("/", "_")}.txt'), "w")
self.sample_data = dict()

start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
self.write_line("Creating dataset %s on %s" % (name, start_time))
self.write_line(f"Creating dataset {name} on {start_time}")
Comment on lines -21 to +25
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function DatasetLog.__init__ refactored with the following changes:

self.write_line("-----")
self._log_params()

Expand All @@ -39,7 +39,7 @@ def write_line(self, line):

def add_sample(self, **kwargs):
for param_name, value in kwargs.items():
if not param_name in self.sample_data:
if param_name not in self.sample_data:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function DatasetLog.add_sample refactored with the following changes:

  • Simplify logical expression using De Morgan identities (de-morgan)

self.sample_data[param_name] = []
self.sample_data[param_name].append(value)

Expand All @@ -51,7 +51,7 @@ def finalize(self):
self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
self.write_line("-----")
end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
self.write_line("Finished on %s" % end_time)
self.write_line(f"Finished on {end_time}")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function DatasetLog.finalize refactored with the following changes:

self.text_file.close()


Expand Down Expand Up @@ -88,10 +88,10 @@ def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, s
sources_file = sources_fpath.open("a" if skip_existing else "w")
audio_durs = []
for extension in _AUDIO_EXTENSIONS:
for in_fpath in speaker_dir.glob("**/*.%s" % extension):
for in_fpath in speaker_dir.glob(f"**/*.{extension}"):
# Check if the target output file already exists
out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
out_fname = out_fname.replace(".%s" % extension, ".npy")
out_fname = out_fname.replace(f".{extension}", ".npy")
Comment on lines -91 to +94
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _preprocess_speaker refactored with the following changes:

if skip_existing and out_fname in existing_fnames:
continue

Expand Down
23 changes: 11 additions & 12 deletions encoder/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,17 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
state_fpath = model_dir / "encoder.pt"

# Load any existing model
if not force_restart:
if state_fpath.exists():
print("Found existing model \"%s\", loading it and resuming training." % run_id)
checkpoint = torch.load(state_fpath)
init_step = checkpoint["step"]
model.load_state_dict(checkpoint["model_state"])
optimizer.load_state_dict(checkpoint["optimizer_state"])
optimizer.param_groups[0]["lr"] = learning_rate_init
else:
print("No model \"%s\" found, starting training from scratch." % run_id)
else:
if force_restart:
print("Starting the training from scratch.")
elif state_fpath.exists():
print("Found existing model \"%s\", loading it and resuming training." % run_id)
checkpoint = torch.load(state_fpath)
init_step = checkpoint["step"]
model.load_state_dict(checkpoint["model_state"])
optimizer.load_state_dict(checkpoint["optimizer_state"])
optimizer.param_groups[0]["lr"] = learning_rate_init
else:
print("No model \"%s\" found, starting training from scratch." % run_id)
Comment on lines -48 to +58
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function train refactored with the following changes:

model.train()

# Initialize the visualization environment
Expand All @@ -74,7 +73,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
# Forward pass
inputs = torch.from_numpy(speaker_batch.data).to(device)
sync(device)
profiler.tick("Data to %s" % device)
profiler.tick(f"Data to {device}")
embeds = model(inputs)
sync(device)
profiler.tick("Forward pass")
Expand Down
10 changes: 3 additions & 7 deletions encoder/visualizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,7 @@ def __init__(self, env_name=None, update_every=10, server="http://localhost", di

# Set the environment name
now = str(datetime.now().strftime("%d-%m %Hh%M"))
if env_name is None:
self.env_name = now
else:
self.env_name = "%s (%s)" % (env_name, now)

self.env_name = now if env_name is None else f"{env_name} ({now})"
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function Visualizations.__init__ refactored with the following changes:

# Connect to visdom and open the corresponding window in the browser
try:
self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
Expand Down Expand Up @@ -113,7 +109,7 @@ def update(self, loss, eer, step):
if step % self.update_every != 0:
return
time_string = "Step time: mean: %5dms std: %5dms" % \
(int(np.mean(self.step_times)), int(np.std(self.step_times)))
(int(np.mean(self.step_times)), int(np.std(self.step_times)))
Comment on lines -116 to +112
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function Visualizations.update refactored with the following changes:

print("\nStep %6d Loss: %.4f EER: %.4f %s" %
(step, np.mean(self.losses), np.mean(self.eers), time_string))
if not self.disabled:
Expand Down Expand Up @@ -143,7 +139,7 @@ def update(self, loss, eer, step):
)
if self.implementation_win is not None:
self.vis.text(
self.implementation_string + ("<b>%s</b>" % time_string),
f"{self.implementation_string}<b>{time_string}</b>",
win=self.implementation_win,
opts={"title": "Training implementation"},
)
Expand Down
3 changes: 2 additions & 1 deletion encoder_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@


if __name__ == "__main__":

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lines 70-70 refactored with the following changes:

class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass

Expand Down Expand Up @@ -67,5 +68,5 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio
}
args = vars(args)
for dataset in args.pop("datasets"):
print("Preprocessing %s" % dataset)
print(f"Preprocessing {dataset}")
preprocess_func[dataset](**args)
62 changes: 26 additions & 36 deletions synthesizer/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,10 @@ def save_wavenet_wav(wav, path, sr):
sf.write(path, wav.astype(np.float32), sr)

def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
return signal.lfilter([1, -k], [1], wav) if preemphasize else wav
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function preemphasis refactored with the following changes:


def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
return signal.lfilter([1], [1, -k], wav) if inv_preemphasize else wav
Comment on lines -26 to +24
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function inv_preemphasis refactored with the following changes:


#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
Expand All @@ -51,52 +47,46 @@ def get_hop_size(hparams):
def linearspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db

if hparams.signal_normalization:
return _normalize(S, hparams)
return S

return _normalize(S, hparams) if hparams.signal_normalization else S
Comment on lines -54 to +51
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function linearspectrogram refactored with the following changes:


def melspectrogram(wav, hparams):
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db

if hparams.signal_normalization:
return _normalize(S, hparams)
return S

return _normalize(S, hparams) if hparams.signal_normalization else S
Comment on lines -62 to +57
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function melspectrogram refactored with the following changes:


def inv_linear_spectrogram(linear_spectrogram, hparams):
"""Converts linear spectrogram to waveform using librosa"""
if hparams.signal_normalization:
D = _denormalize(linear_spectrogram, hparams)
else:
D = linear_spectrogram

S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear

if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:

if not hparams.use_lws:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function inv_linear_spectrogram refactored with the following changes:


def inv_mel_spectrogram(mel_spectrogram, hparams):
"""Converts mel spectrogram to waveform using librosa"""
if hparams.signal_normalization:
D = _denormalize(mel_spectrogram, hparams)
else:
D = mel_spectrogram

S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams) # Convert back to linear

if hparams.use_lws:
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:

if not hparams.use_lws:
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
Comment on lines -90 to +89
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function inv_mel_spectrogram refactored with the following changes:


def _lws_processor(hparams):
import lws
Expand All @@ -109,7 +99,7 @@ def _griffin_lim(S, hparams):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = _istft(S_complex * angles, hparams)
for i in range(hparams.griffin_lim_iters):
for _ in range(hparams.griffin_lim_iters):
Comment on lines -112 to +102
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _griffin_lim refactored with the following changes:

angles = np.exp(1j * np.angle(_stft(y, hparams)))
y = _istft(S_complex * angles, hparams)
return y
Expand All @@ -129,11 +119,11 @@ def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M
return (
(length + pad * 2 - fsize) // fshift + 1
if length % fshift == 0
else (length + pad * 2 - fsize) // fshift + 2
)
Comment on lines -132 to +126
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function num_frames refactored with the following changes:



def pad_lr(x, fsize, fshift):
Expand Down
5 changes: 2 additions & 3 deletions synthesizer/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,12 @@ def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
were fed to the synthesizer when training.
"""
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
if isinstance(fpath_or_wav, (str, Path)):
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav

mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram
return audio.melspectrogram(wav, hparams).astype(np.float32)
Comment on lines -147 to +152
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function Synthesizer.make_spectrogram refactored with the following changes:


@staticmethod
def griffin_lim(mel):
Expand Down
Loading