ikrom96git · SourceryAI · Feb 29, 2024 · SourceryAI · Feb 29, 2024 · SourceryAI
diff --git a/demo_cli.py b/demo_cli.py
@@ -204,5 +204,5 @@
 
 
         except Exception as e:
-            print("Caught exception: %s" % repr(e))
+            print(f"Caught exception: {repr(e)}")
             print("Restarting\n")
diff --git a/encoder/audio.py b/encoder/audio.py
@@ -32,21 +32,21 @@ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
     this argument will be ignored.
     """
     # Load the wav from disk if needed
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+    if isinstance(fpath_or_wav, (str, Path)):
         wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
     else:
         wav = fpath_or_wav
-    
+
     # Resample the wav if needed
     if source_sr is not None and source_sr != sampling_rate:
         wav = librosa.resample(wav, source_sr, sampling_rate)
-        
+
     # Apply the preprocessing: normalize volume and shorten long silences 
     if normalize:
         wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
     if webrtcvad and trim_silence:
         wav = trim_long_silences(wav)
-    
+
     return wav
 
 

diff --git a/encoder/data_objects/speaker.py b/encoder/data_objects/speaker.py
@@ -13,7 +13,7 @@ def __init__(self, root: Path):
     def _load_utterances(self):
         with self.root.joinpath("_sources.txt").open("r") as sources_file:
             sources = [l.split(",") for l in sources_file]
-        sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
+        sources = dict(sources)
         self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
         self.utterance_cycler = RandomCycler(self.utterances)
 
@@ -35,6 +35,4 @@ def random_partial(self, count, n_frames):
 
         utterances = self.utterance_cycler.sample(count)
 
-        a = [(u,) + u.random_partial(n_frames) for u in utterances]
-
-        return a
+        return [(u,) + u.random_partial(n_frames) for u in utterances]
diff --git a/encoder/data_objects/speaker_verification_dataset.py b/encoder/data_objects/speaker_verification_dataset.py
@@ -11,7 +11,7 @@ class SpeakerVerificationDataset(Dataset):
     def __init__(self, datasets_root: Path):
         self.root = datasets_root
         speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
-        if len(speaker_dirs) == 0:
+        if not speaker_dirs:
             raise Exception("No speakers found. Make sure you are pointing to the directory "
                             "containing all preprocessed speaker directories.")
         self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]

diff --git a/encoder/inference.py b/encoder/inference.py
@@ -51,8 +51,7 @@ def embed_frames_batch(frames_batch):
         raise Exception("Model was not loaded. Call load_model() before inference.")
 
     frames = torch.from_numpy(frames_batch).to(_device)
-    embed = _model.forward(frames).detach().cpu().numpy()
-    return embed
+    return _model.forward(frames).detach().cpu().numpy()
 
 
 def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
@@ -130,10 +129,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
     if not using_partials:
         frames = audio.wav_to_mel_spectrogram(wav)
         embed = embed_frames_batch(frames[None, ...])[0]
-        if return_partials:
-            return embed, None, None
-        return embed
-
+        return (embed, None, None) if return_partials else embed
     # Compute where to split the utterance into partials and pad if necessary
     wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
     max_wave_length = wave_slices[-1].stop
@@ -149,9 +145,7 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
     raw_embed = np.mean(partial_embeds, axis=0)
     embed = raw_embed / np.linalg.norm(raw_embed, 2)
 
-    if return_partials:
-        return embed, partial_embeds, wave_slices
-    return embed
+    return (embed, partial_embeds, wave_slices) if return_partials else embed
 
 
 def embed_speaker(wavs, **kwargs):

diff --git a/encoder/model.py b/encoder/model.py
@@ -51,14 +51,11 @@ def forward(self, utterances, hidden_init=None):
         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
         # and the final cell state.
         out, (hidden, cell) = self.lstm(utterances, hidden_init)
-        
+
         # We take only the hidden state of the last layer
         embeds_raw = self.relu(self.linear(hidden[-1]))
-
-        # L2-normalize it
-        embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)        
 
-        return embeds
+        return embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
 
     def similarity_matrix(self, embeds):
         """

diff --git a/encoder/preprocess.py b/encoder/preprocess.py
@@ -18,11 +18,11 @@ class DatasetLog:
     Registers metadata about the dataset in a text file.
     """
     def __init__(self, root, name):
-        self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
+        self.text_file = open(Path(root, f'Log_{name.replace("/", "_")}.txt'), "w")
         self.sample_data = dict()
 
         start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
-        self.write_line("Creating dataset %s on %s" % (name, start_time))
+        self.write_line(f"Creating dataset {name} on {start_time}")
         self.write_line("-----")
         self._log_params()
 
@@ -39,7 +39,7 @@ def write_line(self, line):
 
     def add_sample(self, **kwargs):
         for param_name, value in kwargs.items():
-            if not param_name in self.sample_data:
+            if param_name not in self.sample_data:
                 self.sample_data[param_name] = []
             self.sample_data[param_name].append(value)
 
@@ -51,7 +51,7 @@ def finalize(self):
             self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
         self.write_line("-----")
         end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
-        self.write_line("Finished on %s" % end_time)
+        self.write_line(f"Finished on {end_time}")
         self.text_file.close()
 
 
@@ -88,10 +88,10 @@ def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, s
     sources_file = sources_fpath.open("a" if skip_existing else "w")
     audio_durs = []
     for extension in _AUDIO_EXTENSIONS:
-        for in_fpath in speaker_dir.glob("**/*.%s" % extension):
+        for in_fpath in speaker_dir.glob(f"**/*.{extension}"):
             # Check if the target output file already exists
             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
-            out_fname = out_fname.replace(".%s" % extension, ".npy")
+            out_fname = out_fname.replace(f".{extension}", ".npy")
             if skip_existing and out_fname in existing_fnames:
                 continue
 

diff --git a/encoder/train.py b/encoder/train.py
@@ -45,18 +45,17 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
     state_fpath = model_dir / "encoder.pt"
 
     # Load any existing model
-    if not force_restart:
-        if state_fpath.exists():
-            print("Found existing model \"%s\", loading it and resuming training." % run_id)
-            checkpoint = torch.load(state_fpath)
-            init_step = checkpoint["step"]
-            model.load_state_dict(checkpoint["model_state"])
-            optimizer.load_state_dict(checkpoint["optimizer_state"])
-            optimizer.param_groups[0]["lr"] = learning_rate_init
-        else:
-            print("No model \"%s\" found, starting training from scratch." % run_id)
-    else:
+    if force_restart:
         print("Starting the training from scratch.")
+    elif state_fpath.exists():
+        print("Found existing model \"%s\", loading it and resuming training." % run_id)
+        checkpoint = torch.load(state_fpath)
+        init_step = checkpoint["step"]
+        model.load_state_dict(checkpoint["model_state"])
+        optimizer.load_state_dict(checkpoint["optimizer_state"])
+        optimizer.param_groups[0]["lr"] = learning_rate_init
+    else:
+        print("No model \"%s\" found, starting training from scratch." % run_id)
     model.train()
 
     # Initialize the visualization environment
@@ -74,7 +73,7 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
         # Forward pass
         inputs = torch.from_numpy(speaker_batch.data).to(device)
         sync(device)
-        profiler.tick("Data to %s" % device)
+        profiler.tick(f"Data to {device}")
         embeds = model(inputs)
         sync(device)
         profiler.tick("Forward pass")

diff --git a/encoder/visualizations.py b/encoder/visualizations.py
@@ -42,11 +42,7 @@ def __init__(self, env_name=None, update_every=10, server="http://localhost", di
 
         # Set the environment name
         now = str(datetime.now().strftime("%d-%m %Hh%M"))
-        if env_name is None:
-            self.env_name = now
-        else:
-            self.env_name = "%s (%s)" % (env_name, now)
-
+        self.env_name = now if env_name is None else f"{env_name} ({now})"
         # Connect to visdom and open the corresponding window in the browser
         try:
             self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
@@ -113,7 +109,7 @@ def update(self, loss, eer, step):
         if step % self.update_every != 0:
             return
         time_string = "Step time:  mean: %5dms  std: %5dms" % \
-                      (int(np.mean(self.step_times)), int(np.std(self.step_times)))
+                          (int(np.mean(self.step_times)), int(np.std(self.step_times)))
         print("\nStep %6d   Loss: %.4f   EER: %.4f   %s" %
               (step, np.mean(self.losses), np.mean(self.eers), time_string))
         if not self.disabled:
@@ -143,7 +139,7 @@ def update(self, loss, eer, step):
             )
             if self.implementation_win is not None:
                 self.vis.text(
-                    self.implementation_string + ("<b>%s</b>" % time_string),
+                    f"{self.implementation_string}<b>{time_string}</b>",
                     win=self.implementation_win,
                     opts={"title": "Training implementation"},
                 )

diff --git a/encoder_preprocess.py b/encoder_preprocess.py
@@ -5,6 +5,7 @@
 
 
 if __name__ == "__main__":
+
     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
         pass
 
@@ -67,5 +68,5 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio
     }
     args = vars(args)
     for dataset in args.pop("datasets"):
-        print("Preprocessing %s" % dataset)
+        print(f"Preprocessing {dataset}")
         preprocess_func[dataset](**args)
diff --git a/synthesizer/audio.py b/synthesizer/audio.py
@@ -18,14 +18,10 @@ def save_wavenet_wav(wav, path, sr):
     sf.write(path, wav.astype(np.float32), sr)
 
 def preemphasis(wav, k, preemphasize=True):
-    if preemphasize:
-        return signal.lfilter([1, -k], [1], wav)
-    return wav
+    return signal.lfilter([1, -k], [1], wav) if preemphasize else wav
 
 def inv_preemphasis(wav, k, inv_preemphasize=True):
-    if inv_preemphasize:
-        return signal.lfilter([1], [1, -k], wav)
-    return wav
+    return signal.lfilter([1], [1, -k], wav) if inv_preemphasize else wav
 
 #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
 def start_and_end_indices(quantized, silence_threshold=2):
@@ -51,52 +47,46 @@ def get_hop_size(hparams):
 def linearspectrogram(wav, hparams):
     D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
     S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
-
-    if hparams.signal_normalization:
-        return _normalize(S, hparams)
-    return S
+
+    return _normalize(S, hparams) if hparams.signal_normalization else S
 
 def melspectrogram(wav, hparams):
     D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
     S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
-
-    if hparams.signal_normalization:
-        return _normalize(S, hparams)
-    return S
+
+    return _normalize(S, hparams) if hparams.signal_normalization else S
 
 def inv_linear_spectrogram(linear_spectrogram, hparams):
     """Converts linear spectrogram to waveform using librosa"""
     if hparams.signal_normalization:
         D = _denormalize(linear_spectrogram, hparams)
     else:
         D = linear_spectrogram
-    
+
     S = _db_to_amp(D + hparams.ref_level_db) #Convert back to linear
-
-    if hparams.use_lws:
-        processor = _lws_processor(hparams)
-        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
-        y = processor.istft(D).astype(np.float32)
-        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
-    else:
+
+    if not hparams.use_lws:
         return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
+    processor = _lws_processor(hparams)
+    D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
+    y = processor.istft(D).astype(np.float32)
+    return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 
 def inv_mel_spectrogram(mel_spectrogram, hparams):
     """Converts mel spectrogram to waveform using librosa"""
     if hparams.signal_normalization:
         D = _denormalize(mel_spectrogram, hparams)
     else:
         D = mel_spectrogram
-    
+
     S = _mel_to_linear(_db_to_amp(D + hparams.ref_level_db), hparams)  # Convert back to linear
-
-    if hparams.use_lws:
-        processor = _lws_processor(hparams)
-        D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
-        y = processor.istft(D).astype(np.float32)
-        return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
-    else:
+
+    if not hparams.use_lws:
         return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
+    processor = _lws_processor(hparams)
+    D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
+    y = processor.istft(D).astype(np.float32)
+    return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 
 def _lws_processor(hparams):
     import lws
@@ -109,7 +99,7 @@ def _griffin_lim(S, hparams):
     angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
     S_complex = np.abs(S).astype(np.complex)
     y = _istft(S_complex * angles, hparams)
-    for i in range(hparams.griffin_lim_iters):
+    for _ in range(hparams.griffin_lim_iters):
         angles = np.exp(1j * np.angle(_stft(y, hparams)))
         y = _istft(S_complex * angles, hparams)
     return y
@@ -129,11 +119,11 @@ def num_frames(length, fsize, fshift):
     """Compute number of time frames of spectrogram
     """
     pad = (fsize - fshift)
-    if length % fshift == 0:
-        M = (length + pad * 2 - fsize) // fshift + 1
-    else:
-        M = (length + pad * 2 - fsize) // fshift + 2
-    return M
+    return (
+        (length + pad * 2 - fsize) // fshift + 1
+        if length % fshift == 0
+        else (length + pad * 2 - fsize) // fshift + 2
+    )
 
 
 def pad_lr(x, fsize, fshift):

diff --git a/synthesizer/inference.py b/synthesizer/inference.py
@@ -144,13 +144,12 @@ def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
         Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
         were fed to the synthesizer when training.
         """
-        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+        if isinstance(fpath_or_wav, (str, Path)):
             wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
         else:
             wav = fpath_or_wav
 
-        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
-        return mel_spectrogram
+        return audio.melspectrogram(wav, hparams).astype(np.float32)
 
     @staticmethod
     def griffin_lim(mel):