Fix build dataset bug & update document

fishaudio · Dec 21, 2023 · 4d6f6d0 · 4d6f6d0
1 parent e650753
commit 4d6f6d0
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 14 deletions.
diff --git a/docs/en/finetune.md b/docs/en/finetune.md
@@ -37,7 +37,7 @@ python tools/vqgan/create_train_split.py data/demo
 This command will create `data/demo/vq_train_filelist.txt` and `data/demo/vq_val_filelist.txt` in the `data/demo` directory, to be used for training and validation respectively.
 
 !!!info
-    For the VITS format, you can specify a file list using `--filelist xxx.list`.
+    For the VITS format, you can specify a file list using `--filelist xxx.list`.  
     Please note that the audio files in `filelist` must also be located in the `data/demo` folder.
 
 ### 3. Start Training
@@ -100,7 +100,8 @@ python tools/vqgan/extract_vq.py data/demo \
 ```
 
 !!! note
-    You can adjust `--num-workers` and `--batch-size` to increase extraction speed, but please make sure not to exceed your GPU memory limit.
+    You can adjust `--num-workers` and `--batch-size` to increase extraction speed, but please make sure not to exceed your GPU memory limit.  
+    For the VITS format, you can specify a file list using `--filelist xxx.list`.
 
 This command will create `.npy` files in the `data/demo` directory, as shown below:
 
@@ -134,7 +135,6 @@ After the command finishes executing, you should see the `quantized-dataset-ft.p
 
 !!!info
     For the VITS format, you can specify a file list using `--filelist xxx.list`.
-    Please note that the audio files referenced in `filelist` must also be located in the `data/demo` folder.
 
 ### 4. Start the Rust data server
 

diff --git a/docs/zh/finetune.md b/docs/zh/finetune.md
@@ -37,7 +37,7 @@ python tools/vqgan/create_train_split.py data/demo
 该命令会在 `data/demo` 目录下创建 `data/demo/vq_train_filelist.txt` 和 `data/demo/vq_val_filelist.txt` 文件, 分别用于训练和验证.  
 
 !!!info
-    对于 VITS 格式, 你可以使用 `--filelist xxx.list` 来指定文件列表.
+    对于 VITS 格式, 你可以使用 `--filelist xxx.list` 来指定文件列表.  
     请注意, `filelist` 所指向的音频文件必须也位于 `data/demo` 文件夹下.
 
 ### 3. 启动训练
@@ -112,7 +112,7 @@ python tools/vqgan/extract_vq.py data/demo \
 ```
 
 !!! note
-    你可以调整 `--num-workers` 和 `--batch-size` 来提高提取速度, 但是请注意不要超过你的显存限制. 
+    你可以调整 `--num-workers` 和 `--batch-size` 来提高提取速度, 但是请注意不要超过你的显存限制.  
     对于 VITS 格式, 你可以使用 `--filelist xxx.list` 来指定文件列表.
 
 该命令会在 `data/demo` 目录下创建 `.npy` 文件, 如下所示:

diff --git a/tools/llama/build_dataset.py b/tools/llama/build_dataset.py
@@ -44,31 +44,30 @@ def task_generator_yaml(config):
 
         logger.info(f"Found {len(grouped_files)} groups in {root}")
         for name, subset in grouped_files.items():
-            yield name, subset, source, languages, extension, None
+            yield name, subset, source, languages, extension
 
 
 def task_generator_filelist(filelist):
     grouped_files = defaultdict(list)
     for filename, speaker, languages, text in load_filelist(filelist):
-        if speaker in grouped_files:
-            assert (
-                languages == grouped_files[speaker][0][2]
-            ), f"Speaker {speaker} has different languages"
-
         grouped_files[speaker].append((Path(filename), text, languages))
 
     logger.info(f"Found {len(grouped_files)} groups in {filelist}")
     for speaker, values in grouped_files.items():
-        for filename, txt, languages in values:
-            yield speaker, filename, "filelist", languages, None, txt
+        yield speaker, values, "filelist", languages, None
 
 
 def run_task(task):
-    name, subset, source, languages, extension, text = task
+    name, subset, source, languages, extension = task
 
     # Parse the files
     sentences = []
     for file in subset:
+        if isinstance(file, tuple):
+            file, text, languages = file
+        else:
+            text = None
+
         np_file = file.with_suffix(".npy")
         if np_file.exists() is False:
             logger.warning(f"Can't find {np_file}")