From b8473a9ee5e07ac98e7638abd17ea566d717cea4 Mon Sep 17 00:00:00 2001
From: Lengyue <lengyue@lengyue.me>
Date: Wed, 1 May 2024 00:37:57 -0400
Subject: [PATCH] Add i18n engine

---
 .gitignore                         |   1 +
 fish_speech/i18n/__init__.py       |   3 +
 fish_speech/i18n/core.py           |  40 ++++++++++
 fish_speech/i18n/locale/en_US.json |  25 ++++++
 fish_speech/i18n/locale/zh_CN.json |  25 ++++++
 fish_speech/i18n/scan.py           | 122 +++++++++++++++++++++++++++++
 fish_speech/webui/manage.py        |  10 ++-
 tools/webui.py                     |  55 ++++++-------
 8 files changed, 251 insertions(+), 30 deletions(-)
 create mode 100644 fish_speech/i18n/__init__.py
 create mode 100644 fish_speech/i18n/core.py
 create mode 100644 fish_speech/i18n/locale/en_US.json
 create mode 100644 fish_speech/i18n/locale/zh_CN.json
 create mode 100644 fish_speech/i18n/scan.py

diff --git a/.gitignore b/.gitignore
index a97f3eda..ee2826f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ ffmpeg.exe
 asr-label-win-x64.exe
 /.cache
 /fishenv
+/.locale
diff --git a/fish_speech/i18n/__init__.py b/fish_speech/i18n/__init__.py
new file mode 100644
index 00000000..981dbb3b
--- /dev/null
+++ b/fish_speech/i18n/__init__.py
@@ -0,0 +1,3 @@
+from .core import i18n
+
+__all__ = ["i18n"]
diff --git a/fish_speech/i18n/core.py b/fish_speech/i18n/core.py
new file mode 100644
index 00000000..9f793ec9
--- /dev/null
+++ b/fish_speech/i18n/core.py
@@ -0,0 +1,40 @@
+import json
+import locale
+from pathlib import Path
+
+I18N_FILE_PATH = Path(__file__).parent / "locale"
+DEFAULT_LANGUAGE = "en_US"
+
+
+def load_language_list(language):
+    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
+        language_list = json.load(f)
+
+    return language_list
+
+
+class I18nAuto:
+    def __init__(self):
+        i18n_file = Path(".locale")
+
+        if i18n_file.exists():
+            with open(i18n_file, "r", encoding="utf-8") as f:
+                language = f.read().strip()
+        else:
+            # getlocale can't identify the system's language ((None, None))
+            language = locale.getdefaultlocale()[0]
+
+        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
+            language = DEFAULT_LANGUAGE
+
+        self.language = language
+        self.language_map = load_language_list(language)
+
+    def __call__(self, key):
+        return self.language_map.get(key, key)
+
+    def __repr__(self):
+        return "Use Language: " + self.language
+
+
+i18n = I18nAuto()
diff --git a/fish_speech/i18n/locale/en_US.json b/fish_speech/i18n/locale/en_US.json
new file mode 100644
index 00000000..a5510e62
--- /dev/null
+++ b/fish_speech/i18n/locale/en_US.json
@@ -0,0 +1,25 @@
+{
+    "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
+    "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
+    "Advanced Config": "Advanced Config",
+    "Enable Reference Audio": "Enable Reference Audio",
+    "Error Message": "Error Message",
+    "Generate": "Generate",
+    "Generated Audio": "Generated Audio",
+    "Infer interface is closed": "Infer interface is closed",
+    "Inferring interface is launched at {}": "Inferring interface is launched at {}",
+    "Input Text": "Input Text",
+    "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
+    "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
+    "Opened labeler in browser": "Opened labeler in browser",
+    "Put your text here.": "Put your text here.",
+    "Reference Audio": "Reference Audio",
+    "Reference Text": "Reference Text",
+    "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.",
+    "Repetition Penalty": "Repetition Penalty",
+    "Speaker": "Speaker",
+    "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
+    "Type name of the speaker": "Type name of the speaker",
+    "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
+    "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1)."
+}
diff --git a/fish_speech/i18n/locale/zh_CN.json b/fish_speech/i18n/locale/zh_CN.json
new file mode 100644
index 00000000..034b45c5
--- /dev/null
+++ b/fish_speech/i18n/locale/zh_CN.json
@@ -0,0 +1,25 @@
+{
+    "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
+    "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
+    "Advanced Config": "高级参数",
+    "Enable Reference Audio": "启用参考音频",
+    "Error Message": "错误信息",
+    "Generate": "生成",
+    "Generated Audio": "音频",
+    "Infer interface is closed": "推理界面已关闭",
+    "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
+    "Input Text": "输入文本",
+    "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
+    "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
+    "Opened labeler in browser": "在浏览器中打开标注工具",
+    "Put your text here.": "在此处输入文本.",
+    "Reference Audio": "参考音频",
+    "Reference Text": "参考文本",
+    "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.",
+    "Repetition Penalty": "重复惩罚",
+    "Speaker": "说话人",
+    "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
+    "Type name of the speaker": "输入说话人的名称",
+    "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
+    "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型."
+}
diff --git a/fish_speech/i18n/scan.py b/fish_speech/i18n/scan.py
new file mode 100644
index 00000000..d0194c0f
--- /dev/null
+++ b/fish_speech/i18n/scan.py
@@ -0,0 +1,122 @@
+import ast
+import glob
+import json
+from collections import OrderedDict
+from pathlib import Path
+
+from loguru import logger
+
+from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
+
+
+def extract_i18n_strings(node):
+    i18n_strings = []
+
+    if (
+        isinstance(node, ast.Call)
+        and isinstance(node.func, ast.Name)
+        and node.func.id == "i18n"
+    ):
+        for arg in node.args:
+            if isinstance(arg, ast.Str):
+                i18n_strings.append(arg.s)
+
+    for child_node in ast.iter_child_nodes(node):
+        i18n_strings.extend(extract_i18n_strings(child_node))
+
+    return i18n_strings
+
+
+# scan the directory for all .py files (recursively)
+# for each file, parse the code into an AST
+# for each AST, extract the i18n strings
+
+strings = []
+folders = ["fish_speech", "tools"]
+# for filename in glob.iglob("**/*.py", recursive=True):
+for folder in folders:
+    for f in Path(folder).rglob("*.py"):
+        code = f.read_text(encoding="utf-8")
+        if "i18n(" in code:
+            tree = ast.parse(code)
+            i18n_strings = extract_i18n_strings(tree)
+            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
+            strings.extend(i18n_strings)
+
+code_keys = set(strings)
+logger.info(f"Total unique: {len(code_keys)}")
+
+
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+standard_keys = set(standard_data.keys())
+
+# Define the standard file name
+unused_keys = standard_keys - code_keys
+logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
+for unused_key in unused_keys:
+    logger.info(f"\t{unused_key}")
+
+missing_keys = code_keys - standard_keys
+logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
+for missing_key in missing_keys:
+    logger.info(f"\t{missing_key}")
+
+code_keys_dict = OrderedDict()
+for s in strings:
+    code_keys_dict[s] = s
+
+# write back
+with open(standard_file, "w", encoding="utf-8") as f:
+    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
+    f.write("\n")
+
+logger.info(f"Updated {standard_file}")
+
+
+# Define the standard file name
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+
+# Find all JSON files in the directory
+dir_path = I18N_FILE_PATH
+languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
+
+# Load the standard file
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+
+# Loop through each language file
+for lang_file in languages:
+    # Load the language file
+    with open(lang_file, "r", encoding="utf-8") as f:
+        lang_data = json.load(f, object_pairs_hook=OrderedDict)
+
+    # Find the difference between the language file and the standard file
+    diff = set(standard_data.keys()) - set(lang_data.keys())
+
+    miss = set(lang_data.keys()) - set(standard_data.keys())
+
+    # Add any missing keys to the language file
+    for key in diff:
+        lang_data[key] = "#!" + key
+        logger.info(f"Added missing key: {key} to {lang_file}")
+
+    # Del any extra keys to the language file
+    for key in miss:
+        del lang_data[key]
+        logger.info(f"Del extra key: {key} from {lang_file}")
+
+    # Sort the keys of the language file to match the order of the standard file
+    lang_data = OrderedDict(
+        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
+    )
+
+    # Save the updated language file
+    with open(lang_file, "w", encoding="utf-8") as f:
+        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
+        f.write("\n")
+
+    logger.info(f"Updated {lang_file}")
+
+logger.info("Done")
diff --git a/fish_speech/webui/manage.py b/fish_speech/webui/manage.py
index 3fde3a7f..051c393c 100644
--- a/fish_speech/webui/manage.py
+++ b/fish_speech/webui/manage.py
@@ -17,6 +17,7 @@
 from loguru import logger
 from tqdm import tqdm
 
+from fish_speech.i18n import i18n
 from fish_speech.webui.launch_utils import Seafoam, versions_html
 
 PYTHON = os.path.join(os.environ.get("PYTHON_FOLDERPATH", ""), "python")
@@ -97,7 +98,7 @@ def change_label(if_label):
         # 设置要访问的URL
         url = "https://text-labeler.pages.dev/"
         webbrowser.open(url)
-        yield f"已打开网址"
+        yield i18n("Opened labeler in browser")
     elif if_label == False:
         p_label = None
         yield "Nothing"
@@ -119,7 +120,10 @@ def change_infer(
         env["GRADIO_SERVER_NAME"] = host
         env["GRADIO_SERVER_PORT"] = port
         # 启动第二个进程
-        yield build_html_ok_message(f"推理界面已开启, 访问 http://{host}:{port}")
+        url = f"http://{host}:{port}"
+        yield build_html_ok_message(
+            i18n("Inferring interface is launched at {}").format(url)
+        )
         p_infer = subprocess.Popen(
             [
                 PYTHON,
@@ -140,7 +144,7 @@ def change_infer(
     elif if_infer == False and p_infer != None:
         kill_process(p_infer.pid)
         p_infer = None
-        yield build_html_error_message("推理界面已关闭")
+        yield build_html_error_message(i18n("Infer interface is closed"))
 
 
 js = load_data_in_raw("fish_speech/webui/js/animate.js")
diff --git a/tools/webui.py b/tools/webui.py
index 829dd7fd..17b8b4fb 100644
--- a/tools/webui.py
+++ b/tools/webui.py
@@ -12,6 +12,7 @@
 from loguru import logger
 from transformers import AutoTokenizer
 
+from fish_speech.i18n import i18n
 from tools.llama.generate import launch_thread_safe_queue
 from tools.vqgan.inference import load_model as load_vqgan_model
 
@@ -19,22 +20,18 @@
 os.environ["EINX_FILTER_TRACEBACK"] = "false"
 
 
-HEADER_MD = """# Fish Speech
+HEADER_MD = f"""# Fish Speech
 
-A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).  
-由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成. 
+{i18n("A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).")}  
 
-You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).  
-你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.  
+{i18n("You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).")}  
 
-Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.  
-相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.
+{i18n("Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.")}  
 
-We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.  
-我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
+{i18n("We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.")}  
 """
 
-TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
+TEXTBOX_PLACEHOLDER = i18n("Put your text here.")
 
 try:
     import spaces
@@ -76,7 +73,9 @@ def inference(
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
         return (
             None,
-            f"Text is too long, please keep it under {args.max_gradio_length} characters.",
+            i18n("Text is too long, please keep it under {} characters.").format(
+                args.max_gradio_length
+            ),
         )
 
     # Parse reference audio aka prompt
@@ -171,13 +170,13 @@ def build_app():
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
-                    label="Input Text / 输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
+                    label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=15
                 )
 
                 with gr.Row():
-                    with gr.Tab(label="Advanced Config / 高级参数"):
+                    with gr.Tab(label=i18n("Advanced Config")):
                         chunk_length = gr.Slider(
-                            label="Iterative Prompt Length, 0 means off / 迭代提示长度，0 表示关闭",
+                            label=i18n("Iterative Prompt Length, 0 means off"),
                             minimum=0,
                             maximum=500,
                             value=30,
@@ -185,7 +184,7 @@ def build_app():
                         )
 
                         max_new_tokens = gr.Slider(
-                            label="Maximum tokens per batch, 0 means no limit / 每批最大令牌数，0 表示无限制",
+                            label=i18n("Maximum tokens per batch, 0 means no limit"),
                             minimum=0,
                             maximum=args.max_length,
                             value=0,  # 0 means no limit
@@ -201,7 +200,7 @@ def build_app():
                         )
 
                         repetition_penalty = gr.Slider(
-                            label="Repetition Penalty",
+                            label=i18n("Repetition Penalty"),
                             minimum=0,
                             maximum=2,
                             value=1.5,
@@ -217,40 +216,42 @@ def build_app():
                         )
 
                         speaker = gr.Textbox(
-                            label="Speaker / 说话人",
-                            placeholder="Type name of the speaker / 输入说话人的名称",
+                            label=i18n("Speaker"),
+                            placeholder=i18n("Type name of the speaker"),
                             lines=1,
                         )
 
-                    with gr.Tab(label="Reference Audio / 参考音频"):
+                    with gr.Tab(label=i18n("Reference Audio")):
                         gr.Markdown(
-                            "5 to 10 seconds of reference audio, useful for specifying speaker. \n5 到 10 秒的参考音频，适用于指定音色。"
+                            i18n(
+                                "5 to 10 seconds of reference audio, useful for specifying speaker."
+                            )
                         )
 
                         enable_reference_audio = gr.Checkbox(
-                            label="Enable Reference Audio / 启用参考音频",
+                            label=i18n("Enable Reference Audio"),
                         )
                         reference_audio = gr.Audio(
-                            label="Reference Audio / 参考音频",
+                            label=i18n("Reference Audio"),
                             type="filepath",
                         )
                         reference_text = gr.Textbox(
-                            label="Reference Text / 参考文本",
-                            placeholder="参考文本",
+                            label=i18n("Reference Text"),
+                            placeholder=i18n("Reference Text"),
                             lines=1,
                             value="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
                         )
 
             with gr.Column(scale=3):
                 with gr.Row():
-                    error = gr.HTML(label="Error Message / 错误信息")
+                    error = gr.HTML(label=i18n("Error Message"))
                 with gr.Row():
-                    audio = gr.Audio(label="Generated Audio / 音频", type="numpy")
+                    audio = gr.Audio(label=i18n("Generated Audio"), type="numpy")
 
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
-                            value="\U0001F3A7 Generate / 合成", variant="primary"
+                            value="\U0001F3A7 " + i18n("Generate"), variant="primary"
                         )
 
         # # Submit