yihong0618 · yihong0618 · Mar 16, 2023 · Mar 8, 2023 · Mar 10, 2023 · Mar 10, 2023
diff --git a/.gitignore b/.gitignore
@@ -131,4 +131,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-/test_books/*.epub
+/test_books/*.epub
+log/
diff --git a/README.md b/README.md
@@ -16,33 +16,36 @@ The bilingual_book_maker is an AI translation tool that uses ChatGPT to assist u
 ## Use
 
 - `pip install -r requirements.txt` or `pip install -U bbook_maker`(you can use)
-- Use `--openai_key` option to specify OpenAI API key. If you have multiple keys, separate them by commas (xxx,xxx,xxx) to reduce errors caused by API call limits.  
+- Use `--openai_key` option to specify OpenAI API key. If you have multiple keys, separate them by commas (xxx,xxx,xxx) to reduce errors caused by API call limits.
    Or, just set environment variable `BMM_OPENAI_API_KEY` instead.
 - A sample book, `test_books/animal_farm.epub`, is provided for testing purposes.
 - The default underlying model is [GPT-3.5-turbo](https://openai.com/blog/introducing-chatgpt-and-whisper-apis), which is used by ChatGPT currently. Use `--model gpt3` to change the underlying model to `GPT3`
 5. support DeepL model [DeepL Translator](https://rapidapi.com/splintPRO/api/deepl-translator) need pay to get the token use `--model deepl --deepl_key ${deepl_key}`
 - Use `--test` option to preview the result if you haven't paid for the service. Note that there is a limit and it may take some time.
-- Set the target language like `--language "Simplified Chinese"`. Default target language is `"Simplified Chinese"`.  
+- Set the target language like `--language "Simplified Chinese"`. Default target language is `"Simplified Chinese"`.
    Read available languages by helper message: `python make_book.py --help`
 - Use `--proxy` option to specify proxy server for internet access. Enter a string such as `http://127.0.0.1:7890`.
 - Use `--resume` option to manually resume the process after an interruption.
 - epub is made of html files. By default, we only translate contents in `<p>`.
    Use `--translate-tags` to specify tags need for translation. Use comma to seperate multiple tags. For example:
    `--translate-tags h1,h2,h3,p,div`
 - Use `--book_from` option to specify e-reader type (Now only `kobo` is available), and use `--device_path` to specify the mounting point.
-- If you want to change api_base like using Cloudflare Workers, use `--api_base <URL>` to support it.  
+- If you want to change api_base like using Cloudflare Workers, use `--api_base <URL>` to support it.
    **Note: the api url should be '`https://xxxx/v1`'. Quotation marks are required.**
 - Once the translation is complete, a bilingual book named `${book_name}_bilingual.epub` would be generated.
 - If there are any errors or you wish to interrupt the translation by pressing `CTRL+C`. A book named `${book_name}_bilingual_temp.epub` would be generated. You can simply rename it to any desired name.
 - If you want to translate strings in an e-book that aren't labeled with any tags, you can use the `--allow_navigable_strings` parameter. This will add the strings to the translation queue. **Note that it's best to look for e-books that are more standardized if possible.**
-- To tweak the prompt, use the `--prompt` parameter. Valid placeholders for the `user` role template include `{text}` and `{language}`. It supports a few ways to configure the prompt: 
-   If you don't need to set the `system` role content, you can simply set it up like this: `--prompt "Translate {text} to {language}."` or `--prompt prompt_template_sample.txt` (example of a text file can be found at [./prompt_template_sample.txt](./prompt_template_sample.txt)). 
-   If you need to set the `system` role content, you can use the following format: `--prompt '{"user":"Translate {text} to {language}", "system": "You are a professional translator."}'` or `--prompt prompt_template_sample.json` (example of a JSON file can be found at [./prompt_template_sample.json](./prompt_template_sample.json)). 
+- To tweak the prompt, use the `--prompt` parameter. Valid placeholders for the `user` role template include `{text}` and `{language}`. It supports a few ways to configure the prompt:
+   If you don't need to set the `system` role content, you can simply set it up like this: `--prompt "Translate {text} to {language}."` or `--prompt prompt_template_sample.txt` (example of a text file can be found at [./prompt_template_sample.txt](./prompt_template_sample.txt)).
+   If you need to set the `system` role content, you can use the following format: `--prompt '{"user":"Translate {text} to {language}", "system": "You are a professional translator."}'` or `--prompt prompt_template_sample.json` (example of a JSON file can be found at [./prompt_template_sample.json](./prompt_template_sample.json)).
    You can also set the `user` and `system` role prompt by setting environment variables: `BBM_CHATGPTAPI_USER_MSG_TEMPLATE` and `BBM_CHATGPTAPI_SYS_MSG`.
 - Once the translation is complete, a bilingual book named `${book_name}_bilingual.epub` would be generated.
 - If there are any errors or you wish to interrupt the translation by pressing `CTRL+C`. A book named `${book_name}_bilingual_temp.epub` would be generated. You can simply rename it to any desired name.
 - If you want to translate strings in an e-book that aren't labeled with any tags, you can use the `--allow_navigable_strings` parameter. This will add the strings to the translation queue. **Note that it's best to look for e-books that are more standardized if possible.**
 - Use the `--batch_size` parameter to specify the number of lines for batch translation (default is 10, currently only effective for txt files).
+- `--accumulated_num` Wait for how many tokens have been accumulated before starting the translation. gpt3.5 limits the total_token to 4090. For example, if you use --accumulated_num 1600, maybe openai will
+output 2200 tokens and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000, So you are close to reaching the limit. You have to choose your own
+value, there is no way to know if the limit is reached before sending
 
 ### Examples
 

diff --git a/book_maker/cli.py b/book_maker/cli.py
@@ -170,6 +170,18 @@ def main():
         metavar="PROMPT_ARG",
         help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
     )
+    parser.add_argument(
+        "--accumulated_num",
+        dest="accumulated_num",
+        type=int,
+        default=1,
+        help="""Wait for how many tokens have been accumulated before starting the translation.
+gpt3.5 limits the total_token to 4090.
+For example, if you use --accumulated_num 1600, maybe openai will output 2200 tokens
+and maybe 200 tokens for other messages in the system messages user messages, 1600+2200+200=4000,
+So you are close to reaching the limit. You have to choose your own value, there is no way to know if the limit is reached before sending
+""",
+    )
     parser.add_argument(
         "--batch_size",
         dest="batch_size",
@@ -250,6 +262,7 @@ def main():
         test_num=options.test_num,
         translate_tags=options.translate_tags,
         allow_navigable_strings=options.allow_navigable_strings,
+        accumulated_num=options.accumulated_num,
         prompt_config=parse_prompt_arg(options.prompt_arg),
         batch_size=options.batch_size,
     )

diff --git a/book_maker/loader/epub_loader.py b/book_maker/loader/epub_loader.py
@@ -1,5 +1,7 @@
 import os
+import re
 import pickle
+import tiktoken
 import sys
 from copy import copy
 from pathlib import Path
@@ -15,6 +17,95 @@
 from .base_loader import BaseBookLoader
 
 
+class EPUBBookLoaderHelper:
+    def __init__(self, translate_model, accumulated_num):
+        self.translate_model = translate_model
+        self.accumulated_num = accumulated_num
+
+    def deal_new(self, p, wait_p_list):
+        self.deal_old(wait_p_list)
+        new_p = copy(p)
+        new_p.string = self.translate_model.translate(p.text)
+        p.insert_after(new_p)
+
+    def deal_old(self, wait_p_list):
+        if len(wait_p_list) == 0:
+            return
+
+        result_txt_list = self.translate_model.translate_list(wait_p_list)
+
+        for i in range(len(wait_p_list)):
+            if i < len(result_txt_list):
+                p = wait_p_list[i]
+                new_p = copy(p)
+                new_p.string = result_txt_list[i]
+                p.insert_after(new_p)
+
+        wait_p_list.clear()
+
+
+# ref: https://platform.openai.com/docs/guides/chat/introduction
+def num_tokens_from_text(text, model="gpt-3.5-turbo-0301"):
+    messages = (
+        {
+            "role": "user",
+            "content": text,
+        },
+    )
+
+    """Returns the number of tokens used by a list of messages."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model == "gpt-3.5-turbo-0301":  # note: future models may deviate from this
+        num_tokens = 0
+        for message in messages:
+            num_tokens += (
+                4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+            )
+            for key, value in message.items():
+                num_tokens += len(encoding.encode(value))
+                if key == "name":  # if there's a name, the role is omitted
+                    num_tokens += -1  # role is always required and always 1 token
+        num_tokens += 2  # every reply is primed with <im_start>assistant
+        return num_tokens
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not presently implemented for model {model}.
+  See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )
+
+
+def is_link(text):
+    url_pattern = re.compile(
+        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+    )
+    return bool(url_pattern.match(text.strip()))
+
+
+def is_tail_Link(text, num=100):
+    text = text.strip()
+    url_pattern = re.compile(
+        r".*http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
+    )
+    return bool(url_pattern.match(text)) and len(text) < num
+
+
+def is_source(text):
+    return text.strip().startswith("Source: ")
+
+
+def is_list(text, num=80):
+    text = text.strip()
+    return re.match(r"^Listing\s*\d+", text) and len(text) < num
+
+
+def is_figure(text, num=80):
+    text = text.strip()
+    return re.match(r"^Figure\s*\d+", text) and len(text) < num
+
+
 class EPUBBookLoader(BaseBookLoader):
     def __init__(
         self,
@@ -29,6 +120,8 @@ def __init__(
         test_num=5,
         translate_tags="p",
         allow_navigable_strings=False,
+        accumulated_num=1,
+        prompt_template=None,
         prompt_config=None,
     ):
         self.epub_name = epub_name
@@ -43,6 +136,8 @@ def __init__(
         self.test_num = test_num
         self.translate_tags = translate_tags
         self.allow_navigable_strings = allow_navigable_strings
+        self.accumulated_num = accumulated_num
+        self.helper = EPUBBookLoaderHelper(self.translate_model, self.accumulated_num)
 
         try:
             self.origin_book = epub.read_epub(self.epub_name)
@@ -70,7 +165,7 @@ def _load_spine(self):
 
     @staticmethod
     def _is_special_text(text):
-        return text.isdigit() or text.isspace()
+        return text.isdigit() or text.isspace() or is_link(text)
 
     def _make_new_book(self, book):
         new_book = epub.EpubBook()
@@ -79,6 +174,70 @@ def _make_new_book(self, book):
         new_book.toc = book.toc
         return new_book
 
+    def _process_paragraph(self, p, index, p_to_save_len):
+        if not p.text or self._is_special_text(p.text):
+            return index
+
+        new_p = copy(p)
+
+        if self.resume and index < p_to_save_len:
+            new_p.string = self.p_to_save[index]
+        else:
+            if type(p) == NavigableString:
+                new_p = self.translate_model.translate(p.text)
+                self.p_to_save.append(new_p)
+            else:
+                new_p.string = self.translate_model.translate(p.text)
+                self.p_to_save.append(new_p.text)
+
+        p.insert_after(new_p)
+        index += 1
+
+        if index % 20 == 0:
+            self._save_progress()
+
+        return index
+
+    def translate_paragraphs_acc(self, p_list, send_num):
+        count = 0
+        wait_p_list = []
+        for i in range(len(p_list)):
+            p = p_list[i]
+            temp_p = copy(p)
+            for sup in temp_p.find_all("sup"):
+                sup.extract()
+            if (
+                not p.text
+                or self._is_special_text(temp_p.text)
+                or is_source(temp_p.text)
+                or is_list(temp_p.text)
+                or is_figure(temp_p.text)
+                or is_tail_Link(temp_p.text)
+            ):
+                continue
+            length = num_tokens_from_text(temp_p.text)
+            if length > send_num:
+                self.helper.deal_new(p, wait_p_list)
+                continue
+            if i == len(p_list) - 1:
+                if count + length < send_num:
+                    wait_p_list.append(p)
+                    self.helper.deal_old(wait_p_list)
+                else:
+                    self.helper.deal_new(p, wait_p_list)
+                break
+            if count + length < send_num:
+                count += length
+                wait_p_list.append(p)
+                # This is because the more paragraphs, the easier it is possible to translate different numbers of paragraphs, maybe you should find better values than 15 and 2
+                if len(wait_p_list) > 15 and count > send_num / 2:
+                    self.helper.deal_old(wait_p_list)
+                    count = 0
+            else:
+                self.helper.deal_old(wait_p_list)
+                wait_p_list.append(p)
+                count = length
+
     def make_bilingual_book(self):
         new_book = self._make_new_book(self.origin_book)
         all_items = list(self.origin_book.get_items())
@@ -99,46 +258,56 @@ def make_bilingual_book(self):
         index = 0
         p_to_save_len = len(self.p_to_save)
         try:
+            # Add the things that don't need to be translated first, so that you can see the img after the interruption
             for item in self.origin_book.get_items():
-                if item.get_type() == ITEM_DOCUMENT:
-                    soup = bs(item.content, "html.parser")
-                    p_list = soup.findAll(trans_taglist)
-                    if self.allow_navigable_strings:
-                        p_list.extend(soup.findAll(text=True))
+                if item.get_type() != ITEM_DOCUMENT:
+                    new_book.add_item(item)
+
+            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
+                # if item.file_name != "OEBPS/ch01.xhtml":
+                #     continue
+                if not os.path.exists("log"):
+                    os.makedirs("log")
+
+                soup = bs(item.content, "html.parser")
+                p_list = soup.findAll(trans_taglist)
+                if self.allow_navigable_strings:
+                    p_list.extend(soup.findAll(text=True))
+
+                send_num = self.accumulated_num
+                if send_num > 1:
+                    with open("log/buglog.txt", "a") as f:
+                        print(f"------------- {item.file_name} -------------", file=f)
+
+                    print("------------------------------------------------------")
+                    print(f"dealing {item.file_name} ...")
+                    self.translate_paragraphs_acc(p_list, send_num)
+                else:
                     is_test_done = self.is_test and index > self.test_num
                     for p in p_list:
-                        if is_test_done or not p.text or self._is_special_text(p.text):
-                            continue
-                        new_p = copy(p)
-                        # TODO banch of p to translate then combine
-                        # PR welcome here
-                        if self.resume and index < p_to_save_len:
-                            new_p.string = self.p_to_save[index]
-                        else:
-                            if type(p) == NavigableString:
-                                new_p = self.translate_model.translate(p.text)
-                                self.p_to_save.append(new_p)
-                            else:
-                                new_p.string = self.translate_model.translate(p.text)
-                                self.p_to_save.append(new_p.text)
-                        p.insert_after(new_p)
-                        index += 1
-                        if index % 20 == 0:
-                            self._save_progress()
+                        if is_test_done:
+                            break
+                        index = self._process_paragraph(p, index, p_to_save_len)
                         # pbar.update(delta) not pbar.update(index)?
                         pbar.update(1)
                         if self.is_test and index >= self.test_num:
                             break
-                    item.content = soup.prettify().encode()
+
+                item.content = soup.prettify().encode()
                 new_book.add_item(item)
+                if self.accumulated_num > 1:
+                    name, _ = os.path.splitext(self.epub_name)
+                    epub.write_epub(f"{name}_bilingual.epub", new_book, {})
             name, _ = os.path.splitext(self.epub_name)
             epub.write_epub(f"{name}_bilingual.epub", new_book, {})
-            pbar.close()
+            if self.accumulated_num == 1:
+                pbar.close()
         except (KeyboardInterrupt, Exception) as e:
             print(e)
-            print("you can resume it next time")
-            self._save_progress()
-            self._save_temp_book()
+            if self.accumulated_num == 1:
+                print("you can resume it next time")
+                self._save_progress()
+                self._save_temp_book()
             sys.exit(0)
 
     def load_state(self):

diff --git a/book_maker/loader/txt_loader.py b/book_maker/loader/txt_loader.py
@@ -20,6 +20,8 @@ def __init__(
         model_api_base=None,
         is_test=False,
         test_num=5,
+        accumulated_num=1,
+        prompt_template=None,
         prompt_config=None,
     ):
         self.txt_name = txt_name
@@ -102,7 +104,7 @@ def _save_temp_book(self):
             for i in range(0, len(self.origin_book), self.batch_size)
         ]
 
-        for i in range(0, len(sliced_list)):
+        for i in range(len(sliced_list)):
             batch_text = "".join(sliced_list[i])
             self.bilingual_temp_result.append(batch_text)
             if self._is_special_text(self.origin_book[i]):