Skip to content

Commit

Permalink
improve prompt and fix <sup>
Browse files Browse the repository at this point in the history
change output
  • Loading branch information
hleft committed Mar 12, 2023
1 parent 9aade4b commit 8badb41
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 26 deletions.
3 changes: 3 additions & 0 deletions book_maker/loader/epub_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ def deal_old(wait_p_list):
new_book.add_item(item)

for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
# if item.file_name != "OEBPS/ch01.xhtml":
# continue

soup = bs(item.content, "html.parser")
p_list = soup.findAll(trans_taglist)
if self.allow_navigable_strings:
Expand Down
74 changes: 48 additions & 26 deletions book_maker/translator/chatgptapi_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self, key, language, api_base=None, prompt_template=None):
prompt_template
or "Please help me to translate,`{text}` to {language}, please return only translated content not include the origin text"
)
self.system_content = environ.get("OPENAI_API_SYS_MSG") or ""

max_num_token = -1

Expand All @@ -31,7 +32,7 @@ def get_translation(self, text):
messages=[
{
"role": "system",
"content": environ.get("OPENAI_API_SYS_MSG") or "",
"content": self.system_content,
},
{
"role": "user",
Expand All @@ -47,11 +48,12 @@ def get_translation(self, text):
.decode()
)
print("=================================================")
print(f'Total tokens used this time: {completion["usage"]["total_tokens"]}')
self.max_num_token = max(
self.max_num_token, int(completion["usage"]["total_tokens"])
)
print(f"The maximum number of tokens used at one time: {self.max_num_token}")
print(
f"{completion['usage']['total_tokens']} {completion['usage']['prompt_tokens']} {completion['usage']['completion_tokens']} {self.max_num_token} (total_token, prompt_token, completion_tokens, max_history_total_token)"
)
return t_text

def translate(self, text, needprint=True):
Expand Down Expand Up @@ -85,42 +87,62 @@ def translate_and_split_lines(self, text):

def translate_list(self, plist):
sep = "\n\n\n\n\n"
new_str = sep.join([item.text for item in plist])
# new_str = sep.join([item.text for item in plist])

new_str = ""
for p in plist:
for sup in p.find_all("sup"):
sup.extract() # 提取并删除<sup>标签及其内容, 但不影响p
new_str += p.get_text().strip() + "\n\n"

if new_str.endswith(sep):
new_str = new_str[: -len(sep)]

retry_count = 0
plist_len = len(plist)
self.system_content += f"""Please translate the following paragraphs individually while preserving their original structure(This time it should be exactly {plist_len} paragraphs, no more or less). Only translate the paragraphs provided below:
# supplement_prompt = f"Translated result should have {plist_len} paragraphs"
# supplement_prompt = "Each paragraph in the source text should be translated into a separate and complete paragraph, and each paragraph should be separated"
supplement_prompt = "Each paragraph in the source text should be translated into a separate and complete paragraph, and each translated paragraph should be separated by a blank line"
[Insert first paragraph here]
self.prompt_template = (
"Please help me to translate,`{text}` to {language}, please return only translated content not include the origin text. "
+ supplement_prompt
)
[Insert second paragraph here]
[Insert third paragraph here]"""

retry_count = 0
lines = self.translate_and_split_lines(new_str)

while len(lines) != plist_len and retry_count < 15:
while len(lines) != plist_len and retry_count < 3:
print(
f"bug: {plist_len} paragraphs of text translated into {len(lines)} paragraphs"
f"bug: {plist_len} -> {len(lines)} : Number of paragraphs before and after translation"
)
num = 6
print(f"sleep for {num}s and try again")
time.sleep(num)
print(f"retry {retry_count+1} ...")
sleep_dur = 6
print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...")
time.sleep(sleep_dur)
lines = self.translate_and_split_lines(new_str)
retry_count += 1
if len(lines) == plist_len:
print("retry success")

state = "success"
if len(lines) != plist_len:
state = "fail"

if retry_count > 0:
print(f"retry {state}")
with open("buglog.txt", "a") as f:
print(
f"retry {state}, count = {retry_count}",
file=f,
)

if len(lines) != plist_len:
for i in range(0, plist_len):
print(plist[i].text)
print()
if i < len(lines):
print(lines[i])
print()
newlist = new_str.split(sep)
for i in range(0, len(newlist)):
with open("buglog.txt", "a") as f:
print(newlist[i], file=f)
print(file=f)
if i < len(lines):
print(lines[i], file=f)
print(file=f)
print("=============================", file=f)
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

print(
f"bug: {plist_len} paragraphs of text translated into {len(lines)} paragraphs"
Expand Down

0 comments on commit 8badb41

Please sign in to comment.