yihong0618 · yihong0618 · Mar 16, 2023 · Mar 8, 2023 · Mar 10, 2023 · Mar 10, 2023
diff --git a/book_maker/cli.py b/book_maker/cli.py
@@ -129,6 +129,13 @@ def main():
         metavar="PROMPT_TEMPLATE",
         help="used for customizing the prompt. It can be the prompt template string, or a path to the template file. The valid placeholders are `{text}` and `{language}`.",
     )
+    parser.add_argument(
+        "--accumulated_num",
+        dest="accumulated_num",
+        type=int,
+        default=1,
+        help="Wait for how many characters have been accumulated before starting the translation",
+    )
 
     options = parser.parse_args()
     PROXY = options.proxy
@@ -183,6 +190,7 @@ def main():
         test_num=options.test_num,
         translate_tags=options.translate_tags,
         allow_navigable_strings=options.allow_navigable_strings,
+        accumulated_num=options.accumulated_num,
         prompt_template=parse_prompt_arg(options.prompt_template),
     )
     e.make_bilingual_book()

diff --git a/book_maker/loader/epub_loader.py b/book_maker/loader/epub_loader.py
@@ -1,4 +1,5 @@
 import os
+import re
 import pickle
 import sys
 from copy import copy
@@ -12,6 +13,21 @@
 from .base_loader import BaseBookLoader
 
 
+def isLink(text):
+    url_pattern = re.compile(
+        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+    )
+    return bool(url_pattern.match(text.strip()))
+
+
+def isSourceLink(text):
+    text = text.strip()
+    return text.startswith("Source: ") and re.search(
+        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
+        text,
+    )
+
+
 class EPUBBookLoader(BaseBookLoader):
     def __init__(
         self,
@@ -25,6 +41,7 @@ def __init__(
         test_num=5,
         translate_tags="p",
         allow_navigable_strings=False,
+        accumulated_num=1,
         prompt_template=None,
     ):
         self.epub_name = epub_name
@@ -36,6 +53,7 @@ def __init__(
         self.test_num = test_num
         self.translate_tags = translate_tags
         self.allow_navigable_strings = allow_navigable_strings
+        self.accumulated_num = accumulated_num
 
         try:
             self.origin_book = epub.read_epub(self.epub_name)
@@ -63,7 +81,7 @@ def _load_spine(self):
 
     @staticmethod
     def _is_special_text(text):
-        return text.isdigit() or text.isspace()
+        return text.isdigit() or text.isspace() or isLink(text)
 
     def _make_new_book(self, book):
         new_book = epub.EpubBook()
@@ -73,6 +91,28 @@ def _make_new_book(self, book):
         return new_book
 
     def make_bilingual_book(self):
+        def deal_new(p, wait_p_list):
+            ret = deal_old(wait_p_list)
+            new_p = copy(p)
+            new_p.string = self.translate_model.translate(p.text)
+            p.insert_after(new_p)
+            return ret
+
+        def deal_old(wait_p_list):
+            if len(wait_p_list) == 0:
+                return []
+
+            result_txt_list = self.translate_model.translate_list(wait_p_list)
+
+            for i in range(0, len(wait_p_list)):
+                if i < len(result_txt_list):
+                    p = wait_p_list[i]
+                    new_p = copy(p)
+                    new_p.string = result_txt_list[i]
+                    p.insert_after(new_p)
+
+            return []
+
         new_book = self._make_new_book(self.origin_book)
         all_items = list(self.origin_book.get_items())
         trans_taglist = self.translate_tags.split(",")
@@ -92,12 +132,56 @@ def make_bilingual_book(self):
         index = 0
         p_to_save_len = len(self.p_to_save)
         try:
+            # Add the things that don't need to be translated first, so that you can see the img after the interruption
             for item in self.origin_book.get_items():
-                if item.get_type() == ITEM_DOCUMENT:
-                    soup = bs(item.content, "html.parser")
-                    p_list = soup.findAll(trans_taglist)
-                    if self.allow_navigable_strings:
-                        p_list.extend(soup.findAll(text=True))
+                if item.get_type() != ITEM_DOCUMENT:
+                    new_book.add_item(item)
+
+            for item in self.origin_book.get_items_of_type(ITEM_DOCUMENT):
+                # if item.file_name != "OEBPS/ch01.xhtml":
+                #     continue
+
+                soup = bs(item.content, "html.parser")
+                p_list = soup.findAll(trans_taglist)
+                if self.allow_navigable_strings:
+                    p_list.extend(soup.findAll(text=True))
+
+                send_num = self.accumulated_num
+                if send_num > 1:
+                    print("------------------------------------------------------")
+                    print(f"dealing {item.file_name} ...")
+                    count = 0
+                    wait_p_list = []
+                    for i in range(0, len(p_list)):
+                        p = p_list[i]
+                        temp_p = copy(p)
+                        for sup in temp_p.find_all("sup"):
+                            sup.extract()
+                        if (
+                            not p.text
+                            or self._is_special_text(temp_p.text)
+                            or isSourceLink(temp_p.text)
+                        ):
+                            continue
+                        length = len(p.text)
+                        if length > send_num:
+                            wait_p_list = deal_new(p, wait_p_list)
+                            continue
+                        if i == len(p_list) - 1:
+                            if count + length < send_num:
+                                wait_p_list.append(p)
+                                wait_p_list = deal_old(wait_p_list)
+                            else:
+                                wait_p_list = deal_new(p, wait_p_list)
+                            break
+                        if count + length < send_num:
+                            count += length
+                            wait_p_list.append(p)
+                        else:
+                            wait_p_list = deal_old(wait_p_list)
+                            wait_p_list.append(p)
+                            count = len(p.text)
+                else:
                     is_test_done = self.is_test and index > self.test_num
                     for p in p_list:
                         if is_test_done or not p.text or self._is_special_text(p.text):
@@ -118,16 +202,19 @@ def make_bilingual_book(self):
                         pbar.update(1)
                         if self.is_test and index >= self.test_num:
                             break
-                    item.content = soup.prettify().encode()
+
+                item.content = soup.prettify().encode()
                 new_book.add_item(item)
-            name, _ = os.path.splitext(self.epub_name)
-            epub.write_epub(f"{name}_bilingual.epub", new_book, {})
-            pbar.close()
+                name, _ = os.path.splitext(self.epub_name)
+                epub.write_epub(f"{name}_bilingual.epub", new_book, {})
+            if self.accumulated_num == 1:
+                pbar.close()
         except (KeyboardInterrupt, Exception) as e:
             print(e)
-            print("you can resume it next time")
-            self._save_progress()
-            self._save_temp_book()
+            if self.accumulated_num == 1:
+                print("you can resume it next time")
+                self._save_progress()
+                self._save_temp_book()
             sys.exit(0)
 
     def load_state(self):

diff --git a/book_maker/loader/txt_loader.py b/book_maker/loader/txt_loader.py
@@ -17,6 +17,7 @@ def __init__(
         model_api_base=None,
         is_test=False,
         test_num=5,
+        accumulated_num=1,
         prompt_template=None,
     ):
         self.txt_name = txt_name

diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py
@@ -1,4 +1,6 @@
 import time
+import re
+from copy import copy
 
 import openai
 from os import environ
@@ -16,24 +18,26 @@ def __init__(self, key, language, api_base=None, prompt_template=None):
             prompt_template
             or "Please help me to translate,`{text}` to {language}, please return only translated content not include the origin text"
         )
+        self.system_content = environ.get("OPENAI_API_SYS_MSG") or ""
+
+    max_num_token = -1
 
     def rotate_key(self):
         openai.api_key = next(self.keys)
 
     def get_translation(self, text):
         self.rotate_key()
+        content = self.prompt_template.format(text=text, language=self.language)
         completion = openai.ChatCompletion.create(
             model="gpt-3.5-turbo",
             messages=[
                 {
                     "role": "system",
-                    "content": environ.get("OPENAI_API_SYS_MSG") or "",
+                    "content": self.system_content,
                 },
                 {
                     "role": "user",
-                    "content": self.prompt_template.format(
-                        text=text, language=self.language
-                    ),
+                    "content": content,
                 },
             ],
         )
@@ -44,11 +48,19 @@ def get_translation(self, text):
             .encode("utf8")
             .decode()
         )
+        print("=================================================")
+        self.max_num_token = max(
+            self.max_num_token, int(completion["usage"]["total_tokens"])
+        )
+        print(
+            f"{completion['usage']['total_tokens']} {completion['usage']['prompt_tokens']} {completion['usage']['completion_tokens']} {self.max_num_token} (total_token, prompt_token, completion_tokens, max_history_total_token)"
+        )
         return t_text
 
-    def translate(self, text):
+    def translate(self, text, needprint=True):
         # todo: Determine whether to print according to the cli option
-        print(text)
+        if needprint:
+            print(re.sub("\n{3,}", "\n\n", text))
 
         try:
             t_text = self.get_translation(text)
@@ -64,5 +76,79 @@ def translate(self, text):
             t_text = self.get_translation(text)
 
         # todo: Determine whether to print according to the cli option
-        print(t_text.strip())
+        if needprint:
+            print(re.sub("\n{3,}", "\n\n", t_text))
         return t_text
+
+    def translate_and_split_lines(self, text):
+        result_str = self.translate(text, False)
+        lines = result_str.split("\n")
+        lines = [line.strip() for line in lines if line.strip() != ""]
+        return lines
+
+    def translate_list(self, plist):
+        sep = "\n\n\n\n\n"
+        # new_str = sep.join([item.text for item in plist])
+
+        new_str = ""
+        for p in plist:
+            temp_p = copy(p)
+            for sup in temp_p.find_all("sup"):
+                sup.extract()
+            new_str += temp_p.get_text().strip() + sep
+
+        if new_str.endswith(sep):
+            new_str = new_str[: -len(sep)]
+
+        plist_len = len(plist)
+        self.system_content += f"""Please translate the following paragraphs individually while preserving their original structure(This time it should be exactly {plist_len} paragraphs, no more or less). Only translate the paragraphs provided below:
+
+[Insert first paragraph here]
+
+[Insert second paragraph here]
+
+[Insert third paragraph here]"""
+
+        retry_count = 0
+        result_list = self.translate_and_split_lines(new_str)
+
+        while len(result_list) != plist_len and retry_count < 3:
+            print(
+                f"bug: {plist_len} -> {len(result_list)} : Number of paragraphs before and after translation"
+            )
+            sleep_dur = 6
+            print(f"sleep for {sleep_dur}s and retry {retry_count+1} ...")
+            time.sleep(sleep_dur)
+            result_list = self.translate_and_split_lines(new_str)
+            retry_count += 1
+
+        state = "success"
+        if len(result_list) != plist_len:
+            state = "fail"
+
+        if retry_count > 0:
+            print(f"retry {state}")
+            with open("buglog.txt", "a") as f:
+                print(
+                    f"retry {state}, count = {retry_count}",
+                    file=f,
+                )
+
+        if len(result_list) != plist_len:
+            newlist = new_str.split(sep)
+            with open("buglog.txt", "a") as f:
+                for i in range(0, len(newlist)):
+                    print(newlist[i], file=f)
+                    print(file=f)
+                    if i < len(result_list):
+                        print(result_list[i], file=f)
+                        print(file=f)
+                    print("=============================", file=f)
+            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
+
+            print(
+                f"bug: {plist_len} paragraphs of text translated into {len(result_list)} paragraphs"
+            )
+            print("continue")
+
+        return result_list