transalte in splits for token larger than 4096

yihong0618 · Mar 8, 2023 · f7b3daa · f7b3daa
1 parent 30b34a2
commit f7b3daa
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 43 deletions.
diff --git a/book_maker/translator/chatgptapi_translator.py b/book_maker/translator/chatgptapi_translator.py
@@ -1,6 +1,7 @@
 import time
 
 import openai
+from ..utils import num_tokens_from_messages
 
 from .base_translator import Base
 
@@ -18,45 +19,91 @@ def rotate_key(self):
     def translate(self, text):
         print(text)
         self.rotate_key()
-        try:
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "user",
-                        # english prompt here to save tokens
-                        "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
-                    }
-                ],
-            )
-            t_text = (
-                completion["choices"][0]
-                .get("message")
-                .get("content")
-                .encode("utf8")
-                .decode()
-            )
-        except Exception as e:
-            # TIME LIMIT for open api please pay
-            sleep_time = int(60 / self.key_len)
-            time.sleep(sleep_time)
-            print(e, f"will sleep  {sleep_time} seconds")
-            self.rotate_key()
-            completion = openai.ChatCompletion.create(
-                model="gpt-3.5-turbo",
-                messages=[
-                    {
-                        "role": "user",
-                        "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
-                    }
-                ],
-            )
-            t_text = (
-                completion["choices"][0]
-                .get("message")
-                .get("content")
-                .encode("utf8")
-                .decode()
-            )
+
+        message_log = [
+            {
+                "role": "user",
+                # english prompt here to save tokens
+                "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
+            }
+        ]
+        count_tokens = num_tokens_from_messages(message_log)
+        t_text = ""
+        if count_tokens > 4000:
+            print("too long!")
+
+            splits = count_tokens // 4000 + 1
+
+            text_list = text.split(".")
+            sub_text = ""
+            t_sub_text = ""
+            for n in range(splits):
+                text_segment = text_list[n * splits : (n + 1) * splits]
+                sub_text = ".".join(text_segment)
+                print(sub_text)
+
+                completion = openai.ChatCompletion.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {
+                            "role": "user",
+                            # english prompt here to save tokens
+                            "content": f"Please help me to translate,`{sub_text}` to {self.language}, please return only translated content not include the origin text",
+                        }
+                    ],
+                )
+                t_sub_text = (
+                    completion["choices"][0]
+                    .get("message")
+                    .get("content")
+                    .encode("utf8")
+                    .decode()
+                )
+                print(t_sub_text)
+
+                t_text = t_text + t_sub_text
+
+            else:
+                try:
+                    completion = openai.ChatCompletion.create(
+                        model="gpt-3.5-turbo",
+                        messages=[
+                            {
+                                "role": "user",
+                                # english prompt here to save tokens
+                                "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
+                            }
+                        ],
+                    )
+                    t_text = (
+                        completion["choices"][0]
+                        .get("message")
+                        .get("content")
+                        .encode("utf8")
+                        .decode()
+                    )
+                except Exception as e:
+                    # TIME LIMIT for open api please pay
+                    key_len = self.key.count(",") + 1
+                    sleep_time = int(60 / key_len)
+                    time.sleep(sleep_time)
+                    print(e, f"will sleep  {sleep_time} seconds")
+                    self.rotate_key()
+                    completion = openai.ChatCompletion.create(
+                        model="gpt-3.5-turbo",
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
+                            }
+                        ],
+                    )
+                    t_text = (
+                        completion["choices"][0]
+                        .get("message")
+                        .get("content")
+                        .encode("utf8")
+                        .decode()
+                    )
         print(t_text)
         return t_text
diff --git a/book_maker/utils.py b/book_maker/utils.py
@@ -120,6 +120,7 @@
 
 import tiktoken
 
+
 def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
     """Returns the number of tokens used by a list of messages."""
     try:
@@ -129,13 +130,17 @@ def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
     if model == "gpt-3.5-turbo":  # note: future models may deviate from this
         num_tokens = 0
         for message in messages:
-            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+            num_tokens += (
+                4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+            )
             for key, value in message.items():
                 num_tokens += len(encoding.encode(value))
                 if key == "name":  # if there's a name, the role is omitted
                     num_tokens += -1  # role is always required and always 1 token
         num_tokens += 2  # every reply is primed with <im_start>assistant
         return num_tokens
     else:
-        raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
-See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not presently implemented for model {model}.
+See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )