Skip to content

Commit

Permalink
transalte in splits for token larger than 4096
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffery9 committed Mar 8, 2023
1 parent 30b34a2 commit f7b3daa
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 43 deletions.
127 changes: 87 additions & 40 deletions book_maker/translator/chatgptapi_translator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time

import openai
from ..utils import num_tokens_from_messages

from .base_translator import Base

Expand All @@ -18,45 +19,91 @@ def rotate_key(self):
def translate(self, text):
print(text)
self.rotate_key()
try:
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
except Exception as e:
# TIME LIMIT for open api please pay
sleep_time = int(60 / self.key_len)
time.sleep(sleep_time)
print(e, f"will sleep {sleep_time} seconds")
self.rotate_key()
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)

message_log = [
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
]
count_tokens = num_tokens_from_messages(message_log)
t_text = ""
if count_tokens > 4000:
print("too long!")

splits = count_tokens // 4000 + 1

text_list = text.split(".")
sub_text = ""
t_sub_text = ""
for n in range(splits):
text_segment = text_list[n * splits : (n + 1) * splits]
sub_text = ".".join(text_segment)
print(sub_text)

completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{sub_text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_sub_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
print(t_sub_text)

t_text = t_text + t_sub_text

else:
try:
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
# english prompt here to save tokens
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
except Exception as e:
# TIME LIMIT for open api please pay
key_len = self.key.count(",") + 1
sleep_time = int(60 / key_len)
time.sleep(sleep_time)
print(e, f"will sleep {sleep_time} seconds")
self.rotate_key()
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "user",
"content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text",
}
],
)
t_text = (
completion["choices"][0]
.get("message")
.get("content")
.encode("utf8")
.decode()
)
print(t_text)
return t_text
11 changes: 8 additions & 3 deletions book_maker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@

import tiktoken


def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
"""Returns the number of tokens used by a list of messages."""
try:
Expand All @@ -129,13 +130,17 @@ def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
if model == "gpt-3.5-turbo": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
raise NotImplementedError(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)

0 comments on commit f7b3daa

Please sign in to comment.