Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix (#90) split text if token larger than 4096 #106

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions book_maker/translator/chatgptapi_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from os import environ

from .base_translator import Base
from ..utils import num_tokens_from_messages


PROMPT_ENV_MAP = {
Expand Down Expand Up @@ -60,9 +61,54 @@ def get_translation(self, text):
}
)

count_tokens = num_tokens_from_messages(messages)
consumed_tokens = 0
t_text = ""
if count_tokens > 4000:
print("too long!")

splits = count_tokens // 4000 + 1

text_list = text.split(".")
sub_text = ""
t_sub_text = ""
for n in range(splits):
text_segment = text_list[n * splits : (n + 1) * splits]
sub_text = ".".join(text_segment)
print(sub_text)
message_log = []

if self.prompt_sys_msg:
message_log.append(
{"role": "system", "content": self.prompt_sys_msg},
)

message_log.append(
{
"role": "user",
"content": self.prompt_template.format(
text=sub_text, language=self.language
),
}
)

t_sub_text, completion = self.call_chatgpt(message_log)
print(t_sub_text)
consumed_tokens += completion["usage"]["prompt_tokens"]

t_text = t_text + t_sub_text

else:

t_sub_text, completion = self.call_chatgpt(messages)
consumed_tokens += completion["usage"]["prompt_tokens"]

print(f"{consumed_tokens} prompt tokens used.")
return t_text

def call_chatgpt(self, message_log):
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
model="gpt-3.5-turbo", messages=message_log
)
t_text = (
completion["choices"][0]
Expand All @@ -71,7 +117,8 @@ def get_translation(self, text):
.encode("utf8")
.decode()
)
return t_text

return t_text, completion

def translate(self, text):
# todo: Determine whether to print according to the cli option
Expand Down
27 changes: 27 additions & 0 deletions book_maker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,33 @@
"castilian": "es",
}

import tiktoken


def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += (
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
)
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with <im_start>assistant
return num_tokens
else:
print(
f"""num_tokens_from_messages() is not presently implemented for model {model}.
See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
)


def prompt_config_to_kwargs(prompt_config):
prompt_config = prompt_config or {}
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
bs4
openai
tiktoken
requests
ebooklib
rich
Expand Down