Skip to content

Commit

Permalink
feat: refactor and we have no type hint never (#97)
Browse files Browse the repository at this point in the history
  • Loading branch information
yihong0618 authored Mar 7, 2023
1 parent 0bed995 commit cdeaaea
Show file tree
Hide file tree
Showing 15 changed files with 481 additions and 394 deletions.
Empty file added book_maker/__init__.py
Empty file.
1 change: 1 addition & 0 deletions book_maker/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from cli import main
127 changes: 127 additions & 0 deletions book_maker/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import argparse
import os
from os import environ as env

from book_maker.loader import BOOK_LOADER_DICT
from book_maker.translator import MODEL_DICT
from book_maker.utils import LANGUAGES, TO_LANGUAGE_CODE


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--book_name",
dest="book_name",
type=str,
help="your epub book file path",
)
parser.add_argument(
"--openai_key",
dest="openai_key",
type=str,
default="",
help="openai api key,if you have more than one key,you can use comma"
" to split them and you can break through the limitation",
)
parser.add_argument(
"--no_limit",
dest="no_limit",
action="store_true",
help="If you are a paying customer you can add it",
)
parser.add_argument(
"--test",
dest="test",
action="store_true",
help="if test we only translat 10 contents you can easily check",
)
parser.add_argument(
"--test_num",
dest="test_num",
type=int,
default=10,
help="test num for the test",
)
parser.add_argument(
"-m",
"--model",
dest="model",
type=str,
default="chatgptapi",
choices=["chatgptapi", "gpt3"], # support DeepL later
metavar="MODEL",
help="Which model to use, available: {%(choices)s}",
)
parser.add_argument(
"--language",
type=str,
choices=sorted(LANGUAGES.keys())
+ sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
default="zh-hans",
metavar="LANGUAGE",
help="language to translate to, available: {%(choices)s}",
)
parser.add_argument(
"--resume",
dest="resume",
action="store_true",
help="if program accidentally stop you can use this to resume",
)
parser.add_argument(
"-p",
"--proxy",
dest="proxy",
type=str,
default="",
help="use proxy like http://127.0.0.1:7890",
)
# args to change api_base
parser.add_argument(
"--api_base",
dest="api_base",
type=str,
help="replace base url from openapi",
)

options = parser.parse_args()
PROXY = options.proxy
if PROXY != "":
os.environ["http_proxy"] = PROXY
os.environ["https_proxy"] = PROXY

OPENAI_API_KEY = options.openai_key or env.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise Exception("Need openai API key, please google how to")

book_type = options.book_name.split(".")[-1]
support_type_list = list(BOOK_LOADER_DICT.keys())
if book_type not in support_type_list:
raise Exception(f"now only support {','.join(support_type_list)} files")
translate_model = MODEL_DICT.get(options.model)
assert translate_model is not None, "Not support model"

book_loader = BOOK_LOADER_DICT.get(book_type)
assert book_loader is not None, "Not support loader"
language = options.language
if options.language in LANGUAGES:
# use the value for prompt
language = LANGUAGES.get(language, language)

# change api_base for issue #42
model_api_base = options.api_base

e = book_loader(
options.book_name,
translate_model,
OPENAI_API_KEY,
options.resume,
language=language,
model_api_base=model_api_base,
is_test=options.test,
test_num=options.test_num,
)
e.make_bilingual_book()


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions book_maker/loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from book_maker.loader.epub_loader import EPUBBookLoader

BOOK_LOADER_DICT = {
"epub": EPUBBookLoader
# TODO add more here
}
40 changes: 40 additions & 0 deletions book_maker/loader/base_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from abc import abstractmethod


class BaseBookLoader:
def __init__(
self,
epub_name,
model,
key,
resume,
language,
model_api_base=None,
is_test=False,
test_num=5,
):
pass

@staticmethod
def _is_special_text(text):
return text.isdigit() or text.isspace()

@abstractmethod
def _make_new_book(self, book):
pass

@abstractmethod
def make_bilingual_book(self):
pass

@abstractmethod
def load_state(self):
pass

@abstractmethod
def _save_temp_book(self):
pass

@abstractmethod
def _save_progress(self):
pass
168 changes: 168 additions & 0 deletions book_maker/loader/epub_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import argparse
import os
import pickle
import sys
from copy import copy
from pathlib import Path

from bs4 import BeautifulSoup as bs
from ebooklib import ITEM_DOCUMENT, epub
from rich import print
from tqdm import tqdm

from .base_loader import BaseBookLoader


class EPUBBookLoader(BaseBookLoader):
def __init__(
self,
epub_name,
model,
key,
resume,
language,
model_api_base=None,
is_test=False,
test_num=5,
):
self.epub_name = epub_name
self.new_epub = epub.EpubBook()
self.translate_model = model(key, language, model_api_base)
self.is_test = is_test
self.test_num = test_num

try:
self.origin_book = epub.read_epub(self.epub_name)
except:
# tricky for #71 if you don't know why please check the issue and ignore this
# when upstream change will TODO fix this
def _load_spine(self):
spine = self.container.find(
"{%s}%s" % (epub.NAMESPACES["OPF"], "spine")
)

self.book.spine = [
(t.get("idref"), t.get("linear", "yes")) for t in spine
]
self.book.set_direction(spine.get("page-progression-direction", None))

epub.EpubReader._load_spine = _load_spine
self.origin_book = epub.read_epub(self.epub_name)

self.p_to_save = []
self.resume = resume
self.bin_path = f"{Path(epub_name).parent}/.{Path(epub_name).stem}.temp.bin"
if self.resume:
self.load_state()

@staticmethod
def _is_special_text(text):
return text.isdigit() or text.isspace()

def _make_new_book(self, book):
new_book = epub.EpubBook()
new_book.metadata = book.metadata
new_book.spine = book.spine
new_book.toc = book.toc
return new_book

def make_bilingual_book(self):
new_book = self._make_new_book(self.origin_book)
all_items = list(self.origin_book.get_items())
all_p_length = sum(
0
if i.get_type() != ITEM_DOCUMENT
else len(bs(i.content, "html.parser").findAll("p"))
for i in all_items
)
pbar = tqdm(total=self.test_num) if self.is_test else tqdm(total=all_p_length)
index = 0
p_to_save_len = len(self.p_to_save)
try:
for item in self.origin_book.get_items():
if item.get_type() == ITEM_DOCUMENT:
soup = bs(item.content, "html.parser")
p_list = soup.findAll("p")
is_test_done = self.is_test and index > self.test_num
for p in p_list:
if is_test_done or not p.text or self._is_special_text(p.text):
continue
new_p = copy(p)
# TODO banch of p to translate then combine
# PR welcome here
if self.resume and index < p_to_save_len:
new_p.string = self.p_to_save[index]
else:
new_p.string = self.translate_model.translate(p.text)
self.p_to_save.append(new_p.text)
p.insert_after(new_p)
index += 1
if index % 20 == 0:
self._save_progress()
# pbar.update(delta) not pbar.update(index)?
pbar.update(1)
if self.is_test and index >= self.test_num:
break
item.content = soup.prettify().encode()
new_book.add_item(item)
name, _ = os.path.splitext(self.epub_name)
epub.write_epub(f"{name}_bilingual.epub", new_book, {})
pbar.close()
except (KeyboardInterrupt, Exception) as e:
print(e)
print("you can resume it next time")
self._save_progress()
self._save_temp_book()
sys.exit(0)

def load_state(self):
try:
with open(self.bin_path, "rb") as f:
self.p_to_save = pickle.load(f)
except:
raise Exception("can not load resume file")

def _save_temp_book(self):
origin_book_temp = epub.read_epub(
self.epub_name
) # we need a new instance for temp save
new_temp_book = self._make_new_book(origin_book_temp)
p_to_save_len = len(self.p_to_save)
index = 0
try:
for item in self.origin_book.get_items():
if item.get_type() == ITEM_DOCUMENT:
soup = (
bs(item.content, "xml")
if item.file_name.endswith(".xhtml")
else bs(item.content, "html.parser")
)
p_list = soup.findAll("p")
for p in p_list:
if not p.text or self._is_special_text(p.text):
continue
# TODO banch of p to translate then combine
# PR welcome here
if index < p_to_save_len:
new_p = copy(p)
new_p.string = self.p_to_save[index]
print(new_p.string)
p.insert_after(new_p)
index += 1
else:
break
# for save temp book
item.content = soup.prettify().encode()
new_temp_book.add_item(item)
name, _ = os.path.splitext(self.epub_name)
epub.write_epub(f"{name}_bilingual_temp.epub", new_temp_book, {})
except Exception as e:
# TODO handle it
print(e)

def _save_progress(self):
try:
with open(self.bin_path, "wb") as f:
pickle.dump(self.p_to_save, f)
except:
raise Exception("can not save resume file")
1 change: 1 addition & 0 deletions book_maker/loader/srt_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""TODO"""
1 change: 1 addition & 0 deletions book_maker/loader/txt_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""TODO"""
8 changes: 8 additions & 0 deletions book_maker/translator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from book_maker.translator.chatgptapi_translator import ChatGPTAPI
from book_maker.translator.gpt3_translator import GPT3

MODEL_DICT = {
"chatgptapi": ChatGPTAPI,
"gpt3": GPT3,
# add more here
}
18 changes: 18 additions & 0 deletions book_maker/translator/base_translator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from abc import abstractmethod


class Base:
def __init__(self, key, language, api_base=None):
self.key = key
self.language = language
self.current_key_index = 0

def get_key(self, key_str):
keys = key_str.split(",")
key = keys[self.current_key_index]
self.current_key_index = (self.current_key_index + 1) % len(keys)
return key

@abstractmethod
def translate(self, text):
pass
Loading

0 comments on commit cdeaaea

Please sign in to comment.