From 059a37266d2042bfbf9aab4ced5c9f644853801c Mon Sep 17 00:00:00 2001 From: Weirenlan Date: Sat, 4 Mar 2023 00:06:19 +0800 Subject: [PATCH] Feat/add target lang choices (#12) * Feat: add multipile languages choices and a new argument language to set targett language Co-authored-by: yihong0618 --- .github/workflows/make_test_ebook.yaml | 2 +- README.md | 9 +- make_book.py | 35 ++++++-- utils.py | 119 +++++++++++++++++++++++++ 4 files changed, 152 insertions(+), 13 deletions(-) create mode 100644 utils.py diff --git a/.github/workflows/make_test_ebook.yaml b/.github/workflows/make_test_ebook.yaml index 1ff50200..23b31ba6 100644 --- a/.github/workflows/make_test_ebook.yaml +++ b/.github/workflows/make_test_ebook.yaml @@ -26,4 +26,4 @@ jobs: - name: make test ebook env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: python3 make_book.py --book_name test_books/animal_farm.epub --no_limit --test --test_num 2 + run: python3 make_book.py --book_name test_books/animal_farm.epub --no_limit --test --test_num 2 --language zh-hans diff --git a/README.md b/README.md index 100f0604..09d01c80 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,19 @@ Make bilingual epub books Using AI translate 3. 本地放了一个 animal_farm.epub 给大家测试 4. 默认用了 ChatGPT 模型,用 `--model gpt3` 来使用 gpt3 模型 5. 加了 `--test` 命令如果大家没付费可以加上这个先看看效果(有 limit 稍微有些慢) +6. Set the target language like `--language "Simplified Chinese"`. + Suppot ` "Japanese" / "Traditional Chinese" / "German" / "French" / "Korean"`. + Default target language is `"Simplified Chinese"`. Support language list please see the LANGUAGES at [utils.py](./utils.py). e.g. ```shell # 如果你想快速测一下 -python3 make_book.py --book_name test_books/animal_farm.epub --openai_key ${openai_key} --no_limit --test +python3 make_book.py --book_name test_books/animal_farm.epub --openai_key ${openai_key} --no_limit --test --language "Simplified Chinese" # or do it -python3 make_book.py --book_name test_books/animal_farm.epub --openai_key ${openai_key} +python3 make_book.py --book_name test_books/animal_farm.epub --openai_key ${openai_key} --language "Simplified Chinese" # or 用 gpt3 模型 export OPENAI_API_KEY=${your_api_key} -python3 make_book.py --book_name test_books/animal_farm.epub --model gpt3 --no_limit +python3 make_book.py --book_name test_books/animal_farm.epub --model gpt3 --no_limit --language "Simplified Chinese" ``` ## 注意 diff --git a/make_book.py b/make_book.py index aa77b02a..719cb0e3 100644 --- a/make_book.py +++ b/make_book.py @@ -11,6 +11,7 @@ from bs4 import BeautifulSoup as bs from ebooklib import epub from rich import print +from utils import LANGUAGES, TO_LANGUAGE_CODE NO_LIMIT = False IS_TEST = False @@ -18,7 +19,7 @@ class Base: - def __init__(self, key): + def __init__(self, key, language): pass @abstractmethod @@ -27,7 +28,7 @@ def translate(self, text): class GPT3(Base): - def __init__(self, key): + def __init__(self, key, language): self.api_key = key self.api_url = "https://api.openai.com/v1/completions" self.headers = { @@ -43,10 +44,11 @@ def __init__(self, key): "top_p": 1, } self.session = requests.session() + self.language = language def translate(self, text): print(text) - self.data["prompt"] = f"Please help me to translate,`{text}` to Chinese" + self.data["prompt"] = f"Please help me to translate,`{text}` to {self.language}" r = self.session.post(self.api_url, headers=self.headers, json=self.data) if not r.ok: return text @@ -64,12 +66,14 @@ def translate(self, text): class ChatGPT(Base): - def __init__(self, key): - super().__init__(key) + def __init__(self, key, language): + super().__init__(key, language) self.key = key + self.language = language def translate(self, text): print(text) + print(self.language, "!!!") openai.api_key = self.key try: completion = openai.ChatCompletion.create( @@ -78,7 +82,7 @@ def translate(self, text): { "role": "user", # english prompt here to save tokens - "content": f"Please help me to translate,`{text}` to Chinese, please return only translated content not include the origin text", + "content": f"Please help me to translate,`{text}` to {self.language}, please return only translated content not include the origin text", } ], ) @@ -117,10 +121,10 @@ def translate(self, text): class BEPUB: - def __init__(self, epub_name, model, key, resume): + def __init__(self, epub_name, model, key, resume, language): self.epub_name = epub_name self.new_epub = epub.EpubBook() - self.translate_model = model(key) + self.translate_model = model(key, language) self.origin_book = epub.read_epub(self.epub_name) self.p_to_save = [] self.resume = resume @@ -236,6 +240,14 @@ def save_progress(self): choices=["chatgpt", "gpt3"], # support DeepL later help="Use which model", ) + parser.add_argument( + "--language", + type=str, + choices=sorted(LANGUAGES.keys()) + + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), + default="zh-hans", + help="language to translate to", + ) parser.add_argument( "--resume", dest="resume", @@ -253,5 +265,10 @@ def save_progress(self): if not options.book_name.endswith(".epub"): raise Exception("please use epub file") model = MODEL_DICT.get(options.model, "chatgpt") - e = BEPUB(options.book_name, model, OPENAI_API_KEY, RESUME) + language = options.language + if options.language in LANGUAGES: + # use the value for prompt + language = LANGUAGES.get(language, language) + + e = BEPUB(options.book_name, model, OPENAI_API_KEY, RESUME, language=language) e.make_bilingual_book() diff --git a/utils.py b/utils.py new file mode 100644 index 00000000..acf46264 --- /dev/null +++ b/utils.py @@ -0,0 +1,119 @@ +# Borrowed from : https://github.com/openai/whisper +LANGUAGES = { + "en": "english", + "zh-hans": "simplified chinese", + "zh-hant": "traditional chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "he": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", +} + +# language code lookup by name, with a few language aliases +TO_LANGUAGE_CODE = { + **{language: code for code, language in LANGUAGES.items()}, + "burmese": "my", + "valencian": "ca", + "flemish": "nl", + "haitian": "ht", + "letzeburgesch": "lb", + "pushto": "ps", + "panjabi": "pa", + "moldavian": "ro", + "moldovan": "ro", + "sinhalese": "si", + "castilian": "es", +}