From 5df47e92bd90d42a6552034a4136cdea4755ae44 Mon Sep 17 00:00:00 2001 From: miro Date: Mon, 25 Mar 2024 22:46:18 +0000 Subject: [PATCH] reformat code --- audiobooker/__init__.py | 242 +---------------- audiobooker/base.py | 245 ++++++++++++++++++ audiobooker/scrappers/__init__.py | 11 +- audiobooker/scrappers/audioanarchy.py | 12 +- audiobooker/scrappers/darkerprojects.py | 2 +- audiobooker/scrappers/goldenaudiobooks.py | 5 +- audiobooker/scrappers/librivox.py | 3 +- audiobooker/scrappers/loyalbooks.py | 49 ++-- .../scrappers/stephenkingaudiobooks.py | 3 +- audiobooker/scrappers/storynory.py | 18 +- audiobooker/scrappers/thoughtaudio.py | 8 +- 11 files changed, 303 insertions(+), 295 deletions(-) create mode 100644 audiobooker/base.py diff --git a/audiobooker/__init__.py b/audiobooker/__init__.py index 2e9f5ed..769d9f9 100644 --- a/audiobooker/__init__.py +++ b/audiobooker/__init__.py @@ -1,245 +1,5 @@ -import json -import subprocess -from bs4 import BeautifulSoup -from requests_cache import CachedSession -from datetime import timedelta - +from audiobooker.base import BookTag, BookAuthor, AudioBook from audiobooker.exceptions import UnknownAuthorIdException, \ UnknownBookIdException, UnknownDurationError, ScrappingError, \ UnknownGenreIdException, UnknownAuthorException, UnknownBookException, \ UnknownGenreException, ParseErrorException - -expire_after = timedelta(hours=1) -session = CachedSession(backend='memory', expire_after=expire_after) - - -class BookTag: - def __init__(self, name="", tag_id="", url="", from_data=None): - self.name = name - self.tag_id = tag_id or name - self.url = url - if from_data: - self.from_json(from_data) - - @property - def as_json(self): - return {"name": self.name, - "id": self.tag_id, - "url": self.url} - - def from_json(self, json_data): - if isinstance(json_data, str): - json_data = json.loads(json_data) - if isinstance(json_data, BookTag): - json_data = json_data.as_json - if not isinstance(json_data, dict): - raise TypeError - self.name = json_data.get("name", self.name) - self.tag_id = json_data.get("id", self.tag_id) or self.name - self.url = json_data.get("url", self.url) - - def __str__(self): - return self.name - - def __repr__(self): - return "BookGenre(" + str(self) + ", " + self.tag_id + ")" - - -class BookAuthor: - def __init__(self, first_name="", last_name="", author_id="", url="", - from_data=None): - self.first_name = first_name - self.last_name = last_name - self.first_name, self.last_name = self.normalize_name() - self.author_id = author_id - self.url = url - if from_data: - self.from_json(from_data) - - def normalize_name(self): - author = " ".join([self.first_name, self.last_name]) - names = author.split(" ") - last_name = " ".join(names[1:]) - first_name = names[0] - return first_name, last_name - - def from_json(self, json_data): - if isinstance(json_data, str): - try: - json_data = json.loads(json_data) - except: - json_data = {"last_name": json_data} - if isinstance(json_data, BookAuthor): - json_data = json_data.as_json - if not isinstance(json_data, dict): - print(json_data, type(json_data)) - raise TypeError - self.first_name = json_data.get("first_name", self.first_name) - self.last_name = json_data.get("last_name", self.last_name) - self.first_name, self.last_name = self.normalize_name() - self.author_id = json_data.get("id", self.author_id) - self.url = json_data.get("url", self.url) - - @property - def as_json(self): - return {"first_name": self.first_name, "last_name": self.last_name, - "id": self.author_id, "url": self.url} - - def __str__(self): - return (self.first_name + " " + self.last_name).strip() - - def __repr__(self): - return "BookAuthor(" + str(self) + ", " + self.author_id + ")" - - -class AudioBook: - def __init__(self, title="", authors=None, description="", tags=None, - book_id="", runtime=0, url="", img="", language='english', - from_data=None, stream_list=None, parse=False): - self.img = img - self.url = url - self.title = title - self._authors = authors or [] - self._description = description - self._tags = tags or [] - self.book_id = book_id - self.runtime = runtime - self.lang = language.lower() - self._stream_list = stream_list or [] - if not self.book_id and "/" in self.url: - self.book_id = self.url.split("/")[-1] - elif not self.book_id: - self.book_id = title - if from_data: - self.from_json(from_data) - self.raw = from_data or {} - if parse: - try: - self.from_page() - except: - pass - - def calc_runtime(self, data=None): - raise UnknownDurationError - - def parse_page(self): - raise ParseErrorException - - def from_page(self): - self.raw = self.parse_page() - - @property - def html(self): - try: - return session.get(self.url).text - except Exception as e: - try: - return session.get(self.url, verify=False).text - except: - return None - - @property - def soup(self): - return BeautifulSoup(self.html, "html.parser") - - @property - def description(self): - return self._description.strip() - - @property - def streamer(self): - for s in self._stream_list: - yield s - - @property - def streams(self): - return [s for s in self.streamer] - - def play_sox(self): - self.play("play %1") - - def play_mplayer(self): - self.play("mplayer %1") - - def play_vlc(self): - self.play("cvlc %1 --play-and-exit") - - def play(self, cmd="cvlc %1 --play-and-exit"): - for stream_url in self.streamer: - print("playing", stream_url) - if isinstance(cmd, str): - cmd = cmd.split(" ") - if isinstance(cmd, list): - play_cmd = cmd - for idx, c in enumerate(cmd): - if c == "%1": - play_cmd[idx] = stream_url - subprocess.call(" ".join(play_cmd), shell=True) - else: - raise TypeError - - @property - def authors(self): - authors = [] - for a in self._authors: - if isinstance(a, str): - try: - a = json.loads(a) - except Exception as e: - a = {"last_name": a} - if isinstance(a, dict): - authors += [a] - return [BookAuthor(from_data=a) for a in authors] - - @property - def tags(self): - return [BookTag(from_data=a) for a in self._tags] - - @property - def as_json(self): - bucket = self.raw - bucket["url"] = self.url - bucket["img"] = self.img - bucket["title"] = self.title - bucket["authors"] = self._authors - bucket["description"] = self._description - bucket["tags"] = self._tags - bucket["id"] = self.book_id - bucket["runtime"] = self.runtime - bucket["language"] = self.lang - bucket["streams"] = self.streams - return bucket - - def from_json(self, json_data): - if isinstance(json_data, str): - json_data = json.loads(json_data) - if not isinstance(json_data, dict): - raise TypeError - json_data = json_data or {} - self.url = json_data.get("url", self.url) - self.img = json_data.get("img", - json_data.get("pic", - json_data.get("image", - self.img))) - self.title = json_data.get("title", json_data.get("name", self.title)) - self._authors = json_data.get("authors", self._authors) - self._authors = self._authors or [json_data.get("author", "")] - self._description = json_data.get("description", self._description) - self._tags = json_data.get("tags", self._tags) - self.book_id = json_data.get("id") - self.runtime = json_data.get("runtime", self.runtime) - self.lang = json_data.get('language', - json_data.get('lang', self.lang)).lower() - self._stream_list = json_data.get("streams", self._stream_list) - self.raw = json_data - if not self.book_id and "/" in self.url: - self.book_id = self.url.split("/")[-1] - - def __str__(self): - return self.title - - def __repr__(self): - return "AudioBook(" + str(self) + ", " + self.book_id + ")" - - - diff --git a/audiobooker/base.py b/audiobooker/base.py new file mode 100644 index 0000000..2e9f5ed --- /dev/null +++ b/audiobooker/base.py @@ -0,0 +1,245 @@ +import json +import subprocess +from bs4 import BeautifulSoup +from requests_cache import CachedSession +from datetime import timedelta + +from audiobooker.exceptions import UnknownAuthorIdException, \ + UnknownBookIdException, UnknownDurationError, ScrappingError, \ + UnknownGenreIdException, UnknownAuthorException, UnknownBookException, \ + UnknownGenreException, ParseErrorException + +expire_after = timedelta(hours=1) +session = CachedSession(backend='memory', expire_after=expire_after) + + +class BookTag: + def __init__(self, name="", tag_id="", url="", from_data=None): + self.name = name + self.tag_id = tag_id or name + self.url = url + if from_data: + self.from_json(from_data) + + @property + def as_json(self): + return {"name": self.name, + "id": self.tag_id, + "url": self.url} + + def from_json(self, json_data): + if isinstance(json_data, str): + json_data = json.loads(json_data) + if isinstance(json_data, BookTag): + json_data = json_data.as_json + if not isinstance(json_data, dict): + raise TypeError + self.name = json_data.get("name", self.name) + self.tag_id = json_data.get("id", self.tag_id) or self.name + self.url = json_data.get("url", self.url) + + def __str__(self): + return self.name + + def __repr__(self): + return "BookGenre(" + str(self) + ", " + self.tag_id + ")" + + +class BookAuthor: + def __init__(self, first_name="", last_name="", author_id="", url="", + from_data=None): + self.first_name = first_name + self.last_name = last_name + self.first_name, self.last_name = self.normalize_name() + self.author_id = author_id + self.url = url + if from_data: + self.from_json(from_data) + + def normalize_name(self): + author = " ".join([self.first_name, self.last_name]) + names = author.split(" ") + last_name = " ".join(names[1:]) + first_name = names[0] + return first_name, last_name + + def from_json(self, json_data): + if isinstance(json_data, str): + try: + json_data = json.loads(json_data) + except: + json_data = {"last_name": json_data} + if isinstance(json_data, BookAuthor): + json_data = json_data.as_json + if not isinstance(json_data, dict): + print(json_data, type(json_data)) + raise TypeError + self.first_name = json_data.get("first_name", self.first_name) + self.last_name = json_data.get("last_name", self.last_name) + self.first_name, self.last_name = self.normalize_name() + self.author_id = json_data.get("id", self.author_id) + self.url = json_data.get("url", self.url) + + @property + def as_json(self): + return {"first_name": self.first_name, "last_name": self.last_name, + "id": self.author_id, "url": self.url} + + def __str__(self): + return (self.first_name + " " + self.last_name).strip() + + def __repr__(self): + return "BookAuthor(" + str(self) + ", " + self.author_id + ")" + + +class AudioBook: + def __init__(self, title="", authors=None, description="", tags=None, + book_id="", runtime=0, url="", img="", language='english', + from_data=None, stream_list=None, parse=False): + self.img = img + self.url = url + self.title = title + self._authors = authors or [] + self._description = description + self._tags = tags or [] + self.book_id = book_id + self.runtime = runtime + self.lang = language.lower() + self._stream_list = stream_list or [] + if not self.book_id and "/" in self.url: + self.book_id = self.url.split("/")[-1] + elif not self.book_id: + self.book_id = title + if from_data: + self.from_json(from_data) + self.raw = from_data or {} + if parse: + try: + self.from_page() + except: + pass + + def calc_runtime(self, data=None): + raise UnknownDurationError + + def parse_page(self): + raise ParseErrorException + + def from_page(self): + self.raw = self.parse_page() + + @property + def html(self): + try: + return session.get(self.url).text + except Exception as e: + try: + return session.get(self.url, verify=False).text + except: + return None + + @property + def soup(self): + return BeautifulSoup(self.html, "html.parser") + + @property + def description(self): + return self._description.strip() + + @property + def streamer(self): + for s in self._stream_list: + yield s + + @property + def streams(self): + return [s for s in self.streamer] + + def play_sox(self): + self.play("play %1") + + def play_mplayer(self): + self.play("mplayer %1") + + def play_vlc(self): + self.play("cvlc %1 --play-and-exit") + + def play(self, cmd="cvlc %1 --play-and-exit"): + for stream_url in self.streamer: + print("playing", stream_url) + if isinstance(cmd, str): + cmd = cmd.split(" ") + if isinstance(cmd, list): + play_cmd = cmd + for idx, c in enumerate(cmd): + if c == "%1": + play_cmd[idx] = stream_url + subprocess.call(" ".join(play_cmd), shell=True) + else: + raise TypeError + + @property + def authors(self): + authors = [] + for a in self._authors: + if isinstance(a, str): + try: + a = json.loads(a) + except Exception as e: + a = {"last_name": a} + if isinstance(a, dict): + authors += [a] + return [BookAuthor(from_data=a) for a in authors] + + @property + def tags(self): + return [BookTag(from_data=a) for a in self._tags] + + @property + def as_json(self): + bucket = self.raw + bucket["url"] = self.url + bucket["img"] = self.img + bucket["title"] = self.title + bucket["authors"] = self._authors + bucket["description"] = self._description + bucket["tags"] = self._tags + bucket["id"] = self.book_id + bucket["runtime"] = self.runtime + bucket["language"] = self.lang + bucket["streams"] = self.streams + return bucket + + def from_json(self, json_data): + if isinstance(json_data, str): + json_data = json.loads(json_data) + if not isinstance(json_data, dict): + raise TypeError + json_data = json_data or {} + self.url = json_data.get("url", self.url) + self.img = json_data.get("img", + json_data.get("pic", + json_data.get("image", + self.img))) + self.title = json_data.get("title", json_data.get("name", self.title)) + self._authors = json_data.get("authors", self._authors) + self._authors = self._authors or [json_data.get("author", "")] + self._description = json_data.get("description", self._description) + self._tags = json_data.get("tags", self._tags) + self.book_id = json_data.get("id") + self.runtime = json_data.get("runtime", self.runtime) + self.lang = json_data.get('language', + json_data.get('lang', self.lang)).lower() + self._stream_list = json_data.get("streams", self._stream_list) + self.raw = json_data + if not self.book_id and "/" in self.url: + self.book_id = self.url.split("/")[-1] + + def __str__(self): + return self.title + + def __repr__(self): + return "AudioBook(" + str(self) + ", " + self.book_id + ")" + + + diff --git a/audiobooker/scrappers/__init__.py b/audiobooker/scrappers/__init__.py index aa7eae0..a14d02b 100644 --- a/audiobooker/scrappers/__init__.py +++ b/audiobooker/scrappers/__init__.py @@ -1,10 +1,11 @@ -from bs4 import BeautifulSoup from threading import Thread + +from bs4 import BeautifulSoup from rapidfuzz import process + +from audiobooker.base import session, BookTag from audiobooker.exceptions import UnknownAuthorIdException, \ - UnknownBookIdException, ScrappingError, UnknownGenreIdException, \ - UnknownAuthorException, UnknownBookException, UnknownGenreException -from audiobooker import AudioBook, BookAuthor, session, BookTag + UnknownBookIdException, ScrappingError, UnknownAuthorException, UnknownBookException from audiobooker.utils import random_user_agent @@ -43,7 +44,7 @@ def _get_html(url): return session.get(url, headers={'User-Agent': user_agent}).text except Exception as e: return session.get(url, verify=False, - headers={'User-Agent': user_agent}).text + headers={'User-Agent': user_agent}).text @staticmethod def _get_soup(html): diff --git a/audiobooker/scrappers/audioanarchy.py b/audiobooker/scrappers/audioanarchy.py index 3acf166..43d3b40 100644 --- a/audiobooker/scrappers/audioanarchy.py +++ b/audiobooker/scrappers/audioanarchy.py @@ -1,5 +1,6 @@ import requests -from audiobooker import AudioBook, BookTag, BookAuthor + +from audiobooker.base import AudioBook, BookAuthor from audiobooker.scrappers import AudioBookSource @@ -50,7 +51,7 @@ def __repr__(self): class AudioAnarchy(AudioBookSource): base_url = "http://www.audioanarchy.org" _tags = ["Anarchy"] - _tag_pages = {"Anarchy":'http://www.audioanarchy.org'} + _tag_pages = {"Anarchy": 'http://www.audioanarchy.org'} @staticmethod def _parse_page(html, limit=-1): @@ -92,9 +93,10 @@ def scrap_all_audiobooks(self, limit=-1, offset=0): if __name__ == "__main__": from pprint import pprint - # for book in AudioAnarchy.search_audiobooks(title="Dark Tower"): - # pprint(book.as_json) + + # for book in AudioAnarchy.search_audiobooks(title="Dark Tower"): + # pprint(book.as_json) scraper = AudioAnarchy() - for book in scraper.scrap_popular(): + for book in scraper.scrap_all_audiobooks(): pprint(book.as_json) diff --git a/audiobooker/scrappers/darkerprojects.py b/audiobooker/scrappers/darkerprojects.py index 913da4b..d1de86b 100644 --- a/audiobooker/scrappers/darkerprojects.py +++ b/audiobooker/scrappers/darkerprojects.py @@ -1,7 +1,7 @@ import requests from sitemapparser import SiteMapParser -from audiobooker import AudioBook +from audiobooker.base import AudioBook from audiobooker.scrappers import AudioBookSource diff --git a/audiobooker/scrappers/goldenaudiobooks.py b/audiobooker/scrappers/goldenaudiobooks.py index 1104bfa..5f5b3f8 100644 --- a/audiobooker/scrappers/goldenaudiobooks.py +++ b/audiobooker/scrappers/goldenaudiobooks.py @@ -1,8 +1,9 @@ import requests -from audiobooker import AudioBook, BookTag, BookAuthor -from audiobooker.scrappers import AudioBookSource from sitemapparser import SiteMapParser +from audiobooker.base import AudioBook, BookAuthor +from audiobooker.scrappers import AudioBookSource + class GoldenAudioBooksAudioBook(AudioBook): base_url = "https://goldenaudiobooks.com" diff --git a/audiobooker/scrappers/librivox.py b/audiobooker/scrappers/librivox.py index 2d079ea..2352a29 100644 --- a/audiobooker/scrappers/librivox.py +++ b/audiobooker/scrappers/librivox.py @@ -1,5 +1,6 @@ import feedparser -from audiobooker import AudioBook, BookTag, BookAuthor, session + +from audiobooker.base import AudioBook, BookAuthor, session from audiobooker.scrappers import AudioBookSource diff --git a/audiobooker/scrappers/loyalbooks.py b/audiobooker/scrappers/loyalbooks.py index ec55c76..d9c0c90 100644 --- a/audiobooker/scrappers/loyalbooks.py +++ b/audiobooker/scrappers/loyalbooks.py @@ -1,8 +1,9 @@ import feedparser -from audiobooker import AudioBook, BookTag, BookAuthor -from audiobooker.scrappers import AudioBookSource from sitemapparser import SiteMapParser +from audiobooker.base import AudioBook, BookTag, BookAuthor +from audiobooker.scrappers import AudioBookSource + class LoyalBooksAudioBook(AudioBook): base_url = "http://www.loyalbooks.com" @@ -62,7 +63,7 @@ def parse_page(self): tag = a.text.strip() tag_id = LoyalBooks.get_tag_id(tag) tags.append(BookTag(name=tag, url=url, - tag_id=tag_id)) + tag_id=tag_id)) img = self.soup.find("img", {"itemprop": "image", "class": "cover"}) if img: @@ -232,26 +233,26 @@ def tags(self): LoyalBooks._tags = list(self.tag_pages.keys()) except Exception as e: LoyalBooks._tags = ['Advice', 'Instruction', - 'Ancient Texts', - 'Biography', 'Memoirs', 'Languages', - 'Myths/Legends', 'Holiday', 'Art', - 'Politics', 'Short stories', 'Romance', - 'Essay/Short nonfiction', 'Fiction', - 'Epistolary fiction', 'Science', - 'Nature', 'Dramatic Works', - 'Spy stories', 'History', 'Non-fiction', - 'Historical Fiction', 'Play', 'Children', - 'Satire', 'Humor', - 'Classics (antiquity)', 'Travel', - 'Religion', 'Adventure', 'Animals', - 'Psychology', 'Sea stories', - 'Horror/Ghost stories', 'Fantasy', - 'Cookery', 'Poetry', 'Self Published', - 'Westerns', 'Comedy', 'Music', - 'Economics', 'Fairy tales', 'Tragedy', - 'Teen/Young adult', 'Literature', - 'War stories', 'Science fiction', - 'Philosophy', 'Mystery'] + 'Ancient Texts', + 'Biography', 'Memoirs', 'Languages', + 'Myths/Legends', 'Holiday', 'Art', + 'Politics', 'Short stories', 'Romance', + 'Essay/Short nonfiction', 'Fiction', + 'Epistolary fiction', 'Science', + 'Nature', 'Dramatic Works', + 'Spy stories', 'History', 'Non-fiction', + 'Historical Fiction', 'Play', 'Children', + 'Satire', 'Humor', + 'Classics (antiquity)', 'Travel', + 'Religion', 'Adventure', 'Animals', + 'Psychology', 'Sea stories', + 'Horror/Ghost stories', 'Fantasy', + 'Cookery', 'Poetry', 'Self Published', + 'Westerns', 'Comedy', 'Music', + 'Economics', 'Fairy tales', 'Tragedy', + 'Teen/Young adult', 'Literature', + 'War stories', 'Science fiction', + 'Philosophy', 'Mystery'] return sorted(self._tags) or [] @classmethod @@ -326,7 +327,7 @@ def scrap_by_tag(cls, tag, limit=-1, offset=0): if book is None: continue book._tags = [BookTag(name=tag, url=cls._tag_pages[tag], - tag_id=cls.get_tag_id(tag)).as_json], + tag_id=cls.get_tag_id(tag)).as_json], yield book # check if last page reached diff --git a/audiobooker/scrappers/stephenkingaudiobooks.py b/audiobooker/scrappers/stephenkingaudiobooks.py index 84bf40a..a9b32a1 100644 --- a/audiobooker/scrappers/stephenkingaudiobooks.py +++ b/audiobooker/scrappers/stephenkingaudiobooks.py @@ -1,5 +1,6 @@ import requests -from audiobooker import AudioBook, BookAuthor + +from audiobooker.base import AudioBook, BookAuthor from audiobooker.scrappers import AudioBookSource diff --git a/audiobooker/scrappers/storynory.py b/audiobooker/scrappers/storynory.py index 851b634..f065ada 100644 --- a/audiobooker/scrappers/storynory.py +++ b/audiobooker/scrappers/storynory.py @@ -1,5 +1,6 @@ import requests -from audiobooker import AudioBook, BookTag, BookAuthor + +from audiobooker.base import AudioBook from audiobooker.scrappers import AudioBookSource @@ -104,13 +105,13 @@ def scrap_popular(cls, limit=-1, offset=0): desc = p.text try: book = StoryNoryAudioBook(description=desc, - url=url, - title=img["alt"], - img=img["data-ezsrc"]) + url=url, + title=img["alt"], + img=img["data-ezsrc"]) except: book = StoryNoryAudioBook(description=desc, - url=url, - img=img["src"]) + url=url, + img=img["src"]) book.from_page() # parse book url for streams yield book @@ -139,11 +140,6 @@ def scrap_all_audiobooks(cls, limit=-1, offset=0): if __name__ == "__main__": - from pprint import pprint - # for book in StoryNory.search_audiobooks(title="Dark Tower"): - # pprint(book.as_json) - scraper = StoryNory() for book in scraper.scrap_popular(): print(book.as_json) - diff --git a/audiobooker/scrappers/thoughtaudio.py b/audiobooker/scrappers/thoughtaudio.py index 02c4a15..74361a0 100644 --- a/audiobooker/scrappers/thoughtaudio.py +++ b/audiobooker/scrappers/thoughtaudio.py @@ -1,8 +1,9 @@ import requests -from audiobooker import AudioBook, BookTag, BookAuthor -from audiobooker.scrappers import AudioBookSource from sitemapparser import SiteMapParser +from audiobooker.base import AudioBook +from audiobooker.scrappers import AudioBookSource + class ThoughtAudioAudioBook(AudioBook): base_url = "http://thoughtaudio.com/" @@ -96,7 +97,7 @@ def search_audiobooks(cls, since=None, author=None, title=None, tag=None, return cls._parse_search_page(html) @classmethod - def get_audiobook(cls,book_id): + def get_audiobook(cls, book_id): url = cls.base_url + '/' + book_id book = ThoughtAudioAudioBook(url=url) return book @@ -120,4 +121,3 @@ def scrap_all_audiobooks(cls, limit=-1, offset=0): for book in scraper.scrap_all_audiobooks(): pprint(book.as_json) -