From d599a54e42509b96e2ac540c3f08edd51d97c76d Mon Sep 17 00:00:00 2001 From: miro Date: Wed, 27 Mar 2024 06:38:06 +0000 Subject: [PATCH] v2 --- audiobooker/__init__.py | 2 +- audiobooker/base.py | 251 +--------- audiobooker/scrappers/__init__.py | 200 +++----- audiobooker/scrappers/audioanarchy.py | 101 ++-- audiobooker/scrappers/darkerprojects.py | 146 ++---- audiobooker/scrappers/goldenaudiobooks.py | 256 ++-------- audiobooker/scrappers/hpaudiotales.py | 48 ++ audiobooker/scrappers/librivox.py | 208 ++++---- audiobooker/scrappers/loyalbooks.py | 446 +++--------------- audiobooker/scrappers/sharedaudiobooks.py | 69 +++ .../scrappers/stephenkingaudiobooks.py | 169 +++---- audiobooker/scrappers/storynory.py | 178 +++---- audiobooker/scrappers/thoughtaudio.py | 148 +++--- audiobooker/utils.py | 58 +++ examples/search_librivox.py | 2 +- setup.py | 2 +- test/test_utils.py | 75 +++ 17 files changed, 765 insertions(+), 1594 deletions(-) create mode 100644 audiobooker/scrappers/hpaudiotales.py create mode 100644 audiobooker/scrappers/sharedaudiobooks.py create mode 100644 test/test_utils.py diff --git a/audiobooker/__init__.py b/audiobooker/__init__.py index 769d9f9..7a28859 100644 --- a/audiobooker/__init__.py +++ b/audiobooker/__init__.py @@ -1,4 +1,4 @@ -from audiobooker.base import BookTag, BookAuthor, AudioBook +from audiobooker.base import BookAuthor, AudioBook from audiobooker.exceptions import UnknownAuthorIdException, \ UnknownBookIdException, UnknownDurationError, ScrappingError, \ UnknownGenreIdException, UnknownAuthorException, UnknownBookException, \ diff --git a/audiobooker/base.py b/audiobooker/base.py index 2e9f5ed..a2fc96a 100644 --- a/audiobooker/base.py +++ b/audiobooker/base.py @@ -1,245 +1,34 @@ -import json -import subprocess -from bs4 import BeautifulSoup -from requests_cache import CachedSession -from datetime import timedelta - +from typing import List +from dataclasses import dataclass, field from audiobooker.exceptions import UnknownAuthorIdException, \ UnknownBookIdException, UnknownDurationError, ScrappingError, \ UnknownGenreIdException, UnknownAuthorException, UnknownBookException, \ UnknownGenreException, ParseErrorException -expire_after = timedelta(hours=1) -session = CachedSession(backend='memory', expire_after=expire_after) - - -class BookTag: - def __init__(self, name="", tag_id="", url="", from_data=None): - self.name = name - self.tag_id = tag_id or name - self.url = url - if from_data: - self.from_json(from_data) - - @property - def as_json(self): - return {"name": self.name, - "id": self.tag_id, - "url": self.url} - - def from_json(self, json_data): - if isinstance(json_data, str): - json_data = json.loads(json_data) - if isinstance(json_data, BookTag): - json_data = json_data.as_json - if not isinstance(json_data, dict): - raise TypeError - self.name = json_data.get("name", self.name) - self.tag_id = json_data.get("id", self.tag_id) or self.name - self.url = json_data.get("url", self.url) - - def __str__(self): - return self.name - - def __repr__(self): - return "BookGenre(" + str(self) + ", " + self.tag_id + ")" - +@dataclass class BookAuthor: - def __init__(self, first_name="", last_name="", author_id="", url="", - from_data=None): - self.first_name = first_name - self.last_name = last_name - self.first_name, self.last_name = self.normalize_name() - self.author_id = author_id - self.url = url - if from_data: - self.from_json(from_data) + first_name: str = "" + last_name: str = "" - def normalize_name(self): - author = " ".join([self.first_name, self.last_name]) - names = author.split(" ") - last_name = " ".join(names[1:]) - first_name = names[0] - return first_name, last_name - def from_json(self, json_data): - if isinstance(json_data, str): - try: - json_data = json.loads(json_data) - except: - json_data = {"last_name": json_data} - if isinstance(json_data, BookAuthor): - json_data = json_data.as_json - if not isinstance(json_data, dict): - print(json_data, type(json_data)) - raise TypeError - self.first_name = json_data.get("first_name", self.first_name) - self.last_name = json_data.get("last_name", self.last_name) - self.first_name, self.last_name = self.normalize_name() - self.author_id = json_data.get("id", self.author_id) - self.url = json_data.get("url", self.url) - - @property - def as_json(self): - return {"first_name": self.first_name, "last_name": self.last_name, - "id": self.author_id, "url": self.url} - - def __str__(self): - return (self.first_name + " " + self.last_name).strip() - - def __repr__(self): - return "BookAuthor(" + str(self) + ", " + self.author_id + ")" +@dataclass +class AudiobookNarrator: + first_name: str = "" + last_name: str = "" +@dataclass class AudioBook: - def __init__(self, title="", authors=None, description="", tags=None, - book_id="", runtime=0, url="", img="", language='english', - from_data=None, stream_list=None, parse=False): - self.img = img - self.url = url - self.title = title - self._authors = authors or [] - self._description = description - self._tags = tags or [] - self.book_id = book_id - self.runtime = runtime - self.lang = language.lower() - self._stream_list = stream_list or [] - if not self.book_id and "/" in self.url: - self.book_id = self.url.split("/")[-1] - elif not self.book_id: - self.book_id = title - if from_data: - self.from_json(from_data) - self.raw = from_data or {} - if parse: - try: - self.from_page() - except: - pass - - def calc_runtime(self, data=None): - raise UnknownDurationError - - def parse_page(self): - raise ParseErrorException - - def from_page(self): - self.raw = self.parse_page() - - @property - def html(self): - try: - return session.get(self.url).text - except Exception as e: - try: - return session.get(self.url, verify=False).text - except: - return None - - @property - def soup(self): - return BeautifulSoup(self.html, "html.parser") - - @property - def description(self): - return self._description.strip() - - @property - def streamer(self): - for s in self._stream_list: - yield s - - @property - def streams(self): - return [s for s in self.streamer] - - def play_sox(self): - self.play("play %1") - - def play_mplayer(self): - self.play("mplayer %1") - - def play_vlc(self): - self.play("cvlc %1 --play-and-exit") - - def play(self, cmd="cvlc %1 --play-and-exit"): - for stream_url in self.streamer: - print("playing", stream_url) - if isinstance(cmd, str): - cmd = cmd.split(" ") - if isinstance(cmd, list): - play_cmd = cmd - for idx, c in enumerate(cmd): - if c == "%1": - play_cmd[idx] = stream_url - subprocess.call(" ".join(play_cmd), shell=True) - else: - raise TypeError - - @property - def authors(self): - authors = [] - for a in self._authors: - if isinstance(a, str): - try: - a = json.loads(a) - except Exception as e: - a = {"last_name": a} - if isinstance(a, dict): - authors += [a] - return [BookAuthor(from_data=a) for a in authors] - - @property - def tags(self): - return [BookTag(from_data=a) for a in self._tags] - - @property - def as_json(self): - bucket = self.raw - bucket["url"] = self.url - bucket["img"] = self.img - bucket["title"] = self.title - bucket["authors"] = self._authors - bucket["description"] = self._description - bucket["tags"] = self._tags - bucket["id"] = self.book_id - bucket["runtime"] = self.runtime - bucket["language"] = self.lang - bucket["streams"] = self.streams - return bucket - - def from_json(self, json_data): - if isinstance(json_data, str): - json_data = json.loads(json_data) - if not isinstance(json_data, dict): - raise TypeError - json_data = json_data or {} - self.url = json_data.get("url", self.url) - self.img = json_data.get("img", - json_data.get("pic", - json_data.get("image", - self.img))) - self.title = json_data.get("title", json_data.get("name", self.title)) - self._authors = json_data.get("authors", self._authors) - self._authors = self._authors or [json_data.get("author", "")] - self._description = json_data.get("description", self._description) - self._tags = json_data.get("tags", self._tags) - self.book_id = json_data.get("id") - self.runtime = json_data.get("runtime", self.runtime) - self.lang = json_data.get('language', - json_data.get('lang', self.lang)).lower() - self._stream_list = json_data.get("streams", self._stream_list) - self.raw = json_data - if not self.book_id and "/" in self.url: - self.book_id = self.url.split("/")[-1] - - def __str__(self): - return self.title - - def __repr__(self): - return "AudioBook(" + str(self) + ", " + self.book_id + ")" - + title: str = "" + description: str = "" + image: str = "" + language: str = "" + authors: List[BookAuthor] = field(default_factory=list) + tags: List[str] = field(default_factory=list) + streams: List[str] = field(default_factory=list) + narrator: AudiobookNarrator = None + year: int = 0 + runtime: int = 0 diff --git a/audiobooker/scrappers/__init__.py b/audiobooker/scrappers/__init__.py index a14d02b..e293003 100644 --- a/audiobooker/scrappers/__init__.py +++ b/audiobooker/scrappers/__init__.py @@ -1,149 +1,63 @@ -from threading import Thread - -from bs4 import BeautifulSoup -from rapidfuzz import process - -from audiobooker.base import session, BookTag +import abc +from requests_cache import CachedSession +from datetime import timedelta from audiobooker.exceptions import UnknownAuthorIdException, \ UnknownBookIdException, ScrappingError, UnknownAuthorException, UnknownBookException from audiobooker.utils import random_user_agent +from typing import List, Iterable +from audiobooker.base import AudioBook, BookAuthor, AudiobookNarrator class AudioBookSource: - base_url = "" - popular_url = "" - tags_url = "" - authors_url = "" - search_url = "" - _cache = None - _tags = [] - _tag_pages = {} - - @classmethod - def populate_cache(self, books=None, threaded=False): - if self._cache is None: - if books: - self._cache = books - return - if threaded: - t = Thread(target=self.get_all_audiobooks, - daemon=True).start() - else: - self._cache = self.get_all_audiobooks() - elif books: - self._cache += books - - @property - def tags(self): - return sorted(self._tags) or [] - - @staticmethod - def _get_html(url): - user_agent = random_user_agent() - try: - return session.get(url, headers={'User-Agent': user_agent}).text - except Exception as e: - return session.get(url, verify=False, - headers={'User-Agent': user_agent}).text - - @staticmethod - def _get_soup(html): - return BeautifulSoup(html, "html.parser") - - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - raise ScrappingError - - @property - def tag_pages(self): - return self._tag_pages or {} - - @classmethod - def scrap_tags(cls): - return cls._tag_pages - - @staticmethod - def scrap_all_audiobooks(limit=-1, offset=0): - raise ScrappingError - - @classmethod - def scrap_by_tag(cls, tag, limit=-1, offset=0): - for book in cls.search_audiobooks(tag=tag): - yield book - - @classmethod - def get_all_audiobooks(self, limit=2000, offset=0): - if self._cache is not None: - return self._cache - self._cache = [book for book in self.scrap_all_audiobooks(limit, - offset)] - return self._cache - - @classmethod - def get_tag_id(cls, tag): - if tag in cls._tags: - return str(cls._tags.index(tag)) - tags = [] - for gen in cls.scrap_tags(): - tags.append(gen) - tags = sorted(tags) - return str(tags.index(tag)) - - @classmethod - def get_tag(cls, tag_id): - if tag_id <= len(cls._tags): - tag = cls._tags[tag_id] - else: - tags = [] - for tag in cls.scrap_tags(): - tags.append(tag) - tags = sorted(tags) - tag = tags[tag_id] - return BookTag(tag_id=tag_id, name=tag) - - @staticmethod - def get_audiobook(book_id): - raise UnknownBookIdException - - @staticmethod - def get_author(author_id): - raise UnknownAuthorIdException - - @staticmethod - def get_audiobook_id(book): - raise UnknownBookException - - @staticmethod - def get_author_id(author): - raise UnknownAuthorException - - @classmethod - def search_audiobooks(self, since=None, author=None, title=None, - tag=None, limit=25): - """ - Args: - since: a UNIX timestamp; returns all projects cataloged since that time - author: all records by that author last name - title: all matching titles - tag: all projects of the matching tag - limit: max entries to return (int) - - Returns: - list : list of AudioBook objects - """ - # priority for title matches - alll = self.get_all_audiobooks() - if title: - for res in process.extract(title, alll, limit=limit): - match, score = res - yield match - alll.remove(match) - - # second author matches - if author: - choices = [" ".join([str(a) for a in b.authors]) for b in alll] - for res in process.extract(author, choices, limit=limit): - match, score = res - match = alll[choices.index(match)] - yield match - alll.remove(match) + expire_after = timedelta(hours=1) + session = CachedSession(backend='memory', expire_after=expire_after) + + def search(self, query) -> Iterable[AudioBook]: + # TODO fuzzy match instead + for b in self.search_by_title(query): + yield b + for b in self.search_by_author(query): + yield b + for b in self.search_by_tag(query): + yield b + + def search_by_narrator(self, query) -> Iterable[AudioBook]: + for b in self.iterate_all(): + if b.narrator: + if b.narrator.last_name.lower() in query.lower(): + yield b + + def search_by_author(self, query) -> Iterable[AudioBook]: + for b in self.iterate_all(): + for a in b.authors: + if (a.last_name and a.last_name.lower() in query.lower()) or \ + (a.first_name and a.first_name.lower() in query.lower()): + yield b + + def search_by_title(self, query) -> Iterable[AudioBook]: + for b in self.iterate_all(): + if query.lower() in b.title.lower(): + yield b + + def search_by_tag(self, query) -> Iterable[AudioBook]: + for b in self.iterate_all(): + if query.lower() in [t.lower() for t in b.tags]: + yield b + + @abc.abstractmethod + def iterate_all(self) -> Iterable[AudioBook]: + pass + + def iterate_popular(self) -> Iterable[AudioBook]: + return self.iterate_all() + + def iterate_by_author(self, author) -> Iterable[AudioBook]: + for b in self.iterate_all(): + for a in b.authors: + if a.last_name.lower() in author.lower(): + yield b + + def iterate_by_tag(self, tag) -> Iterable[AudioBook]: + for b in self.iterate_all(): + if tag in b.tags: + yield b diff --git a/audiobooker/scrappers/audioanarchy.py b/audiobooker/scrappers/audioanarchy.py index 43d3b40..50135d5 100644 --- a/audiobooker/scrappers/audioanarchy.py +++ b/audiobooker/scrappers/audioanarchy.py @@ -1,102 +1,57 @@ -import requests +from dataclasses import dataclass from audiobooker.base import AudioBook, BookAuthor from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup -class AudioAnarchyAudioBook(AudioBook): - base_url = "http://www.audioanarchy.org/" +@dataclass +class AudioAnarchyAudioBook: + url: str + image: str = "" - def parse_page(self): + def parse_page(self) -> AudioBook: + base_url = "http://www.audioanarchy.org/" + soup = get_soup(self.url) streams = [] - for url in self.soup.find_all("a"): + for url in soup.find_all("a"): try: if not url["href"].endswith(".mp3"): continue - streams.append(self.base_url + url["href"]) + streams.append(base_url + url["href"]) except: continue - title = self.soup.find("title").text - author_name = "Audio Anarchy" - authors = [BookAuthor(first_name=author_name)] - img = self.img - return {"authors": authors, - "title": title.strip(), - "streams": streams, - "rating": 0, - "tags": [], - "img": img} - - def from_page(self): - data = self.parse_page() - if not self.title: - self.title = data["title"] - if not self._description: - self._description = data.get("description") or self.title - self.img = data.get("img", self.img) - for tag in data["tags"]: - if tag.as_json not in self._tags: - self._tags.append(tag.as_json) - for author in data["authors"]: - if author.as_json not in self._authors: - self._authors.append(author.as_json) - self._stream_list = data["streams"] - self.raw.update(data) - - def __repr__(self): - return "AudioAnarchyAudioBook(" + str( - self) + ", " + self.book_id + ")" + title = soup.find("title").text.split(" - ")[-1].split(" :: ")[-1] + return AudioBook( + title=title, + streams=streams, + image=self.image, + tags=["Anarchy"], + authors=[BookAuthor(last_name="Audio Anarchy")], + language="en" + ) class AudioAnarchy(AudioBookSource): base_url = "http://www.audioanarchy.org" - _tags = ["Anarchy"] - _tag_pages = {"Anarchy": 'http://www.audioanarchy.org'} - @staticmethod - def _parse_page(html, limit=-1): - soup = AudioAnarchy._get_soup(html) + def iterate_all(self): + soup = get_soup(self.base_url) for entry in soup.find_all("div", {"id": "album"}): try: a = entry.find("a") img = entry.find("img") - book = AudioAnarchyAudioBook(from_data={ - "title": img["alt"], - "url": "https://www.audioanarchy.org/" + a["href"], - "img": "https://www.audioanarchy.org/" + img["src"] - }) - book.from_page() # parse url - yield book + yield AudioAnarchyAudioBook( + url="https://www.audioanarchy.org/" + a["href"], + image="https://www.audioanarchy.org/" + img["src"] + ).parse_page() except: - raise continue - def scrap_popular(self, limit=-1, offset=0): - html = requests.get(self.base_url).text - return AudioAnarchy._parse_page(html) - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - html = requests.get(AudioAnarchy.base_url).text - return AudioAnarchy._parse_page(html) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/' + book_id - book = AudioAnarchyAudioBook(url=url) - return book - - def scrap_all_audiobooks(self, limit=-1, offset=0): - return self.scrap_popular() - if __name__ == "__main__": from pprint import pprint - # for book in AudioAnarchy.search_audiobooks(title="Dark Tower"): - # pprint(book.as_json) - scraper = AudioAnarchy() - for book in scraper.scrap_all_audiobooks(): - pprint(book.as_json) + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/scrappers/darkerprojects.py b/audiobooker/scrappers/darkerprojects.py index b838100..5a12867 100644 --- a/audiobooker/scrappers/darkerprojects.py +++ b/audiobooker/scrappers/darkerprojects.py @@ -1,146 +1,56 @@ -import requests from sitemapparser import SiteMapParser - -from audiobooker.base import AudioBook +from dataclasses import dataclass +from audiobooker.base import AudioBook, BookAuthor from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup -class DarkerProjectsAudioBook(AudioBook): - base_url = "http://darkerprojects.com" +@dataclass +class DarkerProjectsAudioBook: + url: str def parse_page(self): streams = [] - img = self.img - desc = self.description + soup = get_soup(self.url) + img = "" + desc = "" - for d in self.soup.find("div", {"class": "inner-entry-content"}).find_all("i"): + for d in soup.find("div", {"class": "inner-entry-content"}).find_all("i"): desc = d.text break - for url in self.soup.find_all("img")[1:]: + for url in soup.find_all("img")[1:]: img = url["src"] break - for url in self.soup.find_all("a"): + for url in soup.find_all("a"): if not url.get("href"): continue if url["href"].endswith(".mp3"): if url["href"] not in streams: streams.append(url["href"]) - title = self.soup.find("title").text - - return {"title": title.strip(), - "streams": streams, - "description": desc, - "img": img} + title = soup.find("title").text - def from_page(self): - data = self.parse_page() - self.title = data["title"] - self.img = data.get("img", self.img) - self._stream_list = data["streams"] - self.raw.update(data) - - def __repr__(self): - return "DarkerProjectsAudioBook(" + str( - self) + ", " + self.book_id + ")" + return AudioBook( + title=title, + streams=streams, + image=img, + tags=["audio drama"], + description=desc, + authors=[BookAuthor(last_name="Darker Projects")], + language="en" + ) class DarkerProjects(AudioBookSource): - base_url = "http://darkerprojects.com" - - @classmethod - def _parse_page(cls, html, limit=-1): - soup = cls._get_soup(html) - for entry in soup.find_all("article"): - try: - if not entry.find("div", {"class": "powerpress_player"}): - continue # no audio streams, text only post - a = entry.find("a") - desc = "" - for p in entry.find_all("p"): - desc = p.text - - tags = [] - try: - cat = entry.find("span", {"class": "cat-links"}).find("a") - tags.append({"name": cat.text, "url": cat["href"]}) - except: - pass - dl = entry.find("a", {"class": "powerpress_link_d"}) - yield DarkerProjectsAudioBook( - title=a.text, - description=desc, - stream_list=[dl["href"]], - tags=tags, - url=a["href"] - ) - except: - continue - - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - html = requests.get(cls.base_url).text - return cls._parse_page(html) - - @classmethod - def scrap_tags(cls): - bucket = {} - sm = SiteMapParser('https://darkerprojects.com/wp-sitemap-taxonomies-category-1.xml') # reads /sitemap.xml - urls = sm.get_urls() # returns iterator of sitemapper.Url instances - for url in urls: - url = str(url) - title = url.strip("/").split("/")[-1].replace("-", " ").title() - bucket[title] = url - return bucket - - @classmethod - def scrap_collections(cls, limit=-1, offset=0): - for tag in cls.scrap_tags(): - yield cls.get_collection(tag) - @classmethod - def get_collection(cls, collection): - for tag, url in cls.scrap_tags().items(): - if tag == collection: - html = requests.get(url).text - streams = [] - for book in cls._parse_page(html): - streams += book.streams - streams.reverse() - return DarkerProjectsAudioBook(title=tag, - stream_list=streams, - url=url) - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - query = "" - if title: - query += title + " " - if tag: - query += tag + " " - if author: - query += author + " " - html = requests.get(cls.base_url, params={"s": query}).text - return cls._parse_page(html) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/' + book_id - book = DarkerProjectsAudioBook(url=url) - return book - - @classmethod - def scrap_all_audiobooks(cls, limit=-1, offset=0): + def iterate_all(self): sm = SiteMapParser('https://darkerprojects.com/wp-sitemap-posts-post-1.xml') # reads /sitemap.xml urls = sm.get_urls() # returns iterator of sitemapper.Url instances for url in urls: url = str(url) - title = url.strip("/").split("/")[-1].replace("-", " ").title() - book = DarkerProjectsAudioBook(url=url, title=title) - book.from_page() - yield book + book = DarkerProjectsAudioBook(url=url) + yield book.parse_page() if __name__ == "__main__": @@ -148,7 +58,5 @@ def scrap_all_audiobooks(cls, limit=-1, offset=0): scraper = DarkerProjects() - print(scraper.scrap_tags()) - - for book in scraper.scrap_all_audiobooks(): - pprint(book.as_json) + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/scrappers/goldenaudiobooks.py b/audiobooker/scrappers/goldenaudiobooks.py index 5f5b3f8..6c5a8df 100644 --- a/audiobooker/scrappers/goldenaudiobooks.py +++ b/audiobooker/scrappers/goldenaudiobooks.py @@ -1,240 +1,60 @@ -import requests +from dataclasses import dataclass + from sitemapparser import SiteMapParser from audiobooker.base import AudioBook, BookAuthor from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup, normalize_name -class GoldenAudioBooksAudioBook(AudioBook): - base_url = "https://goldenaudiobooks.com" +@dataclass +class GoldenAudioBooksAudioBook: + url: str def parse_page(self): - author_name = "goldenaudiobooks" - title = self.soup.find("h1", {"class": "entry-title"}).text - content = self.soup.find("div", {"class": "entry-content"}) - desc = content.find("p").text + soup = get_soup(self.url) + title = soup.find("h1", {"class": "title-page"}).text.replace(" Audiobook", "") + tags = [t for t in soup.find("span", {"class": "post-meta-category"}).text.split(" ") if len(t) > 2] + img = soup.find("figure").find("img")["src"] + + authors = [] + if "–" in title: pts = title.split("–") author_name = pts[0] title = " ".join(pts[1:]) - img = content.find("img")["data-src"] - names = author_name.strip().split(" ") - if len(names): - first_name = names[0].strip() - last_name = " ".join(names[1:]).strip() - if not last_name: - last_name = first_name - first_name = "" - else: - first_name = "" - last_name = author_name.strip() - - authors = [BookAuthor(first_name=first_name, last_name=last_name)] - - streams = [s.find("a").text for s in content.find_all("audio")] - - return {"description": desc, - "authors": authors, - "title": title.strip(), - "streams": streams, - "rating": 0, - "tags": [], - "img": img} - - def from_page(self): - data = self.parse_page() - if not self.title: - self.title = data["title"] - if not self._description: - self._description = data["description"] - - self.img = data.get("img", self.img) - for tag in data["tags"]: - if tag.as_json not in self._tags: - self._tags.append(tag.as_json) - for author in data["authors"]: - if author.as_json not in self._authors: - self._authors.append(author.as_json) - self._stream_list = data["streams"] - self.raw.update(data) - - def __repr__(self): - return "GoldenAudioBooksAudioBook(" + str( - self) + ", " + self.book_id + ")" + f, l = normalize_name(author_name) + authors = [BookAuthor(first_name=f, last_name=l)] -class GoldenAudioBooks(AudioBookSource): - base_url = "https://goldenaudiobooks.com" - popular_url = "https://goldenaudiobooks.com/category/bestsellers" - - @classmethod - def scrap_tags(cls): - bucket = {} - sm = SiteMapParser('https://goldenaudiobook.co/category-sitemap.xml') # reads /sitemap.xml - urls = sm.get_urls() # returns iterator of sitemapper.Url instances - for url in urls: - url = str(url) - title = url.strip("/").split("/")[-1].replace("-", " ").title() - - bucket[title] = url - - return bucket - - @property - def tag_pages(self): - if self._tag_pages is None: - try: - self._tag_pages = self.scrap_tags() - except Exception as e: - self._tag_pages = { - 'Action': 'https://goldenaudiobooks.com/category/action/', - 'Adults': 'https://goldenaudiobooks.com/category/adults-audios/', - 'Adventure': 'https://goldenaudiobooks.com/category/adventure/', - 'Autobiography & Biographies': 'https://goldenaudiobooks.com/category/autobiography-biographies/', - 'Bestsellers': 'https://goldenaudiobooks.com/category/bestsellers/', - 'Business': 'https://goldenaudiobooks.com/category/business/', - 'Children': 'https://goldenaudiobooks.com/category/children/', - 'Classic': 'https://goldenaudiobooks.com/category/classic/', - 'Crime': 'https://goldenaudiobooks.com/category/crime/', - 'Fantasy': 'https://goldenaudiobooks.com/category/audio-fantasy/', - 'General Fiction': 'https://goldenaudiobooks.com/category/general-fiction/', - 'Historical Fiction': 'https://goldenaudiobooks.com/category/historical-fiction/', - 'History': 'https://goldenaudiobooks.com/category/history/', - 'Horror': 'https://goldenaudiobooks.com/category/horror/', - 'Humor': 'https://goldenaudiobooks.com/category/humors/', - 'Literary': 'https://goldenaudiobooks.com/category/literary/', - 'Literature & Fiction': 'https://goldenaudiobooks.com/category/literature-fiction/', - 'Mystery': 'https://goldenaudiobooks.com/category/mystery/', - 'Nonfiction': 'https://goldenaudiobooks.com/category/nonfiction/', - 'Novel': 'https://goldenaudiobooks.com/category/novel/', - 'Other': 'https://goldenaudiobooks.com/category/other/', - 'Paranormal': 'https://goldenaudiobooks.com/category/paranormal-audiobooks/', - 'Philosophy': 'https://goldenaudiobooks.com/category/philosophy/', - 'Romance': 'https://goldenaudiobooks.com/category/audiobooks-romance/', - 'Sci-Fi': 'https://goldenaudiobooks.com/category/science-fiction-audiobooks/', - 'Science': 'https://goldenaudiobooks.com/category/science/', - 'Self-help': 'https://goldenaudiobooks.com/category/self-help/', - 'Short Story': 'https://goldenaudiobooks.com/category/short-story/', - 'Spiritual & Religious': 'https://goldenaudiobooks.com/category/spiritual-religious/', - 'Sports': 'https://goldenaudiobooks.com/category/sports/', - 'Suspense': 'https://goldenaudiobooks.com/category/suspense/', - 'Teen & Young Adult': 'https://goldenaudiobooks.com/category/teen-and-young-adult/', - 'Thriller': 'https://goldenaudiobooks.com/category/thriller/', - 'Uncategorized': 'https://goldenaudiobooks.com/category/uncategorized/', - 'Westerns': 'https://goldenaudiobooks.com/category/westerns/'} - return self._tag_pages or {} - - @classmethod - def _parse_page(cls, html, limit=-1): - soup = cls._get_soup(html) - for entry in soup.find_all("div", {"class": "columns postbox"}): - a = entry.find("a") - img = entry.find("img")["data-src"] - url = a["href"] - title = a["title"] - tags = [] - for a in entry.find("span", {"class": "cat-links"}). \ - find_all("a"): - tags.append({"name": a.text, "url": a["href"]}) - yield GoldenAudioBooksAudioBook(from_data={ - "title": title, - "url": url, - "img": img, - "tags": tags - }) - if limit == -1 or limit > 0: - limit -= 1 - next_page = soup.find("a", {"class": "next page-numbers"}) - if next_page: - html = requests.get(next_page["href"]).text - for ntry in cls._parse_page(html, limit=limit): - yield ntry - - @classmethod - def scrap_by_tag(cls, tag, limit=-1, offset=0): - if tag in cls._tag_pages: - url = cls._tag_pages[tag] - html = requests.get(url).text - for book in cls._parse_page(html): - # TODO inject tag in book obj - yield book - else: - for book in cls.search_audiobooks(tag=tag): - yield book - - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - html = requests.get(cls.popular_url).text - return cls._parse_page(html) - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - """ - Args: - since: a UNIX timestamp; returns all projects cataloged since that time - author: all records by that author last name - title: all matching titles - tag: all projects of the matching tag - Yields: - AudioBook objects - """ - query = "" - if title: - query += title + " " - if tag: - query += tag + " " - if author: - query += author + " " - html = requests.get(cls.base_url, - params={"s": query}).text - return cls._parse_page(html) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/' + book_id - book = GoldenAudioBooksAudioBook(url=url) - return book - - @classmethod - def scrap_all_audiobooks(cls, limit=-1, offset=0): - sm = SiteMapParser('https://goldenaudiobook.co/post-sitemap.xml') # reads /sitemap.xml - urls = sm.get_urls() # returns iterator of sitemapper.Url instances - for url in urls: - url = str(url) - title = url.strip("/").split("/")[-1].replace("-", " ").title() - yield GoldenAudioBooksAudioBook(url=url, title=title) - - sm = SiteMapParser('https://goldenaudiobook.co/post-sitemap2.xml') # reads /sitemap.xml - urls = sm.get_urls() # returns iterator of sitemapper.Url instances - for url in urls: - url = str(url) - title = url.strip("/").split("/")[-1].replace("-", " ").title() - yield GoldenAudioBooksAudioBook(url=url, title=title) - + streams = [s.find("a").text for s in soup.find_all("audio")] -if __name__ == "__main__": - from pprint import pprint + return AudioBook( + title=title.strip(), + streams=streams, + image=img, + tags=tags, + authors=authors, + language="en" + ) - book = GoldenAudioBooks.get_audiobook('andy-weir-artemis-audiobook/') - # pprint(book.parse_page()) - for a in book.authors: - # print(a.as_json) - pass - tags = GoldenAudioBooks.scrap_tags() - print(tags) +class GoldenAudioBooks(AudioBookSource): + base_url = "https://goldenaudiobook.co/" - for book in GoldenAudioBooks.search_audiobooks(author="Lovecraft"): - pprint(book.as_json) + def iterate_all(self): + for u in ['https://goldenaudiobook.co/post-sitemap.xml', + 'https://goldenaudiobook.co/post-sitemap2.xml']: + sm = SiteMapParser(u) # reads /sitemap.xml + urls = sm.get_urls() # returns iterator of sitemapper.Url instances + for url in urls: + yield GoldenAudioBooksAudioBook(url=str(url)).parse_page() - scraper = GoldenAudioBooks() - for book in scraper.scrap_popular(): - pprint(book.as_json) - for book in scraper.scrap_by_tag("science-fiction-audiobooks"): - pprint(book.as_json) +if __name__ == "__main__": + from pprint import pprint - for book in scraper.scrap_all_audiobooks(): - pprint(book.as_json) + scraper = GoldenAudioBooks() + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/scrappers/hpaudiotales.py b/audiobooker/scrappers/hpaudiotales.py new file mode 100644 index 0000000..9d55043 --- /dev/null +++ b/audiobooker/scrappers/hpaudiotales.py @@ -0,0 +1,48 @@ +from dataclasses import dataclass + +from sitemapparser import SiteMapParser + +from audiobooker.base import AudioBook +from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup + + +@dataclass +class HPTalesAudioBook: + url: str + + def parse_page(self): + soup = get_soup(self.url) + title = soup.find("h1", {"class": "entry-title"}).text + tags = ["Harry Potter", "Fantasy", "Magic"] + + d = soup.find("div", {"class": "audioigniter-root"})["data-tracks-url"] + data = AudioBookSource.session.get(d).json() + tags += list(set(s["subtitle"] for s in data)) + + streams = [s["audio"] for s in data] + + return AudioBook( + title=title.strip(), + streams=streams, + tags=tags, + language="en" + ) + + +class HPTalesAudioBooks(AudioBookSource): + + @classmethod + def iterate_all(cls, limit=-1, offset=0): + sm = SiteMapParser('https://hpaudiotales.com/wp-sitemap-posts-post-1.xml') # reads /sitemap.xml + urls = sm.get_urls() # returns iterator of sitemapper.Url instances + for url in urls: + yield HPTalesAudioBook(url=str(url)).parse_page() + + +if __name__ == "__main__": + from pprint import pprint + + scraper = HPTalesAudioBooks() + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/scrappers/librivox.py b/audiobooker/scrappers/librivox.py index 2352a29..bdcd7cf 100644 --- a/audiobooker/scrappers/librivox.py +++ b/audiobooker/scrappers/librivox.py @@ -1,116 +1,116 @@ +from typing import Iterable + import feedparser -from audiobooker.base import AudioBook, BookAuthor, session +from audiobooker.base import AudioBook, BookAuthor, AudiobookNarrator from audiobooker.scrappers import AudioBookSource - - -class LibrivoxAudioBook(AudioBook): - def __init__(self, title="", authors=None, description="", tags=None, - book_id="", runtime=0, url="", img="", rss_url="", - copyright_year=0, language='english', from_data=None): - self.rss_url = rss_url - self.copyright_year = copyright_year - AudioBook.__init__(self, title, authors, description, tags, - book_id, runtime, url, img, language, from_data=from_data) - - @property - def description(self): - return self._description.replace("

", "").replace("

", "") \ - .replace("(summary from Wikipedia)", "").strip().rstrip("\"") \ - .lstrip("\"") - - @property - def rss_data(self): - return feedparser.parse(self.rss_url) - - @property - def streamer(self): - for stream in self.rss_data["entries"]: - try: - yield stream['media_content'][0]["url"] - except Exception as e: - print(e) - continue - - def from_json(self, json_data): - AudioBook.from_json(self, json_data) - self.url = json_data.get("url_librivox", self.url) - self.runtime = json_data.get("totaltimesecs", self.runtime) - self.copyright_year = json_data.get("copyright_year", - self.copyright_year) - self.rss_url = json_data.get("url_rss", self.rss_url) - - def __repr__(self): - return "LibrivoxAudioBook(" + str(self) + ", " + self.book_id + ")" +from audiobooker.utils import normalize_name class Librivox(AudioBookSource): base_url = "https://librivox.org/api/feed/audiobooks/?%s&format=json" authors_url = "https://librivox.org/api/feed/authors/?%s&format=json" - @classmethod - def scrap_all_audiobooks(cls, limit=2000, offset=0): - """ - Generator, yields LibrivoxAudioBook objects - Args: - limit: - offset: - """ - url = cls.base_url % \ - ("limit=" + str(limit) + "offset=" + str(offset) + "&extended=1") - json_data = session.get(url).json()['books'] - for k in json_data: - yield LibrivoxAudioBook(from_data=json_data[k]) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url % ("id=" + str(book_id),) - json_data = session.get(url).json()['books'] - return LibrivoxAudioBook(from_data=json_data[0]) - - @classmethod - def get_author(cls, author_id): - url = cls.authors_url % ("id=" + str(author_id),) - json_data = session.get(url).json()["authors"] - return BookAuthor(from_data=json_data[0]) - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - """ - Args: - since: a UNIX timestamp; returns all projects cataloged since that time - author: all records by that author last name - title: all matching titles - tag: all projects of the matching tag - - Returns: - list : list of LibrivoxAudioBook objects - """ - searchterm = [] - if limit: - # TODO validate - searchterm.append("limit=" + str(limit)) - if since: - # TODO validate - searchterm.append("since=" + since) - if author: - searchterm.append("author=" + author) - if title: - searchterm.append("title=" + title) - if tag: - # TODO validate - searchterm.append("tag=" + tag) - if not searchterm: - raise TypeError - searchterm = "&".join(searchterm) - url = cls.base_url % (searchterm,) - json_data = session.get(url).json() - if "error" in json_data: - return [] - return [LibrivoxAudioBook(from_data=a) for a in json_data["books"]] + def iterate_all(self, offset=0, max_offset=100000): + url = "https://librivox.org/api/feed/audiobooks" + params = { + "limit": 50, + "offset": offset, + "extended": 1, + "format": "json" + } + json_data = AudioBookSource.session.get(url, params=params).json() + for k in json_data['books']: + for book in self._parse_res(k): + yield book + if offset < max_offset: + offset += 50 + for k in self.iterate_all(offset): + yield k + + def search_by_author(self, query) -> Iterable[AudioBook]: + url = "https://librivox.org/api/feed/audiobooks" + params = { + "author": query, + "limit": 50, + "extended": 1, + "format": "json" + } + json_data = AudioBookSource.session.get(url, params=params).json() + for k in json_data['books']: + for book in self._parse_res(k): + yield book + + def search_by_narrator(self, query) -> Iterable[AudioBook]: + url = "https://librivox.org/api/feed/audiobooks" + params = { + "reader": query, + "limit": 50, + "extended": 1, + "format": "json" + } + json_data = AudioBookSource.session.get(url, params=params).json() + for k in json_data['books']: + for book in self._parse_res(k): + yield book + + def search_by_tag(self, query) -> Iterable[AudioBook]: + url = "https://librivox.org/api/feed/audiobooks" + params = { + "tag": query, + "limit": 50, + "extended": 1, + "format": "json" + } + json_data = AudioBookSource.session.get(url, params=params).json() + for k in json_data['books']: + for book in self._parse_res(k): + yield book + + def search_by_title(self, query) -> Iterable[AudioBook]: + url = "https://librivox.org/api/feed/audiobooks" + params = { + "title": query, + "limit": 50, + "extended": 1, + "format": "json" + } + json_data = AudioBookSource.session.get(url, params=params).json() + for k in json_data['books']: + for book in self._parse_res(k): + yield book + + def _parse_res(self, k): + rss = feedparser.parse(k['url_rss']) + streams = [stream['media_content'][0]["url"] + for stream in rss["entries"]] + + for idx, s in enumerate(k["sections"]): + + if len(s["readers"]) > 1: + narrator = AudiobookNarrator(last_name="Various") + else: + f, l = normalize_name(s["readers"][0]['display_name']) + narrator = AudiobookNarrator(last_name=l, first_name=f) + + yield AudioBook( + streams=[streams[idx]], + narrator=narrator, + tags=[g["name"] for g in k["genres"]], + authors=[BookAuthor(first_name=a["first_name"], + last_name=a["last_name"]) + for a in k["authors"]], + title=k["title"] + " | " + s["title"], + description=k["description"], + year=int(k['copyright_year']), + runtime=s['playtime'], + language=k["language"] # TODO - convert to lang code + ) if __name__ == "__main__": - book = Librivox.search_audiobooks(title="War of the worlds")[0] - book.play_mplayer() + l = Librivox() + for book in l.search_by_title("Art of War"): + print(book) + for book in l.search_by_author("Lovecraft"): + print(book) diff --git a/audiobooker/scrappers/loyalbooks.py b/audiobooker/scrappers/loyalbooks.py index d9c0c90..5d633ed 100644 --- a/audiobooker/scrappers/loyalbooks.py +++ b/audiobooker/scrappers/loyalbooks.py @@ -1,417 +1,83 @@ import feedparser from sitemapparser import SiteMapParser -from audiobooker.base import AudioBook, BookTag, BookAuthor +from audiobooker.base import AudioBook, BookAuthor from audiobooker.scrappers import AudioBookSource - - -class LoyalBooksAudioBook(AudioBook): - base_url = "http://www.loyalbooks.com" - - def __init__(self, title="", authors=None, description="", tags=None, - book_id="", runtime=0, url="", rss_url="", img="", rating=0, - language='english', from_data=None): - self.rss_url = rss_url or url + "/feed" - self.rating = rating - AudioBook.__init__(self, title, authors, description, tags, - book_id, runtime, url, img, language) - self.from_rss() - - def parse_page(self): - title = self.soup.find("span", {"itemprop": "name"}).text - description = self.soup.find("font", - {"class": "book-description"}).text - if self.soup.find(id="star1") is not None: - rating = 1 - elif self.soup.find(id="star2") is not None: - rating = 2 - elif self.soup.find(id="star3") is not None: - rating = 3 - elif self.soup.find(id="star4") is not None: - rating = 4 - elif self.soup.find(id="star5") is not None: - rating = 5 - else: - rating = 0 - author = self.soup.find("font", {"class": "book-author"}) - author_name = author.text.replace("By: ", "") - - names = author_name.split(" ") - if len(names): - first_name = names[0].strip() - last_name = " ".join(names[1:]).strip() - if not last_name: - last_name = first_name - first_name = "" - else: - first_name = "" - last_name = author_name.strip() - - author_url = author.find("a") - if author_url: - author_url = self.base_url + author_url["href"] - - authors = [BookAuthor(url=author_url, first_name=first_name, - last_name=last_name)] - - tags = [] - tags_table = self.soup.find(summary="Genres for this book") - if tags_table: - tags_urls = tags_table.find_all("a") - for a in tags_urls: - url = self.base_url + a["href"] - tag = a.text.strip() - tag_id = LoyalBooks.get_tag_id(tag) - tags.append(BookTag(name=tag, url=url, - tag_id=tag_id)) - - img = self.soup.find("img", {"itemprop": "image", "class": "cover"}) - if img: - img = self.base_url + img["src"] - return {"description": description, "rating": rating, "tags": tags, - "authors": authors, "title": title, "img": img} - - @property - def rss_data(self): - return feedparser.parse(self.rss_url) - - @property - def streamer(self): - for stream in self.rss_data["entries"]: - try: - for url in stream["links"]: - if url["type"] == 'audio/mpeg': - yield url["href"] - except Exception as e: +from audiobooker.utils import normalize_name + + +def calc_runtime(rss_data): + runtime = rss_data["itunes_duration"].split(":") + if len(runtime) == 1: # seconds + return int(runtime[0]) + elif len(runtime) == 2: # minutes : seconds + return int(runtime[1]) + (int(runtime[0]) * 60) + elif len(runtime) == 3: # hours : minutes : seconds + return int(runtime[2]) + (int(runtime[1]) * 60) + \ + (int(runtime[0]) * 120) + return 0 + + +def from_rss(rss_url): + data = feedparser.parse(rss_url) + for rss in data["entries"]: + authors = [] + streams = [s['href'] for s in rss["links"] + if "audio" in s["type"]] + for rss_data in rss["authors"]: + if not rss_data: continue - - def from_json(self, json_data): - AudioBook.from_json(self, json_data) - self.rss_url = json_data.get("url_rss", self.rss_url) - self.rating = json_data.get("rating", self.rating) - - def calc_runtime(self, data=None): - data = data or self.rss_data["entries"] - for rss_data in data: - runtime = rss_data["itunes_duration"].split(":") - if len(runtime) == 1: # seconds - self.runtime += int(runtime[0]) - elif len(runtime) == 2: # minutes : seconds - self.runtime += int(runtime[1]) + (int(runtime[0]) * 60) - elif len(runtime) == 3: # hours : minutes : seconds - self.runtime += int(runtime[2]) + (int(runtime[1]) * 60) + \ - (int(runtime[0]) * 120) - - def from_rss(self): - rss = self.rss_data["entries"] - - if self.runtime < 1: - self.calc_runtime() - - if not self.url: - self.url = rss[0]["link"] - - for rss_data in rss: - first_name = "" - last_name = rss_data["author"] - names = last_name.split(" ") - if len(names) > 1: - first_name = names[0].strip() - last_name = " ".join(names[1:]).strip() - if not last_name: - last_name = first_name - first_name = "" - author = BookAuthor(from_data={"first_name": first_name, - "last_name": last_name}) - if author.as_json not in self._authors: - self._authors.append(author.as_json) - - def from_page(self): - data = self.parse_page() - if self.rating < 1: - self.rating = data["rating"] - if not self.title: - self.title = data["title"] - if not self._description: - self._description = data["description"] - - self.img = data.get("img", self.img) - for tag in data["tags"]: - if tag.as_json not in self._tags: - self._tags.append(tag.as_json) - for author in data["authors"]: - if author.as_json not in self._authors: - self._authors.append(author.as_json) - - def __repr__(self): - return "LoyalBooksAudioBook(" + str(self) + ", " + self.book_id + ")" + f, l = normalize_name(rss_data["name"]) + author = BookAuthor(first_name=f, last_name=l) + authors.append(author) + yield AudioBook( + language=data["feed"]["language"], + description=data["feed"]["summary"], + tags=[t['term'] for t in data["feed"]["tags"]], + image=data["feed"]["image"]["href"], + streams=streams, + title=data["feed"]["title"] + " | " + rss["title"], + runtime=calc_runtime(rss), + authors=authors + ) class LoyalBooks(AudioBookSource): - base_url = "https://www.loyalbooks.com" - popular_url = "https://www.loyalbooks.com" - tags_url = "https://www.loyalbooks.com/tag-menu" - search_url = "https://www.loyalbooks.com/search?q=%s" - - @classmethod - def scrap_tags(cls): - soup = cls._get_soup(cls._get_html(cls.tags_url)) - urls = soup.find("div", {"class": "left"}).find_all("a") - bucket = {} - for url in urls: - tag = url.text - url = url["href"] - if url.startswith("/tag"): - url = "http://www.loyalbooks.com" + url - bucket[tag] = url - cls._tags = list(bucket.keys()) - return bucket - - @property - def tag_pages(self): - if LoyalBooks._tag_pages is None: - try: - LoyalBooks._tag_pages = LoyalBooks.scrap_tags() - except Exception as e: - LoyalBooks._tag_pages = { - 'Adventure': 'http://www.loyalbooks.com/tag/Adventure', - 'Advice': 'http://www.loyalbooks.com/tag/Advice', - 'Ancient Texts': 'http://www.loyalbooks.com/tag/Ancient_Texts', - 'Animals': 'http://www.loyalbooks.com/tag/Animals', - 'Art': 'http://www.loyalbooks.com/tag/Art', - 'Biography': 'http://www.loyalbooks.com/tag/Biography', - 'Children': 'http://www.loyalbooks.com/tag/Children', - 'Classics (antiquity)': 'http://www.loyalbooks.com/tag/Classics_antiquity', - 'Comedy': 'http://www.loyalbooks.com/tag/Comedy', - 'Cookery': 'http://www.loyalbooks.com/tag/Cookery', - 'Dramatic Works': 'http://www.loyalbooks.com/tag/Dramatic_Works', - 'Economics': 'http://www.loyalbooks.com/tag/Economics_Political_Economy', - 'Epistolary fiction': 'http://www.loyalbooks.com/tag/Epistolary_fiction', - 'Essay/Short nonfiction': 'http://www.loyalbooks.com/tag/Essay_Short_nonfiction', - 'Fairy tales': 'http://www.loyalbooks.com/tag/Fairy_tales', - 'Fantasy': 'http://www.loyalbooks.com/tag/Fantasy', - 'Fiction': 'http://www.loyalbooks.com/tag/Fiction', - 'Historical Fiction': 'http://www.loyalbooks.com/tag/Historical_Fiction', - 'History': 'http://www.loyalbooks.com/tag/History', - 'Holiday': 'http://www.loyalbooks.com/tag/Holiday', - 'Horror/Ghost stories': 'http://www.loyalbooks.com/tag/Horror_Ghost_stories', - 'Humor': 'http://www.loyalbooks.com/tag/Humor', - 'Instruction': 'http://www.loyalbooks.com/tag/Instruction', - 'Languages': 'http://www.loyalbooks.com/tag/Languages', - 'Literature': 'http://www.loyalbooks.com/tag/Literature', - 'Memoirs': 'http://www.loyalbooks.com/tag/Memoirs', - 'Music': 'http://www.loyalbooks.com/tag/Music', - 'Mystery': 'http://www.loyalbooks.com/tag/Mystery', - 'Myths/Legends': 'http://www.loyalbooks.com/tag/Myths_Legends', - 'Nature': 'http://www.loyalbooks.com/tag/Nature', - 'Non-fiction': 'http://www.loyalbooks.com/tag/Non-fiction', - 'Philosophy': 'http://www.loyalbooks.com/tag/Philosophy', - 'Play': 'http://www.loyalbooks.com/tag/Play', - 'Poetry': 'http://www.loyalbooks.com/tag/Poetry', - 'Politics': 'http://www.loyalbooks.com/tag/Politics', - 'Psychology': 'http://www.loyalbooks.com/tag/Psychology', - 'Religion': 'http://www.loyalbooks.com/tag/Religion', - 'Romance': 'http://www.loyalbooks.com/tag/Romance', - 'Satire': 'http://www.loyalbooks.com/tag/Satire', - 'Science': 'http://www.loyalbooks.com/tag/Science', - 'Science fiction': 'http://www.loyalbooks.com/tag/Science_fiction', - 'Sea stories': 'http://www.loyalbooks.com/tag/Sea_stories', - 'Self Published': 'http://www.loyalbooks.com/tag/Self-Published', - 'Short stories': 'http://www.loyalbooks.com/tag/Short_stories', - 'Spy stories': 'http://www.loyalbooks.com/tag/Spy_stories', - 'Teen/Young adult': 'http://www.loyalbooks.com/tag/Teen_Young_adult', - 'Tragedy': 'http://www.loyalbooks.com/tag/Tragedy', - 'Travel': 'http://www.loyalbooks.com/tag/Travel', - 'War stories': 'http://www.loyalbooks.com/tag/War_stories', - 'Westerns': 'http://www.loyalbooks.com/tag/Westerns'} - return self._tag_pages or {} - - @property - def tags(self): - if LoyalBooks._tags is None: - try: - LoyalBooks._tags = list(self.tag_pages.keys()) - except Exception as e: - LoyalBooks._tags = ['Advice', 'Instruction', - 'Ancient Texts', - 'Biography', 'Memoirs', 'Languages', - 'Myths/Legends', 'Holiday', 'Art', - 'Politics', 'Short stories', 'Romance', - 'Essay/Short nonfiction', 'Fiction', - 'Epistolary fiction', 'Science', - 'Nature', 'Dramatic Works', - 'Spy stories', 'History', 'Non-fiction', - 'Historical Fiction', 'Play', 'Children', - 'Satire', 'Humor', - 'Classics (antiquity)', 'Travel', - 'Religion', 'Adventure', 'Animals', - 'Psychology', 'Sea stories', - 'Horror/Ghost stories', 'Fantasy', - 'Cookery', 'Poetry', 'Self Published', - 'Westerns', 'Comedy', 'Music', - 'Economics', 'Fairy tales', 'Tragedy', - 'Teen/Young adult', 'Literature', - 'War stories', 'Science fiction', - 'Philosophy', 'Mystery'] - return sorted(self._tags) or [] - - @classmethod - def _parse_book_div(cls, book): - try: - url = cls.base_url + book.find("a")[ - "href"].strip() - img = book.find("img") - if img: - img = cls.base_url + img["src"].strip() - name = book.find("b") - if name: - name = name.text.strip() - author = book.text.replace(name, "").strip() - else: - name, author = book.find("div", {"class": "s-left"}) \ - .text.split(" By: ") - if book.find(id="star1") is not None: - rating = 1 - elif book.find(id="star2") is not None: - rating = 2 - elif book.find(id="star3") is not None: - rating = 3 - elif book.find(id="star4") is not None: - rating = 4 - elif book.find(id="star5") is not None: - rating = 5 - else: - rating = 0 - names = author.split(" ") - if len(names): - first_name = names[0].strip() - last_name = " ".join(names[1:]).strip() - if not last_name: - last_name = first_name - first_name = "" - else: - first_name = "" - last_name = author.strip() - return LoyalBooksAudioBook(title=name.strip(), url=url, - img=img or "", rating=rating, - authors=[BookAuthor( - first_name=first_name, - last_name=last_name).as_json]) - except Exception as e: - pass # probably an add - return None - @classmethod - def scrap_by_tag(cls, tag, limit=-1, offset=0): - """ - Generator, yields AudioBook objects - """ - if tag not in cls._tag_pages: - cls._tag_pages = cls.scrap_tags() - if tag not in cls._tag_pages: - return - - url = cls._tag_pages[tag] + "?page=" + str(offset) - limit = int(limit) - soup = cls._get_soup(cls._get_html(url)) - el = soup.find("table", {"class": "layout2-blue"}) - if el is None: - el = soup.find("table", {"class": "layout3"}) - - books = el.find_all("td", {"class": "layout2-blue"}) - if not len(books): - books = el.find_all("td", {"class": "layout3"}) - - for book in books: - book = cls._parse_book_div(book) - if book is None: - continue - book._tags = [BookTag(name=tag, url=cls._tag_pages[tag], - tag_id=cls.get_tag_id(tag)).as_json], - yield book - - # check if last page reached - pages = soup.find("div", {"class": "result-pages"}).text - if ">" not in pages: - return - - # check if limit crawled - if limit > 0 and int(offset) > limit: - return - - # crawl next page - for book in cls.scrap_by_tag(tag, offset + 1, limit): - yield book - - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - """ - Generator, yields AudioBook objects - """ - soup = cls._get_soup(cls._get_html(cls.popular_url)) - books = soup.find(summary="Audio books").find_all("td") - for b in books: - b = cls._parse_book_div(b) - if b is not None: - yield b - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - """ - Args: - since: a UNIX timestamp; returns all projects cataloged since that time - author: all records by that author last name - title: all matching titles - tag: all projects of the matching tag - - Yields: - AudioBook objects - """ - sm = SiteMapParser(f"{LoyalBooks.base_url}/sitemap.xml") # reads /sitemap.xml + def search(self, query): + sm = SiteMapParser("https://www.loyalbooks.com/sitemap.xml") # reads /sitemap.xml for url in sm.get_urls(): url = str(url) - if not url.startswith(f"{LoyalBooks.base_url}/book/"): + if not url.startswith("https://www.loyalbooks.com/book/"): continue t = url.split("/")[-1].replace("-", " ").lower() - if author and author.lower() in t: - yield LoyalBooksAudioBook(url=url, title=t) - elif title and title.lower() in t: - yield LoyalBooksAudioBook(url=url, title=t) + if query.lower() in t: + yield from from_rss(url + "/feed") + + def search_by_narrator(self, query): + return [] # narrator info unavailable + + def search_by_title(self, query): + return self.search(query) - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/book/' + book_id - return LoyalBooksAudioBook(url=url) + def search_by_author(self, query): + return self.search(query) - def scrap_all_audiobooks(self, limit=-1, offset=0): + def iterate_all(self): """ Generator, yields AudioBook objects """ - sm = SiteMapParser('https://www.loyalbooks.com/sitemap.xml') # reads /sitemap.xml - urls = sm.get_urls() # returns iterator of sitemapper.Url instances - for url in urls: + sm = SiteMapParser("https://www.loyalbooks.com/sitemap.xml") # reads /sitemap.xml + for url in sm.get_urls(): url = str(url) if not url.startswith("https://www.loyalbooks.com/book/"): continue - title = url.split("/")[-1].replace("-", " ").title() - yield LoyalBooksAudioBook(url=url, title=title) + yield from from_rss(url + "/feed") if __name__ == "__main__": from pprint import pprint - for book in LoyalBooks.search_audiobooks(author="Lovecraft"): - pprint(book.as_json) - - scraper = LoyalBooks() - for book in scraper.scrap_popular(): - pprint(book.as_json) - - for book in scraper.scrap_by_tag("Science fiction"): - pprint(book.as_json) + for book in LoyalBooks().search_by_author("lovecraft"): + print(book) - for book in scraper.scrap_all_audiobooks(): - pprint(book.as_json) - pprint(scraper.scrap_tags()) - pprint(scraper.tags) diff --git a/audiobooker/scrappers/sharedaudiobooks.py b/audiobooker/scrappers/sharedaudiobooks.py new file mode 100644 index 0000000..cea4e09 --- /dev/null +++ b/audiobooker/scrappers/sharedaudiobooks.py @@ -0,0 +1,69 @@ +from dataclasses import dataclass + +from sitemapparser import SiteMapParser + +from audiobooker.base import AudioBook, BookAuthor +from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup, normalize_name + + +@dataclass +class SharedAudioBook: + url: str + image: str = "" + + def parse_page(self): + soup = get_soup(self.url) + title = soup.find("h1", {"class": "entry-title"}).text + tags = [t for t in soup.find("ul", {"class": "post-categories"}).text.split("\n") if t.strip()] + + img = soup.find_all("img")[-1]["src"] + + authors = [] + + if "–" in title: + pts = title.split("–") + author_name = pts[0] + title = " ".join(pts[1:]) + + f, l = normalize_name(author_name) + + authors = [BookAuthor(first_name=f, last_name=l)] + + streams = [s.find("a").text for s in soup.find_all("audio")] + + return AudioBook( + title=title.strip(), + streams=streams, + image=img, + tags=tags, + authors=authors, + language="en" + ) + + +class SharedAudioBooks(AudioBookSource): + + @classmethod + def iterate_all(cls, limit=-1, offset=0): + sm = SiteMapParser('https://sharedaudiobooks.com/post-sitemap.xml') # reads /sitemap.xml + urls = sm.get_urls() # returns iterator of sitemapper.Url instances + for url in urls: + url = str(url) + if url == "https://sharedaudiobooks.com/": + continue + yield SharedAudioBook(url=str(url)).parse_page() + + for i in range(2, 10): + sm = SiteMapParser(f'https://sharedaudiobooks.com/post-sitemap{i}.xml') # reads /sitemap.xml + urls = sm.get_urls() # returns iterator of sitemapper.Url instances + for url in urls: + yield SharedAudioBook(url=str(url)).parse_page() + + +if __name__ == "__main__": + from pprint import pprint + + scraper = SharedAudioBooks() + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/scrappers/stephenkingaudiobooks.py b/audiobooker/scrappers/stephenkingaudiobooks.py index a9b32a1..fbd8e40 100644 --- a/audiobooker/scrappers/stephenkingaudiobooks.py +++ b/audiobooker/scrappers/stephenkingaudiobooks.py @@ -1,146 +1,97 @@ -import requests +from dataclasses import dataclass +from typing import Iterable -from audiobooker.base import AudioBook, BookAuthor +from sitemapparser import SiteMapParser + +from audiobooker.exceptions import ParseErrorException +from audiobooker.base import AudioBook, BookAuthor, AudiobookNarrator from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup, extractor_narrator, extract_year -class StephenKingAudioBook(AudioBook): - base_url = "https://stephenkingaudiobooks.com/" +@dataclass +class StephenKingAudioBook: + url: str def parse_page(self): - author_name = "Stephen King" - title = self.soup.find("h1", {"class": "title-page"}).text - content = self.soup.find("div", {"class": "post-single clearfix"}) - desc = content.find("p").text - if "–" in title: - pts = title.split("–") - author_name = pts[0] - title = " ".join(pts[1:]).strip().lstrip(",") + soup = get_soup(self.url) + tags = soup.find("span", {"class": "post-meta-category"}) + title = soup.find("h1", {"class": "title-page"}).text.replace("\xa0", " ") + content = soup.find("div", {"class": "post-single clearfix"}) + desc = content.find("p").text.replace("\xa0", " ") img = content.find("img")["src"] - names = author_name.strip().split(" ") - if len(names): - first_name = names[0].strip() - last_name = " ".join(names[1:]).strip() - if not last_name: - last_name = first_name - first_name = "" + + if "Harry Potter" not in tags.text: + authors = [BookAuthor(first_name="Stephen", last_name="King")] else: - first_name = "" - last_name = author_name.strip() + authors = [BookAuthor(first_name="J.K.", last_name="Rowling")] - authors = [BookAuthor(first_name=first_name, last_name=last_name)] + if "Stephen Fry" in title and "Harry Potter" in tags.text: + narrator = AudiobookNarrator(first_name="Stephen", + last_name="Fry") + else: + narrator = (extractor_narrator(title) or + extractor_narrator(desc)) streams = [s.find("a").text for s in content.find_all("audio")] - return {"description": desc, - "authors": authors, - "title": title.strip(), - "streams": streams, - "rating": 0, - "tags": [], - "img": img} - - def from_page(self): - data = self.parse_page() - if not self.title: - self.title = data["title"] - if not self._description: - self._description = data["description"] - - self.img = data.get("img", self.img) - for tag in data["tags"]: - if tag.as_json not in self._tags: - self._tags.append(tag.as_json) - for author in data["authors"]: - if author.as_json not in self._authors: - self._authors.append(author.as_json) - self._stream_list = data["streams"] - self.raw.update(data) - - def __repr__(self): - return "StephenKingAudioBook(" + str( - self) + ", " + self.book_id + ")" + if not streams: + raise ParseErrorException("No streams found") + return AudioBook( + title=title.replace(" Audiobook", ""), + streams=streams, + description=desc, + narrator=narrator, + image=img, + tags=[], + authors=authors, + year=extract_year(title) or + extract_year(desc), + language="en" + ) class StephenKingAudioBooks(AudioBookSource): base_url = "https://stephenkingaudiobooks.com" - _tags = ["Harry Potter", 'Stephen King'] - _tag_pages = { - "Harry Potter": "https://stephenkingaudiobooks.com/category/harry-potter/", - 'Stephen King': 'https://stephenkingaudiobooks.com/category/stephen-king/'} - @classmethod - def _parse_page(cls, html, limit=-1): - soup = cls._get_soup(html) + def _parse_page(cls,url = "https://stephenkingaudiobooks.com", limit=-1, **params): + soup = get_soup(url, **params) for entry in soup.find_all("article"): try: a = entry.find("a") - img = entry.find("img")["src"] url = a["href"] - title = a["title"] - book = StephenKingAudioBook(title=title, url=url, img=img) - book.from_page() # parse url - yield book + yield StephenKingAudioBook(url=url).parse_page() except: continue if limit == -1 or limit > 0: limit -= 1 next_page = soup.find("div", {"class": "nav-previous"}) if next_page: - html = requests.get(next_page.find("a")["href"]).text - for ntry in cls._parse_page(html, limit=limit): + url = next_page.find("a")["href"] + for ntry in cls._parse_page(url=url, limit=limit, **params): yield ntry - @classmethod - def scrap_by_tag(cls, tag, limit=-1, offset=0): - for book in cls.search_audiobooks(tag=tag): - yield book - - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - html = requests.get(cls.base_url).text - return cls._parse_page(html) - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - """ - Args: - since: a UNIX timestamp; returns all projects cataloged since that time - author: all records by that author last name - title: all matching titles - tag: all projects of the matching tag - Yields: - AudioBook objects - """ - query = "" - if title: - query += title + " " - if tag: - query += tag + " " - if author: - query += author + " " - html = requests.get(cls.base_url, params={"s": query}).text - return cls._parse_page(html) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/' + book_id - book = StephenKingAudioBook(url=url) - return book + def search(self, query): + return self._parse_page(params={"s": query}) - @classmethod - def scrap_all_audiobooks(cls, limit=-1, offset=0): - return cls.scrap_popular() + def iterate_all(self): + sm = SiteMapParser('https://stephenkingaudiobook.net/wp-sitemap-posts-post-1.xml') # reads /sitemap.xml + urls = sm.get_urls() # returns iterator of sitemapper.Url instances + for url in urls: + try: + yield StephenKingAudioBook(url=str(url)).parse_page() + except: + pass if __name__ == "__main__": from pprint import pprint - # for book in StephenKingAudioBooks.search_audiobooks(title="Dark Tower"): - # pprint(book.as_json) - scraper = StephenKingAudioBooks() - for book in scraper.scrap_popular(): - pprint(book.as_json) + for book in scraper.search("Dark Tower"): + pprint(book) + + exit() + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/scrappers/storynory.py b/audiobooker/scrappers/storynory.py index f065ada..97eecfc 100644 --- a/audiobooker/scrappers/storynory.py +++ b/audiobooker/scrappers/storynory.py @@ -1,82 +1,71 @@ +from dataclasses import dataclass + import requests +from sitemapparser import SiteMapParser from audiobooker.base import AudioBook +from audiobooker.exceptions import ParseErrorException from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup, extractor_narrator -class StoryNoryAudioBook(AudioBook): - base_url = "https://www.storynory.com/" +@dataclass +class StoryNoryAudioBook: + url: str + image: str = "" def parse_page(self): + soup = get_soup(self.url) streams = [] - for url in self.soup.find_all("a"): - if url["href"].endswith(".mp3"): - if url["href"] not in streams: - streams.append(url["href"]) - - title = self.soup.find("title").text - img = self.soup.find("img") - if img.get("data-ezsrc"): - img = img["data-ezsrc"] - elif img.get("src"): - img = img["src"] + for url in soup.find_all("a"): + if not url.get("href"): + continue + if url.get("download") or url["href"].endswith(".mp3"): + url = url["href"] + if url.startswith("//"): + url = "https:" + url + streams.append(url.strip()) + + title = soup.find("title").text.strip().replace(" - Storynory", "") + img = soup.find("img") + if img and img.get("src"): + img = img["src"].strip() + if img.startswith("//"): + img = "https:" + img else: - img = self.img - print(streams) - return {"title": title.strip(), - "streams": streams, - "img": img} - - def from_page(self): - data = self.parse_page() - self.title = data["title"] - self.img = data.get("img", self.img) - self.raw.update(data) - self._stream_list = data["streams"] - - def __repr__(self): - return "StoryNoryAudioBook(" + str( - self) + ", " + self.book_id + ")" + img = self.image + if not streams: + raise ParseErrorException("No streams found") + for d in soup.find_all("p"): + if d.text.lower().startswith("download"): + continue + desc = d.text.split("\n")[0][:100] + break + else: + desc = "" + return AudioBook( + title=title.strip(), + description=desc, + streams=streams, + narrator=extractor_narrator(desc), + image=img, + language="en" + ) class StoryNory(AudioBookSource): - # TODO categories / tags - base_url = "https://www.storynory.com" @classmethod - def _parse_page(cls, html, limit=-1): - soup = cls._get_soup(html) - for entry in soup.find_all("div", {"class": "bf-item"}): - try: - a = entry.find("a") - img = entry.find("img") - book = StoryNoryAudioBook(from_data={ - "title": entry.text, - "url": a["href"], - "img": img["src"] - }) - book.from_page() # parse url - yield book - except: - continue + def _parse_search_page(cls, url="https://www.storynory.com", + limit=-1, **params): + soup = get_soup(url, **params) - @classmethod - def _parse_search_page(cls, html, limit=-1): - soup = cls._get_soup(html) for entry in soup.find_all("div", {"class": "panel-body"}): try: a = entry.find("a") img = entry.find("img") - book = StoryNoryAudioBook(from_data={ - "title": a.text, - "description": entry.find("p").text, - "url": a["href"], - "img": img["src"] if img else "" - }) - print(book) - book.from_page() # parse url - print(book) - yield book + yield StoryNoryAudioBook(url=a["href"], + image= img["src"] if img else "").parse_page() except: continue @@ -84,62 +73,29 @@ def _parse_search_page(cls, html, limit=-1): limit -= 1 next_page = soup.find("li", {"class": "bpn-next-link"}) if next_page: - html = requests.get(next_page.find("a")["href"]).text - for ntry in cls._parse_search_page(html, limit=limit): + url = next_page.find("a")["href"] + for ntry in cls._parse_search_page(url=url, limit=limit, **params): yield ntry - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - html = requests.get(cls.base_url).text - soup = cls._get_soup(html) - for a in soup.find_all("a"): - url = a["href"] - if not url.startswith("https://www.storynory.com/"): - continue - img = a.find("img") - if not img: - continue - p = a.find("p") - desc = "" - if p: - desc = p.text - try: - book = StoryNoryAudioBook(description=desc, - url=url, - title=img["alt"], - img=img["data-ezsrc"]) - except: - book = StoryNoryAudioBook(description=desc, - url=url, - img=img["src"]) - book.from_page() # parse book url for streams - yield book - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - query = "" - if title: - query += title + " " - if tag: - query += tag + " " - if author: - query += author + " " - html = requests.get(cls.base_url, params={"s": query}).text - return cls._parse_search_page(html) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/' + book_id - book = StoryNoryAudioBook(url=url) - return book + def search(self, query): + return self._parse_search_page(params={"s": query}) - @classmethod - def scrap_all_audiobooks(cls, limit=-1, offset=0): - return cls.scrap_popular() + def iterate_all(self): + for u in [ + 'https://www.storynory.com/post-sitemap1.xml', + 'https://www.storynory.com/post-sitemap2.xml' + ]: + sm = SiteMapParser(u) # reads /sitemap.xml + urls = sm.get_urls() # returns iterator of sitemapper.Url instances + for url in urls: + try: + yield StoryNoryAudioBook(url=str(url)).parse_page() + except ParseErrorException: + # not a book, just a blog post + continue if __name__ == "__main__": scraper = StoryNory() - for book in scraper.scrap_popular(): - print(book.as_json) + for book in scraper.search("snow white"): + print(book) diff --git a/audiobooker/scrappers/thoughtaudio.py b/audiobooker/scrappers/thoughtaudio.py index 74361a0..77c449f 100644 --- a/audiobooker/scrappers/thoughtaudio.py +++ b/audiobooker/scrappers/thoughtaudio.py @@ -1,123 +1,85 @@ -import requests +from dataclasses import dataclass + from sitemapparser import SiteMapParser -from audiobooker.base import AudioBook +from audiobooker.base import AudioBook, BookAuthor, AudiobookNarrator from audiobooker.scrappers import AudioBookSource +from audiobooker.utils import get_soup, extract_year, normalize_name -class ThoughtAudioAudioBook(AudioBook): - base_url = "http://thoughtaudio.com/" +@dataclass +class ThoughtAudioAudioBook: + url: str def parse_page(self): + soup = get_soup(self.url) streams = [] - for url in self.soup.find_all("a"): + img = None + for url in soup.find_all("a"): if url["href"].endswith(".mp3"): streams.append(url["href"]) - for url in self.soup.find_all("iframe"): + for url in soup.find_all("iframe"): if "youtube" not in url["src"]: continue + vid = url["src"].split("/")[-1].split("?")[0] + img = f"https://img.youtube.com/vi/{vid}/0.jpg" streams.append( url["src"].split("?feature=oembed")[0]. replace("https://www.youtube.com/embed/", "https://www.youtube.com/watch?v=") ) - title = self.soup.find("title").text - img = self.img - - return {"title": title.strip(), - "streams": streams, - "img": img} - - def from_page(self): - data = self.parse_page() - self.title = data["title"] - self.img = data.get("img", self.img) - self._stream_list = data["streams"] - self.raw.update(data) - - def __repr__(self): - return "ThoughtAudioAudioBook(" + str( - self) + ", " + self.book_id + ")" + title = soup.find("title").text.split(" – ThoughtAudio")[0].split(": ")[-1] + + if not title: + title = soup.find("span", {"class": "Text-Head"}).text + + narrator = None + author = None + desc = "" + for s in soup.find_all("p"): + if "WRITTEN BY:" in s.text: + name = s.text.split("WRITTEN BY:")[-1] + f, l = normalize_name(name) + author = BookAuthor(first_name=f, last_name=l) + + elif "NARRATED BY:" in s.text: + name = s.text.split("NARRATED BY:")[-1] + f, l = normalize_name(name) + narrator = AudiobookNarrator(first_name=f, last_name=l) + elif s.text.strip() and narrator and author: + desc = s.text.split("\n")[0] + break + if not img: + pics = soup.find_all("img") + if len(pics) > 1: + img = pics[1] + else: + img = pics[0] + return AudioBook( + title=title.strip(), + streams=streams, + image=img or "", + description=desc, + narrator=narrator, + year=extract_year(desc), + authors=[author] if author else [], + tags=["ThoughtAudio"], + language="en" + ) class ThoughtAudio(AudioBookSource): - base_url = "http://thoughtaudio.com" - _tags = ["Philosophy"] - _tag_pages = {"Philosophy": 'http://thoughtaudio.com'} - - @classmethod - def _parse_page(cls, html, limit=-1): - soup = cls._get_soup(html) - for entry in soup.find_all("div", {"class": "bf-item"}): - try: - a = entry.find("a") - img = entry.find("img") - book = ThoughtAudioAudioBook(from_data={ - "title": entry.text, - "url": a["href"], - "img": img["src"] - }) - book.from_page() # parse url - yield book - except: - continue - @classmethod - def _parse_search_page(cls, html, limit=-1): - soup = cls._get_soup(html) - for entry in soup.find_all("article"): - try: - a = entry.find("a") - img = entry.find("img") - book = ThoughtAudioAudioBook(from_data={ - "title": a.text, - "url": a["href"], - "img": img["src"] - }) - book.from_page() # parse url - yield book - except: - continue - - @classmethod - def scrap_popular(cls, limit=-1, offset=0): - html = requests.get(cls.base_url).text - return cls._parse_page(html) - - @classmethod - def search_audiobooks(cls, since=None, author=None, title=None, tag=None, - limit=25): - query = "" - if title: - query += title + " " - if tag: - query += tag + " " - if author: - query += author + " " - html = requests.get(cls.base_url, params={"s": query}).text - return cls._parse_search_page(html) - - @classmethod - def get_audiobook(cls, book_id): - url = cls.base_url + '/' + book_id - book = ThoughtAudioAudioBook(url=url) - return book - - @classmethod - def scrap_all_audiobooks(cls, limit=-1, offset=0): + def iterate_all(self): sm = SiteMapParser('http://thoughtaudio.com/wp-sitemap-posts-post-1.xml') # reads /sitemap.xml urls = sm.get_urls() # returns iterator of sitemapper.Url instances for url in urls: url = str(url) - title = url.strip("/").split("/")[-1].replace("-", " ").title() - yield ThoughtAudioAudioBook(url=url, title=title) + yield ThoughtAudioAudioBook(url=url).parse_page() if __name__ == "__main__": from pprint import pprint scraper = ThoughtAudio() - for book in scraper.search_audiobooks(title="machine"): - pprint(book.as_json) - - for book in scraper.scrap_all_audiobooks(): - pprint(book.as_json) + for book in scraper.iterate_all(): + pprint(book) diff --git a/audiobooker/utils.py b/audiobooker/utils.py index 6405502..56179a8 100644 --- a/audiobooker/utils.py +++ b/audiobooker/utils.py @@ -1,4 +1,7 @@ import random +import re + +from bs4 import BeautifulSoup USER_AGENTS = [ ('Mozilla/5.0 (X11; Linux x86_64) ' @@ -32,3 +35,58 @@ def random_user_agent(): return random.choice(USER_AGENTS) + + +def get_html(url, **kwargs): + from audiobooker.scrappers import AudioBookSource + try: + return AudioBookSource.session.get(url, **kwargs).text + except Exception as e: + try: + return AudioBookSource.session.get(url, verify=False, **kwargs).text + except: + return None + + +def get_soup(url, **kwargs): + html = get_html(url, **kwargs) + if html: + return BeautifulSoup(html, "html.parser") + + +def extract_year(title: str) -> int: + match = re.search(r'\b\d{4}\b', title) + if match: + return int(match.group()) + return 0 + + +def extractor_narrator(title): + from audiobooker.base import AudiobookNarrator + narrator = None + title = title.replace("\xa0", " ").strip() + matches = re.findall(r'\b(?:read by|audiobook by|narrated by)\b\s*(.*?)(?:\s*–|$)', title, flags=re.IGNORECASE) + + if matches: + narrator_str = matches[0].strip() # Consider only the first "read by" occurrence + # Split the narrator's name using a regex pattern + names = re.findall(r'(?:[A-Z]\.)+|\S+', narrator_str) + # Ensure we only take up to two words for the narrator's name + names = names[:2] + if len(names) > 0: + first_name = names[0].strip() + last_name = " ".join(names[1:]).strip() if len(names) > 1 else "" + if last_name and first_name[0].isupper() and not last_name[0].isupper(): + last_name = "" # not part of the name + narrator = AudiobookNarrator(first_name=first_name.title(), + last_name=last_name.title()) + return narrator + + +def normalize_name(name): + """convert a name string to first and last name""" + name = name.replace("(", "").replace(")", "").title().strip() + if " " in name: + return name.split(" ", 1) + else: + return name, "" diff --git a/examples/search_librivox.py b/examples/search_librivox.py index 4fe3274..281d3d2 100644 --- a/examples/search_librivox.py +++ b/examples/search_librivox.py @@ -17,7 +17,7 @@ pprint(book.url) pprint(book.streams) pprint(book.runtime) -pprint(book.rss_data) +#pprint(book.rss_data) #book.play() a = ", ".join([au.first_name + au.last_name for au in book.authors]) pprint(a) diff --git a/setup.py b/setup.py index bbddb4a..a0a12b5 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='audiobooker', - version='0.3.1', + version='0.5.0', packages=['audiobooker', 'audiobooker.scrappers'], install_requires=["requests", "bs4", "feedparser", "rapidfuzz", "requests-cache", "site-map-parser"], diff --git a/test/test_utils.py b/test/test_utils.py new file mode 100644 index 0000000..692ec70 --- /dev/null +++ b/test/test_utils.py @@ -0,0 +1,75 @@ +import unittest + +from audiobooker.utils import extract_year +from audiobooker.utils import extractor_narrator + + +class TestExtractorNarrator(unittest.TestCase): + def test_single_read_by(self): + title1 = "1996 Stephen King – The Regulators Audiobook read by Frank Muller" + narrator1 = extractor_narrator(title1) + self.assertEqual(narrator1.first_name, "Frank") + self.assertEqual(narrator1.last_name, "Muller") + + title2 = "The Shining Audiobook read by Campbell Scott" + narrator2 = extractor_narrator(title2) + self.assertEqual(narrator2.first_name, "Campbell") + self.assertEqual(narrator2.last_name, "Scott") + + title3 = "Alice In Wonderland read by Natasha now has its own podcast" + narrator3 = extractor_narrator(title3) + self.assertEqual(narrator3.first_name, "Natasha") + self.assertEqual(narrator3.last_name, "") + + def test_audiobook_by(self): + title1 = "Harry Potter and the Chamber of Secrets Audiobook by Jim Dale" + narrator1 = extractor_narrator(title1) + self.assertEqual(narrator1.first_name, "Jim") + self.assertEqual(narrator1.last_name, "Dale") + + title2 = "Pride and Prejudice Audiobook by Jane Austen" + narrator2 = extractor_narrator(title2) + self.assertEqual(narrator2.first_name, "Jane") + self.assertEqual(narrator2.last_name, "Austen") + + def test_narrated_by(self): + title1 = "The shadow over innsmouth by H.P. Lovecraft, narrated by Wayne June" + narrator1 = extractor_narrator(title1) + self.assertEqual(narrator1.first_name, "Wayne") + self.assertEqual(narrator1.last_name, "June") + + title2 = "The Catcher in the Rye by J.D. Salinger, narrated by Matt Damon" + narrator2 = extractor_narrator(title2) + self.assertEqual(narrator2.first_name, "Matt") + self.assertEqual(narrator2.last_name, "Damon") + + def test_no_narrator(self): + title = "The Great Gatsby" + narrator = extractor_narrator(title) + self.assertIsNone(narrator) + + +class TestExtractYear(unittest.TestCase): + def test_year_present(self): + title1 = "1996 Stephen King – The Regulators Audiobook read by Frank Muller" + self.assertEqual(extract_year(title1), 1996) + + title2 = "Harry Potter and the Chamber of Secrets (1998) Audiobook by Jim Dale" + self.assertEqual(extract_year(title2), 1998) + + def test_no_year(self): + title = "The Great Gatsby" + self.assertEqual(extract_year(title), 0) + + def test_multiple_years(self): + title = "The Odyssey (2001) and Moby Dick (1954) Audiobook by Some Narrator" + # Only the first year should be extracted + self.assertEqual(extract_year(title), 2001) + + def test_year_in_sentence(self): + title = "This is a sentence with the year 2022 in it." + self.assertEqual(extract_year(title), 2022) + + +if __name__ == '__main__': + unittest.main()