Skip to content

Commit

Permalink
Jackett rewrite (#139)
Browse files Browse the repository at this point in the history
* Add TorBox scraper

* Add is_anime attribute to item

* Rework Jackett to Keyword Queries. Added categories. Removed Torbox

* Remove audio from parsing, it removed alot of good hits

* fix movie scraping and modify response parsing logic to be more readable

* fix: remove torbox module

* remove audio from being parsed

* remove more audio from parser

* fix typo

* fix: tidy audio and networks

* small tweaks

---------

Co-authored-by: Spoked <Spoked@localhost>
Co-authored-by: Gaisberg <None>
  • Loading branch information
dreulavelle and Spoked authored Jan 18, 2024
1 parent 943b098 commit a93050a
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 74 deletions.
7 changes: 6 additions & 1 deletion backend/program/content/overseerr.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from utils.logger import logger
from utils.request import get, ping
from program.media.container import MediaItemContainer
from program.updaters.trakt import Updater as Trakt
from program.updaters.trakt import Updater as Trakt, get_imdbid_from_tvdb


class OverseerrConfig(BaseModel):
Expand Down Expand Up @@ -102,6 +102,11 @@ def get_imdb_id(self, overseerr_item):
imdb_id = response.data.externalIds.imdbId
if imdb_id:
return imdb_id
if not imdb_id:
# I've seen a case where no imdbId was returned but a tvdbId was
imdb_id = get_imdbid_from_tvdb(response.data.externalIds.tvdbId)
if imdb_id:
return imdb_id
self.not_found_ids.append(f"{id_extension}{external_id}")
title = getattr(response.data, "title", None) or getattr(
response.data, "originalName", None
Expand Down
10 changes: 0 additions & 10 deletions backend/program/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,3 @@ def _needs_new_scrape(self, item) -> bool:
> scrape_time
or item.scraped_times == 0
)
def _check_for_title_match(self, item, string) -> bool:
"""Check if the title matches PTN title"""
parsed_title = parser.get_title(string)
if item.type == "movie":
return parsed_title == item.title
if item.type == "season":
return parsed_title == item.parent.title
if item.type == "episode":
return parsed_title == item.parent.parent.title
return False
59 changes: 30 additions & 29 deletions backend/program/scrapers/jackett.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ def __init__(self, _):
logger.info("Jackett initialized!")

def validate_settings(self) -> bool:
"""Validate the Jackett settings."""
"""Validate Jackett settings."""
if not self.settings.enabled:
logger.debug("Jackett is set to disabled.")
return False
if self.settings.url:
try:
Expand All @@ -46,8 +47,7 @@ def validate_settings(self) -> bool:
return False

def run(self, item):
"""Scrape the Jackett API for the given media items
and update the object with scraped streams"""
"""Scrape Jackett for the given media items"""
try:
self._scrape_item(item)
except RequestException:
Expand All @@ -67,29 +67,30 @@ def _scrape_item(self, item):
logger.debug("Could not find streams for %s", item.log_string)

def api_scrape(self, item):
"""Wrapper for torrentio scrape method"""
query = ""
if item.type == "movie":
query = f"&t=movie&imdbid={item.imdb_id}"
if item.type == "season":
query = f"&t=tv-search&imdbid={item.parent.imdb_id}&season={item.number}"
if item.type == "episode":
query = f"&t=tv-search&imdbid={item.parent.parent.imdb_id}&season={item.parent.number}&ep={item.number}"

url = (
f"{self.settings.url}/api/v2.0/indexers/all/results/torznab?apikey={self.api_key}{query}"
)
with self.second_limiter:
response = get(url=url, retry_if_failed=False, timeout=60)
if response.is_ok:
data = {}
for stream in response.data['rss']['channel']['item']:
title = stream.get('title')
for attr in stream.get('torznab:attr', []):
if attr.get('@name') == 'infohash':
infohash = attr.get('@value')
if parser.parse(title) and infohash:
data[infohash] = {"name": title}
if len(data) > 0:
return parser.sort_streams(data)
return {}
"""Wrapper for `Jackett` scrape method"""
# https://github.com/Jackett/Jackett/wiki/Jackett-Categories
with self.minute_limiter:
query = ""
if item.type == "movie":
query = f"&cat=2010,2020,2030,2040,2045,2050,2080&t=movie&q={item.title} {item.aired_at.year}"
if item.type == "season":
query = f"&cat=5010,5020,5030,5040,5045,5050,5060,5070,5080&t=tvsearch&q={item.parent.title}&season={item.number}"
if item.type == "episode":
query = f"&cat=5010,5020,5030,5040,5045,5050,5060,5070,5080&t=tvsearch&q={item.parent.parent.title}&season={item.parent.number}&ep={item.number}"
url = (f"{self.settings.url}/api/v2.0/indexers/!status:failing,test:passed/results/torznab?apikey={self.api_key}{query}")
with self.second_limiter:
response = get(url=url, retry_if_failed=False, timeout=60)
if response.is_ok:
data = {}
for stream in response.data['rss']['channel'].get('item', []):
title = stream.get('title')
if parser.check_for_title_match(item, title):
if parser.parse(title):
attr = stream.get('torznab:attr', [])
infohash_attr = next((a for a in attr if a.get('@name') == 'infohash'), None)
if infohash_attr:
infohash = infohash_attr.get('@value')
data[infohash] = {"name": title}
if len(data) > 0:
return parser.sort_streams(data)
return {}
7 changes: 5 additions & 2 deletions backend/program/updaters/trakt.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def __init__(self):
self.pkl_file = path.join(get_data_path(), "trakt_data.pkl")
self.ids = []


def create_items(self, imdb_ids):
"""Update media items to state where they can start downloading"""
self.trakt_data.load(self.pkl_file)
Expand Down Expand Up @@ -85,11 +84,13 @@ def _map_item_from_data(data, item_type):
if getattr(data, "released", None):
released_at = data.released
formatted_aired_at = datetime.strptime(released_at, "%Y-%m-%d")
is_anime = "anime" in getattr(data, "genres", [])
item = {
"title": getattr(data, "title", None), # 'Game of Thrones'
"year": getattr(data, "year", None), # 2011
"status": getattr(data, "status", None), # 'ended', 'released', 'returning series'
"aired_at": formatted_aired_at, # datetime.datetime(2011, 4, 17, 0, 0)
"is_anime": is_anime, # True"
"imdb_id": getattr(data.ids, "imdb", None), # 'tt0496424'
"tvdb_id": getattr(data.ids, "tvdb", None), # 79488
"tmdb_id": getattr(data.ids, "tmdb", None), # 1399
Expand Down Expand Up @@ -155,7 +156,9 @@ def get_imdbid_from_tvdb(tvdb_id: str) -> str:
additional_headers={"trakt-api-version": "2", "trakt-api-key": CLIENT_ID},
)
if response.is_ok and len(response.data) > 0:
return response.data[0].show.ids.imdb
# noticing there are multiple results for some TVDB IDs
# TODO: Need to check item.type and compare to the resulting types..
return response.data[0].show.ids.imdb
return None

def get_imdbid_from_tmdb(tmdb_id: str) -> str:
Expand Down
6 changes: 2 additions & 4 deletions backend/utils/default_settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "0.4.0",
"version": "0.4.3",
"debug": true,
"log": true,
"symlink": {
Expand Down Expand Up @@ -59,8 +59,6 @@
"language": ["English"],
"include_4k": false,
"highest_quality": false,
"repack_proper": true,
"dual_audio": true,
"av1_audio": true
"repack_proper": true
}
}
54 changes: 27 additions & 27 deletions backend/utils/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@
from typing import List
from pydantic import BaseModel
from utils.settings import settings_manager
from thefuzz import fuzz


class ParserConfig(BaseModel):
language: List[str]
include_4k: bool
highest_quality: bool
repack_proper: bool
dual_audio: bool # This sometimes doesnt work depending on if other audio is in the title
av1_audio: bool


class Parser:
Expand All @@ -26,33 +25,15 @@ def __init__(self):
"VODRip", "DVD-R", "DSRip", "BRRip"]
self.quality = [None, "Blu-ray", "WEB-DL", "WEBRip", "HDRip",
"HDTVRip", "BDRip", "Pay-Per-View Rip"]
self.audio = [None, "AAC", "AAC 2.0", "AAC 5.1", "FLAC", "AVC", "Custom"]
self.network = ["Apple TV+", "Amazon Studios", "Netflix",
"Nickelodeon", "YouTube Premium", "Disney Plus",
"DisneyNOW", "HBO Max", "HBO", "Hulu Networks",
"DC Universe", "Adult Swim", "Comedy Central",
"Peacock", "AMC", "PBS", "Crunchyroll",
"Syndication", "Hallmark", "BBC", "VICE",
"MSNBC", "Crave"] # Will probably be used later in `Versions`
self.validate_settings()

def validate_settings(self):
if self.settings.highest_quality:
self.resolution = ["UHD", "2160p", "4K", "1080p", "720p"]
self.audio += ["Dolby TrueHD", "Dolby Atmos",
"Dolby Digital EX", "Dolby Digital Plus",
"Dolby Digital 5.1", "Dolby Digital 7.1",
"Dolby Digital Plus 5.1", "Dolby Digital Plus 7.1"
"DTS-HD MA", "DTS-HD MA", "DTS-HD", "DTS-HD MA 5.1"
"DTS-EX", "DTS:X", "DTS", "5.1", "7.1"]
elif self.settings.include_4k:
self.resolution = ["2160p", "4K", "1080p", "720p"]
else:
self.resolution = ["1080p", "720p"]
if self.settings.dual_audio:
self.audio += ["Dual"]
if not self.settings.av1_audio:
self.unwanted_codec += ["AV1"] # Not all devices support this codec

def _parse(self, string):
parse = PTN.parse(string)
Expand Down Expand Up @@ -127,7 +108,6 @@ def _is_highest_quality(self, string) -> bool:
return any([
parsed.get("hdr", False),
parsed.get("remux", False),
parsed.get("audio", False) in self.audio,
parsed.get("resolution", False) in ["UHD", "2160p", "4K"],
parsed.get("upscaled", False)
])
Expand All @@ -143,15 +123,21 @@ def _is_repack_or_proper(self, string) -> bool:

def _is_dual_audio(self, string) -> bool:
"""Check if content is `dual audio`."""
if self.settings.dual_audio:
parsed = self._parse(string)
return parsed.get("audio") == "Dual" or \
re.search(r"((dual.audio)|(english|eng)\W+(dub|audio))", string, flags=re.IGNORECASE) is not None
parsed = self._parse(string)
return parsed.get("audio") == "Dual" or \
re.search(r"((dual.audio)|(english|eng)\W+(dub|audio))", string, flags=re.IGNORECASE) is not None

def _is_network(self, string) -> bool:
"""Check if content is from a `network`."""
parsed = self._parse(string)
return parsed.get("network", False) in self.network
network = ["Apple TV+", "Amazon Studios", "Netflix",
"Nickelodeon", "YouTube Premium", "Disney Plus",
"DisneyNOW", "HBO Max", "HBO", "Hulu Networks",
"DC Universe", "Adult Swim", "Comedy Central",
"Peacock", "AMC", "PBS", "Crunchyroll",
"Syndication", "Hallmark", "BBC", "VICE",
"MSNBC", "Crave"] # Will probably be used later in `Versions`
return (parsed.get("network", False)) in network

def sort_streams(self, streams: dict) -> dict:
"""Sorts streams based on user preferences."""
Expand All @@ -174,7 +160,6 @@ def parse(self, string) -> bool:
return (
parse["resolution"] in self.resolution
and parse["language"] in self.language
and parse["audio"] in self.audio
and not parse["quality"] in self.unwanted_quality
and not parse["codec"] in self.unwanted_codec
)
Expand All @@ -184,4 +169,19 @@ def get_title(self, string) -> str:
parse = self._parse(string)
return parse["title"]

def check_for_title_match(self, item, string, threshold=94) -> bool:
"""Check if the title matches PTN title using fuzzy matching."""
# TODO1: remove special chars from parsed_title and target_title. Could improve matching.
# TODO2: We should be checking aliases as well for titles. Anime only probably?
parsed_title = self.get_title(string)
if item.type == "movie":
target_title = item.title
elif item.type == "season":
target_title = item.parent.title
elif item.type == "episode":
target_title = item.parent.parent.title
else:
return False
return fuzz.ratio(parsed_title.lower(), target_title.lower()) >= threshold

parser = Parser()
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ pathos
pydantic
fastapi
uvicorn[standard]
parse-torrent-title
parse-torrent-title
thefuzz

0 comments on commit a93050a

Please sign in to comment.