Skip to content

Commit

Permalink
fix: more tweaks for scrapers and fine tuning.
Browse files Browse the repository at this point in the history
  • Loading branch information
dreulavelle committed Nov 9, 2024
1 parent 73c0bcc commit b25658d
Show file tree
Hide file tree
Showing 10 changed files with 29 additions and 22 deletions.
12 changes: 8 additions & 4 deletions src/program/services/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import threading
from datetime import datetime
from typing import Dict, Generator, List, Union
from typing import Dict, Generator, List

from loguru import logger

from program.media.item import Episode, MediaItem, Movie, Season, Show
from program.media.state import States
from program.media.item import MediaItem
from program.media.stream import Stream
from program.services.scrapers.comet import Comet
from program.services.scrapers.jackett import Jackett
Expand Down Expand Up @@ -98,7 +97,10 @@ def run_service(service, item,):
if total_results != len(results):
logger.debug(f"Scraped {item.log_string} with {total_results} results, removed {total_results - len(results)} duplicate hashes")

sorted_streams: Dict[str, Stream] = _parse_results(item, results, log)
sorted_streams: Dict[str, Stream] = {}

if results:
sorted_streams = _parse_results(item, results, log)

if sorted_streams and (log and settings_manager.settings.debug):
item_type = item.type.title()
Expand All @@ -110,6 +112,8 @@ def run_service(service, item,):
elif item.type == "episode":
item_info = f"[{item_type} {item.parent.number}:{item.number}]"
logger.debug(f"{item_info} Parsed '{sorted_tor.parsed_title}' with rank {sorted_tor.rank} ({sorted_tor.infohash}): '{sorted_tor.raw_title}'")
else:
logger.log("NOT_FOUND", f"No streams to process for {item.log_string}")

return sorted_streams

Expand Down
2 changes: 1 addition & 1 deletion src/program/services/scrapers/comet.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Comet ratelimit exceeded for item: {item.log_string}")
logger.debug(f"Comet ratelimit exceeded for item: {item.log_string}")
except ConnectTimeout:
logger.warning(f"Comet connection timeout for item: {item.log_string}")
except ReadTimeout:
Expand Down
2 changes: 1 addition & 1 deletion src/program/services/scrapers/jackett.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def run(self, item: MediaItem) -> Generator[MediaItem, None, None]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Jackett ratelimit exceeded for item: {item.log_string}")
logger.debug(f"Jackett ratelimit exceeded for item: {item.log_string}")
except RequestException as e:
logger.error(f"Jackett request exception: {e}")
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion src/program/services/scrapers/knightcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Knightcrawler rate limit exceeded for item: {item.log_string}")
logger.debug(f"Knightcrawler rate limit exceeded for item: {item.log_string}")
except ConnectTimeout:
logger.warning(f"Knightcrawler connection timeout for item: {item.log_string}")
except ReadTimeout:
Expand Down
13 changes: 8 additions & 5 deletions src/program/services/scrapers/mediafusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def __init__(self):
self.settings = self.app_settings.scraping.mediafusion
self.timeout = self.settings.timeout
self.encrypted_string = None
rate_limit_params = get_rate_limit_params(max_calls=1, period=2) if self.settings.ratelimit else None
# https://github.com/elfhosted/infra/blob/ci/mediafusion/middleware-ratelimit-stream.yaml
rate_limit_params = get_rate_limit_params(max_calls=1, period=10) if self.settings.ratelimit else None
session = create_service_session(rate_limit_params=rate_limit_params)
self.request_handler = ScraperRequestHandler(session)
self.initialized = self.validate()
Expand Down Expand Up @@ -112,15 +113,15 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Mediafusion ratelimit exceeded for item: {item.log_string}")
logger.debug(f"Mediafusion ratelimit exceeded for item: {item.log_string}")
except ConnectTimeout:
logger.warning(f"Mediafusion connection timeout for item: {item.log_string}")
except ReadTimeout:
logger.warning(f"Mediafusion read timeout for item: {item.log_string}")
except RequestException as e:
logger.error(f"Mediafusion request exception: {e}")
except Exception as e:
logger.error(f"Mediafusion exception thrown: {e}")
logger.exception(f"Mediafusion exception thrown: {e}")
return {}

def scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]:
Expand All @@ -139,8 +140,10 @@ def scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]:
torrents: Dict[str, str] = {}

for stream in response.data.streams:
description_split = stream.description.replace("📂 ", "").replace("/", "")
raw_title = description_split.split("\n")[0]
if not hasattr(stream, "description") and hasattr(stream, "title") and "rate-limit exceeded" in stream.title:
raise RateLimitExceeded
description_split = stream.description.replace("📂 ", "")
raw_title = description_split.split("/")[0] or description_split.split("\n")[0] # we want the torrent name if possible
info_hash = re.search(r"info_hash=([A-Za-z0-9]+)", stream.url).group(1)
if info_hash and info_hash not in torrents:
torrents[info_hash] = raw_title
Expand Down
4 changes: 2 additions & 2 deletions src/program/services/scrapers/orionoid.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Orionoid ratelimit exceeded for item: {item.log_string}")
logger.debug(f"Orionoid ratelimit exceeded for item: {item.log_string}")
except Exception as e:
logger.opt(exception=True).error(f"Orionoid exception for item: {item.log_string} - Exception: {e}")
logger.exception(f"Orionoid exception for item: {item.log_string} - Exception: {e}")
return {}

def _build_query_params(self, item: MediaItem) -> dict:
Expand Down
4 changes: 2 additions & 2 deletions src/program/services/scrapers/prowlarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Prowlarr ratelimit exceeded for item: {item.log_string}")
logger.debug(f"Prowlarr ratelimit exceeded for item: {item.log_string}")
except RequestException as e:
logger.error(f"Prowlarr request exception: {e}")
except Exception as e:
logger.error(f"Prowlarr failed to scrape item with error: {e}")
logger.exception(f"Prowlarr failed to scrape item with error: {e}")
return {}

def scrape(self, item: MediaItem) -> Dict[str, str]:
Expand Down
4 changes: 2 additions & 2 deletions src/program/services/scrapers/torbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"TorBox rate limit exceeded for item: {item.log_string}")
logger.debug(f"TorBox rate limit exceeded for item: {item.log_string}")
except ConnectTimeout:
logger.log("NOT_FOUND", f"TorBox is caching request for {item.log_string}, will retry later")
except RequestException as e:
Expand All @@ -55,7 +55,7 @@ def run(self, item: MediaItem) -> Dict[str, str]:
elif e.response and e.response.status_code == 500:
logger.log("NOT_FOUND", f"TorBox is caching request for {item.log_string}, will retry later")
except Exception as e:
logger.error(f"TorBox exception thrown: {e}")
logger.exception(f"TorBox exception thrown: {e}")
return {}

def _build_query_params(self, item: MediaItem) -> str:
Expand Down
4 changes: 2 additions & 2 deletions src/program/services/scrapers/torrentio.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Torrentio rate limit exceeded for item: {item.log_string}")
logger.debug(f"Torrentio rate limit exceeded for item: {item.log_string}")
except Exception as e:
logger.error(f"Torrentio exception thrown: {str(e)}")
logger.exception(f"Torrentio exception thrown: {str(e)}")
return {}

def scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]:
Expand Down
4 changes: 2 additions & 2 deletions src/program/services/scrapers/zilean.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
logger.warning(f"Zilean rate limit exceeded for item: {item.log_string}")
logger.debug(f"Zilean rate limit exceeded for item: {item.log_string}")
except Exception as e:
logger.error(f"Zilean exception thrown: {e}")
logger.exception(f"Zilean exception thrown: {e}")
return {}

def _build_query_params(self, item: MediaItem) -> Dict[str, str]:
Expand Down

0 comments on commit b25658d

Please sign in to comment.