diff --git a/requirements.txt b/requirements.txt index f35e753..ed00c5a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ requests bs4 -pytube ytmusicapi \ No newline at end of file diff --git a/tutubo/models.py b/tutubo/models.py index 4f395f2..412882d 100644 --- a/tutubo/models.py +++ b/tutubo/models.py @@ -1,102 +1,4 @@ -import json -import re -from typing import List, Tuple, Optional, Iterable - -# Patches since upstream is abandoned (TODO time to fork ?) -import pytube.cipher -import pytube.innertube -from pytube import YouTube as _Yt, Channel as _Ch, Playlist as _Pl -from pytube import extract -from pytube.exceptions import VideoUnavailable -from pytube.helpers import uniqueify, cache, DeferredGeneratorList - - -def get_throttling_function_name(js: str) -> str: - """Extract the name of the function that computes the throttling parameter. - - :param str js: - The contents of the base.js asset file. - :rtype: str - :returns: - The name of the function used to compute the throttling parameter. - """ - function_patterns = [ - # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377 - # https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8 - # var Bpa = [iha]; - # ... - # a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b), - # Bpa.length || iha("")) }}; - # In the above case, `iha` is the relevant function name - r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&.*?\|\|\s*([a-z]+)', - r'\([a-z]\s*=\s*([a-zA-Z0-9$]+)(\[\d+\])\([a-z]\)', - ] - for pattern in function_patterns: - regex = re.compile(pattern) - function_match = regex.search(js) - if function_match: - if len(function_match.groups()) == 1: - return function_match.group(1) - idx = function_match.group(2) - if idx: - idx = idx.strip("[]") - array = re.search( - r'var {nfunc}\s*=\s*(\[.+?\]);'.format( - nfunc=re.escape(function_match.group(1))), - js - ) - if array: - array = array.group(1).strip("[]").split(",") - array = [x.strip() for x in array] - return array[int(idx)] - - raise extract.RegexMatchError( - caller="get_throttling_function_name", pattern="multiple" - ) - - -def extract_channel_name(url: str) -> str: - """Extract the ``channel_name`` or ``channel_id`` from a YouTube url. - - This function supports the following patterns: - - - :samp:`https://youtube.com/{channel_name}/*` - - :samp:`https://youtube.com/@{channel_name}/*` - - :samp:`https://youtube.com/c/{channel_name}/*` - - :samp:`https://youtube.com/channel/{channel_id}/* - - :samp:`https://youtube.com/c/@{channel_name}/*` - - :samp:`https://youtube.com/channel/@{channel_id}/* - - :samp:`https://youtube.com/u/{channel_name}/*` - - :samp:`https://youtube.com/user/{channel_id}/* - - :param str url: - A YouTube url containing a channel name. - :rtype: str - :returns: - YouTube channel name. - """ - pattern = r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/(?:(user|channel|c)(?:\/))?\@?([%\d\w_\-]+)" - regex = re.compile(pattern) - function_match = regex.search(url) - if function_match: - uri_style = function_match.group(1) - uri_style = uri_style if uri_style else "c" - uri_identifier = function_match.group(2) - return f'/{uri_style}/{uri_identifier}' - - raise extract.RegexMatchError( - caller="channel_name", pattern="patterns" - ) - - -extract.channel_name = extract_channel_name -pytube.cipher.get_throttling_function_name = get_throttling_function_name -pytube.innertube._default_clients["ANDROID"]["context"]["client"]["clientVersion"] = "19.08.35" -pytube.innertube._default_clients["IOS"]["context"]["client"]["clientVersion"] = "19.08.35" -pytube.innertube._default_clients["ANDROID_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35" -pytube.innertube._default_clients["IOS_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35" -pytube.innertube._default_clients["IOS_MUSIC"]["context"]["client"]["clientVersion"] = "6.41" -pytube.innertube._default_clients["ANDROID_MUSIC"] = pytube.innertube._default_clients["ANDROID_CREATOR"] +from tutubo.pytube import YouTube as Video, Channel, Playlist class YoutubePreview: @@ -353,242 +255,5 @@ def as_dict(self): 'image': self.thumbnail_url} -class Playlist(_Pl): - def __init__(self, url, *args, **kwargs): - super().__init__(url, *args, **kwargs) - self._metadata = None - self._microformat = None - - @property - def metadata(self): - if self._metadata: - return self._metadata - else: - self._metadata = self.initial_data['metadata'][ - 'channelMetadataRenderer'] - return self._metadata - - @property - def microformat(self): - if self._microformat: - return self._microformat - else: - self._microformat = self.initial_data['metadata'][ - 'microformatDataRenderer'] - return self._microformat - - @property - def title(self): - """Extract playlist title - - :return: playlist title (name) - :rtype: Optional[str] - """ - try: - return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'title']['runs'][0]['text'] - except: # sidebar not available - pass - try: - return self.microformat['title'] - except: - pass - try: - return self.metadata['title'] - except: - pass - - @property - def description(self) -> str: - try: - return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ - 'description']['simpleText'].strip() - except: # sometimes description is an empty dict - return "" - - @property - def featured_videos(self): - videos = [] - idx = 0 - for vid in self.videos: - if idx > 5: - break - try: - videos.append({ - "videoId": vid.video_id, - "url": vid.watch_url, - "image": vid.thumbnail_url, - "title": vid.title - }) - idx += 1 - except VideoUnavailable: - continue - return videos - - @property - def thumbnail_url(self): - try: - return self.featured_videos[0]["image"] - except: - return None - - @property - def as_dict(self): - return {'playlistId': self.playlist_id, - 'title': self.title, - 'url': self.playlist_url, - "image": self.thumbnail_url, - 'featured_videos': self.featured_videos} - - -class Channel(Playlist, _Ch): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # see https://github.com/pytube/pytube/pull/1019 - @property - def playlists(self) -> Iterable[Playlist]: - """Yields Playlist objects of playlists in this channel - :rtype: List[Playlist] - :returns: List of Playlist - """ - return DeferredGeneratorList(self.playlist_generator()) - - @staticmethod - def _extract_playlists(raw_json: str) -> Tuple[List[str], Optional[str]]: - """Extracts playlists from a raw json page - :param str raw_json: Input json extracted from the page or the last - server response - :rtype: Tuple[List[str], Optional[str]] - :returns: Tuple containing a list of up to 100 video watch ids and - a continuation token, if more videos are available - """ - initial_data = json.loads(raw_json) - - # this is the json tree structure, if the json was extracted from - # html - playlists = [] - try: - tabs = initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"] - for t in tabs: - if "content" not in t["tabRenderer"]: - continue - data = t["tabRenderer"]["content"] - if "sectionListRenderer" not in t["tabRenderer"]["content"]: - continue - for c in data["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"]: - if 'shelfRenderer' in c: - playlists = c['shelfRenderer']["content"]['horizontalListRenderer']["items"] - break - elif 'gridRenderer' in c: - playlists = c['gridRenderer']["items"] - break - if playlists: - break - except (KeyError, IndexError, TypeError): - pass - - try: - continuation = playlists[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][ - 'token'] - playlists = playlists[:-1] - except (KeyError, IndexError): - # if there is an error, no continuation is available - continuation = None - - for p in playlists: - if 'gridPlaylistRenderer' in p: - p['playlistId'] = p['gridPlaylistRenderer']['playlistId'] - elif 'lockupViewModel' in p: - # p["title"] = p['lockupViewModel']['metadata']['lockupMetadataViewModel']['title']['content'] - p['playlistId'] = p['lockupViewModel']['contentId'] - # remove duplicates - return ( - uniqueify(list(map(lambda x: f"/playlist?list={x['playlistId']}", playlists))), - continuation, - ) - - def playlist_generator(self): - for url in self.playlist_urls: - yield Playlist(url) - - def playlist_url_generator(self): - """Generator that yields video URLs. - :Yields: Video URLs - """ - for page in self._paginate_playlists(): - for playlist in page: - yield self._playlist_url(playlist) - - def _paginate_playlists( - self, until_watch_id: Optional[str] = None - ) -> Iterable[List[str]]: - """Parse the playlist links from the page source, yields the /watch?v= - part from video link - :param until_watch_id Optional[str]: YouTube Video watch id until - which the playlist should be read. - :rtype: Iterable[List[str]] - :returns: Iterable of lists of YouTube playlist ids - """ - playlist_urls, continuation = self._extract_playlists( - json.dumps(extract.initial_data(self.playlists_html)) - ) - yield playlist_urls - - @staticmethod - def _playlist_url(playlist_path: str): - return f"https://www.youtube.com{playlist_path}" - - @property # type: ignore - @cache - def playlist_urls(self) -> DeferredGeneratorList: - """Complete links of all the playlists in channel - :rtype: List[str] - :returns: List of playlist URLs - """ - return DeferredGeneratorList(self.playlist_url_generator()) - - @property - def as_dict(self): - return {'channelId': self.channel_id, - 'title': self.title, - 'image': self.thumbnail_url, - 'url': self.channel_url} - - -def video_description_info(watch_html: str): - try: - yt_description_result = extract.regex_search(r'"(?<=description":{"simpleText":")([^}]+)', watch_html, group=0) - except extract.RegexMatchError: - yt_description_result = None - return yt_description_result - - -class Video(_Yt): - @property - def description(self) -> str: - """Get the video description.""" - return self.vid_info.get("videoDetails", {}).get("shortDescription") or \ - video_description_info(self.watch_html).replace('\\n', '\n') - - @property - def as_dict(self): - return {'length': self.length, - 'keywords': self.keywords, - 'image': self.thumbnail_url, - 'title': self.title, - "author": self.author, - 'url': self.watch_url, - 'videoId': self.video_id} - - class RelatedVideo(Video): - @property - def as_dict(self): - return {'length': self.length, - 'keywords': self.keywords, - 'image': self.thumbnail_url, - 'title': self.title, - "author": self.author, - 'url': self.watch_url, - 'videoId': self.video_id} + """""" diff --git a/tutubo/pytube/__init__.py b/tutubo/pytube/__init__.py new file mode 100644 index 0000000..2823580 --- /dev/null +++ b/tutubo/pytube/__init__.py @@ -0,0 +1,19 @@ +# flake8: noqa: F401 +# noreorder +""" +Pytube: a very serious Python library for downloading YouTube Videos. +""" +__title__ = "pytube" +__author__ = "Ronnie Ghose, Taylor Fox Dahlin, Nick Ficano" +__license__ = "The Unlicense (Unlicense)" +__js__ = None +__js_url__ = None + +from tutubo.pytube.version import __version__ +from tutubo.pytube.streams import Stream +from tutubo.pytube.captions import Caption +from tutubo.pytube.query import CaptionQuery, StreamQuery +from tutubo.pytube.__main__ import YouTube +from tutubo.pytube.contrib.playlist import Playlist +from tutubo.pytube.contrib.channel import Channel +from tutubo.pytube.contrib.search import Search diff --git a/tutubo/pytube/__main__.py b/tutubo/pytube/__main__.py new file mode 100644 index 0000000..a9e3e79 --- /dev/null +++ b/tutubo/pytube/__main__.py @@ -0,0 +1,487 @@ +""" +This module implements the core developer interface for pytube. + +The problem domain of the :class:`YouTube class focuses almost +exclusively on the developer interface. Pytube offloads the heavy lifting to +smaller peripheral modules and functions. + +""" +import logging +from typing import Any, Callable, Dict, List, Optional + +import pytube +import pytube.exceptions as exceptions +from tutubo.pytube import extract, request +from tutubo.pytube import Stream, StreamQuery +from tutubo.pytube.helpers import install_proxy +from tutubo.pytube.innertube import InnerTube +from tutubo.pytube.metadata import YouTubeMetadata +from tutubo.pytube.monostate import Monostate + +logger = logging.getLogger(__name__) + + +class YouTube: + """Core developer interface for pytube.""" + + def __init__( + self, + url: str, + on_progress_callback: Optional[Callable[[Any, bytes, int], None]] = None, + on_complete_callback: Optional[Callable[[Any, Optional[str]], None]] = None, + proxies: Dict[str, str] = None, + use_oauth: bool = False, + allow_oauth_cache: bool = True + ): + """Construct a :class:`YouTube `. + + :param str url: + A valid YouTube watch URL. + :param func on_progress_callback: + (Optional) User defined callback function for stream download + progress events. + :param func on_complete_callback: + (Optional) User defined callback function for stream download + complete events. + :param dict proxies: + (Optional) A dict mapping protocol to proxy address which will be used by pytube. + :param bool use_oauth: + (Optional) Prompt the user to authenticate to YouTube. + If allow_oauth_cache is set to True, the user should only be prompted once. + :param bool allow_oauth_cache: + (Optional) Cache OAuth tokens locally on the machine. Defaults to True. + These tokens are only generated if use_oauth is set to True as well. + """ + self._js: Optional[str] = None # js fetched by js_url + self._js_url: Optional[str] = None # the url to the js, parsed from watch html + + self._vid_info: Optional[Dict] = None # content fetched from innertube/player + + self._watch_html: Optional[str] = None # the html of /watch?v= + self._embed_html: Optional[str] = None + self._player_config_args: Optional[Dict] = None # inline js in the html containing + self._age_restricted: Optional[bool] = None + + self._fmt_streams: Optional[List[Stream]] = None + + self._initial_data = None + self._metadata: Optional[YouTubeMetadata] = None + + # video_id part of /watch?v= + self.video_id = extract.video_id(url) + + self.watch_url = f"https://youtube.com/watch?v={self.video_id}" + self.embed_url = f"https://www.youtube.com/embed/{self.video_id}" + + # Shared between all instances of `Stream` (Borg pattern). + self.stream_monostate = Monostate( + on_progress=on_progress_callback, on_complete=on_complete_callback + ) + + if proxies: + install_proxy(proxies) + + self._author = None + self._title = None + self._publish_date = None + + self.use_oauth = use_oauth + self.allow_oauth_cache = allow_oauth_cache + + def __repr__(self): + return f'' + + def __eq__(self, o: object) -> bool: + # Compare types and urls, if they're same return true, else return false. + return type(o) == type(self) and o.watch_url == self.watch_url + + @property + def watch_html(self): + if self._watch_html: + return self._watch_html + self._watch_html = request.get(url=self.watch_url) + return self._watch_html + + @property + def embed_html(self): + if self._embed_html: + return self._embed_html + self._embed_html = request.get(url=self.embed_url) + return self._embed_html + + @property + def age_restricted(self): + if self._age_restricted: + return self._age_restricted + self._age_restricted = extract.is_age_restricted(self.watch_html) + return self._age_restricted + + @property + def js_url(self): + if self._js_url: + return self._js_url + + if self.age_restricted: + self._js_url = extract.js_url(self.embed_html) + else: + self._js_url = extract.js_url(self.watch_html) + + return self._js_url + + @property + def js(self): + if self._js: + return self._js + + # If the js_url doesn't match the cached url, fetch the new js and update + # the cache; otherwise, load the cache. + if pytube.__js_url__ != self.js_url: + self._js = request.get(self.js_url) + pytube.__js__ = self._js + pytube.__js_url__ = self.js_url + else: + self._js = pytube.__js__ + + return self._js + + @property + def initial_data(self): + if self._initial_data: + return self._initial_data + self._initial_data = extract.initial_data(self.watch_html) + return self._initial_data + + @property + def streaming_data(self): + """Return streamingData from video info.""" + if 'streamingData' in self.vid_info: + return self.vid_info['streamingData'] + else: + self.bypass_age_gate() + return self.vid_info['streamingData'] + + @property + def fmt_streams(self): + """Returns a list of streams if they have been initialized. + + If the streams have not been initialized, finds all relevant + streams and initializes them. + """ + self.check_availability() + if self._fmt_streams: + return self._fmt_streams + + self._fmt_streams = [] + + stream_manifest = extract.apply_descrambler(self.streaming_data) + + # If the cached js doesn't work, try fetching a new js file + # https://github.com/pytube/pytube/issues/1054 + try: + extract.apply_signature(stream_manifest, self.vid_info, self.js) + except exceptions.ExtractError: + # To force an update to the js file, we clear the cache and retry + self._js = None + self._js_url = None + pytube.__js__ = None + pytube.__js_url__ = None + extract.apply_signature(stream_manifest, self.vid_info, self.js) + + # build instances of :class:`Stream ` + # Initialize stream objects + for stream in stream_manifest: + video = Stream( + stream=stream, + monostate=self.stream_monostate, + ) + self._fmt_streams.append(video) + + self.stream_monostate.title = self.title + self.stream_monostate.duration = self.length + + return self._fmt_streams + + def check_availability(self): + """Check whether the video is available. + + Raises different exceptions based on why the video is unavailable, + otherwise does nothing. + """ + status, messages = extract.playability_status(self.watch_html) + + for reason in messages: + if status == 'UNPLAYABLE': + if reason == ( + 'Join this channel to get access to members-only content ' + 'like this video, and other exclusive perks.' + ): + raise exceptions.MembersOnly(video_id=self.video_id) + elif reason == 'This live stream recording is not available.': + raise exceptions.RecordingUnavailable(video_id=self.video_id) + else: + raise exceptions.VideoUnavailable(video_id=self.video_id) + elif status == 'LOGIN_REQUIRED': + if reason == ( + 'This is a private video. ' + 'Please sign in to verify that you may see it.' + ): + raise exceptions.VideoPrivate(video_id=self.video_id) + elif status == 'ERROR': + if reason == 'Video unavailable': + raise exceptions.VideoUnavailable(video_id=self.video_id) + elif status == 'LIVE_STREAM': + raise exceptions.LiveStreamError(video_id=self.video_id) + + @property + def vid_info(self): + """Parse the raw vid info and return the parsed result. + + :rtype: Dict[Any, Any] + """ + if self._vid_info: + return self._vid_info + + innertube = InnerTube(use_oauth=self.use_oauth, allow_cache=self.allow_oauth_cache) + + innertube_response = innertube.player(self.video_id) + self._vid_info = innertube_response + return self._vid_info + + def bypass_age_gate(self): + """Attempt to update the vid_info by bypassing the age gate.""" + innertube = InnerTube( + client='ANDROID_EMBED', + use_oauth=self.use_oauth, + allow_cache=self.allow_oauth_cache + ) + innertube_response = innertube.player(self.video_id) + + playability_status = innertube_response['playabilityStatus'].get('status', None) + + # If we still can't access the video, raise an exception + # (tier 3 age restriction) + if playability_status == 'UNPLAYABLE': + raise exceptions.AgeRestrictedError(self.video_id) + + self._vid_info = innertube_response + + @property + def caption_tracks(self) -> List[pytube.Caption]: + """Get a list of :class:`Caption `. + + :rtype: List[Caption] + """ + raw_tracks = ( + self.vid_info.get("captions", {}) + .get("playerCaptionsTracklistRenderer", {}) + .get("captionTracks", []) + ) + return [pytube.Caption(track) for track in raw_tracks] + + @property + def captions(self) -> pytube.CaptionQuery: + """Interface to query caption tracks. + + :rtype: :class:`CaptionQuery `. + """ + return pytube.CaptionQuery(self.caption_tracks) + + @property + def streams(self) -> StreamQuery: + """Interface to query both adaptive (DASH) and progressive streams. + + :rtype: :class:`StreamQuery `. + """ + self.check_availability() + return StreamQuery(self.fmt_streams) + + @property + def thumbnail_url(self) -> str: + """Get the thumbnail url image. + + :rtype: str + """ + thumbnail_details = ( + self.vid_info.get("videoDetails", {}) + .get("thumbnail", {}) + .get("thumbnails") + ) + if thumbnail_details: + thumbnail_details = thumbnail_details[-1] # last item has max size + return thumbnail_details["url"] + + return f"https://img.youtube.com/vi/{self.video_id}/maxresdefault.jpg" + + @property + def publish_date(self): + """Get the publish date. + + :rtype: datetime + """ + if self._publish_date: + return self._publish_date + self._publish_date = extract.publish_date(self.watch_html) + return self._publish_date + + @publish_date.setter + def publish_date(self, value): + """Sets the publish date.""" + self._publish_date = value + + @property + def title(self) -> str: + """Get the video title. + + :rtype: str + """ + if self._title: + return self._title + + try: + self._title = self.vid_info['videoDetails']['title'] + except KeyError: + # Check_availability will raise the correct exception in most cases + # if it doesn't, ask for a report. + self.check_availability() + raise exceptions.PytubeError( + ( + f'Exception while accessing title of {self.watch_url}. ' + 'Please file a bug report at https://github.com/pytube/pytube' + ) + ) + + return self._title + + @title.setter + def title(self, value): + """Sets the title value.""" + self._title = value + + @property + def description(self) -> str: + """Get the video description.""" + return self.vid_info.get("videoDetails", {}).get("shortDescription") or \ + extract.video_description_info(self.watch_html).replace('\\n', '\n') + + @property + def rating(self) -> float: + """Get the video average rating. + + :rtype: float + + """ + return self.vid_info.get("videoDetails", {}).get("averageRating") + + @property + def length(self) -> int: + """Get the video length in seconds. + + :rtype: int + """ + return int(self.vid_info.get('videoDetails', {}).get('lengthSeconds')) + + @property + def views(self) -> int: + """Get the number of the times the video has been viewed. + + :rtype: int + """ + return int(self.vid_info.get("videoDetails", {}).get("viewCount")) + + @property + def author(self) -> str: + """Get the video author. + :rtype: str + """ + if self._author: + return self._author + self._author = self.vid_info.get("videoDetails", {}).get( + "author", "unknown" + ) + return self._author + + @author.setter + def author(self, value): + """Set the video author.""" + self._author = value + + @property + def keywords(self) -> List[str]: + """Get the video keywords. + + :rtype: List[str] + """ + return self.vid_info.get('videoDetails', {}).get('keywords', []) + + @property + def channel_id(self) -> str: + """Get the video poster's channel id. + + :rtype: str + """ + return self.vid_info.get('videoDetails', {}).get('channelId', None) + + @property + def channel_url(self) -> str: + """Construct the channel url for the video's poster from the channel id. + + :rtype: str + """ + return f'https://www.youtube.com/channel/{self.channel_id}' + + @property + def metadata(self) -> Optional[YouTubeMetadata]: + """Get the metadata for the video. + + :rtype: YouTubeMetadata + """ + if self._metadata: + return self._metadata + else: + self._metadata = extract.metadata(self.initial_data) + return self._metadata + + def register_on_progress_callback(self, func: Callable[[Any, bytes, int], None]): + """Register a download progress callback function post initialization. + + :param callable func: + A callback function that takes ``stream``, ``chunk``, + and ``bytes_remaining`` as parameters. + + :rtype: None + + """ + self.stream_monostate.on_progress = func + + def register_on_complete_callback(self, func: Callable[[Any, Optional[str]], None]): + """Register a download complete callback function post initialization. + + :param callable func: + A callback function that takes ``stream`` and ``file_path``. + + :rtype: None + + """ + self.stream_monostate.on_complete = func + + @staticmethod + def from_id(video_id: str) -> "YouTube": + """Construct a :class:`YouTube ` object from a video id. + + :param str video_id: + The video id of the YouTube video. + + :rtype: :class:`YouTube ` + + """ + return YouTube(f"https://www.youtube.com/watch?v={video_id}") + + @property + def as_dict(self): + return {'length': self.length, + 'keywords': self.keywords, + 'image': self.thumbnail_url, + 'title': self.title, + "author": self.author, + 'url': self.watch_url, + 'videoId': self.video_id} diff --git a/tutubo/pytube/captions.py b/tutubo/pytube/captions.py new file mode 100644 index 0000000..ee4cd78 --- /dev/null +++ b/tutubo/pytube/captions.py @@ -0,0 +1,164 @@ +import math +import os +import time +import json +import xml.etree.ElementTree as ElementTree +from html import unescape +from typing import Dict, Optional + +from tutubo.pytube import request +from tutubo.pytube.helpers import safe_filename, target_directory + + +class Caption: + """Container for caption tracks.""" + + def __init__(self, caption_track: Dict): + """Construct a :class:`Caption `. + + :param dict caption_track: + Caption track data extracted from ``watch_html``. + """ + self.url = caption_track.get("baseUrl") + + # Certain videos have runs instead of simpleText + # this handles that edge case + name_dict = caption_track['name'] + if 'simpleText' in name_dict: + self.name = name_dict['simpleText'] + else: + for el in name_dict['runs']: + if 'text' in el: + self.name = el['text'] + + # Use "vssId" instead of "languageCode", fix issue #779 + self.code = caption_track["vssId"] + # Remove preceding '.' for backwards compatibility, e.g.: + # English -> vssId: .en, languageCode: en + # English (auto-generated) -> vssId: a.en, languageCode: en + self.code = self.code.strip('.') + + @property + def xml_captions(self) -> str: + """Download the xml caption tracks.""" + return request.get(self.url) + + @property + def json_captions(self) -> dict: + """Download and parse the json caption tracks.""" + json_captions_url = self.url.replace('fmt=srv3','fmt=json3') + text = request.get(json_captions_url) + parsed = json.loads(text) + assert parsed['wireMagic'] == 'pb3', 'Unexpected captions format' + return parsed + + def generate_srt_captions(self) -> str: + """Generate "SubRip Subtitle" captions. + + Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and + recompiles them into the "SubRip Subtitle" format. + """ + return self.xml_caption_to_srt(self.xml_captions) + + @staticmethod + def float_to_srt_time_format(d: float) -> str: + """Convert decimal durations into proper srt format. + + :rtype: str + :returns: + SubRip Subtitle (str) formatted time duration. + + float_to_srt_time_format(3.89) -> '00:00:03,890' + """ + fraction, whole = math.modf(d) + time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole)) + ms = f"{fraction:.3f}".replace("0.", "") + return time_fmt + ms + + def xml_caption_to_srt(self, xml_captions: str) -> str: + """Convert xml caption tracks to "SubRip Subtitle (srt)". + + :param str xml_captions: + XML formatted caption tracks. + """ + segments = [] + root = ElementTree.fromstring(xml_captions) + for i, child in enumerate(list(root)): + text = child.text or "" + caption = unescape(text.replace("\n", " ").replace(" ", " "),) + try: + duration = float(child.attrib["dur"]) + except KeyError: + duration = 0.0 + start = float(child.attrib["start"]) + end = start + duration + sequence_number = i + 1 # convert from 0-indexed to 1. + line = "{seq}\n{start} --> {end}\n{text}\n".format( + seq=sequence_number, + start=self.float_to_srt_time_format(start), + end=self.float_to_srt_time_format(end), + text=caption, + ) + segments.append(line) + return "\n".join(segments).strip() + + def download( + self, + title: str, + srt: bool = True, + output_path: Optional[str] = None, + filename_prefix: Optional[str] = None, + ) -> str: + """Write the media stream to disk. + + :param title: + Output filename (stem only) for writing media file. + If one is not specified, the default filename is used. + :type title: str + :param srt: + Set to True to download srt, false to download xml. Defaults to True. + :type srt bool + :param output_path: + (optional) Output path for writing media file. If one is not + specified, defaults to the current working directory. + :type output_path: str or None + :param filename_prefix: + (optional) A string that will be prepended to the filename. + For example a number in a playlist or the name of a series. + If one is not specified, nothing will be prepended + This is separate from filename so you can use the default + filename but still add a prefix. + :type filename_prefix: str or None + + :rtype: str + """ + if title.endswith(".srt") or title.endswith(".xml"): + filename = ".".join(title.split(".")[:-1]) + else: + filename = title + + if filename_prefix: + filename = f"{safe_filename(filename_prefix)}{filename}" + + filename = safe_filename(filename) + + filename += f" ({self.code})" + + if srt: + filename += ".srt" + else: + filename += ".xml" + + file_path = os.path.join(target_directory(output_path), filename) + + with open(file_path, "w", encoding="utf-8") as file_handle: + if srt: + file_handle.write(self.generate_srt_captions()) + else: + file_handle.write(self.xml_captions) + + return file_path + + def __repr__(self): + """Printable object representation.""" + return ''.format(s=self) diff --git a/tutubo/pytube/cipher.py b/tutubo/pytube/cipher.py new file mode 100644 index 0000000..4f02f2e --- /dev/null +++ b/tutubo/pytube/cipher.py @@ -0,0 +1,696 @@ +""" +This module contains all logic necessary to decipher the signature. + +YouTube's strategy to restrict downloading videos is to send a ciphered version +of the signature to the client, along with the decryption algorithm obfuscated +in JavaScript. For the clients to play the videos, JavaScript must take the +ciphered version, cycle it through a series of "transform functions," and then +signs the media URL with the output. + +This module is responsible for (1) finding and extracting those "transform +functions" (2) maps them to Python equivalents and (3) taking the ciphered +signature and decoding it. + +""" +import logging +import re +from itertools import chain +from typing import Any, Callable, Dict, List, Optional, Tuple + +from tutubo.pytube.exceptions import ExtractError, RegexMatchError +from tutubo.pytube.helpers import cache, regex_search +from tutubo.pytube.parser import find_object_from_startpoint, throttling_array_split + +logger = logging.getLogger(__name__) + + +class Cipher: + def __init__(self, js: str): + self.transform_plan: List[str] = get_transform_plan(js) + var_regex = re.compile(r"^\w+\W") + var_match = var_regex.search(self.transform_plan[0]) + if not var_match: + raise RegexMatchError( + caller="__init__", pattern=var_regex.pattern + ) + var = var_match.group(0)[:-1] + self.transform_map = get_transform_map(js, var) + self.js_func_patterns = [ + r"\w+\.(\w+)\(\w,(\d+)\)", + r"\w+\[(\"\w+\")\]\(\w,(\d+)\)" + ] + + self.throttling_plan = get_throttling_plan(js) + self.throttling_array = get_throttling_function_array(js) + + self.calculated_n = None + + def calculate_n(self, initial_n: list): + """Converts n to the correct value to prevent throttling.""" + if self.calculated_n: + return self.calculated_n + + # First, update all instances of 'b' with the list(initial_n) + for i in range(len(self.throttling_array)): + if self.throttling_array[i] == 'b': + self.throttling_array[i] = initial_n + + for step in self.throttling_plan: + curr_func = self.throttling_array[int(step[0])] + if not callable(curr_func): + logger.debug(f'{curr_func} is not callable.') + logger.debug(f'Throttling array:\n{self.throttling_array}\n') + raise ExtractError(f'{curr_func} is not callable.') + + first_arg = self.throttling_array[int(step[1])] + + if len(step) == 2: + curr_func(first_arg) + elif len(step) == 3: + second_arg = self.throttling_array[int(step[2])] + curr_func(first_arg, second_arg) + + self.calculated_n = ''.join(initial_n) + return self.calculated_n + + def get_signature(self, ciphered_signature: str) -> str: + """Decipher the signature. + + Taking the ciphered signature, applies the transform functions. + + :param str ciphered_signature: + The ciphered signature sent in the ``player_config``. + :rtype: str + :returns: + Decrypted signature required to download the media content. + """ + signature = list(ciphered_signature) + + for js_func in self.transform_plan: + name, argument = self.parse_function(js_func) # type: ignore + signature = self.transform_map[name](signature, argument) + logger.debug( + "applied transform function\n" + "output: %s\n" + "js_function: %s\n" + "argument: %d\n" + "function: %s", + "".join(signature), + name, + argument, + self.transform_map[name], + ) + + return "".join(signature) + + @cache + def parse_function(self, js_func: str) -> Tuple[str, int]: + """Parse the Javascript transform function. + + Break a JavaScript transform function down into a two element ``tuple`` + containing the function name and some integer-based argument. + + :param str js_func: + The JavaScript version of the transform function. + :rtype: tuple + :returns: + two element tuple containing the function name and an argument. + + **Example**: + + parse_function('DE.AJ(a,15)') + ('AJ', 15) + + """ + logger.debug("parsing transform function") + for pattern in self.js_func_patterns: + regex = re.compile(pattern) + parse_match = regex.search(js_func) + if parse_match: + fn_name, fn_arg = parse_match.groups() + return fn_name, int(fn_arg) + + raise RegexMatchError( + caller="parse_function", pattern="js_func_patterns" + ) + + +def get_initial_function_name(js: str) -> str: + """Extract the name of the function responsible for computing the signature. + :param str js: + The contents of the base.js asset file. + :rtype: str + :returns: + Function name from regex match + """ + + function_patterns = [ + r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r'(?:\b|[^a-zA-Z0-9$])(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 + r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # noqa: E501 + r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r"\.sig\|\|(?P[a-zA-Z0-9$]+)\(", + r"yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r"\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r"\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r"\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(", # noqa: E501 + ] + logger.debug("finding initial function name") + for pattern in function_patterns: + regex = re.compile(pattern) + function_match = regex.search(js) + if function_match: + logger.debug("finished regex search, matched: %s", pattern) + return function_match.group(1) + + raise RegexMatchError( + caller="get_initial_function_name", pattern="multiple" + ) + + +def get_transform_plan(js: str) -> List[str]: + """Extract the "transform plan". + + The "transform plan" is the functions that the ciphered signature is + cycled through to obtain the actual signature. + + :param str js: + The contents of the base.js asset file. + + **Example**: + + ['DE.AJ(a,15)', + 'DE.VR(a,3)', + 'DE.AJ(a,51)', + 'DE.VR(a,3)', + 'DE.kT(a,51)', + 'DE.kT(a,8)', + 'DE.VR(a,3)', + 'DE.kT(a,21)'] + """ + name = re.escape(get_initial_function_name(js)) + pattern = r"%s=function\(\w\){[a-z=\.\(\"\)]*;(.*);(?:.+)}" % name + logger.debug("getting transform plan") + return regex_search(pattern, js, group=1).split(";") + + +def get_transform_object(js: str, var: str) -> List[str]: + """Extract the "transform object". + + The "transform object" contains the function definitions referenced in the + "transform plan". The ``var`` argument is the obfuscated variable name + which contains these functions, for example, given the function call + ``DE.AJ(a,15)`` returned by the transform plan, "DE" would be the var. + + :param str js: + The contents of the base.js asset file. + :param str var: + The obfuscated variable name that stores an object with all functions + that descrambles the signature. + + **Example**: + + >>> get_transform_object(js, 'DE') + ['AJ:function(a){a.reverse()}', + 'VR:function(a,b){a.splice(0,b)}', + 'kT:function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c}'] + + """ + pattern = r"var %s={(.*?)};" % re.escape(var) + logger.debug("getting transform object") + regex = re.compile(pattern, flags=re.DOTALL) + transform_match = regex.search(js) + if not transform_match: + raise RegexMatchError(caller="get_transform_object", pattern=pattern) + + return transform_match.group(1).replace("\n", " ").split(", ") + + +def get_transform_map(js: str, var: str) -> Dict: + """Build a transform function lookup. + + Build a lookup table of obfuscated JavaScript function names to the + Python equivalents. + + :param str js: + The contents of the base.js asset file. + :param str var: + The obfuscated variable name that stores an object with all functions + that descrambles the signature. + + """ + transform_object = get_transform_object(js, var) + mapper = {} + for obj in transform_object: + # AJ:function(a){a.reverse()} => AJ, function(a){a.reverse()} + name, function = obj.split(":", 1) + fn = map_functions(function) + mapper[name] = fn + return mapper + + +def get_throttling_function_name(js: str) -> str: + """Extract the name of the function that computes the throttling parameter. + + :param str js: + The contents of the base.js asset file. + :rtype: str + :returns: + The name of the function used to compute the throttling parameter. + """ + function_patterns = [ + # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377 + # https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8 + # var Bpa = [iha]; + # ... + # a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b), + # Bpa.length || iha("")) }}; + # In the above case, `iha` is the relevant function name + r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&.*?\|\|\s*([a-z]+)', + r'\([a-z]\s*=\s*([a-zA-Z0-9$]+)(\[\d+\])\([a-z]\)', + ] + for pattern in function_patterns: + regex = re.compile(pattern) + function_match = regex.search(js) + if function_match: + if len(function_match.groups()) == 1: + return function_match.group(1) + idx = function_match.group(2) + if idx: + idx = idx.strip("[]") + array = re.search( + r'var {nfunc}\s*=\s*(\[.+?\]);'.format( + nfunc=re.escape(function_match.group(1))), + js + ) + if array: + array = array.group(1).strip("[]").split(",") + array = [x.strip() for x in array] + return array[int(idx)] + + raise RegexMatchError( + caller="get_throttling_function_name", pattern="multiple" + ) + + + +def get_throttling_function_code(js: str) -> str: + """Extract the raw code for the throttling function. + + :param str js: + The contents of the base.js asset file. + :rtype: str + :returns: + The name of the function used to compute the throttling parameter. + """ + # Begin by extracting the correct function name + name = re.escape(get_throttling_function_name(js)) + + # Identify where the function is defined + pattern_start = r"%s=function\(\w\)" % name + regex = re.compile(pattern_start) + match = regex.search(js) + + # Extract the code within curly braces for the function itself, and merge any split lines + code_lines_list = find_object_from_startpoint(js, match.span()[1]).split('\n') + joined_lines = "".join(code_lines_list) + + # Prepend function definition (e.g. `Dea=function(a)`) + return match.group(0) + joined_lines + + +def get_throttling_function_array(js: str) -> List[Any]: + """Extract the "c" array. + + :param str js: + The contents of the base.js asset file. + :returns: + The array of various integers, arrays, and functions. + """ + raw_code = get_throttling_function_code(js) + + array_start = r",c=\[" + array_regex = re.compile(array_start) + match = array_regex.search(raw_code) + + array_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1) + str_array = throttling_array_split(array_raw) + + converted_array = [] + for el in str_array: + try: + converted_array.append(int(el)) + continue + except ValueError: + # Not an integer value. + pass + + if el == 'null': + converted_array.append(None) + continue + + if el.startswith('"') and el.endswith('"'): + # Convert e.g. '"abcdef"' to string without quotation marks, 'abcdef' + converted_array.append(el[1:-1]) + continue + + if el.startswith('function'): + mapper = ( + (r"{for\(\w=\(\w%\w\.length\+\w\.length\)%\w\.length;\w--;\)\w\.unshift\(\w.pop\(\)\)}", throttling_unshift), # noqa:E501 + (r"{\w\.reverse\(\)}", throttling_reverse), + (r"{\w\.push\(\w\)}", throttling_push), + (r";var\s\w=\w\[0\];\w\[0\]=\w\[\w\];\w\[\w\]=\w}", throttling_swap), + (r"case\s\d+", throttling_cipher_function), + (r"\w\.splice\(0,1,\w\.splice\(\w,1,\w\[0\]\)\[0\]\)", throttling_nested_splice), # noqa:E501 + (r";\w\.splice\(\w,1\)}", js_splice), + (r"\w\.splice\(-\w\)\.reverse\(\)\.forEach\(function\(\w\){\w\.unshift\(\w\)}\)", throttling_prepend), # noqa:E501 + (r"for\(var \w=\w\.length;\w;\)\w\.push\(\w\.splice\(--\w,1\)\[0\]\)}", throttling_reverse), # noqa:E501 + ) + + found = False + for pattern, fn in mapper: + if re.search(pattern, el): + converted_array.append(fn) + found = True + if found: + continue + + converted_array.append(el) + + # Replace null elements with array itself + for i in range(len(converted_array)): + if converted_array[i] is None: + converted_array[i] = converted_array + + return converted_array + + +def get_throttling_plan(js: str): + """Extract the "throttling plan". + + The "throttling plan" is a list of tuples used for calling functions + in the c array. The first element of the tuple is the index of the + function to call, and any remaining elements of the tuple are arguments + to pass to that function. + + :param str js: + The contents of the base.js asset file. + :returns: + The full function code for computing the throttlign parameter. + """ + raw_code = get_throttling_function_code(js) + + transform_start = r"try{" + plan_regex = re.compile(transform_start) + match = plan_regex.search(raw_code) + + transform_plan_raw = find_object_from_startpoint(raw_code, match.span()[1] - 1) + + # Steps are either c[x](c[y]) or c[x](c[y],c[z]) + step_start = r"c\[(\d+)\]\(c\[(\d+)\](,c(\[(\d+)\]))?\)" + step_regex = re.compile(step_start) + matches = step_regex.findall(transform_plan_raw) + transform_steps = [] + for match in matches: + if match[4] != '': + transform_steps.append((match[0],match[1],match[4])) + else: + transform_steps.append((match[0],match[1])) + + return transform_steps + + +def reverse(arr: List, _: Optional[Any]): + """Reverse elements in a list. + + This function is equivalent to: + + .. code-block:: javascript + + function(a, b) { a.reverse() } + + This method takes an unused ``b`` variable as their transform functions + universally sent two arguments. + + **Example**: + + >>> reverse([1, 2, 3, 4]) + [4, 3, 2, 1] + """ + return arr[::-1] + + +def splice(arr: List, b: int): + """Add/remove items to/from a list. + + This function is equivalent to: + + .. code-block:: javascript + + function(a, b) { a.splice(0, b) } + + **Example**: + + >>> splice([1, 2, 3, 4], 2) + [1, 2] + """ + return arr[b:] + + +def swap(arr: List, b: int): + """Swap positions at b modulus the list length. + + This function is equivalent to: + + .. code-block:: javascript + + function(a, b) { var c=a[0];a[0]=a[b%a.length];a[b]=c } + + **Example**: + + >>> swap([1, 2, 3, 4], 2) + [3, 2, 1, 4] + """ + r = b % len(arr) + return list(chain([arr[r]], arr[1:r], [arr[0]], arr[r + 1 :])) + + +def throttling_reverse(arr: list): + """Reverses the input list. + + Needs to do an in-place reversal so that the passed list gets changed. + To accomplish this, we create a reversed copy, and then change each + indvidual element. + """ + reverse_copy = arr.copy()[::-1] + for i in range(len(reverse_copy)): + arr[i] = reverse_copy[i] + + +def throttling_push(d: list, e: Any): + """Pushes an element onto a list.""" + d.append(e) + + +def throttling_mod_func(d: list, e: int): + """Perform the modular function from the throttling array functions. + + In the javascript, the modular operation is as follows: + e = (e % d.length + d.length) % d.length + + We simply translate this to python here. + """ + return (e % len(d) + len(d)) % len(d) + + +def throttling_unshift(d: list, e: int): + """Rotates the elements of the list to the right. + + In the javascript, the operation is as follows: + for(e=(e%d.length+d.length)%d.length;e--;)d.unshift(d.pop()) + """ + e = throttling_mod_func(d, e) + new_arr = d[-e:] + d[:-e] + d.clear() + for el in new_arr: + d.append(el) + + +def throttling_cipher_function(d: list, e: str): + """This ciphers d with e to generate a new list. + + In the javascript, the operation is as follows: + var h = [A-Za-z0-9-_], f = 96; // simplified from switch-case loop + d.forEach( + function(l,m,n){ + this.push( + n[m]=h[ + (h.indexOf(l)-h.indexOf(this[m])+m-32+f--)%h.length + ] + ) + }, + e.split("") + ) + """ + h = list('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_') + f = 96 + # by naming it "this" we can more closely reflect the js + this = list(e) + + # This is so we don't run into weirdness with enumerate while + # we change the input list + copied_list = d.copy() + + for m, l in enumerate(copied_list): + bracket_val = (h.index(l) - h.index(this[m]) + m - 32 + f) % len(h) + this.append( + h[bracket_val] + ) + d[m] = h[bracket_val] + f -= 1 + + +def throttling_nested_splice(d: list, e: int): + """Nested splice function in throttling js. + + In the javascript, the operation is as follows: + function(d,e){ + e=(e%d.length+d.length)%d.length; + d.splice( + 0, + 1, + d.splice( + e, + 1, + d[0] + )[0] + ) + } + + While testing, all this seemed to do is swap element 0 and e, + but the actual process is preserved in case there was an edge + case that was not considered. + """ + e = throttling_mod_func(d, e) + inner_splice = js_splice( + d, + e, + 1, + d[0] + ) + js_splice( + d, + 0, + 1, + inner_splice[0] + ) + + +def throttling_prepend(d: list, e: int): + """ + + In the javascript, the operation is as follows: + function(d,e){ + e=(e%d.length+d.length)%d.length; + d.splice(-e).reverse().forEach( + function(f){ + d.unshift(f) + } + ) + } + + Effectively, this moves the last e elements of d to the beginning. + """ + start_len = len(d) + # First, calculate e + e = throttling_mod_func(d, e) + + # Then do the prepending + new_arr = d[-e:] + d[:-e] + + # And update the input list + d.clear() + for el in new_arr: + d.append(el) + + end_len = len(d) + assert start_len == end_len + + +def throttling_swap(d: list, e: int): + """Swap positions of the 0'th and e'th elements in-place.""" + e = throttling_mod_func(d, e) + f = d[0] + d[0] = d[e] + d[e] = f + + +def js_splice(arr: list, start: int, delete_count=None, *items): + """Implementation of javascript's splice function. + + :param list arr: + Array to splice + :param int start: + Index at which to start changing the array + :param int delete_count: + Number of elements to delete from the array + :param *items: + Items to add to the array + + Reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/splice # noqa:E501 + """ + # Special conditions for start value + try: + if start > len(arr): + start = len(arr) + # If start is negative, count backwards from end + if start < 0: + start = len(arr) - start + except TypeError: + # Non-integer start values are treated as 0 in js + start = 0 + + # Special condition when delete_count is greater than remaining elements + if not delete_count or delete_count >= len(arr) - start: + delete_count = len(arr) - start # noqa: N806 + + deleted_elements = arr[start:start + delete_count] + + # Splice appropriately. + new_arr = arr[:start] + list(items) + arr[start + delete_count:] + + # Replace contents of input array + arr.clear() + for el in new_arr: + arr.append(el) + + return deleted_elements + + +def map_functions(js_func: str) -> Callable: + """For a given JavaScript transform function, return the Python equivalent. + + :param str js_func: + The JavaScript version of the transform function. + """ + mapper = ( + # function(a){a.reverse()} + (r"{\w\.reverse\(\)}", reverse), + # function(a,b){a.splice(0,b)} + (r"{\w\.splice\(0,\w\)}", splice), + # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b]=c} + (r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\]=\w}", swap), + # function(a,b){var c=a[0];a[0]=a[b%a.length];a[b%a.length]=c} + ( + r"{var\s\w=\w\[0\];\w\[0\]=\w\[\w\%\w.length\];\w\[\w\%\w.length\]=\w}", + swap, + ), + ) + + for pattern, fn in mapper: + if re.search(pattern, js_func): + return fn + raise RegexMatchError(caller="map_functions", pattern="multiple") diff --git a/tutubo/pytube/cli.py b/tutubo/pytube/cli.py new file mode 100755 index 0000000..7e120e5 --- /dev/null +++ b/tutubo/pytube/cli.py @@ -0,0 +1,560 @@ +#!/usr/bin/env python3 +"""A simple command line application to download youtube videos.""" +import argparse +import gzip +import json +import logging +import os +import shutil +import sys +import datetime as dt +import subprocess # nosec +from typing import List, Optional + +import pytube.exceptions as exceptions +from tutubo.pytube import __version__ +from tutubo.pytube import CaptionQuery, Playlist, Stream, YouTube +from tutubo.pytube.helpers import safe_filename, setup_logger + + +logger = logging.getLogger(__name__) + + +def main(): + """Command line application to download youtube videos.""" + # noinspection PyTypeChecker + parser = argparse.ArgumentParser(description=main.__doc__) + args = _parse_args(parser) + if args.verbose: + log_filename = None + if args.logfile: + log_filename = args.logfile + setup_logger(logging.DEBUG, log_filename=log_filename) + logger.debug(f'Pytube version: {__version__}') + + if not args.url or "youtu" not in args.url: + parser.print_help() + sys.exit(1) + + if "/playlist" in args.url: + print("Loading playlist...") + playlist = Playlist(args.url) + if not args.target: + args.target = safe_filename(playlist.title) + for youtube_video in playlist.videos: + try: + _perform_args_on_youtube(youtube_video, args) + except exceptions.PytubeError as e: + print(f"There was an error with video: {youtube_video}") + print(e) + else: + print("Loading video...") + youtube = YouTube(args.url) + _perform_args_on_youtube(youtube, args) + + +def _perform_args_on_youtube( + youtube: YouTube, args: argparse.Namespace +) -> None: + if len(sys.argv) == 2 : # no arguments parsed + download_highest_resolution_progressive( + youtube=youtube, resolution="highest", target=args.target + ) + if args.list_captions: + _print_available_captions(youtube.captions) + if args.list: + display_streams(youtube) + if args.build_playback_report: + build_playback_report(youtube) + if args.itag: + download_by_itag(youtube=youtube, itag=args.itag, target=args.target) + if args.caption_code: + download_caption( + youtube=youtube, lang_code=args.caption_code, target=args.target + ) + if args.resolution: + download_by_resolution( + youtube=youtube, resolution=args.resolution, target=args.target + ) + if args.audio: + download_audio( + youtube=youtube, filetype=args.audio, target=args.target + ) + if args.ffmpeg: + ffmpeg_process( + youtube=youtube, resolution=args.ffmpeg, target=args.target + ) + + +def _parse_args( + parser: argparse.ArgumentParser, args: Optional[List] = None +) -> argparse.Namespace: + parser.add_argument( + "url", help="The YouTube /watch or /playlist url", nargs="?" + ) + parser.add_argument( + "--version", action="version", version="%(prog)s " + __version__, + ) + parser.add_argument( + "--itag", type=int, help="The itag for the desired stream", + ) + parser.add_argument( + "-r", + "--resolution", + type=str, + help="The resolution for the desired stream", + ) + parser.add_argument( + "-l", + "--list", + action="store_true", + help=( + "The list option causes pytube cli to return a list of streams " + "available to download" + ), + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + dest="verbose", + help="Set logger output to verbose output.", + ) + parser.add_argument( + "--logfile", + action="store", + help="logging debug and error messages into a log file", + ) + parser.add_argument( + "--build-playback-report", + action="store_true", + help="Save the html and js to disk", + ) + parser.add_argument( + "-c", + "--caption-code", + type=str, + help=( + "Download srt captions for given language code. " + "Prints available language codes if no argument given" + ), + ) + parser.add_argument( + '-lc', + '--list-captions', + action='store_true', + help=( + "List available caption codes for a video" + ) + ) + parser.add_argument( + "-t", + "--target", + help=( + "The output directory for the downloaded stream. " + "Default is current working directory" + ), + ) + parser.add_argument( + "-a", + "--audio", + const="mp4", + nargs="?", + help=( + "Download the audio for a given URL at the highest bitrate available. " + "Defaults to mp4 format if none is specified" + ), + ) + parser.add_argument( + "-f", + "--ffmpeg", + const="best", + nargs="?", + help=( + "Downloads the audio and video stream for resolution provided. " + "If no resolution is provided, downloads the best resolution. " + "Runs the command line program ffmpeg to combine the audio and video" + ), + ) + + return parser.parse_args(args) + + +def build_playback_report(youtube: YouTube) -> None: + """Serialize the request data to json for offline debugging. + + :param YouTube youtube: + A YouTube object. + """ + ts = int(dt.datetime.utcnow().timestamp()) + fp = os.path.join(os.getcwd(), f"yt-video-{youtube.video_id}-{ts}.json.gz") + + js = youtube.js + watch_html = youtube.watch_html + vid_info = youtube.vid_info + + with gzip.open(fp, "wb") as fh: + fh.write( + json.dumps( + { + "url": youtube.watch_url, + "js": js, + "watch_html": watch_html, + "video_info": vid_info, + } + ).encode("utf8"), + ) + + +def display_progress_bar( + bytes_received: int, filesize: int, ch: str = "█", scale: float = 0.55 +) -> None: + """Display a simple, pretty progress bar. + + Example: + ~~~~~~~~ + PSY - GANGNAM STYLE(강남스타일) MV.mp4 + ↳ |███████████████████████████████████████| 100.0% + + :param int bytes_received: + The delta between the total file size (bytes) and bytes already + written to disk. + :param int filesize: + File size of the media stream in bytes. + :param str ch: + Character to use for presenting progress segment. + :param float scale: + Scale multiplier to reduce progress bar size. + + """ + columns = shutil.get_terminal_size().columns + max_width = int(columns * scale) + + filled = int(round(max_width * bytes_received / float(filesize))) + remaining = max_width - filled + progress_bar = ch * filled + " " * remaining + percent = round(100.0 * bytes_received / float(filesize), 1) + text = f" ↳ |{progress_bar}| {percent}%\r" + sys.stdout.write(text) + sys.stdout.flush() + + +# noinspection PyUnusedLocal +def on_progress( + stream: Stream, chunk: bytes, bytes_remaining: int +) -> None: # pylint: disable=W0613 + filesize = stream.filesize + bytes_received = filesize - bytes_remaining + display_progress_bar(bytes_received, filesize) + + +def _download( + stream: Stream, + target: Optional[str] = None, + filename: Optional[str] = None, +) -> None: + filesize_megabytes = stream.filesize // 1048576 + print(f"{filename or stream.default_filename} | {filesize_megabytes} MB") + file_path = stream.get_file_path(filename=filename, output_path=target) + if stream.exists_at_path(file_path): + print(f"Already downloaded at:\n{file_path}") + return + + stream.download(output_path=target, filename=filename) + sys.stdout.write("\n") + + +def _unique_name(base: str, subtype: str, media_type: str, target: str) -> str: + """ + Given a base name, the file format, and the target directory, will generate + a filename unique for that directory and file format. + :param str base: + The given base-name. + :param str subtype: + The filetype of the video which will be downloaded. + :param str media_type: + The media_type of the file, ie. "audio" or "video" + :param Path target: + Target directory for download. + """ + counter = 0 + while True: + file_name = f"{base}_{media_type}_{counter}" + file_path = os.path.join(target, f"{file_name}.{subtype}") + if not os.path.exists(file_path): + return file_name + counter += 1 + + +def ffmpeg_process( + youtube: YouTube, resolution: str, target: Optional[str] = None +) -> None: + """ + Decides the correct video stream to download, then calls _ffmpeg_downloader. + + :param YouTube youtube: + A valid YouTube object. + :param str resolution: + YouTube video resolution. + :param str target: + Target directory for download + """ + youtube.register_on_progress_callback(on_progress) + target = target or os.getcwd() + + if resolution == "best": + highest_quality_stream = ( + youtube.streams.filter(progressive=False) + .order_by("resolution") + .last() + ) + mp4_stream = ( + youtube.streams.filter(progressive=False, subtype="mp4") + .order_by("resolution") + .last() + ) + if highest_quality_stream.resolution == mp4_stream.resolution: + video_stream = mp4_stream + else: + video_stream = highest_quality_stream + else: + video_stream = youtube.streams.filter( + progressive=False, resolution=resolution, subtype="mp4" + ).first() + if not video_stream: + video_stream = youtube.streams.filter( + progressive=False, resolution=resolution + ).first() + if video_stream is None: + print(f"Could not find a stream with resolution: {resolution}") + print("Try one of these:") + display_streams(youtube) + sys.exit() + + audio_stream = youtube.streams.get_audio_only(video_stream.subtype) + if not audio_stream: + audio_stream = ( + youtube.streams.filter(only_audio=True).order_by("abr").last() + ) + if not audio_stream: + print("Could not find an audio only stream") + sys.exit() + _ffmpeg_downloader( + audio_stream=audio_stream, video_stream=video_stream, target=target + ) + + +def _ffmpeg_downloader( + audio_stream: Stream, video_stream: Stream, target: str +) -> None: + """ + Given a YouTube Stream object, finds the correct audio stream, downloads them both + giving them a unique name, them uses ffmpeg to create a new file with the audio + and video from the previously downloaded files. Then deletes the original adaptive + streams, leaving the combination. + + :param Stream audio_stream: + A valid Stream object representing the audio to download + :param Stream video_stream: + A valid Stream object representing the video to download + :param Path target: + A valid Path object + """ + video_unique_name = _unique_name( + safe_filename(video_stream.title), + video_stream.subtype, + "video", + target=target, + ) + audio_unique_name = _unique_name( + safe_filename(video_stream.title), + audio_stream.subtype, + "audio", + target=target, + ) + _download(stream=video_stream, target=target, filename=video_unique_name) + print("Loading audio...") + _download(stream=audio_stream, target=target, filename=audio_unique_name) + + video_path = os.path.join( + target, f"{video_unique_name}.{video_stream.subtype}" + ) + audio_path = os.path.join( + target, f"{audio_unique_name}.{audio_stream.subtype}" + ) + final_path = os.path.join( + target, f"{safe_filename(video_stream.title)}.{video_stream.subtype}" + ) + + subprocess.run( # nosec + [ + "ffmpeg", + "-i", + video_path, + "-i", + audio_path, + "-codec", + "copy", + final_path, + ] + ) + os.unlink(video_path) + os.unlink(audio_path) + + +def download_by_itag( + youtube: YouTube, itag: int, target: Optional[str] = None +) -> None: + """Start downloading a YouTube video. + + :param YouTube youtube: + A valid YouTube object. + :param int itag: + YouTube format identifier code. + :param str target: + Target directory for download + """ + stream = youtube.streams.get_by_itag(itag) + if stream is None: + print(f"Could not find a stream with itag: {itag}") + print("Try one of these:") + display_streams(youtube) + sys.exit() + + youtube.register_on_progress_callback(on_progress) + + try: + _download(stream, target=target) + except KeyboardInterrupt: + sys.exit() + + +def download_by_resolution( + youtube: YouTube, resolution: str, target: Optional[str] = None +) -> None: + """Start downloading a YouTube video. + + :param YouTube youtube: + A valid YouTube object. + :param str resolution: + YouTube video resolution. + :param str target: + Target directory for download + """ + # TODO(nficano): allow dash itags to be selected + stream = youtube.streams.get_by_resolution(resolution) + if stream is None: + print(f"Could not find a stream with resolution: {resolution}") + print("Try one of these:") + display_streams(youtube) + sys.exit() + + youtube.register_on_progress_callback(on_progress) + + try: + _download(stream, target=target) + except KeyboardInterrupt: + sys.exit() + + +def download_highest_resolution_progressive( + youtube: YouTube, resolution: str, target: Optional[str] = None +) -> None: + """Start downloading the highest resolution progressive stream. + + :param YouTube youtube: + A valid YouTube object. + :param str resolution: + YouTube video resolution. + :param str target: + Target directory for download + """ + youtube.register_on_progress_callback(on_progress) + try: + stream = youtube.streams.get_highest_resolution() + except exceptions.VideoUnavailable as err: + print(f"No video streams available: {err}") + else: + try: + _download(stream, target=target) + except KeyboardInterrupt: + sys.exit() + + +def display_streams(youtube: YouTube) -> None: + """Probe YouTube video and lists its available formats. + + :param YouTube youtube: + A valid YouTube watch URL. + + """ + for stream in youtube.streams: + print(stream) + + +def _print_available_captions(captions: CaptionQuery) -> None: + print( + f"Available caption codes are: {', '.join(c.code for c in captions)}" + ) + + +def download_caption( + youtube: YouTube, lang_code: Optional[str], target: Optional[str] = None +) -> None: + """Download a caption for the YouTube video. + + :param YouTube youtube: + A valid YouTube object. + :param str lang_code: + Language code desired for caption file. + Prints available codes if the value is None + or the desired code is not available. + :param str target: + Target directory for download + """ + try: + caption = youtube.captions[lang_code] + downloaded_path = caption.download( + title=youtube.title, output_path=target + ) + print(f"Saved caption file to: {downloaded_path}") + except KeyError: + print(f"Unable to find caption with code: {lang_code}") + _print_available_captions(youtube.captions) + + +def download_audio( + youtube: YouTube, filetype: str, target: Optional[str] = None +) -> None: + """ + Given a filetype, downloads the highest quality available audio stream for a + YouTube video. + + :param YouTube youtube: + A valid YouTube object. + :param str filetype: + Desired file format to download. + :param str target: + Target directory for download + """ + audio = ( + youtube.streams.filter(only_audio=True, subtype=filetype) + .order_by("abr") + .last() + ) + + if audio is None: + print("No audio only stream found. Try one of these:") + display_streams(youtube) + sys.exit() + + youtube.register_on_progress_callback(on_progress) + + try: + _download(audio, target=target) + except KeyboardInterrupt: + sys.exit() + + +if __name__ == "__main__": + main() diff --git a/tutubo/pytube/contrib/__init__.py b/tutubo/pytube/contrib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tutubo/pytube/contrib/channel.py b/tutubo/pytube/contrib/channel.py new file mode 100644 index 0000000..aee126f --- /dev/null +++ b/tutubo/pytube/contrib/channel.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +"""Module for interacting with a user's youtube channel.""" +import json +import logging +from typing import Dict, List, Tuple, Optional, Iterable + +from tutubo.pytube import extract, Playlist, request +from tutubo.pytube.helpers import uniqueify, cache, DeferredGeneratorList + +logger = logging.getLogger(__name__) + + +class Channel(Playlist): + def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None): + """Construct a :class:`Channel `. + + :param str url: + A valid YouTube channel URL. + :param proxies: + (Optional) A dictionary of proxies to use for web requests. + """ + super().__init__(url, proxies) + + self.channel_uri = extract.channel_name(url) + + self.channel_url = ( + f"https://www.youtube.com{self.channel_uri}" + ) + + self.videos_url = self.channel_url + '/videos' + self.playlists_url = self.channel_url + '/playlists' + self.community_url = self.channel_url + '/community' + self.featured_channels_url = self.channel_url + '/channels' + self.about_url = self.channel_url + '/about' + self.shorts_url = self.channel_url + '/shorts' + self.live_url = self.channel_url + '/streams' + + # Possible future additions + self._playlists_html = None + self._community_html = None + self._featured_channels_html = None + self._about_html = None + + self._html_url = self.videos_url # Videos will be preferred over short videos and live + self._visitor_data = None + + @property + def channel_name(self): + """Get the name of the YouTube channel. + + :rtype: str + """ + return self.initial_data['metadata']['channelMetadataRenderer']['title'] + + @property + def channel_id(self): + """Get the ID of the YouTube channel. + + This will return the underlying ID, not the vanity URL. + + :rtype: str + """ + return self.initial_data['metadata']['channelMetadataRenderer']['externalId'] + + @property + def vanity_url(self): + """Get the vanity URL of the YouTube channel. + + Returns None if it doesn't exist. + + :rtype: str + """ + return self.initial_data['metadata']['channelMetadataRenderer'].get('vanityChannelUrl', None) # noqa:E501 + + @property + def html(self): + """Get the html for the /videos page. + + :rtype: str + """ + if self._html: + return self._html + self._html = request.get(self.videos_url) + return self._html + + @property + def playlists_html(self): + """Get the html for the /playlists page. + + Currently unused for any functionality. + + :rtype: str + """ + if self._playlists_html: + return self._playlists_html + else: + self._playlists_html = request.get(self.playlists_url) + return self._playlists_html + + @property + def community_html(self): + """Get the html for the /community page. + + Currently unused for any functionality. + + :rtype: str + """ + if self._community_html: + return self._community_html + else: + self._community_html = request.get(self.community_url) + return self._community_html + + @property + def featured_channels_html(self): + """Get the html for the /channels page. + + Currently unused for any functionality. + + :rtype: str + """ + if self._featured_channels_html: + return self._featured_channels_html + else: + self._featured_channels_html = request.get(self.featured_channels_url) + return self._featured_channels_html + + @property + def about_html(self): + """Get the html for the /about page. + + Currently unused for any functionality. + + :rtype: str + """ + if self._about_html: + return self._about_html + else: + self._about_html = request.get(self.about_url) + return self._about_html + + @staticmethod + def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]: + """Extracts videos from a raw json page + + :param str raw_json: Input json extracted from the page or the last + server response + :rtype: Tuple[List[str], Optional[str]] + :returns: Tuple containing a list of up to 100 video watch ids and + a continuation token, if more videos are available + """ + initial_data = json.loads(raw_json) + # this is the json tree structure, if the json was extracted from + # html + try: + videos = initial_data["contents"][ + "twoColumnBrowseResultsRenderer"][ + "tabs"][1]["tabRenderer"]["content"][ + "sectionListRenderer"]["contents"][0][ + "itemSectionRenderer"]["contents"][0][ + "gridRenderer"]["items"] + except (KeyError, IndexError, TypeError): + try: + # this is the json tree structure, if the json was directly sent + # by the server in a continuation response + important_content = initial_data[1]['response']['onResponseReceivedActions'][ + 0 + ]['appendContinuationItemsAction']['continuationItems'] + videos = important_content + except (KeyError, IndexError, TypeError): + try: + # this is the json tree structure, if the json was directly sent + # by the server in a continuation response + # no longer a list and no longer has the "response" key + important_content = initial_data['onResponseReceivedActions'][0][ + 'appendContinuationItemsAction']['continuationItems'] + videos = important_content + except (KeyError, IndexError, TypeError) as p: + logger.info(p) + return [], None + + try: + continuation = videos[-1]['continuationItemRenderer'][ + 'continuationEndpoint' + ]['continuationCommand']['token'] + videos = videos[:-1] + except (KeyError, IndexError): + # if there is an error, no continuation is available + continuation = None + + # remove duplicates + return ( + uniqueify( + list( + # only extract the video ids from the video data + map( + lambda x: ( + f"/watch?v=" + f"{x['gridVideoRenderer']['videoId']}" + ), + videos + ) + ), + ), + continuation, + ) + + @property + def playlists(self) -> Iterable[Playlist]: + """Yields Playlist objects of playlists in this channel + :rtype: List[Playlist] + :returns: List of Playlist + """ + return DeferredGeneratorList(self.playlist_generator()) + + @staticmethod + def _extract_playlists(raw_json: str) -> Tuple[List[str], Optional[str]]: + """Extracts playlists from a raw json page + :param str raw_json: Input json extracted from the page or the last + server response + :rtype: Tuple[List[str], Optional[str]] + :returns: Tuple containing a list of up to 100 video watch ids and + a continuation token, if more videos are available + """ + initial_data = json.loads(raw_json) + + # this is the json tree structure, if the json was extracted from + # html + playlists = [] + try: + tabs = initial_data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"] + for t in tabs: + if "content" not in t["tabRenderer"]: + continue + data = t["tabRenderer"]["content"] + if "sectionListRenderer" not in t["tabRenderer"]["content"]: + continue + for c in data["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"]: + if 'shelfRenderer' in c: + playlists = c['shelfRenderer']["content"]['horizontalListRenderer']["items"] + break + elif 'gridRenderer' in c: + playlists = c['gridRenderer']["items"] + break + if playlists: + break + except (KeyError, IndexError, TypeError): + pass + + try: + continuation = playlists[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][ + 'token'] + playlists = playlists[:-1] + except (KeyError, IndexError): + # if there is an error, no continuation is available + continuation = None + + for p in playlists: + if 'gridPlaylistRenderer' in p: + p['playlistId'] = p['gridPlaylistRenderer']['playlistId'] + elif 'lockupViewModel' in p: + # p["title"] = p['lockupViewModel']['metadata']['lockupMetadataViewModel']['title']['content'] + p['playlistId'] = p['lockupViewModel']['contentId'] + # remove duplicates + return ( + uniqueify(list(map(lambda x: f"/playlist?list={x['playlistId']}", playlists))), + continuation, + ) + + def playlist_generator(self): + for url in self.playlist_urls: + yield Playlist(url) + + def playlist_url_generator(self): + """Generator that yields video URLs. + :Yields: Video URLs + """ + for page in self._paginate_playlists(): + for playlist in page: + yield self._playlist_url(playlist) + + def _paginate_playlists( + self, until_watch_id: Optional[str] = None + ) -> Iterable[List[str]]: + """Parse the playlist links from the page source, yields the /watch?v= + part from video link + :param until_watch_id Optional[str]: YouTube Video watch id until + which the playlist should be read. + :rtype: Iterable[List[str]] + :returns: Iterable of lists of YouTube playlist ids + """ + playlist_urls, continuation = self._extract_playlists( + json.dumps(extract.initial_data(self.playlists_html)) + ) + yield playlist_urls + + @staticmethod + def _playlist_url(playlist_path: str): + return f"https://www.youtube.com{playlist_path}" + + @property # type: ignore + @cache + def playlist_urls(self) -> DeferredGeneratorList: + """Complete links of all the playlists in channel + :rtype: List[str] + :returns: List of playlist URLs + """ + return DeferredGeneratorList(self.playlist_url_generator()) + + @property + def as_dict(self): + return {'channelId': self.channel_id, + 'title': self.title, + 'image': self.thumbnail_url, + 'url': self.channel_url} diff --git a/tutubo/pytube/contrib/playlist.py b/tutubo/pytube/contrib/playlist.py new file mode 100644 index 0000000..4f160e7 --- /dev/null +++ b/tutubo/pytube/contrib/playlist.py @@ -0,0 +1,488 @@ +"""Module to download a complete playlist from a youtube channel.""" +import json +import logging +from collections.abc import Sequence +from datetime import date, datetime +from typing import Dict, Iterable, List, Optional, Tuple, Union + +from tutubo.pytube import extract, request, YouTube +from tutubo.pytube.exceptions import VideoUnavailable +from tutubo.pytube.helpers import cache, DeferredGeneratorList, install_proxy, uniqueify + +logger = logging.getLogger(__name__) + + +class Playlist(Sequence): + """Load a YouTube playlist with URL""" + + def __init__(self, url: str, proxies: Optional[Dict[str, str]] = None): + if proxies: + install_proxy(proxies) + + self._input_url = url + + # These need to be initialized as None for the properties. + self._html = None + self._ytcfg = None + self._initial_data = None + self._sidebar_info = None + + self._playlist_id = None + + self._metadata = None + self._microformat = None + + @property + def playlist_id(self): + """Get the playlist id. + + :rtype: str + """ + if self._playlist_id: + return self._playlist_id + self._playlist_id = extract.playlist_id(self._input_url) + return self._playlist_id + + @property + def playlist_url(self): + """Get the base playlist url. + + :rtype: str + """ + return f"https://www.youtube.com/playlist?list={self.playlist_id}" + + @property + def metadata(self): + if self._metadata: + return self._metadata + else: + self._metadata = self.initial_data['metadata'][ + 'channelMetadataRenderer'] + return self._metadata + + @property + def microformat(self): + if self._microformat: + return self._microformat + else: + self._microformat = self.initial_data['metadata'][ + 'microformatDataRenderer'] + return self._microformat + + @property + def html(self): + """Get the playlist page html. + + :rtype: str + """ + if self._html: + return self._html + self._html = request.get(self.playlist_url) + return self._html + + @property + def ytcfg(self): + """Extract the ytcfg from the playlist page html. + + :rtype: dict + """ + if self._ytcfg: + return self._ytcfg + self._ytcfg = extract.get_ytcfg(self.html) + return self._ytcfg + + @property + def initial_data(self): + """Extract the initial data from the playlist page html. + + :rtype: dict + """ + if self._initial_data: + return self._initial_data + else: + self._initial_data = extract.initial_data(self.html) + return self._initial_data + + @property + def sidebar_info(self): + """Extract the sidebar info from the playlist page html. + + :rtype: dict + """ + if self._sidebar_info: + return self._sidebar_info + else: + self._sidebar_info = self.initial_data['sidebar'][ + 'playlistSidebarRenderer']['items'] + return self._sidebar_info + + @property + def yt_api_key(self): + """Extract the INNERTUBE_API_KEY from the playlist ytcfg. + + :rtype: str + """ + return self.ytcfg['INNERTUBE_API_KEY'] + + def _paginate( + self, until_watch_id: Optional[str] = None + ) -> Iterable[List[str]]: + """Parse the video links from the page source, yields the /watch?v= + part from video link + + :param until_watch_id Optional[str]: YouTube Video watch id until + which the playlist should be read. + + :rtype: Iterable[List[str]] + :returns: Iterable of lists of YouTube watch ids + """ + videos_urls, continuation = self._extract_videos( + json.dumps(extract.initial_data(self.html)) + ) + if until_watch_id: + try: + trim_index = videos_urls.index(f"/watch?v={until_watch_id}") + yield videos_urls[:trim_index] + return + except ValueError: + pass + yield videos_urls + + # Extraction from a playlist only returns 100 videos at a time + # if self._extract_videos returns a continuation there are more + # than 100 songs inside a playlist, so we need to add further requests + # to gather all of them + if continuation: + load_more_url, headers, data = self._build_continuation_url(continuation) + else: + load_more_url, headers, data = None, None, None + + while load_more_url and headers and data: # there is an url found + logger.debug("load more url: %s", load_more_url) + # requesting the next page of videos with the url generated from the + # previous page, needs to be a post + req = request.post(load_more_url, extra_headers=headers, data=data) + # extract up to 100 songs from the page loaded + # returns another continuation if more videos are available + videos_urls, continuation = self._extract_videos(req) + if until_watch_id: + try: + trim_index = videos_urls.index(f"/watch?v={until_watch_id}") + yield videos_urls[:trim_index] + return + except ValueError: + pass + yield videos_urls + + if continuation: + load_more_url, headers, data = self._build_continuation_url( + continuation + ) + else: + load_more_url, headers, data = None, None, None + + def _build_continuation_url(self, continuation: str) -> Tuple[str, dict, dict]: + """Helper method to build the url and headers required to request + the next page of videos + + :param str continuation: Continuation extracted from the json response + of the last page + :rtype: Tuple[str, dict, dict] + :returns: Tuple of an url and required headers for the next http + request + """ + return ( + ( + # was changed to this format (and post requests) + # between 2021.03.02 and 2021.03.03 + "https://www.youtube.com/youtubei/v1/browse?key=" + f"{self.yt_api_key}" + ), + { + "X-YouTube-Client-Name": "1", + "X-YouTube-Client-Version": "2.20200720.00.02", + }, + # extra data required for post request + { + "continuation": continuation, + "context": { + "client": { + "clientName": "WEB", + "clientVersion": "2.20200720.00.02" + } + } + } + ) + + @staticmethod + def _extract_videos(raw_json: str) -> Tuple[List[str], Optional[str]]: + """Extracts videos from a raw json page + + :param str raw_json: Input json extracted from the page or the last + server response + :rtype: Tuple[List[str], Optional[str]] + :returns: Tuple containing a list of up to 100 video watch ids and + a continuation token, if more videos are available + """ + initial_data = json.loads(raw_json) + try: + # this is the json tree structure, if the json was extracted from + # html + section_contents = initial_data["contents"][ + "twoColumnBrowseResultsRenderer"][ + "tabs"][0]["tabRenderer"]["content"][ + "sectionListRenderer"]["contents"] + try: + # Playlist without submenus + important_content = section_contents[ + 0]["itemSectionRenderer"][ + "contents"][0]["playlistVideoListRenderer"] + except (KeyError, IndexError, TypeError): + # Playlist with submenus + important_content = section_contents[ + 1]["itemSectionRenderer"][ + "contents"][0]["playlistVideoListRenderer"] + videos = important_content["contents"] + except (KeyError, IndexError, TypeError): + try: + # this is the json tree structure, if the json was directly sent + # by the server in a continuation response + # no longer a list and no longer has the "response" key + important_content = initial_data['onResponseReceivedActions'][0][ + 'appendContinuationItemsAction']['continuationItems'] + videos = important_content + except (KeyError, IndexError, TypeError) as p: + logger.info(p) + return [], None + + try: + continuation = videos[-1]['continuationItemRenderer'][ + 'continuationEndpoint' + ]['continuationCommand']['token'] + videos = videos[:-1] + except (KeyError, IndexError): + # if there is an error, no continuation is available + continuation = None + + # remove duplicates + return ( + uniqueify( + list( + # only extract the video ids from the video data + map( + lambda x: ( + f"/watch?v=" + f"{x['playlistVideoRenderer']['videoId']}" + ), + videos + ) + ), + ), + continuation, + ) + + def trimmed(self, video_id: str) -> Iterable[str]: + """Retrieve a list of YouTube video URLs trimmed at the given video ID + + i.e. if the playlist has video IDs 1,2,3,4 calling trimmed(3) returns + [1,2] + :type video_id: str + video ID to trim the returned list of playlist URLs at + :rtype: List[str] + :returns: + List of video URLs from the playlist trimmed at the given ID + """ + for page in self._paginate(until_watch_id=video_id): + yield from (self._video_url(watch_path) for watch_path in page) + + def url_generator(self): + """Generator that yields video URLs. + + :Yields: Video URLs + """ + for page in self._paginate(): + for video in page: + yield self._video_url(video) + + @property # type: ignore + @cache + def video_urls(self) -> DeferredGeneratorList: + """Complete links of all the videos in playlist + + :rtype: List[str] + :returns: List of video URLs + """ + return DeferredGeneratorList(self.url_generator()) + + def videos_generator(self): + for url in self.video_urls: + yield YouTube(url) + + @property + def videos(self) -> Iterable[YouTube]: + """Yields YouTube objects of videos in this playlist + + :rtype: List[YouTube] + :returns: List of YouTube + """ + return DeferredGeneratorList(self.videos_generator()) + + def __getitem__(self, i: Union[slice, int]) -> Union[str, List[str]]: + return self.video_urls[i] + + def __len__(self) -> int: + return len(self.video_urls) + + def __repr__(self) -> str: + return f"{repr(self.video_urls)}" + + @property + @cache + def last_updated(self) -> Optional[date]: + """Extract the date that the playlist was last updated. + + For some playlists, this will be a specific date, which is returned as a datetime + object. For other playlists, this is an estimate such as "1 week ago". Due to the + fact that this value is returned as a string, pytube does a best-effort parsing + where possible, and returns the raw string where it is not possible. + + :return: Date of last playlist update where possible, else the string provided + :rtype: datetime.date + """ + last_updated_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ + 'stats'][2]['runs'][1]['text'] + try: + date_components = last_updated_text.split() + month = date_components[0] + day = date_components[1].strip(',') + year = date_components[2] + return datetime.strptime( + f"{month} {day:0>2} {year}", "%b %d %Y" + ).date() + except (IndexError, KeyError): + return last_updated_text + + @property + def title(self): + """Extract playlist title + + :return: playlist title (name) + :rtype: Optional[str] + """ + try: + return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ + 'title']['runs'][0]['text'] + except: # sidebar not available + pass + try: + return self.microformat['title'] + except: + pass + try: + return self.metadata['title'] + except: + pass + + @property + def thumbnail_url(self): + try: + return self.featured_videos[0]["image"] + except: + return None + + @property + def description(self) -> str: + try: + return self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ + 'description']['simpleText'] + except: # sometimes description is an empty dict + return "" + + @property + def length(self): + """Extract the number of videos in the playlist. + + :return: Playlist video count + :rtype: int + """ + count_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ + 'stats'][0]['runs'][0]['text'] + count_text = count_text.replace(',', '') + return int(count_text) + + @property + def views(self): + """Extract view count for playlist. + + :return: Playlist view count + :rtype: int + """ + # "1,234,567 views" + views_text = self.sidebar_info[0]['playlistSidebarPrimaryInfoRenderer'][ + 'stats'][1]['simpleText'] + # "1,234,567" + count_text = views_text.split()[0] + # "1234567" + count_text = count_text.replace(',', '') + return int(count_text) + + @property + def owner(self): + """Extract the owner of the playlist. + + :return: Playlist owner name. + :rtype: str + """ + return self.sidebar_info[1]['playlistSidebarSecondaryInfoRenderer'][ + 'videoOwner']['videoOwnerRenderer']['title']['runs'][0]['text'] + + @property + def owner_id(self): + """Extract the channel_id of the owner of the playlist. + + :return: Playlist owner's channel ID. + :rtype: str + """ + return self.sidebar_info[1]['playlistSidebarSecondaryInfoRenderer'][ + 'videoOwner']['videoOwnerRenderer']['title']['runs'][0][ + 'navigationEndpoint']['browseEndpoint']['browseId'] + + @property + def owner_url(self): + """Create the channel url of the owner of the playlist. + + :return: Playlist owner's channel url. + :rtype: str + """ + return f'https://www.youtube.com/channel/{self.owner_id}' + + @staticmethod + def _video_url(watch_path: str): + return f"https://www.youtube.com{watch_path}" + + @property + def as_dict(self): + return {'playlistId': self.playlist_id, + 'title': self.title, + 'url': self.playlist_url, + "image": self.thumbnail_url, + 'featured_videos': self.featured_videos} + + @property + def featured_videos(self): + videos = [] + idx = 0 + for vid in self.videos: + if idx > 5: + break + try: + videos.append({ + "videoId": vid.video_id, + "url": vid.watch_url, + "image": vid.thumbnail_url, + "title": vid.title + }) + idx += 1 + except VideoUnavailable: + continue + return videos diff --git a/tutubo/pytube/contrib/search.py b/tutubo/pytube/contrib/search.py new file mode 100644 index 0000000..a3a2f31 --- /dev/null +++ b/tutubo/pytube/contrib/search.py @@ -0,0 +1,225 @@ +"""Module for interacting with YouTube search.""" +# Native python imports +import logging + +# Local imports +from tutubo.pytube import YouTube +from tutubo.pytube.innertube import InnerTube + + +logger = logging.getLogger(__name__) + + +class Search: + def __init__(self, query): + """Initialize Search object. + + :param str query: + Search query provided by the user. + """ + self.query = query + self._innertube_client = InnerTube(client='WEB') + + # The first search, without a continuation, is structured differently + # and contains completion suggestions, so we must store this separately + self._initial_results = None + + self._results = None + self._completion_suggestions = None + + # Used for keeping track of query continuations so that new results + # are always returned when get_next_results() is called + self._current_continuation = None + + @property + def completion_suggestions(self): + """Return query autocompletion suggestions for the query. + + :rtype: list + :returns: + A list of autocomplete suggestions provided by YouTube for the query. + """ + if self._completion_suggestions: + return self._completion_suggestions + if self.results: + self._completion_suggestions = self._initial_results['refinements'] + return self._completion_suggestions + + @property + def results(self): + """Return search results. + + On first call, will generate and return the first set of results. + Additional results can be generated using ``.get_next_results()``. + + :rtype: list + :returns: + A list of YouTube objects. + """ + if self._results: + return self._results + + videos, continuation = self.fetch_and_parse() + self._results = videos + self._current_continuation = continuation + return self._results + + def get_next_results(self): + """Use the stored continuation string to fetch the next set of results. + + This method does not return the results, but instead updates the results property. + """ + if self._current_continuation: + videos, continuation = self.fetch_and_parse(self._current_continuation) + self._results.extend(videos) + self._current_continuation = continuation + else: + raise IndexError + + def fetch_and_parse(self, continuation=None): + """Fetch from the innertube API and parse the results. + + :param str continuation: + Continuation string for fetching results. + :rtype: tuple + :returns: + A tuple of a list of YouTube objects and a continuation string. + """ + # Begin by executing the query and identifying the relevant sections + # of the results + raw_results = self.fetch_query(continuation) + + # Initial result is handled by try block, continuations by except block + try: + sections = raw_results['contents']['twoColumnSearchResultsRenderer'][ + 'primaryContents']['sectionListRenderer']['contents'] + except KeyError: + sections = raw_results['onResponseReceivedCommands'][0][ + 'appendContinuationItemsAction']['continuationItems'] + item_renderer = None + continuation_renderer = None + for s in sections: + if 'itemSectionRenderer' in s: + item_renderer = s['itemSectionRenderer'] + if 'continuationItemRenderer' in s: + continuation_renderer = s['continuationItemRenderer'] + + # If the continuationItemRenderer doesn't exist, assume no further results + if continuation_renderer: + next_continuation = continuation_renderer['continuationEndpoint'][ + 'continuationCommand']['token'] + else: + next_continuation = None + + # If the itemSectionRenderer doesn't exist, assume no results. + if item_renderer: + videos = [] + raw_video_list = item_renderer['contents'] + for video_details in raw_video_list: + # Skip over ads + if video_details.get('searchPyvRenderer', {}).get('ads', None): + continue + + # Skip "recommended" type videos e.g. "people also watched" and "popular X" + # that break up the search results + if 'shelfRenderer' in video_details: + continue + + # Skip auto-generated "mix" playlist results + if 'radioRenderer' in video_details: + continue + + # Skip playlist results + if 'playlistRenderer' in video_details: + continue + + # Skip channel results + if 'channelRenderer' in video_details: + continue + + # Skip 'people also searched for' results + if 'horizontalCardListRenderer' in video_details: + continue + + # Can't seem to reproduce, probably related to typo fix suggestions + if 'didYouMeanRenderer' in video_details: + continue + + # Seems to be the renderer used for the image shown on a no results page + if 'backgroundPromoRenderer' in video_details: + continue + + if 'videoRenderer' not in video_details: + logger.warning('Unexpected renderer encountered.') + logger.warning(f'Renderer name: {video_details.keys()}') + logger.warning(f'Search term: {self.query}') + logger.warning( + 'Please open an issue at ' + 'https://github.com/pytube/pytube/issues ' + 'and provide this log output.' + ) + continue + + # Extract relevant video information from the details. + # Some of this can be used to pre-populate attributes of the + # YouTube object. + vid_renderer = video_details['videoRenderer'] + vid_id = vid_renderer['videoId'] + vid_url = f'https://www.youtube.com/watch?v={vid_id}' + vid_title = vid_renderer['title']['runs'][0]['text'] + vid_channel_name = vid_renderer['ownerText']['runs'][0]['text'] + vid_channel_uri = vid_renderer['ownerText']['runs'][0][ + 'navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'] + # Livestreams have "runs", non-livestreams have "simpleText", + # and scheduled releases do not have 'viewCountText' + if 'viewCountText' in vid_renderer: + if 'runs' in vid_renderer['viewCountText']: + vid_view_count_text = vid_renderer['viewCountText']['runs'][0]['text'] + else: + vid_view_count_text = vid_renderer['viewCountText']['simpleText'] + # Strip ' views' text, then remove commas + stripped_text = vid_view_count_text.split()[0].replace(',','') + if stripped_text == 'No': + vid_view_count = 0 + else: + vid_view_count = int(stripped_text) + else: + vid_view_count = 0 + if 'lengthText' in vid_renderer: + vid_length = vid_renderer['lengthText']['simpleText'] + else: + vid_length = None + + vid_metadata = { + 'id': vid_id, + 'url': vid_url, + 'title': vid_title, + 'channel_name': vid_channel_name, + 'channel_url': vid_channel_uri, + 'view_count': vid_view_count, + 'length': vid_length + } + + # Construct YouTube object from metadata and append to results + vid = YouTube(vid_metadata['url']) + vid.author = vid_metadata['channel_name'] + vid.title = vid_metadata['title'] + videos.append(vid) + else: + videos = None + + return videos, next_continuation + + def fetch_query(self, continuation=None): + """Fetch raw results from the innertube API. + + :param str continuation: + Continuation string for fetching results. + :rtype: dict + :returns: + The raw json object returned by the innertube API. + """ + query_results = self._innertube_client.search(self.query, continuation) + if not self._initial_results: + self._initial_results = query_results + return query_results # noqa:R504 diff --git a/tutubo/pytube/exceptions.py b/tutubo/pytube/exceptions.py new file mode 100644 index 0000000..ec44d2a --- /dev/null +++ b/tutubo/pytube/exceptions.py @@ -0,0 +1,145 @@ +"""Library specific exception definitions.""" +from typing import Pattern, Union + + +class PytubeError(Exception): + """Base pytube exception that all others inherit. + + This is done to not pollute the built-in exceptions, which *could* result + in unintended errors being unexpectedly and incorrectly handled within + implementers code. + """ + + +class MaxRetriesExceeded(PytubeError): + """Maximum number of retries exceeded.""" + + +class HTMLParseError(PytubeError): + """HTML could not be parsed""" + + +class ExtractError(PytubeError): + """Data extraction based exception.""" + + +class RegexMatchError(ExtractError): + """Regex pattern did not return any matches.""" + + def __init__(self, caller: str, pattern: Union[str, Pattern]): + """ + :param str caller: + Calling function + :param str pattern: + Pattern that failed to match + """ + super().__init__(f"{caller}: could not find match for {pattern}") + self.caller = caller + self.pattern = pattern + + +class VideoUnavailable(PytubeError): + """Base video unavailable error.""" + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.error_string) + + @property + def error_string(self): + return f'{self.video_id} is unavailable' + + +class AgeRestrictedError(VideoUnavailable): + """Video is age restricted, and cannot be accessed without OAuth.""" + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.video_id) + + @property + def error_string(self): + return f"{self.video_id} is age restricted, and can't be accessed without logging in." + + +class LiveStreamError(VideoUnavailable): + """Video is a live stream.""" + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.video_id) + + @property + def error_string(self): + return f'{self.video_id} is streaming live and cannot be loaded' + + +class VideoPrivate(VideoUnavailable): + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.video_id) + + @property + def error_string(self): + return f'{self.video_id} is a private video' + + +class RecordingUnavailable(VideoUnavailable): + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.video_id) + + @property + def error_string(self): + return f'{self.video_id} does not have a live stream recording available' + + +class MembersOnly(VideoUnavailable): + """Video is members-only. + + YouTube has special videos that are only viewable to users who have + subscribed to a content creator. + ref: https://support.google.com/youtube/answer/7544492?hl=en + """ + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.video_id) + + @property + def error_string(self): + return f'{self.video_id} is a members-only video' + + +class VideoRegionBlocked(VideoUnavailable): + def __init__(self, video_id: str): + """ + :param str video_id: + A YouTube video identifier. + """ + self.video_id = video_id + super().__init__(self.video_id) + + @property + def error_string(self): + return f'{self.video_id} is not available in your region' diff --git a/tutubo/pytube/extract.py b/tutubo/pytube/extract.py new file mode 100644 index 0000000..635e5bd --- /dev/null +++ b/tutubo/pytube/extract.py @@ -0,0 +1,586 @@ +"""This module contains all non-cipher related data extraction logic.""" +import logging +import urllib.parse +import re +from collections import OrderedDict +from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple +from urllib.parse import parse_qs, quote, urlencode, urlparse + +from tutubo.pytube.cipher import Cipher +from tutubo.pytube.exceptions import HTMLParseError, LiveStreamError, RegexMatchError +from tutubo.pytube.helpers import regex_search +from tutubo.pytube.metadata import YouTubeMetadata +from tutubo.pytube.parser import parse_for_object, parse_for_all_objects + + +logger = logging.getLogger(__name__) + + +def publish_date(watch_html: str): + """Extract publish date + :param str watch_html: + The html contents of the watch page. + :rtype: str + :returns: + Publish date of the video. + """ + try: + result = regex_search( + r"(?<=itemprop=\"datePublished\" content=\")\d{4}-\d{2}-\d{2}", + watch_html, group=0 + ) + except RegexMatchError: + return None + return datetime.strptime(result, '%Y-%m-%d') + + +def recording_available(watch_html): + """Check if live stream recording is available. + + :param str watch_html: + The html contents of the watch page. + :rtype: bool + :returns: + Whether or not the content is private. + """ + unavailable_strings = [ + 'This live stream recording is not available.' + ] + for string in unavailable_strings: + if string in watch_html: + return False + return True + + +def is_private(watch_html): + """Check if content is private. + + :param str watch_html: + The html contents of the watch page. + :rtype: bool + :returns: + Whether or not the content is private. + """ + private_strings = [ + "This is a private video. Please sign in to verify that you may see it.", + "\"simpleText\":\"Private video\"", + "This video is private." + ] + for string in private_strings: + if string in watch_html: + return True + return False + + +def is_age_restricted(watch_html: str) -> bool: + """Check if content is age restricted. + + :param str watch_html: + The html contents of the watch page. + :rtype: bool + :returns: + Whether or not the content is age restricted. + """ + try: + regex_search(r"og:restrictions:age", watch_html, group=0) + except RegexMatchError: + return False + return True + + +def playability_status(watch_html: str) -> (str, str): + """Return the playability status and status explanation of a video. + + For example, a video may have a status of LOGIN_REQUIRED, and an explanation + of "This is a private video. Please sign in to verify that you may see it." + + This explanation is what gets incorporated into the media player overlay. + + :param str watch_html: + The html contents of the watch page. + :rtype: bool + :returns: + Playability status and reason of the video. + """ + player_response = initial_player_response(watch_html) + status_dict = player_response.get('playabilityStatus', {}) + if 'liveStreamability' in status_dict: + return 'LIVE_STREAM', 'Video is a live stream.' + if 'status' in status_dict: + if 'reason' in status_dict: + return status_dict['status'], [status_dict['reason']] + if 'messages' in status_dict: + return status_dict['status'], status_dict['messages'] + return None, [None] + + +def video_id(url: str) -> str: + """Extract the ``video_id`` from a YouTube url. + + This function supports the following patterns: + + - :samp:`https://youtube.com/watch?v={video_id}` + - :samp:`https://youtube.com/embed/{video_id}` + - :samp:`https://youtu.be/{video_id}` + + :param str url: + A YouTube url containing a video id. + :rtype: str + :returns: + YouTube video id. + """ + return regex_search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url, group=1) + + +def playlist_id(url: str) -> str: + """Extract the ``playlist_id`` from a YouTube url. + + This function supports the following patterns: + + - :samp:`https://youtube.com/playlist?list={playlist_id}` + - :samp:`https://youtube.com/watch?v={video_id}&list={playlist_id}` + + :param str url: + A YouTube url containing a playlist id. + :rtype: str + :returns: + YouTube playlist id. + """ + parsed = urllib.parse.urlparse(url) + return parse_qs(parsed.query)['list'][0] + + +def channel_name(url: str) -> str: + """Extract the ``channel_name`` or ``channel_id`` from a YouTube url. + + This function supports the following patterns: + + - :samp:`https://youtube.com/{channel_name}/*` + - :samp:`https://youtube.com/@{channel_name}/*` + - :samp:`https://youtube.com/c/{channel_name}/*` + - :samp:`https://youtube.com/channel/{channel_id}/* + - :samp:`https://youtube.com/c/@{channel_name}/*` + - :samp:`https://youtube.com/channel/@{channel_id}/* + - :samp:`https://youtube.com/u/{channel_name}/*` + - :samp:`https://youtube.com/user/{channel_id}/* + + :param str url: + A YouTube url containing a channel name. + :rtype: str + :returns: + YouTube channel name. + """ + pattern = r"(?:https?:\/\/)?(?:www\.)?youtube\.com\/(?:(user|channel|c)(?:\/))?\@?([%\d\w_\-]+)" + regex = re.compile(pattern) + function_match = regex.search(url) + if function_match: + logger.debug("finished regex search, matched: %s", pattern) + uri_style = function_match.group(1) + uri_style = uri_style if uri_style else "c" + uri_identifier = function_match.group(2) + return f'/{uri_style}/{uri_identifier}' + + raise RegexMatchError( + caller="channel_name", pattern="patterns" + ) + + +def video_description_info(watch_html: str): + try: + yt_description_result = regex_search(r'"(?<=description":{"simpleText":")([^}]+)', watch_html, group=0) + except RegexMatchError: + yt_description_result = None + return yt_description_result + + +def video_info_url(video_id: str, watch_url: str) -> str: + """Construct the video_info url. + + :param str video_id: + A YouTube video identifier. + :param str watch_url: + A YouTube watch url. + :rtype: str + :returns: + :samp:`https://youtube.com/get_video_info` with necessary GET + parameters. + """ + params = OrderedDict( + [ + ("video_id", video_id), + ("ps", "default"), + ("eurl", quote(watch_url)), + ("hl", "en_US"), + ("html5", "1"), + ("c", "TVHTML5"), + ("cver", "7.20201028"), + ] + ) + return _video_info_url(params) + + +def video_info_url_age_restricted(video_id: str, embed_html: str) -> str: + """Construct the video_info url. + + :param str video_id: + A YouTube video identifier. + :param str embed_html: + The html contents of the embed page (for age restricted videos). + :rtype: str + :returns: + :samp:`https://youtube.com/get_video_info` with necessary GET + parameters. + """ + try: + sts = regex_search(r'"sts"\s*:\s*(\d+)', embed_html, group=1) + except RegexMatchError: + sts = "" + # Here we use ``OrderedDict`` so that the output is consistent between + # Python 2.7+. + eurl = f"https://youtube.googleapis.com/v/{video_id}" + params = OrderedDict( + [ + ("video_id", video_id), + ("eurl", eurl), + ("sts", sts), + ("html5", "1"), + ("c", "TVHTML5"), + ("cver", "7.20201028"), + ] + ) + return _video_info_url(params) + + +def _video_info_url(params: OrderedDict) -> str: + return "https://www.youtube.com/get_video_info?" + urlencode(params) + + +def js_url(html: str) -> str: + """Get the base JavaScript url. + + Construct the base JavaScript url, which contains the decipher + "transforms". + + :param str html: + The html contents of the watch page. + """ + try: + base_js = get_ytplayer_config(html)['assets']['js'] + except (KeyError, RegexMatchError): + base_js = get_ytplayer_js(html) + return "https://youtube.com" + base_js + + +def mime_type_codec(mime_type_codec: str) -> Tuple[str, List[str]]: + """Parse the type data. + + Breaks up the data in the ``type`` key of the manifest, which contains the + mime type and codecs serialized together, and splits them into separate + elements. + + **Example**: + + mime_type_codec('audio/webm; codecs="opus"') -> ('audio/webm', ['opus']) + + :param str mime_type_codec: + String containing mime type and codecs. + :rtype: tuple + :returns: + The mime type and a list of codecs. + + """ + pattern = r"(\w+\/\w+)\;\scodecs=\"([a-zA-Z-0-9.,\s]*)\"" + regex = re.compile(pattern) + results = regex.search(mime_type_codec) + if not results: + raise RegexMatchError(caller="mime_type_codec", pattern=pattern) + mime_type, codecs = results.groups() + return mime_type, [c.strip() for c in codecs.split(",")] + + +def get_ytplayer_js(html: str) -> Any: + """Get the YouTube player base JavaScript path. + + :param str html + The html contents of the watch page. + :rtype: str + :returns: + Path to YouTube's base.js file. + """ + js_url_patterns = [ + r"(/s/player/[\w\d]+/[\w\d_/.]+/base\.js)" + ] + for pattern in js_url_patterns: + regex = re.compile(pattern) + function_match = regex.search(html) + if function_match: + logger.debug("finished regex search, matched: %s", pattern) + yt_player_js = function_match.group(1) + return yt_player_js + + raise RegexMatchError( + caller="get_ytplayer_js", pattern="js_url_patterns" + ) + + +def get_ytplayer_config(html: str) -> Any: + """Get the YouTube player configuration data from the watch html. + + Extract the ``ytplayer_config``, which is json data embedded within the + watch html and serves as the primary source of obtaining the stream + manifest data. + + :param str html: + The html contents of the watch page. + :rtype: str + :returns: + Substring of the html containing the encoded manifest data. + """ + logger.debug("finding initial function name") + config_patterns = [ + r"ytplayer\.config\s*=\s*", + r"ytInitialPlayerResponse\s*=\s*" + ] + for pattern in config_patterns: + # Try each pattern consecutively if they don't find a match + try: + return parse_for_object(html, pattern) + except HTMLParseError as e: + logger.debug(f'Pattern failed: {pattern}') + logger.debug(e) + continue + + # setConfig() needs to be handled a little differently. + # We want to parse the entire argument to setConfig() + # and use then load that as json to find PLAYER_CONFIG + # inside of it. + setconfig_patterns = [ + r"yt\.setConfig\(.*['\"]PLAYER_CONFIG['\"]:\s*" + ] + for pattern in setconfig_patterns: + # Try each pattern consecutively if they don't find a match + try: + return parse_for_object(html, pattern) + except HTMLParseError: + continue + + raise RegexMatchError( + caller="get_ytplayer_config", pattern="config_patterns, setconfig_patterns" + ) + + +def get_ytcfg(html: str) -> str: + """Get the entirety of the ytcfg object. + + This is built over multiple pieces, so we have to find all matches and + combine the dicts together. + + :param str html: + The html contents of the watch page. + :rtype: str + :returns: + Substring of the html containing the encoded manifest data. + """ + ytcfg = {} + ytcfg_patterns = [ + r"ytcfg\s=\s", + r"ytcfg\.set\(" + ] + for pattern in ytcfg_patterns: + # Try each pattern consecutively and try to build a cohesive object + try: + found_objects = parse_for_all_objects(html, pattern) + for obj in found_objects: + ytcfg.update(obj) + except HTMLParseError: + continue + + if len(ytcfg) > 0: + return ytcfg + + raise RegexMatchError( + caller="get_ytcfg", pattern="ytcfg_pattenrs" + ) + + +def apply_signature(stream_manifest: Dict, vid_info: Dict, js: str) -> None: + """Apply the decrypted signature to the stream manifest. + + :param dict stream_manifest: + Details of the media streams available. + :param str js: + The contents of the base.js asset file. + + """ + cipher = Cipher(js=js) + + for i, stream in enumerate(stream_manifest): + try: + url: str = stream["url"] + except KeyError: + live_stream = ( + vid_info.get("playabilityStatus", {},) + .get("liveStreamability") + ) + if live_stream: + raise LiveStreamError("UNKNOWN") + # 403 Forbidden fix. + if "signature" in url or ( + "s" not in stream and ("&sig=" in url or "&lsig=" in url) + ): + # For certain videos, YouTube will just provide them pre-signed, in + # which case there's no real magic to download them and we can skip + # the whole signature descrambling entirely. + logger.debug("signature found, skip decipher") + continue + + signature = cipher.get_signature(ciphered_signature=stream["s"]) + + logger.debug( + "finished descrambling signature for itag=%s", stream["itag"] + ) + parsed_url = urlparse(url) + + # Convert query params off url to dict + query_params = parse_qs(urlparse(url).query) + query_params = { + k: v[0] for k,v in query_params.items() + } + query_params['sig'] = signature + if 'ratebypass' not in query_params.keys(): + # Cipher n to get the updated value + + initial_n = list(query_params['n']) + new_n = cipher.calculate_n(initial_n) + query_params['n'] = new_n + + url = f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}?{urlencode(query_params)}' # noqa:E501 + + # 403 forbidden fix + stream_manifest[i]["url"] = url + + +def apply_descrambler(stream_data: Dict) -> None: + """Apply various in-place transforms to YouTube's media stream data. + + Creates a ``list`` of dictionaries by string splitting on commas, then + taking each list item, parsing it as a query string, converting it to a + ``dict`` and unquoting the value. + + :param dict stream_data: + Dictionary containing query string encoded values. + + **Example**: + + >>> d = {'foo': 'bar=1&var=test,em=5&t=url%20encoded'} + >>> apply_descrambler(d, 'foo') + >>> print(d) + {'foo': [{'bar': '1', 'var': 'test'}, {'em': '5', 't': 'url encoded'}]} + + """ + if 'url' in stream_data: + return None + + # Merge formats and adaptiveFormats into a single list + formats = [] + if 'formats' in stream_data.keys(): + formats.extend(stream_data['formats']) + if 'adaptiveFormats' in stream_data.keys(): + formats.extend(stream_data['adaptiveFormats']) + + # Extract url and s from signatureCiphers as necessary + for data in formats: + if 'url' not in data: + if 'signatureCipher' in data: + cipher_url = parse_qs(data['signatureCipher']) + data['url'] = cipher_url['url'][0] + data['s'] = cipher_url['s'][0] + data['is_otf'] = data.get('type') == 'FORMAT_STREAM_TYPE_OTF' + + logger.debug("applying descrambler") + return formats + + +def initial_data(watch_html: str) -> str: + """Extract the ytInitialData json from the watch_html page. + + This mostly contains metadata necessary for rendering the page on-load, + such as video information, copyright notices, etc. + + @param watch_html: Html of the watch page + @return: + """ + patterns = [ + r"window\[['\"]ytInitialData['\"]]\s*=\s*", + r"ytInitialData\s*=\s*" + ] + for pattern in patterns: + try: + return parse_for_object(watch_html, pattern) + except HTMLParseError: + pass + + raise RegexMatchError(caller='initial_data', pattern='initial_data_pattern') + + +def initial_player_response(watch_html: str) -> str: + """Extract the ytInitialPlayerResponse json from the watch_html page. + + This mostly contains metadata necessary for rendering the page on-load, + such as video information, copyright notices, etc. + + @param watch_html: Html of the watch page + @return: + """ + patterns = [ + r"window\[['\"]ytInitialPlayerResponse['\"]]\s*=\s*", + r"ytInitialPlayerResponse\s*=\s*" + ] + for pattern in patterns: + try: + return parse_for_object(watch_html, pattern) + except HTMLParseError: + pass + + raise RegexMatchError( + caller='initial_player_response', + pattern='initial_player_response_pattern' + ) + + +def metadata(initial_data) -> Optional[YouTubeMetadata]: + """Get the informational metadata for the video. + + e.g.: + [ + { + 'Song': '강남스타일(Gangnam Style)', + 'Artist': 'PSY', + 'Album': 'PSY SIX RULES Pt.1', + 'Licensed to YouTube by': 'YG Entertainment Inc. [...]' + } + ] + + :rtype: YouTubeMetadata + """ + try: + metadata_rows: List = initial_data["contents"]["twoColumnWatchNextResults"][ + "results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"][ + "metadataRowContainer"]["metadataRowContainerRenderer"]["rows"] + except (KeyError, IndexError): + # If there's an exception accessing this data, it probably doesn't exist. + return YouTubeMetadata([]) + + # Rows appear to only have "metadataRowRenderer" or "metadataRowHeaderRenderer" + # and we only care about the former, so we filter the others + metadata_rows = filter( + lambda x: "metadataRowRenderer" in x.keys(), + metadata_rows + ) + + # We then access the metadataRowRenderer key in each element + # and build a metadata object from this new list + metadata_rows = [x["metadataRowRenderer"] for x in metadata_rows] + + return YouTubeMetadata(metadata_rows) diff --git a/tutubo/pytube/helpers.py b/tutubo/pytube/helpers.py new file mode 100644 index 0000000..dda15e0 --- /dev/null +++ b/tutubo/pytube/helpers.py @@ -0,0 +1,335 @@ +"""Various helper functions implemented by pytube.""" +import functools +import gzip +import json +import logging +import os +import re +import warnings +from typing import Any, Callable, Dict, List, Optional, TypeVar +from urllib import request + +from tutubo.pytube.exceptions import RegexMatchError + +logger = logging.getLogger(__name__) + + +class DeferredGeneratorList: + """A wrapper class for deferring list generation. + + Pytube has some continuation generators that create web calls, which means + that any time a full list is requested, all of those web calls must be + made at once, which could lead to slowdowns. This will allow individual + elements to be queried, so that slowdowns only happen as necessary. For + example, you can iterate over elements in the list without accessing them + all simultaneously. This should allow for speed improvements for playlist + and channel interactions. + """ + def __init__(self, generator): + """Construct a :class:`DeferredGeneratorList `. + + :param generator generator: + The deferrable generator to create a wrapper for. + :param func func: + (Optional) A function to call on the generator items to produce the list. + """ + self.gen = generator + self._elements = [] + + def __eq__(self, other): + """We want to mimic list behavior for comparison.""" + return list(self) == other + + def __getitem__(self, key) -> Any: + """Only generate items as they're asked for.""" + # We only allow querying with indexes. + if not isinstance(key, (int, slice)): + raise TypeError('Key must be either a slice or int.') + + # Convert int keys to slice + key_slice = key + if isinstance(key, int): + key_slice = slice(key, key + 1, 1) + + # Generate all elements up to the final item + while len(self._elements) < key_slice.stop: + try: + next_item = next(self.gen) + except StopIteration: + # If we can't find enough elements for the slice, raise an IndexError + raise IndexError + else: + self._elements.append(next_item) + + return self._elements[key] + + def __iter__(self): + """Custom iterator for dynamically generated list.""" + iter_index = 0 + while True: + try: + curr_item = self[iter_index] + except IndexError: + return + else: + yield curr_item + iter_index += 1 + + def __next__(self) -> Any: + """Fetch next element in iterator.""" + try: + curr_element = self[self.iter_index] + except IndexError: + raise StopIteration + self.iter_index += 1 + return curr_element # noqa:R504 + + def __len__(self) -> int: + """Return length of list of all items.""" + self.generate_all() + return len(self._elements) + + def __repr__(self) -> str: + """String representation of all items.""" + self.generate_all() + return str(self._elements) + + def __reversed__(self): + self.generate_all() + return self._elements[::-1] + + def generate_all(self): + """Generate all items.""" + while True: + try: + next_item = next(self.gen) + except StopIteration: + break + else: + self._elements.append(next_item) + + +def regex_search(pattern: str, string: str, group: int) -> str: + """Shortcut method to search a string for a given pattern. + + :param str pattern: + A regular expression pattern. + :param str string: + A target string to search. + :param int group: + Index of group to return. + :rtype: + str or tuple + :returns: + Substring pattern matches. + """ + regex = re.compile(pattern) + results = regex.search(string) + if not results: + raise RegexMatchError(caller="regex_search", pattern=pattern) + + logger.debug("matched regex search: %s", pattern) + + return results.group(group) + + +def safe_filename(s: str, max_length: int = 255) -> str: + """Sanitize a string making it safe to use as a filename. + + This function was based off the limitations outlined here: + https://en.wikipedia.org/wiki/Filename. + + :param str s: + A string to make safe for use as a file name. + :param int max_length: + The maximum filename character length. + :rtype: str + :returns: + A sanitized string. + """ + # Characters in range 0-31 (0x00-0x1F) are not allowed in ntfs filenames. + ntfs_characters = [chr(i) for i in range(0, 31)] + characters = [ + r'"', + r"\#", + r"\$", + r"\%", + r"'", + r"\*", + r"\,", + r"\.", + r"\/", + r"\:", + r'"', + r"\;", + r"\<", + r"\>", + r"\?", + r"\\", + r"\^", + r"\|", + r"\~", + r"\\\\", + ] + pattern = "|".join(ntfs_characters + characters) + regex = re.compile(pattern, re.UNICODE) + filename = regex.sub("", s) + return filename[:max_length].rsplit(" ", 0)[0] + + +def setup_logger(level: int = logging.ERROR, log_filename: Optional[str] = None) -> None: + """Create a configured instance of logger. + + :param int level: + Describe the severity level of the logs to handle. + """ + fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s" + date_fmt = "%H:%M:%S" + formatter = logging.Formatter(fmt, datefmt=date_fmt) + + # https://github.com/pytube/pytube/issues/163 + logger = logging.getLogger("pytube") + logger.setLevel(level) + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + if log_filename is not None: + file_handler = logging.FileHandler(log_filename) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + +GenericType = TypeVar("GenericType") + + +def cache(func: Callable[..., GenericType]) -> GenericType: + """ mypy compatible annotation wrapper for lru_cache""" + return functools.lru_cache()(func) # type: ignore + + +def deprecated(reason: str) -> Callable: + """ + This is a decorator which can be used to mark functions + as deprecated. It will result in a warning being emitted + when the function is used. + """ + + def decorator(func1): + message = "Call to deprecated function {name} ({reason})." + + @functools.wraps(func1) + def new_func1(*args, **kwargs): + warnings.simplefilter("always", DeprecationWarning) + warnings.warn( + message.format(name=func1.__name__, reason=reason), + category=DeprecationWarning, + stacklevel=2, + ) + warnings.simplefilter("default", DeprecationWarning) + return func1(*args, **kwargs) + + return new_func1 + + return decorator + + +def target_directory(output_path: Optional[str] = None) -> str: + """ + Function for determining target directory of a download. + Returns an absolute path (if relative one given) or the current + path (if none given). Makes directory if it does not exist. + + :type output_path: str + :rtype: str + :returns: + An absolute directory path as a string. + """ + if output_path: + if not os.path.isabs(output_path): + output_path = os.path.join(os.getcwd(), output_path) + else: + output_path = os.getcwd() + os.makedirs(output_path, exist_ok=True) + return output_path + + +def install_proxy(proxy_handler: Dict[str, str]) -> None: + proxy_support = request.ProxyHandler(proxy_handler) + opener = request.build_opener(proxy_support) + request.install_opener(opener) + + +def uniqueify(duped_list: List) -> List: + """Remove duplicate items from a list, while maintaining list order. + + :param List duped_list + List to remove duplicates from + + :return List result + De-duplicated list + """ + seen: Dict[Any, bool] = {} + result = [] + for item in duped_list: + if item in seen: + continue + seen[item] = True + result.append(item) + return result + + +def generate_all_html_json_mocks(): + """Regenerate the video mock json files for all current test videos. + + This should automatically output to the test/mocks directory. + """ + test_vid_ids = [ + '2lAe1cqCOXo', + '5YceQ8YqYMc', + 'irauhITDrsE', + 'm8uHb5jIGN8', + 'QRS8MkLhQmM', + 'WXxV9g7lsFE' + ] + for vid_id in test_vid_ids: + create_mock_html_json(vid_id) + + +def create_mock_html_json(vid_id) -> Dict[str, Any]: + """Generate a json.gz file with sample html responses. + + :param str vid_id + YouTube video id + + :return dict data + Dict used to generate the json.gz file + """ + from tutubo.pytube import YouTube + gzip_filename = 'yt-video-%s-html.json.gz' % vid_id + + # Get the pytube directory in order to navigate to /tests/mocks + pytube_dir_path = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + os.path.pardir + ) + ) + pytube_mocks_path = os.path.join(pytube_dir_path, 'tests', 'mocks') + gzip_filepath = os.path.join(pytube_mocks_path, gzip_filename) + + yt = YouTube(f'https://www.youtube.com/watch?v={vid_id}') + html_data = { + 'url': yt.watch_url, + 'js': yt.js, + 'embed_html': yt.embed_html, + 'watch_html': yt.watch_html, + 'vid_info': yt.vid_info + } + + logger.info(f'Outputing json.gz file to {gzip_filepath}') + with gzip.open(gzip_filepath, 'wb') as f: + f.write(json.dumps(html_data).encode('utf-8')) + + return html_data diff --git a/tutubo/pytube/innertube.py b/tutubo/pytube/innertube.py new file mode 100644 index 0000000..21ad720 --- /dev/null +++ b/tutubo/pytube/innertube.py @@ -0,0 +1,514 @@ +"""This module is designed to interact with the innertube API. + +This module is NOT intended to be used directly by end users, as each of the +interfaces returns raw results. These should instead be parsed to extract +the useful information for the end user. +""" +# Native python imports +import json +import os +import pathlib +import time +from urllib import parse + +# Local imports +from tutubo.pytube import request + +# YouTube on TV client secrets +_client_id = '861556708454-d6dlm3lh05idd8npek18k6be8ba3oc68.apps.googleusercontent.com' +_client_secret = 'SboVhoG9s0rNafixCSGGKXAT' + +# Extracted API keys -- unclear what these are linked to. +_api_keys = [ + 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', + 'AIzaSyCtkvNIR1HCEwzsqK6JuE6KqpyjusIRI30', + 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', + 'AIzaSyC8UYZpvA2eknNex0Pjid0_eTLJoDu6los', + 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', + 'AIzaSyDHQ9ipnphqTzDqZsbtd8_Ru4_kiKVQe2k' +] + +_default_clients = { + 'WEB': { + 'context': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20200720.00.02' + } + }, + 'header': { + 'User-Agent': 'Mozilla/5.0' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'ANDROID': { + 'context': { + 'client': { + 'clientName': 'ANDROID', + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30 + } + }, + 'header': { + 'User-Agent': 'com.google.android.youtube/', + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'IOS': { + 'context': { + 'client': { + 'clientName': 'IOS', + 'clientVersion': '17.33.2', + 'deviceModel': 'iPhone14,3' + } + }, + 'header': { + 'User-Agent': 'com.google.ios.youtube/' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + + 'WEB_EMBED': { + 'context': { + 'client': { + 'clientName': 'WEB_EMBEDDED_PLAYER', + 'clientVersion': '2.20210721.00.00', + 'clientScreen': 'EMBED' + } + }, + 'header': { + 'User-Agent': 'Mozilla/5.0' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'ANDROID_EMBED': { + 'context': { + 'client': { + 'clientName': 'ANDROID_EMBEDDED_PLAYER', + 'clientVersion': '17.31.35', + 'clientScreen': 'EMBED', + 'androidSdkVersion': 30, + } + }, + 'header': { + 'User-Agent': 'com.google.android.youtube/' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'IOS_EMBED': { + 'context': { + 'client': { + 'clientName': 'IOS_MESSAGES_EXTENSION', + 'clientVersion': '17.33.2', + 'deviceModel': 'iPhone14,3' + } + }, + 'header': { + 'User-Agent': 'com.google.ios.youtube/' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + + 'WEB_MUSIC': { + 'context': { + 'client': { + 'clientName': 'WEB_REMIX', + 'clientVersion': '1.20220727.01.00', + } + }, + 'header': { + 'User-Agent': 'Mozilla/5.0' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'ANDROID_MUSIC': { + 'context': { + 'client': { + 'clientName': 'ANDROID_MUSIC', + 'clientVersion': '5.16.51', + 'androidSdkVersion': 30 + } + }, + 'header': { + 'User-Agent': 'com.google.android.apps.youtube.music/' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'IOS_MUSIC': { + 'context': { + 'client': { + 'clientName': 'IOS_MUSIC', + 'clientVersion': '5.21', + 'deviceModel': 'iPhone14,3' + } + }, + 'header': { + 'User-Agent': 'com.google.ios.youtubemusic/' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + + 'WEB_CREATOR': { + 'context': { + 'client': { + 'clientName': 'WEB_CREATOR', + 'clientVersion': '1.20220726.00.00', + } + }, + 'header': { + 'User-Agent': 'Mozilla/5.0' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'ANDROID_CREATOR': { + 'context': { + 'client': { + 'clientName': 'ANDROID_CREATOR', + 'clientVersion': '22.30.100', + 'androidSdkVersion': 30, + } + }, + 'header': { + 'User-Agent': 'com.google.android.apps.youtube.creator/', + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + 'IOS_CREATOR': { + 'context': { + 'client': { + 'clientName': 'IOS_CREATOR', + 'clientVersion': '22.33.101', + 'deviceModel': 'iPhone14,3', + } + }, + 'header': { + 'User-Agent': 'com.google.ios.ytcreator/' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + + 'MWEB': { + 'context': { + 'client': { + 'clientName': 'MWEB', + 'clientVersion': '2.20220801.00.00', + } + }, + 'header': { + 'User-Agent': 'Mozilla/5.0' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, + + 'TV_EMBED': { + 'context': { + 'client': { + 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', + 'clientVersion': '2.0', + } + }, + 'header': { + 'User-Agent': 'Mozilla/5.0' + }, + 'api_key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + }, +} +_token_timeout = 1800 +_cache_dir = pathlib.Path(__file__).parent.resolve() / '__cache__' +_token_file = os.path.join(_cache_dir, 'tokens.json') + +_default_clients["ANDROID"]["context"]["client"]["clientVersion"] = "19.08.35" +_default_clients["IOS"]["context"]["client"]["clientVersion"] = "19.08.35" +_default_clients["ANDROID_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35" +_default_clients["IOS_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35" +_default_clients["IOS_MUSIC"]["context"]["client"]["clientVersion"] = "6.41" +_default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"] + + +class InnerTube: + """Object for interacting with the innertube API.""" + def __init__(self, client='ANDROID_MUSIC', use_oauth=False, allow_cache=True): + """Initialize an InnerTube object. + + :param str client: + Client to use for the object. + Default to web because it returns the most playback types. + :param bool use_oauth: + Whether or not to authenticate to YouTube. + :param bool allow_cache: + Allows caching of oauth tokens on the machine. + """ + self.context = _default_clients[client]['context'] + self.header = _default_clients[client]['header'] + self.api_key = _default_clients[client]['api_key'] + self.access_token = None + self.refresh_token = None + self.use_oauth = use_oauth + self.allow_cache = allow_cache + + # Stored as epoch time + self.expires = None + + # Try to load from file if specified + if self.use_oauth and self.allow_cache: + # Try to load from file if possible + if os.path.exists(_token_file): + with open(_token_file) as f: + data = json.load(f) + self.access_token = data['access_token'] + self.refresh_token = data['refresh_token'] + self.expires = data['expires'] + self.refresh_bearer_token() + + def cache_tokens(self): + """Cache tokens to file if allowed.""" + if not self.allow_cache: + return + + data = { + 'access_token': self.access_token, + 'refresh_token': self.refresh_token, + 'expires': self.expires + } + if not os.path.exists(_cache_dir): + os.mkdir(_cache_dir) + with open(_token_file, 'w') as f: + json.dump(data, f) + + def refresh_bearer_token(self, force=False): + """Refreshes the OAuth token if necessary. + + :param bool force: + Force-refresh the bearer token. + """ + if not self.use_oauth: + return + # Skip refresh if it's not necessary and not forced + if self.expires > time.time() and not force: + return + + # Subtracting 30 seconds is arbitrary to avoid potential time discrepencies + start_time = int(time.time() - 30) + data = { + 'client_id': _client_id, + 'client_secret': _client_secret, + 'grant_type': 'refresh_token', + 'refresh_token': self.refresh_token + } + response = request._execute_request( + 'https://oauth2.googleapis.com/token', + 'POST', + headers={ + 'Content-Type': 'application/json' + }, + data=data + ) + response_data = json.loads(response.read()) + + self.access_token = response_data['access_token'] + self.expires = start_time + response_data['expires_in'] + self.cache_tokens() + + def fetch_bearer_token(self): + """Fetch an OAuth token.""" + # Subtracting 30 seconds is arbitrary to avoid potential time discrepencies + start_time = int(time.time() - 30) + data = { + 'client_id': _client_id, + 'scope': 'https://www.googleapis.com/auth/youtube' + } + response = request._execute_request( + 'https://oauth2.googleapis.com/device/code', + 'POST', + headers={ + 'Content-Type': 'application/json' + }, + data=data + ) + response_data = json.loads(response.read()) + verification_url = response_data['verification_url'] + user_code = response_data['user_code'] + print(f'Please open {verification_url} and input code {user_code}') + input('Press enter when you have completed this step.') + + data = { + 'client_id': _client_id, + 'client_secret': _client_secret, + 'device_code': response_data['device_code'], + 'grant_type': 'urn:ietf:params:oauth:grant-type:device_code' + } + response = request._execute_request( + 'https://oauth2.googleapis.com/token', + 'POST', + headers={ + 'Content-Type': 'application/json' + }, + data=data + ) + response_data = json.loads(response.read()) + + self.access_token = response_data['access_token'] + self.refresh_token = response_data['refresh_token'] + self.expires = start_time + response_data['expires_in'] + self.cache_tokens() + + @property + def base_url(self): + """Return the base url endpoint for the innertube API.""" + return 'https://www.youtube.com/youtubei/v1' + + @property + def base_data(self): + """Return the base json data to transmit to the innertube API.""" + return { + 'context': self.context + } + + @property + def base_params(self): + """Return the base query parameters to transmit to the innertube API.""" + return { + 'key': self.api_key, + 'contentCheckOk': True, + 'racyCheckOk': True + } + + def _call_api(self, endpoint, query, data): + """Make a request to a given endpoint with the provided query parameters and data.""" + # Remove the API key if oauth is being used. + if self.use_oauth: + del query['key'] + + endpoint_url = f'{endpoint}?{parse.urlencode(query)}' + headers = { + 'Content-Type': 'application/json', + } + # Add the bearer token if applicable + if self.use_oauth: + if self.access_token: + self.refresh_bearer_token() + headers['Authorization'] = f'Bearer {self.access_token}' + else: + self.fetch_bearer_token() + headers['Authorization'] = f'Bearer {self.access_token}' + + headers.update(self.header) + + response = request._execute_request( + endpoint_url, + 'POST', + headers=headers, + data=data + ) + return json.loads(response.read()) + + def browse(self): + """Make a request to the browse endpoint. + + TODO: Figure out how we can use this + """ + # endpoint = f'{self.base_url}/browse' # noqa:E800 + ... + # return self._call_api(endpoint, query, self.base_data) # noqa:E800 + + def config(self): + """Make a request to the config endpoint. + + TODO: Figure out how we can use this + """ + # endpoint = f'{self.base_url}/config' # noqa:E800 + ... + # return self._call_api(endpoint, query, self.base_data) # noqa:E800 + + def guide(self): + """Make a request to the guide endpoint. + + TODO: Figure out how we can use this + """ + # endpoint = f'{self.base_url}/guide' # noqa:E800 + ... + # return self._call_api(endpoint, query, self.base_data) # noqa:E800 + + def next(self): + """Make a request to the next endpoint. + + TODO: Figure out how we can use this + """ + # endpoint = f'{self.base_url}/next' # noqa:E800 + ... + # return self._call_api(endpoint, query, self.base_data) # noqa:E800 + + def player(self, video_id): + """Make a request to the player endpoint. + + :param str video_id: + The video id to get player info for. + :rtype: dict + :returns: + Raw player info results. + """ + endpoint = f'{self.base_url}/player' + query = { + 'videoId': video_id, + } + query.update(self.base_params) + return self._call_api(endpoint, query, self.base_data) + + def search(self, search_query, continuation=None): + """Make a request to the search endpoint. + + :param str search_query: + The query to search. + :rtype: dict + :returns: + Raw search query results. + """ + endpoint = f'{self.base_url}/search' + query = { + 'query': search_query + } + query.update(self.base_params) + data = {} + if continuation: + data['continuation'] = continuation + data.update(self.base_data) + return self._call_api(endpoint, query, data) + + def verify_age(self, video_id): + """Make a request to the age_verify endpoint. + + Notable examples of the types of video this verification step is for: + * https://www.youtube.com/watch?v=QLdAhwSBZ3w + * https://www.youtube.com/watch?v=hc0ZDaAZQT0 + + :param str video_id: + The video id to get player info for. + :rtype: dict + :returns: + Returns information that includes a URL for bypassing certain restrictions. + """ + endpoint = f'{self.base_url}/verify_age' + data = { + 'nextEndpoint': { + 'urlEndpoint': { + 'url': f'/watch?v={video_id}' + } + }, + 'setControvercy': True + } + data.update(self.base_data) + result = self._call_api(endpoint, self.base_params, data) + return result + + def get_transcript(self, video_id): + """Make a request to the get_transcript endpoint. + + This is likely related to captioning for videos, but is currently untested. + """ + endpoint = f'{self.base_url}/get_transcript' + query = { + 'videoId': video_id, + } + query.update(self.base_params) + result = self._call_api(endpoint, query, self.base_data) + return result diff --git a/tutubo/pytube/itags.py b/tutubo/pytube/itags.py new file mode 100644 index 0000000..87536b1 --- /dev/null +++ b/tutubo/pytube/itags.py @@ -0,0 +1,153 @@ +"""This module contains a lookup table of YouTube's itag values.""" +from typing import Dict + +PROGRESSIVE_VIDEO = { + 5: ("240p", "64kbps"), + 6: ("270p", "64kbps"), + 13: ("144p", None), + 17: ("144p", "24kbps"), + 18: ("360p", "96kbps"), + 22: ("720p", "192kbps"), + 34: ("360p", "128kbps"), + 35: ("480p", "128kbps"), + 36: ("240p", None), + 37: ("1080p", "192kbps"), + 38: ("3072p", "192kbps"), + 43: ("360p", "128kbps"), + 44: ("480p", "128kbps"), + 45: ("720p", "192kbps"), + 46: ("1080p", "192kbps"), + 59: ("480p", "128kbps"), + 78: ("480p", "128kbps"), + 82: ("360p", "128kbps"), + 83: ("480p", "128kbps"), + 84: ("720p", "192kbps"), + 85: ("1080p", "192kbps"), + 91: ("144p", "48kbps"), + 92: ("240p", "48kbps"), + 93: ("360p", "128kbps"), + 94: ("480p", "128kbps"), + 95: ("720p", "256kbps"), + 96: ("1080p", "256kbps"), + 100: ("360p", "128kbps"), + 101: ("480p", "192kbps"), + 102: ("720p", "192kbps"), + 132: ("240p", "48kbps"), + 151: ("720p", "24kbps"), + 300: ("720p", "128kbps"), + 301: ("1080p", "128kbps"), +} + +DASH_VIDEO = { + # DASH Video + 133: ("240p", None), # MP4 + 134: ("360p", None), # MP4 + 135: ("480p", None), # MP4 + 136: ("720p", None), # MP4 + 137: ("1080p", None), # MP4 + 138: ("2160p", None), # MP4 + 160: ("144p", None), # MP4 + 167: ("360p", None), # WEBM + 168: ("480p", None), # WEBM + 169: ("720p", None), # WEBM + 170: ("1080p", None), # WEBM + 212: ("480p", None), # MP4 + 218: ("480p", None), # WEBM + 219: ("480p", None), # WEBM + 242: ("240p", None), # WEBM + 243: ("360p", None), # WEBM + 244: ("480p", None), # WEBM + 245: ("480p", None), # WEBM + 246: ("480p", None), # WEBM + 247: ("720p", None), # WEBM + 248: ("1080p", None), # WEBM + 264: ("1440p", None), # MP4 + 266: ("2160p", None), # MP4 + 271: ("1440p", None), # WEBM + 272: ("4320p", None), # WEBM + 278: ("144p", None), # WEBM + 298: ("720p", None), # MP4 + 299: ("1080p", None), # MP4 + 302: ("720p", None), # WEBM + 303: ("1080p", None), # WEBM + 308: ("1440p", None), # WEBM + 313: ("2160p", None), # WEBM + 315: ("2160p", None), # WEBM + 330: ("144p", None), # WEBM + 331: ("240p", None), # WEBM + 332: ("360p", None), # WEBM + 333: ("480p", None), # WEBM + 334: ("720p", None), # WEBM + 335: ("1080p", None), # WEBM + 336: ("1440p", None), # WEBM + 337: ("2160p", None), # WEBM + 394: ("144p", None), # MP4 + 395: ("240p", None), # MP4 + 396: ("360p", None), # MP4 + 397: ("480p", None), # MP4 + 398: ("720p", None), # MP4 + 399: ("1080p", None), # MP4 + 400: ("1440p", None), # MP4 + 401: ("2160p", None), # MP4 + 402: ("4320p", None), # MP4 + 571: ("4320p", None), # MP4 + 694: ("144p", None), # MP4 + 695: ("240p", None), # MP4 + 696: ("360p", None), # MP4 + 697: ("480p", None), # MP4 + 698: ("720p", None), # MP4 + 699: ("1080p", None), # MP4 + 700: ("1440p", None), # MP4 + 701: ("2160p", None), # MP4 + 702: ("4320p", None), # MP4 +} + +DASH_AUDIO = { + # DASH Audio + 139: (None, "48kbps"), # MP4 + 140: (None, "128kbps"), # MP4 + 141: (None, "256kbps"), # MP4 + 171: (None, "128kbps"), # WEBM + 172: (None, "256kbps"), # WEBM + 249: (None, "50kbps"), # WEBM + 250: (None, "70kbps"), # WEBM + 251: (None, "160kbps"), # WEBM + 256: (None, "192kbps"), # MP4 + 258: (None, "384kbps"), # MP4 + 325: (None, None), # MP4 + 328: (None, None), # MP4 +} + +ITAGS = { + **PROGRESSIVE_VIDEO, + **DASH_VIDEO, + **DASH_AUDIO, +} + +HDR = [330, 331, 332, 333, 334, 335, 336, 337] +_3D = [82, 83, 84, 85, 100, 101, 102] +LIVE = [91, 92, 93, 94, 95, 96, 132, 151] + + +def get_format_profile(itag: int) -> Dict: + """Get additional format information for a given itag. + + :param str itag: + YouTube format identifier code. + """ + itag = int(itag) + if itag in ITAGS: + res, bitrate = ITAGS[itag] + else: + res, bitrate = None, None + return { + "resolution": res, + "abr": bitrate, + "is_live": itag in LIVE, + "is_3d": itag in _3D, + "is_hdr": itag in HDR, + "is_dash": ( + itag in DASH_AUDIO + or itag in DASH_VIDEO + ), + } diff --git a/tutubo/pytube/metadata.py b/tutubo/pytube/metadata.py new file mode 100644 index 0000000..be12c63 --- /dev/null +++ b/tutubo/pytube/metadata.py @@ -0,0 +1,48 @@ +"""This module contains the YouTubeMetadata class.""" +import json +from typing import Dict, List, Optional + + +class YouTubeMetadata: + def __init__(self, metadata: List): + self._raw_metadata: List = metadata + self._metadata = [{}] + + for el in metadata: + # We only add metadata to the dict if it has a simpleText title. + if 'title' in el and 'simpleText' in el['title']: + metadata_title = el['title']['simpleText'] + else: + continue + + contents = el['contents'][0] + if 'simpleText' in contents: + self._metadata[-1][metadata_title] = contents['simpleText'] + elif 'runs' in contents: + self._metadata[-1][metadata_title] = contents['runs'][0]['text'] + + # Upon reaching a dividing line, create a new grouping + if el.get('hasDividerLine', False): + self._metadata.append({}) + + # If we happen to create an empty dict at the end, drop it + if self._metadata[-1] == {}: + self._metadata = self._metadata[:-1] + + def __getitem__(self, key): + return self._metadata[key] + + def __iter__(self): + for el in self._metadata: + yield el + + def __str__(self): + return json.dumps(self._metadata) + + @property + def raw_metadata(self) -> Optional[Dict]: + return self._raw_metadata + + @property + def metadata(self): + return self._metadata diff --git a/tutubo/pytube/monostate.py b/tutubo/pytube/monostate.py new file mode 100644 index 0000000..7968af5 --- /dev/null +++ b/tutubo/pytube/monostate.py @@ -0,0 +1,15 @@ +from typing import Any, Callable, Optional + + +class Monostate: + def __init__( + self, + on_progress: Optional[Callable[[Any, bytes, int], None]], + on_complete: Optional[Callable[[Any, Optional[str]], None]], + title: Optional[str] = None, + duration: Optional[int] = None, + ): + self.on_progress = on_progress + self.on_complete = on_complete + self.title = title + self.duration = duration diff --git a/tutubo/pytube/parser.py b/tutubo/pytube/parser.py new file mode 100644 index 0000000..eb8c78b --- /dev/null +++ b/tutubo/pytube/parser.py @@ -0,0 +1,185 @@ +import ast +import json +import re +from tutubo.pytube.exceptions import HTMLParseError + + +def parse_for_all_objects(html, preceding_regex): + """Parses input html to find all matches for the input starting point. + + :param str html: + HTML to be parsed for an object. + :param str preceding_regex: + Regex to find the string preceding the object. + :rtype list: + :returns: + A list of dicts created from parsing the objects. + """ + result = [] + regex = re.compile(preceding_regex) + match_iter = regex.finditer(html) + for match in match_iter: + if match: + start_index = match.end() + try: + obj = parse_for_object_from_startpoint(html, start_index) + except HTMLParseError: + # Some of the instances might fail because set is technically + # a method of the ytcfg object. We'll skip these since they + # don't seem relevant at the moment. + continue + else: + result.append(obj) + + if len(result) == 0: + raise HTMLParseError(f'No matches for regex {preceding_regex}') + + return result + + +def parse_for_object(html, preceding_regex): + """Parses input html to find the end of a JavaScript object. + + :param str html: + HTML to be parsed for an object. + :param str preceding_regex: + Regex to find the string preceding the object. + :rtype dict: + :returns: + A dict created from parsing the object. + """ + regex = re.compile(preceding_regex) + result = regex.search(html) + if not result: + raise HTMLParseError(f'No matches for regex {preceding_regex}') + + start_index = result.end() + return parse_for_object_from_startpoint(html, start_index) + + +def find_object_from_startpoint(html, start_point): + """Parses input html to find the end of a JavaScript object. + + :param str html: + HTML to be parsed for an object. + :param int start_point: + Index of where the object starts. + :rtype dict: + :returns: + A dict created from parsing the object. + """ + html = html[start_point:] + if html[0] not in ['{','[']: + raise HTMLParseError(f'Invalid start point. Start of HTML:\n{html[:20]}') + + # First letter MUST be a open brace, so we put that in the stack, + # and skip the first character. + last_char = '{' + curr_char = None + stack = [html[0]] + i = 1 + + context_closers = { + '{': '}', + '[': ']', + '"': '"', + '/': '/' # javascript regex + } + + while i < len(html): + if len(stack) == 0: + break + if curr_char not in [' ', '\n']: + last_char = curr_char + curr_char = html[i] + curr_context = stack[-1] + + # If we've reached a context closer, we can remove an element off the stack + if curr_char == context_closers[curr_context]: + stack.pop() + i += 1 + continue + + # Strings and regex expressions require special context handling because they can contain + # context openers *and* closers + if curr_context in ['"', '/']: + # If there's a backslash in a string or regex expression, we skip a character + if curr_char == '\\': + i += 2 + continue + else: + # Non-string contexts are when we need to look for context openers. + if curr_char in context_closers.keys(): + # Slash starts a regular expression depending on context + if not (curr_char == '/' and last_char not in ['(', ',', '=', ':', '[', '!', '&', '|', '?', '{', '}', ';']): + stack.append(curr_char) + + i += 1 + + full_obj = html[:i] + return full_obj # noqa: R504 + + +def parse_for_object_from_startpoint(html, start_point): + """JSONifies an object parsed from HTML. + + :param str html: + HTML to be parsed for an object. + :param int start_point: + Index of where the object starts. + :rtype dict: + :returns: + A dict created from parsing the object. + """ + full_obj = find_object_from_startpoint(html, start_point) + try: + return json.loads(full_obj) + except json.decoder.JSONDecodeError: + try: + return ast.literal_eval(full_obj) + except (ValueError, SyntaxError): + raise HTMLParseError('Could not parse object.') + + +def throttling_array_split(js_array): + """Parses the throttling array into a python list of strings. + + Expects input to begin with `[` and close with `]`. + + :param str js_array: + The javascript array, as a string. + :rtype: list: + :returns: + A list of strings representing splits on `,` in the throttling array. + """ + results = [] + curr_substring = js_array[1:] + + comma_regex = re.compile(r",") + func_regex = re.compile(r"function\([^)]*\)") + + while len(curr_substring) > 0: + if curr_substring.startswith('function'): + # Handle functions separately. These can contain commas + match = func_regex.search(curr_substring) + match_start, match_end = match.span() + + function_text = find_object_from_startpoint(curr_substring, match.span()[1]) + full_function_def = curr_substring[:match_end + len(function_text)] + results.append(full_function_def) + curr_substring = curr_substring[len(full_function_def) + 1:] + else: + match = comma_regex.search(curr_substring) + + # Try-catch to capture end of array + try: + match_start, match_end = match.span() + except AttributeError: + match_start = len(curr_substring) - 1 + match_end = match_start + 1 + + curr_el = curr_substring[:match_start] + results.append(curr_el) + curr_substring = curr_substring[match_end:] + + return results diff --git a/tutubo/pytube/query.py b/tutubo/pytube/query.py new file mode 100644 index 0000000..6973fb6 --- /dev/null +++ b/tutubo/pytube/query.py @@ -0,0 +1,424 @@ +"""This module provides a query interface for media streams and captions.""" +from collections.abc import Mapping, Sequence +from typing import Callable, List, Optional, Union + +from tutubo.pytube import Caption, Stream +from tutubo.pytube.helpers import deprecated + + +class StreamQuery(Sequence): + """Interface for querying the available media streams.""" + + def __init__(self, fmt_streams): + """Construct a :class:`StreamQuery `. + + param list fmt_streams: + list of :class:`Stream ` instances. + """ + self.fmt_streams = fmt_streams + self.itag_index = {int(s.itag): s for s in fmt_streams} + + def filter( + self, + fps=None, + res=None, + resolution=None, + mime_type=None, + type=None, + subtype=None, + file_extension=None, + abr=None, + bitrate=None, + video_codec=None, + audio_codec=None, + only_audio=None, + only_video=None, + progressive=None, + adaptive=None, + is_dash=None, + custom_filter_functions=None, + ): + """Apply the given filtering criterion. + + :param fps: + (optional) The frames per second. + :type fps: + int or None + + :param resolution: + (optional) Alias to ``res``. + :type res: + str or None + + :param res: + (optional) The video resolution. + :type resolution: + str or None + + :param mime_type: + (optional) Two-part identifier for file formats and format contents + composed of a "type", a "subtype". + :type mime_type: + str or None + + :param type: + (optional) Type part of the ``mime_type`` (e.g.: audio, video). + :type type: + str or None + + :param subtype: + (optional) Sub-type part of the ``mime_type`` (e.g.: mp4, mov). + :type subtype: + str or None + + :param file_extension: + (optional) Alias to ``sub_type``. + :type file_extension: + str or None + + :param abr: + (optional) Average bitrate (ABR) refers to the average amount of + data transferred per unit of time (e.g.: 64kbps, 192kbps). + :type abr: + str or None + + :param bitrate: + (optional) Alias to ``abr``. + :type bitrate: + str or None + + :param video_codec: + (optional) Video compression format. + :type video_codec: + str or None + + :param audio_codec: + (optional) Audio compression format. + :type audio_codec: + str or None + + :param bool progressive: + Excludes adaptive streams (one file contains both audio and video + tracks). + + :param bool adaptive: + Excludes progressive streams (audio and video are on separate + tracks). + + :param bool is_dash: + Include/exclude dash streams. + + :param bool only_audio: + Excludes streams with video tracks. + + :param bool only_video: + Excludes streams with audio tracks. + + :param custom_filter_functions: + (optional) Interface for defining complex filters without + subclassing. + :type custom_filter_functions: + list or None + + """ + filters = [] + if res or resolution: + if isinstance(res, str) or isinstance(resolution, str): + filters.append(lambda s: s.resolution == (res or resolution)) + elif isinstance(res, list) or isinstance(resolution, list): + filters.append(lambda s: s.resolution in (res or resolution)) + + if fps: + filters.append(lambda s: s.fps == fps) + + if mime_type: + filters.append(lambda s: s.mime_type == mime_type) + + if type: + filters.append(lambda s: s.type == type) + + if subtype or file_extension: + filters.append(lambda s: s.subtype == (subtype or file_extension)) + + if abr or bitrate: + filters.append(lambda s: s.abr == (abr or bitrate)) + + if video_codec: + filters.append(lambda s: s.video_codec == video_codec) + + if audio_codec: + filters.append(lambda s: s.audio_codec == audio_codec) + + if only_audio: + filters.append( + lambda s: ( + s.includes_audio_track and not s.includes_video_track + ), + ) + + if only_video: + filters.append( + lambda s: ( + s.includes_video_track and not s.includes_audio_track + ), + ) + + if progressive: + filters.append(lambda s: s.is_progressive) + + if adaptive: + filters.append(lambda s: s.is_adaptive) + + if custom_filter_functions: + filters.extend(custom_filter_functions) + + if is_dash is not None: + filters.append(lambda s: s.is_dash == is_dash) + + return self._filter(filters) + + def _filter(self, filters: List[Callable]) -> "StreamQuery": + fmt_streams = self.fmt_streams + for filter_lambda in filters: + fmt_streams = filter(filter_lambda, fmt_streams) + return StreamQuery(list(fmt_streams)) + + def order_by(self, attribute_name: str) -> "StreamQuery": + """Apply a sort order. Filters out stream the do not have the attribute. + + :param str attribute_name: + The name of the attribute to sort by. + """ + has_attribute = [ + s + for s in self.fmt_streams + if getattr(s, attribute_name) is not None + ] + # Check that the attributes have string values. + if has_attribute and isinstance( + getattr(has_attribute[0], attribute_name), str + ): + # Try to return a StreamQuery sorted by the integer representations + # of the values. + try: + return StreamQuery( + sorted( + has_attribute, + key=lambda s: int( + "".join( + filter(str.isdigit, getattr(s, attribute_name)) + ) + ), # type: ignore # noqa: E501 + ) + ) + except ValueError: + pass + + return StreamQuery( + sorted(has_attribute, key=lambda s: getattr(s, attribute_name)) + ) + + def desc(self) -> "StreamQuery": + """Sort streams in descending order. + + :rtype: :class:`StreamQuery ` + + """ + return StreamQuery(self.fmt_streams[::-1]) + + def asc(self) -> "StreamQuery": + """Sort streams in ascending order. + + :rtype: :class:`StreamQuery ` + + """ + return self + + def get_by_itag(self, itag: int) -> Optional[Stream]: + """Get the corresponding :class:`Stream ` for a given itag. + + :param int itag: + YouTube format identifier code. + :rtype: :class:`Stream ` or None + :returns: + The :class:`Stream ` matching the given itag or None if + not found. + + """ + return self.itag_index.get(int(itag)) + + def get_by_resolution(self, resolution: str) -> Optional[Stream]: + """Get the corresponding :class:`Stream ` for a given resolution. + + Stream must be a progressive mp4. + + :param str resolution: + Video resolution i.e. "720p", "480p", "360p", "240p", "144p" + :rtype: :class:`Stream ` or None + :returns: + The :class:`Stream ` matching the given itag or None if + not found. + + """ + return self.filter( + progressive=True, subtype="mp4", resolution=resolution + ).first() + + def get_lowest_resolution(self) -> Optional[Stream]: + """Get lowest resolution stream that is a progressive mp4. + + :rtype: :class:`Stream ` or None + :returns: + The :class:`Stream ` matching the given itag or None if + not found. + + """ + return ( + self.filter(progressive=True, subtype="mp4") + .order_by("resolution") + .first() + ) + + def get_highest_resolution(self) -> Optional[Stream]: + """Get highest resolution stream that is a progressive video. + + :rtype: :class:`Stream ` or None + :returns: + The :class:`Stream ` matching the given itag or None if + not found. + + """ + return self.filter(progressive=True).order_by("resolution").last() + + def get_audio_only(self, subtype: str = "mp4") -> Optional[Stream]: + """Get highest bitrate audio stream for given codec (defaults to mp4) + + :param str subtype: + Audio subtype, defaults to mp4 + :rtype: :class:`Stream ` or None + :returns: + The :class:`Stream ` matching the given itag or None if + not found. + """ + return ( + self.filter(only_audio=True, subtype=subtype) + .order_by("abr") + .last() + ) + + def otf(self, is_otf: bool = False) -> "StreamQuery": + """Filter stream by OTF, useful if some streams have 404 URLs + + :param bool is_otf: Set to False to retrieve only non-OTF streams + :rtype: :class:`StreamQuery ` + :returns: A StreamQuery object with otf filtered streams + """ + return self._filter([lambda s: s.is_otf == is_otf]) + + def first(self) -> Optional[Stream]: + """Get the first :class:`Stream ` in the results. + + :rtype: :class:`Stream ` or None + :returns: + the first result of this query or None if the result doesn't + contain any streams. + + """ + try: + return self.fmt_streams[0] + except IndexError: + return None + + def last(self): + """Get the last :class:`Stream ` in the results. + + :rtype: :class:`Stream ` or None + :returns: + Return the last result of this query or None if the result + doesn't contain any streams. + + """ + try: + return self.fmt_streams[-1] + except IndexError: + pass + + @deprecated("Get the size of this list directly using len()") + def count(self, value: Optional[str] = None) -> int: # pragma: no cover + """Get the count of items in the list. + + :rtype: int + """ + if value: + return self.fmt_streams.count(value) + + return len(self) + + @deprecated("This object can be treated as a list, all() is useless") + def all(self) -> List[Stream]: # pragma: no cover + """Get all the results represented by this query as a list. + + :rtype: list + + """ + return self.fmt_streams + + def __getitem__(self, i: Union[slice, int]): + return self.fmt_streams[i] + + def __len__(self) -> int: + return len(self.fmt_streams) + + def __repr__(self) -> str: + return f"{self.fmt_streams}" + + +class CaptionQuery(Mapping): + """Interface for querying the available captions.""" + + def __init__(self, captions: List[Caption]): + """Construct a :class:`Caption `. + + param list captions: + list of :class:`Caption ` instances. + + """ + self.lang_code_index = {c.code: c for c in captions} + + @deprecated( + "This object can be treated as a dictionary, i.e. captions['en']" + ) + def get_by_language_code( + self, lang_code: str + ) -> Optional[Caption]: # pragma: no cover + """Get the :class:`Caption ` for a given ``lang_code``. + + :param str lang_code: + The code that identifies the caption language. + :rtype: :class:`Caption ` or None + :returns: + The :class:`Caption ` matching the given ``lang_code`` or + None if it does not exist. + """ + return self.lang_code_index.get(lang_code) + + @deprecated("This object can be treated as a dictionary") + def all(self) -> List[Caption]: # pragma: no cover + """Get all the results represented by this query as a list. + + :rtype: list + + """ + return list(self.lang_code_index.values()) + + def __getitem__(self, i: str): + return self.lang_code_index[i] + + def __len__(self) -> int: + return len(self.lang_code_index) + + def __iter__(self): + return iter(self.lang_code_index.values()) + + def __repr__(self) -> str: + return f"{self.lang_code_index}" diff --git a/tutubo/pytube/request.py b/tutubo/pytube/request.py new file mode 100644 index 0000000..2d747ce --- /dev/null +++ b/tutubo/pytube/request.py @@ -0,0 +1,269 @@ +"""Implements a simple wrapper around urlopen.""" +import http.client +import json +import logging +import re +import socket +from functools import lru_cache +from urllib import parse +from urllib.error import URLError +from urllib.request import Request, urlopen + +from tutubo.pytube.exceptions import RegexMatchError, MaxRetriesExceeded +from tutubo.pytube.helpers import regex_search + +logger = logging.getLogger(__name__) +default_range_size = 9437184 # 9MB + + +def _execute_request( + url, + method=None, + headers=None, + data=None, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT +): + base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"} + if headers: + base_headers.update(headers) + if data: + # encode data for request + if not isinstance(data, bytes): + data = bytes(json.dumps(data), encoding="utf-8") + if url.lower().startswith("http"): + request = Request(url, headers=base_headers, method=method, data=data) + else: + raise ValueError("Invalid URL") + return urlopen(request, timeout=timeout) # nosec + + +def get(url, extra_headers=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + """Send an http GET request. + + :param str url: + The URL to perform the GET request for. + :param dict extra_headers: + Extra headers to add to the request + :rtype: str + :returns: + UTF-8 encoded string of response + """ + if extra_headers is None: + extra_headers = {} + response = _execute_request(url, headers=extra_headers, timeout=timeout) + return response.read().decode("utf-8") + + +def post(url, extra_headers=None, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): + """Send an http POST request. + + :param str url: + The URL to perform the POST request for. + :param dict extra_headers: + Extra headers to add to the request + :param dict data: + The data to send on the POST request + :rtype: str + :returns: + UTF-8 encoded string of response + """ + # could technically be implemented in get, + # but to avoid confusion implemented like this + if extra_headers is None: + extra_headers = {} + if data is None: + data = {} + # required because the youtube servers are strict on content type + # raises HTTPError [400]: Bad Request otherwise + extra_headers.update({"Content-Type": "application/json"}) + response = _execute_request( + url, + headers=extra_headers, + data=data, + timeout=timeout + ) + return response.read().decode("utf-8") + + +def seq_stream( + url, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + max_retries=0 +): + """Read the response in sequence. + :param str url: The URL to perform the GET request for. + :rtype: Iterable[bytes] + """ + # YouTube expects a request sequence number as part of the parameters. + split_url = parse.urlsplit(url) + base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path) + + querys = dict(parse.parse_qsl(split_url.query)) + + # The 0th sequential request provides the file headers, which tell us + # information about how the file is segmented. + querys['sq'] = 0 + url = base_url + parse.urlencode(querys) + + segment_data = b'' + for chunk in stream(url, timeout=timeout, max_retries=max_retries): + yield chunk + segment_data += chunk + + # We can then parse the header to find the number of segments + stream_info = segment_data.split(b'\r\n') + segment_count_pattern = re.compile(b'Segment-Count: (\\d+)') + for line in stream_info: + match = segment_count_pattern.search(line) + if match: + segment_count = int(match.group(1).decode('utf-8')) + + # We request these segments sequentially to build the file. + seq_num = 1 + while seq_num <= segment_count: + # Create sequential request URL + querys['sq'] = seq_num + url = base_url + parse.urlencode(querys) + + yield from stream(url, timeout=timeout, max_retries=max_retries) + seq_num += 1 + return # pylint: disable=R1711 + + +def stream( + url, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + max_retries=0 +): + """Read the response in chunks. + :param str url: The URL to perform the GET request for. + :rtype: Iterable[bytes] + """ + file_size: int = default_range_size # fake filesize to start + downloaded = 0 + while downloaded < file_size: + stop_pos = min(downloaded + default_range_size, file_size) - 1 + range_header = f"bytes={downloaded}-{stop_pos}" + tries = 0 + + # Attempt to make the request multiple times as necessary. + while True: + # If the max retries is exceeded, raise an exception + if tries >= 1 + max_retries: + raise MaxRetriesExceeded() + + # Try to execute the request, ignoring socket timeouts + try: + response = _execute_request( + url + f"&range={downloaded}-{stop_pos}", + method="GET", + timeout=timeout + ) + except URLError as e: + # We only want to skip over timeout errors, and + # raise any other URLError exceptions + if isinstance(e.reason, socket.timeout): + pass + else: + raise + except http.client.IncompleteRead: + # Allow retries on IncompleteRead errors for unreliable connections + pass + else: + # On a successful request, break from loop + break + tries += 1 + + if file_size == default_range_size: + try: + resp = _execute_request( + url + f"&range={0}-{99999999999}", + method="GET", + timeout=timeout + ) + content_range = resp.info()["Content-Length"] + file_size = int(content_range) + except (KeyError, IndexError, ValueError) as e: + logger.error(e) + while True: + chunk = response.read() + if not chunk: + break + downloaded += len(chunk) + yield chunk + return # pylint: disable=R1711 + + +@lru_cache() +def filesize(url): + """Fetch size in bytes of file at given URL + + :param str url: The URL to get the size of + :returns: int: size in bytes of remote file + """ + return int(head(url)["content-length"]) + + +@lru_cache() +def seq_filesize(url): + """Fetch size in bytes of file at given URL from sequential requests + + :param str url: The URL to get the size of + :returns: int: size in bytes of remote file + """ + total_filesize = 0 + # YouTube expects a request sequence number as part of the parameters. + split_url = parse.urlsplit(url) + base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path) + querys = dict(parse.parse_qsl(split_url.query)) + + # The 0th sequential request provides the file headers, which tell us + # information about how the file is segmented. + querys['sq'] = 0 + url = base_url + parse.urlencode(querys) + response = _execute_request( + url, method="GET" + ) + + response_value = response.read() + # The file header must be added to the total filesize + total_filesize += len(response_value) + + # We can then parse the header to find the number of segments + segment_count = 0 + stream_info = response_value.split(b'\r\n') + segment_regex = b'Segment-Count: (\\d+)' + for line in stream_info: + # One of the lines should contain the segment count, but we don't know + # which, so we need to iterate through the lines to find it + try: + segment_count = int(regex_search(segment_regex, line, 1)) + except RegexMatchError: + pass + + if segment_count == 0: + raise RegexMatchError('seq_filesize', segment_regex) + + # We make HEAD requests to the segments sequentially to find the total filesize. + seq_num = 1 + while seq_num <= segment_count: + # Create sequential request URL + querys['sq'] = seq_num + url = base_url + parse.urlencode(querys) + + total_filesize += int(head(url)['content-length']) + seq_num += 1 + return total_filesize + + +def head(url): + """Fetch headers returned http GET request. + + :param str url: + The URL to perform the GET request for. + :rtype: dict + :returns: + dictionary of lowercase headers + """ + response_headers = _execute_request(url, method="HEAD").info() + return {k.lower(): v for k, v in response_headers.items()} diff --git a/tutubo/pytube/streams.py b/tutubo/pytube/streams.py new file mode 100644 index 0000000..357484c --- /dev/null +++ b/tutubo/pytube/streams.py @@ -0,0 +1,436 @@ +""" +This module contains a container for stream manifest data. + +A container object for the media stream (video only / audio only / video+audio +combined). This was referred to as ``Video`` in the legacy pytube version, but +has been renamed to accommodate DASH (which serves the audio and video +separately). +""" +import logging +import os +from math import ceil + +from datetime import datetime +from typing import BinaryIO, Dict, Optional, Tuple +from urllib.error import HTTPError +from urllib.parse import parse_qs + +from tutubo.pytube import extract, request +from tutubo.pytube.helpers import safe_filename, target_directory +from tutubo.pytube.itags import get_format_profile +from tutubo.pytube.monostate import Monostate + +logger = logging.getLogger(__name__) + + +class Stream: + """Container for stream manifest data.""" + + def __init__( + self, stream: Dict, monostate: Monostate + ): + """Construct a :class:`Stream `. + + :param dict stream: + The unscrambled data extracted from YouTube. + :param dict monostate: + Dictionary of data shared across all instances of + :class:`Stream `. + """ + # A dictionary shared between all instances of :class:`Stream ` + # (Borg pattern). + self._monostate = monostate + + self.url = stream["url"] # signed download url + self.itag = int( + stream["itag"] + ) # stream format id (youtube nomenclature) + + # set type and codec info + + # 'video/webm; codecs="vp8, vorbis"' -> 'video/webm', ['vp8', 'vorbis'] + self.mime_type, self.codecs = extract.mime_type_codec(stream["mimeType"]) + + # 'video/webm' -> 'video', 'webm' + self.type, self.subtype = self.mime_type.split("/") + + # ['vp8', 'vorbis'] -> video_codec: vp8, audio_codec: vorbis. DASH + # streams return NoneType for audio/video depending. + self.video_codec, self.audio_codec = self.parse_codecs() + + self.is_otf: bool = stream["is_otf"] + self.bitrate: Optional[int] = stream["bitrate"] + + # filesize in bytes + self._filesize: Optional[int] = int(stream.get('contentLength', 0)) + + # filesize in kilobytes + self._filesize_kb: Optional[float] = float(ceil(float(stream.get('contentLength', 0)) / 1024 * 1000) / 1000) + + # filesize in megabytes + self._filesize_mb: Optional[float] = float(ceil(float(stream.get('contentLength', 0)) / 1024 / 1024 * 1000) / 1000) + + # filesize in gigabytes(fingers crossed we don't need terabytes going forward though) + self._filesize_gb: Optional[float] = float(ceil(float(stream.get('contentLength', 0)) / 1024 / 1024 / 1024 * 1000) / 1000) + + # Additional information about the stream format, such as resolution, + # frame rate, and whether the stream is live (HLS) or 3D. + itag_profile = get_format_profile(self.itag) + self.is_dash = itag_profile["is_dash"] + self.abr = itag_profile["abr"] # average bitrate (audio streams only) + if 'fps' in stream: + self.fps = stream['fps'] # Video streams only + self.resolution = itag_profile[ + "resolution" + ] # resolution (e.g.: "480p") + self.is_3d = itag_profile["is_3d"] + self.is_hdr = itag_profile["is_hdr"] + self.is_live = itag_profile["is_live"] + + @property + def is_adaptive(self) -> bool: + """Whether the stream is DASH. + + :rtype: bool + """ + # if codecs has two elements (e.g.: ['vp8', 'vorbis']): 2 % 2 = 0 + # if codecs has one element (e.g.: ['vp8']) 1 % 2 = 1 + return bool(len(self.codecs) % 2) + + @property + def is_progressive(self) -> bool: + """Whether the stream is progressive. + + :rtype: bool + """ + return not self.is_adaptive + + @property + def includes_audio_track(self) -> bool: + """Whether the stream only contains audio. + + :rtype: bool + """ + return self.is_progressive or self.type == "audio" + + @property + def includes_video_track(self) -> bool: + """Whether the stream only contains video. + + :rtype: bool + """ + return self.is_progressive or self.type == "video" + + def parse_codecs(self) -> Tuple[Optional[str], Optional[str]]: + """Get the video/audio codecs from list of codecs. + + Parse a variable length sized list of codecs and returns a + constant two element tuple, with the video codec as the first element + and audio as the second. Returns None if one is not available + (adaptive only). + + :rtype: tuple + :returns: + A two element tuple with audio and video codecs. + + """ + video = None + audio = None + if not self.is_adaptive: + video, audio = self.codecs + elif self.includes_video_track: + video = self.codecs[0] + elif self.includes_audio_track: + audio = self.codecs[0] + return video, audio + + @property + def filesize(self) -> int: + """File size of the media stream in bytes. + + :rtype: int + :returns: + Filesize (in bytes) of the stream. + """ + if self._filesize == 0: + try: + self._filesize = request.filesize(self.url) + except HTTPError as e: + if e.code != 404: + raise + self._filesize = request.seq_filesize(self.url) + return self._filesize + + @property + def filesize_kb(self) -> float: + """File size of the media stream in kilobytes. + + :rtype: float + :returns: + Rounded filesize (in kilobytes) of the stream. + """ + if self._filesize_kb == 0: + try: + self._filesize_kb = float(ceil(request.filesize(self.url)/1024 * 1000) / 1000) + except HTTPError as e: + if e.code != 404: + raise + self._filesize_kb = float(ceil(request.seq_filesize(self.url)/1024 * 1000) / 1000) + return self._filesize_kb + + @property + def filesize_mb(self) -> float: + """File size of the media stream in megabytes. + + :rtype: float + :returns: + Rounded filesize (in megabytes) of the stream. + """ + if self._filesize_mb == 0: + try: + self._filesize_mb = float(ceil(request.filesize(self.url)/1024/1024 * 1000) / 1000) + except HTTPError as e: + if e.code != 404: + raise + self._filesize_mb = float(ceil(request.seq_filesize(self.url)/1024/1024 * 1000) / 1000) + return self._filesize_mb + + @property + def filesize_gb(self) -> float: + """File size of the media stream in gigabytes. + + :rtype: float + :returns: + Rounded filesize (in gigabytes) of the stream. + """ + if self._filesize_gb == 0: + try: + self._filesize_gb = float(ceil(request.filesize(self.url)/1024/1024/1024 * 1000) / 1000) + except HTTPError as e: + if e.code != 404: + raise + self._filesize_gb = float(ceil(request.seq_filesize(self.url)/1024/1024/1024 * 1000) / 1000) + return self._filesize_gb + + @property + def title(self) -> str: + """Get title of video + + :rtype: str + :returns: + Youtube video title + """ + return self._monostate.title or "Unknown YouTube Video Title" + + @property + def filesize_approx(self) -> int: + """Get approximate filesize of the video + + Falls back to HTTP call if there is not sufficient information to approximate + + :rtype: int + :returns: size of video in bytes + """ + if self._monostate.duration and self.bitrate: + bits_in_byte = 8 + return int( + (self._monostate.duration * self.bitrate) / bits_in_byte + ) + + return self.filesize + + @property + def expiration(self) -> datetime: + expire = parse_qs(self.url.split("?")[1])["expire"][0] + return datetime.utcfromtimestamp(int(expire)) + + @property + def default_filename(self) -> str: + """Generate filename based on the video title. + + :rtype: str + :returns: + An os file system compatible filename. + """ + filename = safe_filename(self.title) + return f"{filename}.{self.subtype}" + + def download( + self, + output_path: Optional[str] = None, + filename: Optional[str] = None, + filename_prefix: Optional[str] = None, + skip_existing: bool = True, + timeout: Optional[int] = None, + max_retries: Optional[int] = 0 + ) -> str: + """Write the media stream to disk. + + :param output_path: + (optional) Output path for writing media file. If one is not + specified, defaults to the current working directory. + :type output_path: str or None + :param filename: + (optional) Output filename (stem only) for writing media file. + If one is not specified, the default filename is used. + :type filename: str or None + :param filename_prefix: + (optional) A string that will be prepended to the filename. + For example a number in a playlist or the name of a series. + If one is not specified, nothing will be prepended + This is separate from filename so you can use the default + filename but still add a prefix. + :type filename_prefix: str or None + :param skip_existing: + (optional) Skip existing files, defaults to True + :type skip_existing: bool + :param timeout: + (optional) Request timeout length in seconds. Uses system default. + :type timeout: int + :param max_retries: + (optional) Number of retries to attempt after socket timeout. Defaults to 0. + :type max_retries: int + :returns: + Path to the saved video + :rtype: str + + """ + file_path = self.get_file_path( + filename=filename, + output_path=output_path, + filename_prefix=filename_prefix, + ) + + if skip_existing and self.exists_at_path(file_path): + logger.debug(f'file {file_path} already exists, skipping') + self.on_complete(file_path) + return file_path + + bytes_remaining = self.filesize + logger.debug(f'downloading ({self.filesize} total bytes) file to {file_path}') + + with open(file_path, "wb") as fh: + try: + for chunk in request.stream( + self.url, + timeout=timeout, + max_retries=max_retries + ): + # reduce the (bytes) remainder by the length of the chunk. + bytes_remaining -= len(chunk) + # send to the on_progress callback. + self.on_progress(chunk, fh, bytes_remaining) + except HTTPError as e: + if e.code != 404: + raise + # Some adaptive streams need to be requested with sequence numbers + for chunk in request.seq_stream( + self.url, + timeout=timeout, + max_retries=max_retries + ): + # reduce the (bytes) remainder by the length of the chunk. + bytes_remaining -= len(chunk) + # send to the on_progress callback. + self.on_progress(chunk, fh, bytes_remaining) + self.on_complete(file_path) + return file_path + + def get_file_path( + self, + filename: Optional[str] = None, + output_path: Optional[str] = None, + filename_prefix: Optional[str] = None, + ) -> str: + if not filename: + filename = self.default_filename + if filename_prefix: + filename = f"{filename_prefix}{filename}" + return os.path.join(target_directory(output_path), filename) + + def exists_at_path(self, file_path: str) -> bool: + return ( + os.path.isfile(file_path) + and os.path.getsize(file_path) == self.filesize + ) + + def stream_to_buffer(self, buffer: BinaryIO) -> None: + """Write the media stream to buffer + + :rtype: io.BytesIO buffer + """ + bytes_remaining = self.filesize + logger.info( + "downloading (%s total bytes) file to buffer", self.filesize, + ) + + for chunk in request.stream(self.url): + # reduce the (bytes) remainder by the length of the chunk. + bytes_remaining -= len(chunk) + # send to the on_progress callback. + self.on_progress(chunk, buffer, bytes_remaining) + self.on_complete(None) + + def on_progress( + self, chunk: bytes, file_handler: BinaryIO, bytes_remaining: int + ): + """On progress callback function. + + This function writes the binary data to the file, then checks if an + additional callback is defined in the monostate. This is exposed to + allow things like displaying a progress bar. + + :param bytes chunk: + Segment of media file binary data, not yet written to disk. + :param file_handler: + The file handle where the media is being written to. + :type file_handler: + :py:class:`io.BufferedWriter` + :param int bytes_remaining: + The delta between the total file size in bytes and amount already + downloaded. + + :rtype: None + + """ + file_handler.write(chunk) + logger.debug("download remaining: %s", bytes_remaining) + if self._monostate.on_progress: + self._monostate.on_progress(self, chunk, bytes_remaining) + + def on_complete(self, file_path: Optional[str]): + """On download complete handler function. + + :param file_path: + The file handle where the media is being written to. + :type file_path: str + + :rtype: None + + """ + logger.debug("download finished") + on_complete = self._monostate.on_complete + if on_complete: + logger.debug("calling on_complete callback %s", on_complete) + on_complete(self, file_path) + + def __repr__(self) -> str: + """Printable object representation. + + :rtype: str + :returns: + A string representation of a :class:`Stream ` object. + """ + parts = ['itag="{s.itag}"', 'mime_type="{s.mime_type}"'] + if self.includes_video_track: + parts.extend(['res="{s.resolution}"', 'fps="{s.fps}fps"']) + if not self.is_adaptive: + parts.extend( + ['vcodec="{s.video_codec}"', 'acodec="{s.audio_codec}"',] + ) + else: + parts.extend(['vcodec="{s.video_codec}"']) + else: + parts.extend(['abr="{s.abr}"', 'acodec="{s.audio_codec}"']) + parts.extend(['progressive="{s.is_progressive}"', 'type="{s.type}"']) + return f"" diff --git a/tutubo/pytube/version.py b/tutubo/pytube/version.py new file mode 100644 index 0000000..e2a3cbc --- /dev/null +++ b/tutubo/pytube/version.py @@ -0,0 +1,4 @@ +__version__ = "15.0.0" + +if __name__ == "__main__": + print(__version__) diff --git a/tutubo/search.py b/tutubo/search.py index 8a4ee3d..ac6a073 100755 --- a/tutubo/search.py +++ b/tutubo/search.py @@ -1,6 +1,6 @@ import enum -from pytube.contrib.search import Search as _Search +from tutubo.pytube.contrib.search import Search as _Search from tutubo.models import * from tutubo.ytmus import *