diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb595f924..e09144aec 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2309,6 +2309,7 @@ UstudioIE, ) from .utreon import UtreonIE +from .vanillo import VanilloIE, VanilloPlaylistIE, VanilloUserIE from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veo import VeoIE diff --git a/yt_dlp/extractor/vanillo.py b/yt_dlp/extractor/vanillo.py new file mode 100644 index 000000000..f62f0c5fd --- /dev/null +++ b/yt_dlp/extractor/vanillo.py @@ -0,0 +1,386 @@ +import datetime +import json +import re + +from .common import InfoExtractor +from ..utils import ExtractorError, parse_iso8601 + +# NOTE: Private videos can be downloaded by adding --add-header "authorization: Bearer abcxyz", +# but won't work with --cookies-from-browser and --cookies file.txt + + +class VanilloIE(InfoExtractor): + _access_token = None + + def _real_initialize(self): + """ + Called before extracting each URL. This checks the cookie jar for an access token + """ + # 1) Check if we already have an access token in our cookies + if not self._access_token: + self._access_token = self._get_token_from_cookies() + + # If we still have no token, requests remain unauthorized (public only). + + def _get_token_from_cookies(self): + """ + Attempt to read an access_token from cookies (if any). + """ + cookie_names = ('access_token', 'ACCESS_TOKEN') + # We rename 'path' to '_path' since it's unused in the loop body + for domain, path_dict in self._downloader.cookiejar._cookies.items(): + if 'vanillo.tv' in domain: + for _path, cookie_dict in path_dict.items(): + for name, cookie in cookie_dict.items(): + if name in cookie_names: + # self.to_screen(f'Found vanillo.tv access_token in cookies: {cookie.value}') + self.to_screen('Found vanillo.tv access_token in cookies') + return cookie.value + return None + + def _download_json(self, url_or_request, video_id, note='Downloading JSON', *args, **kwargs): + """ + Overriding calls to _download_json so that our Authorization + header is always set if we have an access token. + """ + # We call _download_json_handle, passing 'note' as a positional argument + res, urlh = self._download_json_handle(url_or_request, video_id, note, *args, **kwargs) + return res + + def _download_json_handle(self, url_or_request, video_id, note, *args, **kwargs): + """ + Overridden to insert Bearer token if present, while + still relying on the default logic for everything else. + """ + # Because 'headers' can be in kwargs, we need to merge them + # carefully with our Authorization header if we have a token + custom_headers = kwargs.setdefault('headers', {}) + if self._access_token: + custom_headers['Authorization'] = f'Bearer {self._access_token}' + + # Call the parent method with 'note' as a positional argument + return super()._download_json_handle(url_or_request, video_id, note, *args, **kwargs) + + _VALID_URL = r'https?://(?:dev\.|beta\.)?vanillo\.tv/(?:v|embed)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://vanillo.tv/v/iaCi-oTmmGY', + 'info_dict': { + 'id': 'iaCi-oTmmGY', + 'title': 'Wawa', + 'description': '', + 'thumbnail': 'https://images.vanillo.tv/V6mYuajeHGsSSPRJKCdRAvvWgHFVGZ00g-ne3TZevss/h:300/aHR0cHM6Ly9pbWFnZXMuY2RuLnZhbmlsbG8udHYvdGh1bWJuYWlsL1RhUGE3TEJFTVBlS205elh2ZWdzLmF2aWY', + 'uploader_url': 'M7A', + 'upload_date': '20240309', # YYYYMMDD format, server API provides 2024-03-09T07:56:35.636Z + 'duration': 5.71, + 'view_count': 205, + 'comment_count': 2, + 'like_count': 4, + 'dislike_count': 0, + 'average_rating': 4.2, + 'categories': ['film_and_animation'], + 'tags': ['Wawa', 'wawa', 'Wa Wa', 'wa wa', 'WaWa', 'wAwA', 'wA Wa'], + }, + }, { + 'url': 'https://vanillo.tv/v/RhSueuQZiKF', + 'info_dict': { + 'id': 'RhSueuQZiKF', + 'title': "What's New on Vanillo - Fall Update", + 'description': '', + 'thumbnail': 'https://images.vanillo.tv/7Qfelvn1-4waFjX3rIc1FkfpB9jOJqqLlvieD5i3mlA/h:300/aHR0cHM6Ly9pbWFnZXMuY2RuLnZhbmlsbG8udHYvdGh1bWJuYWlsL3JsMmR5ajJFcnozMEphSUd0bTZyLmF2aWY', + 'uploader_url': 'Vanillo', + 'upload_date': '20231020', # YYYYMMDD format, server API provides 2023-10-20T04:53:13.718Z + 'duration': 99.35, + 'view_count': 368, + 'comment_count': 2, + 'like_count': 20, + 'dislike_count': 0, + 'average_rating': 4.2, + 'categories': ['film_and_animation'], + 'tags': [ + 'fall', 'update', 'fall update', 'autumn', 'autumn update', 'vanillo', 'new features', 'new', + 'features', 'exciting', 'language', 'switch', 'english', 'descriptive audio', 'descriptive', + 'audio', 'qualities', 'higher', 'process', 'processing', 'faster', 'fast', '2x', '4x', + 'twice', 'speed', 'speedy', 'quick', 'chapters', + ], + }, + 'playlist_mincount': 1, + }] + + def _get_replies(self, comment_id, limit=3): + replies = [] + replies_url = f'https://api.vanillo.tv/v1/comments/{comment_id}/replies?limit={limit}&reviewing=false' + try: + replies_data = self._download_json( + replies_url, comment_id, note=f'Downloading replies for comment {comment_id}', fatal=False, + ) + except ExtractorError: + return replies + if replies_data.get('status') != 'success': + return replies + for reply in replies_data.get('data', {}).get('comments', []): + transformed = { + 'id': reply.get('id'), + 'author': reply.get('profile', {}).get('username'), + 'author_id': reply.get('profile', {}).get('id'), + 'text': reply.get('text'), + 'timestamp': parse_iso8601(reply.get('createdAt')), + } + replies.append(transformed) + return replies + + def _get_comments(self, video_id, limit=10): + all_comments = [] + page_key = None + # Loop to download all comments using pageKey + while True: + url = ( + f'https://api.vanillo.tv/v1/videos/{video_id}/comments' + f'?limit={limit}&reviewing=false&filter=high_to_low_score' + ) + if page_key: + url += f'&pageKey={page_key}' + try: + comments_data = self._download_json(url, video_id, note='Downloading comments', fatal=False) + except ExtractorError: + break + if comments_data.get('status') != 'success': + break + data = comments_data.get('data', {}) + comments = data.get('comments', []) + if not comments: + break + # For each comment, download replies (if any) + for comment in comments: + transformed = { + 'id': comment.get('id'), + 'author': comment.get('profile', {}).get('username'), + 'author_id': comment.get('profile', {}).get('id'), + 'text': comment.get('text'), + 'timestamp': parse_iso8601(comment.get('createdAt')), + 'replies': self._get_replies(comment.get('id')), + } + all_comments.append(transformed) + page_key = data.get('nextPageKey') + if not page_key: + break + return all_comments + + def _real_extract(self, url): + video_id = self._match_id(url) + + # 1) Retrieve video info (metadata) + video_info_url = f'https://api.vanillo.tv/v1/videos/{video_id}?groups=uploader,profile.full' + try: + video_info = self._download_json(video_info_url, video_id, note='Downloading video info') + except ExtractorError as e: + # Try to get an HTTP code from the error cause or message + http_code = getattr(e.cause, 'code', None) + if http_code is None and 'HTTP Error 404' in str(e): + http_code = 404 + if http_code == 404: + self.raise_login_required( + 'Video not found or is private. ' # maybe use report_login here? + 'Sign in if you have access to this video. Use --cookies or --cookies-from-browser option', + method=None, + ) + elif http_code == 403: + raise ExtractorError( + 'Your Internet provider is likely blocked. Try again with another ISP or use VPN', + expected=True, + ) + raise + + if video_info.get('status') != 'success': + raise ExtractorError('Video info API returned an error', expected=True) + data = video_info.get('data', {}) + title = data.get('title') or video_id + description = data.get('description') + thumbnail = data.get('thumbnail') + + uploader = data.get('uploader', {}) + uploader_url = uploader.get('url') + + # 2) Fix the ISO8601 date to remove leftover data + upload_date_raw = data.get('publishedAt') + upload_date = None + if upload_date_raw: + # Remove fractional seconds and any extra data after 'Z' + upload_date_raw = re.sub(r'\.\d+', '', upload_date_raw) + upload_date_raw = re.sub(r'Z.*$', 'Z', upload_date_raw) + try: + parsed_date = datetime.datetime.fromisoformat(upload_date_raw.replace('Z', '+00:00')) + upload_date = parsed_date.strftime('%Y%m%d') + except ValueError: + pass + + duration = data.get('duration') + + # Convert numeric fields safely + def safe_int(val): + try: + return int(val) + except (TypeError, ValueError): + return None + + view_count = safe_int(data.get('views')) + comment_count = safe_int(data.get('totalComments')) + like_count = safe_int(data.get('likes')) + dislike_count = safe_int(data.get('dislikes')) + + average_rating = None + if like_count is not None and dislike_count is not None: + total = like_count + dislike_count + if total > 0: + average_rating = round((like_count / total) * 5, 1) + + categories = data.get('category') + if categories and not isinstance(categories, list): + categories = [categories] + tags = data.get('tags') + + # 3) Get watch token (required for accessing manifests) + watch_token_url = 'https://api.vanillo.tv/v1/watch' + post_data = json.dumps({'videoId': video_id}).encode('utf-8') + watch_token_resp = self._download_json( + watch_token_url, video_id, + note='Downloading watch token', + data=post_data, + headers={'Content-Type': 'application/json'}, + ) + watch_token = watch_token_resp.get('data', {}).get('watchToken') + if not watch_token: + raise ExtractorError('Failed to retrieve watch token', expected=True) + + # 4) Get the HLS & DASH manifest URLs using the watch token + manifests_url = f'https://api.vanillo.tv/v1/watch/manifests?watchToken={watch_token}' + manifests = self._download_json(manifests_url, video_id, note='Downloading manifests') + hls_url = manifests.get('data', {}).get('media', {}).get('hls') + # dash_url = manifests.get('data', {}).get('media', {}).get('dash') + # DASH provides comically gigantic files. Disabling. + # example - 1.7 mb file becomes 15.1 mb, thus short videos for no reason become 100+gb + # same for audio tracks, thus RAM usage will be high, and merged file will be even bigger. + ''' + if dash_url: + fmts, subs = self._extract_mpd_formats_and_subtitles( + dash_url, video_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + ''' + + # 5) Extract available formats/subtitles from HLS + subtitles = {} + formats = [] + if hls_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_url, video_id, ext='mp4', m3u8_id='hls', fatal=False, + ) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + # 6) Download all comments using pagination with pageKey only if --write-comments + if self._downloader.params.get('getcomments'): + comments = self._get_comments(video_id, limit=10) + else: + comments = None + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + 'comments': comments, + 'uploader_url': uploader_url, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'average_rating': average_rating, + 'categories': categories, + 'tags': tags, + } + + +class VanilloPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:dev\.|beta\.)?vanillo\.tv/playlist/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://vanillo.tv/playlist/wn9_PM-DTPypZeNy32EE1A', + 'info_dict': { + 'id': 'wn9_PM-DTPypZeNy32EE1A', + 'title': 'Staff Picks', + }, + 'playlist_mincount': 1, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + # 1) Download playlist metadata + playlist_api_url = f'https://api.vanillo.tv/v1/playlists/{playlist_id}' + playlist_info = self._download_json( + playlist_api_url, playlist_id, note='Downloading playlist metadata', fatal=False, + ) + playlist_data = playlist_info.get('data', {}).get('playlist', {}) + playlist_title = playlist_data.get('name') or playlist_id + playlist_description = playlist_data.get('description') + video_count = playlist_data.get('videoCount') or 20 + + # 2) Download the videos using the videoCount as the limit + api_url = f'https://api.vanillo.tv/v1/playlists/{playlist_id}/videos?offset=0&limit={video_count}' + playlist_data = self._download_json( + api_url, playlist_id, note='Downloading playlist videos', + ) + videos = playlist_data.get('data', {}).get('videos', []) + entries = [] + for video in videos: + vid = video.get('id') + if not vid: + continue + video_url = f'https://vanillo.tv/v/{vid}' + entries.append(self.url_result(video_url, VanilloIE.ie_key())) + + info = self.playlist_result(entries, playlist_id, playlist_title=playlist_title) + if playlist_description: + info['description'] = playlist_description + return info + + +class VanilloUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:dev\.|beta\.)?vanillo\.tv/u/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://vanillo.tv/u/f9pKNFrUSG6Qo3pJ4UlGbQ', + 'info_dict': { + 'id': 'f9pKNFrUSG6Qo3pJ4UlGbQ', + 'title': 'User BakhosVillager videos', + }, + 'playlist_mincount': 1, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + entries = [] + offset = 0 + while True: + # Loop to paginate through all user videos + api_url = ( + f'https://api.vanillo.tv/v1/profiles/{user_id}/videos' + f'?offset={offset}&limit=20&groups=videos.all' + ) + user_data = self._download_json( + api_url, user_id, note='Downloading user videos', fatal=False, + ) + videos = user_data.get('data', {}).get('videos', []) + if not videos: + break + for video in videos: + vid = video.get('id') + if not vid: + continue + video_url = f'https://vanillo.tv/v/{vid}' + entries.append(self.url_result(video_url, VanilloIE.ie_key())) + if len(videos) < 20: + break + offset += 20 + return self.playlist_result(entries, user_id, playlist_title=f'User {user_id} videos')