Merge 35bba89cdb into 38c2bf4026

2025-08-15 08:58:28 +00:00 · 2025-08-04 17:21:15 +00:00 · 2025-08-04 17:21:15 +00:00 · d1e23c0852
commit d1e23c0852
parent 38c2bf4026 35bba89cdb
2 changed files with 387 additions and 0 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -2309,6 +2309,7 @@
    UstudioIE,
 )
 from .utreon import UtreonIE
 from .vanillo import VanilloIE, VanilloPlaylistIE, VanilloUserIE
 from .varzesh3 import Varzesh3IE
 from .vbox7 import Vbox7IE
 from .veo import VeoIE
--- a/yt_dlp/extractor/vanillo.py
+++ b/yt_dlp/extractor/vanillo.py
@ -0,0 +1,386 @@
 import datetime
 import json
 import re
 from .common import InfoExtractor
 from ..utils import ExtractorError, parse_iso8601
 # NOTE: Private videos can be downloaded by adding --add-header "authorization: Bearer abcxyz",
 # but won't work with --cookies-from-browser and --cookies file.txt
 class VanilloIE(InfoExtractor):
    _access_token = None
    def _real_initialize(self):
        """
        Called before extracting each URL. This checks the cookie jar for an access token
        """
        # 1) Check if we already have an access token in our cookies
        if not self._access_token:
            self._access_token = self._get_token_from_cookies()
        # If we still have no token, requests remain unauthorized (public only).
    def _get_token_from_cookies(self):
        """
        Attempt to read an access_token from cookies (if any).
        """
        cookie_names = ('access_token', 'ACCESS_TOKEN')
        # We rename 'path' to '_path' since it's unused in the loop body
        for domain, path_dict in self._downloader.cookiejar._cookies.items():
            if 'vanillo.tv' in domain:
                for _path, cookie_dict in path_dict.items():
                    for name, cookie in cookie_dict.items():
                        if name in cookie_names:
                            # self.to_screen(f'Found vanillo.tv access_token in cookies: {cookie.value}')
                            self.to_screen('Found vanillo.tv access_token in cookies')
                            return cookie.value
        return None
    def _download_json(self, url_or_request, video_id, note='Downloading JSON', *args, **kwargs):
        """
        Overriding calls to _download_json so that our Authorization
        header is always set if we have an access token.
        """
        # We call _download_json_handle, passing 'note' as a positional argument
        res, urlh = self._download_json_handle(url_or_request, video_id, note, *args, **kwargs)
        return res
    def _download_json_handle(self, url_or_request, video_id, note, *args, **kwargs):
        """
        Overridden to insert Bearer token if present, while
        still relying on the default logic for everything else.
        """
        # Because 'headers' can be in kwargs, we need to merge them
        # carefully with our Authorization header if we have a token
        custom_headers = kwargs.setdefault('headers', {})
        if self._access_token:
            custom_headers['Authorization'] = f'Bearer {self._access_token}'
        # Call the parent method with 'note' as a positional argument
        return super()._download_json_handle(url_or_request, video_id, note, *args, **kwargs)
    _VALID_URL = r'https?://(?:dev\.|beta\.)?vanillo\.tv/(?:v|embed)/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://vanillo.tv/v/iaCi-oTmmGY',
        'info_dict': {
            'id': 'iaCi-oTmmGY',
            'title': 'Wawa',
            'description': '',
            'thumbnail': 'https://images.vanillo.tv/V6mYuajeHGsSSPRJKCdRAvvWgHFVGZ00g-ne3TZevss/h:300/aHR0cHM6Ly9pbWFnZXMuY2RuLnZhbmlsbG8udHYvdGh1bWJuYWlsL1RhUGE3TEJFTVBlS205elh2ZWdzLmF2aWY',
            'uploader_url': 'M7A',
            'upload_date': '20240309',  # YYYYMMDD format, server API provides 2024-03-09T07:56:35.636Z
            'duration': 5.71,
            'view_count': 205,
            'comment_count': 2,
            'like_count': 4,
            'dislike_count': 0,
            'average_rating': 4.2,
            'categories': ['film_and_animation'],
            'tags': ['Wawa', 'wawa', 'Wa Wa', 'wa wa', 'WaWa', 'wAwA', 'wA Wa'],
        },
    }, {
        'url': 'https://vanillo.tv/v/RhSueuQZiKF',
        'info_dict': {
            'id': 'RhSueuQZiKF',
            'title': "What's New on Vanillo - Fall Update",
            'description': '',
            'thumbnail': 'https://images.vanillo.tv/7Qfelvn1-4waFjX3rIc1FkfpB9jOJqqLlvieD5i3mlA/h:300/aHR0cHM6Ly9pbWFnZXMuY2RuLnZhbmlsbG8udHYvdGh1bWJuYWlsL3JsMmR5ajJFcnozMEphSUd0bTZyLmF2aWY',
            'uploader_url': 'Vanillo',
            'upload_date': '20231020',  # YYYYMMDD format, server API provides 2023-10-20T04:53:13.718Z
            'duration': 99.35,
            'view_count': 368,
            'comment_count': 2,
            'like_count': 20,
            'dislike_count': 0,
            'average_rating': 4.2,
            'categories': ['film_and_animation'],
            'tags': [
                'fall', 'update', 'fall update', 'autumn', 'autumn update', 'vanillo', 'new features', 'new',
                'features', 'exciting', 'language', 'switch', 'english', 'descriptive audio', 'descriptive',
                'audio', 'qualities', 'higher', 'process', 'processing', 'faster', 'fast', '2x', '4x',
                'twice', 'speed', 'speedy', 'quick', 'chapters',
            ],
        },
        'playlist_mincount': 1,
    }]
    def _get_replies(self, comment_id, limit=3):
        replies = []
        replies_url = f'https://api.vanillo.tv/v1/comments/{comment_id}/replies?limit={limit}&reviewing=false'
        try:
            replies_data = self._download_json(
                replies_url, comment_id, note=f'Downloading replies for comment {comment_id}', fatal=False,
            )
        except ExtractorError:
            return replies
        if replies_data.get('status') != 'success':
            return replies
        for reply in replies_data.get('data', {}).get('comments', []):
            transformed = {
                'id': reply.get('id'),
                'author': reply.get('profile', {}).get('username'),
                'author_id': reply.get('profile', {}).get('id'),
                'text': reply.get('text'),
                'timestamp': parse_iso8601(reply.get('createdAt')),
            }
            replies.append(transformed)
        return replies
    def _get_comments(self, video_id, limit=10):
        all_comments = []
        page_key = None
        # Loop to download all comments using pageKey
        while True:
            url = (
                f'https://api.vanillo.tv/v1/videos/{video_id}/comments'
                f'?limit={limit}&reviewing=false&filter=high_to_low_score'
            )
            if page_key:
                url += f'&pageKey={page_key}'
            try:
                comments_data = self._download_json(url, video_id, note='Downloading comments', fatal=False)
            except ExtractorError:
                break
            if comments_data.get('status') != 'success':
                break
            data = comments_data.get('data', {})
            comments = data.get('comments', [])
            if not comments:
                break
            # For each comment, download replies (if any)
            for comment in comments:
                transformed = {
                    'id': comment.get('id'),
                    'author': comment.get('profile', {}).get('username'),
                    'author_id': comment.get('profile', {}).get('id'),
                    'text': comment.get('text'),
                    'timestamp': parse_iso8601(comment.get('createdAt')),
                    'replies': self._get_replies(comment.get('id')),
                }
                all_comments.append(transformed)
            page_key = data.get('nextPageKey')
            if not page_key:
                break
        return all_comments
    def _real_extract(self, url):
        video_id = self._match_id(url)
        # 1) Retrieve video info (metadata)
        video_info_url = f'https://api.vanillo.tv/v1/videos/{video_id}?groups=uploader,profile.full'
        try:
            video_info = self._download_json(video_info_url, video_id, note='Downloading video info')
        except ExtractorError as e:
            # Try to get an HTTP code from the error cause or message
            http_code = getattr(e.cause, 'code', None)
            if http_code is None and 'HTTP Error 404' in str(e):
                http_code = 404
            if http_code == 404:
                self.raise_login_required(
                    'Video not found or is private. '  # maybe use report_login here?
                    'Sign in if you have access to this video. Use --cookies or --cookies-from-browser option',
                    method=None,
                )
            elif http_code == 403:
                raise ExtractorError(
                    'Your Internet provider is likely blocked. Try again with another ISP or use VPN',
                    expected=True,
                )
            raise
        if video_info.get('status') != 'success':
            raise ExtractorError('Video info API returned an error', expected=True)
        data = video_info.get('data', {})
        title = data.get('title') or video_id
        description = data.get('description')
        thumbnail = data.get('thumbnail')
        uploader = data.get('uploader', {})
        uploader_url = uploader.get('url')
        # 2) Fix the ISO8601 date to remove leftover data
        upload_date_raw = data.get('publishedAt')
        upload_date = None
        if upload_date_raw:
            # Remove fractional seconds and any extra data after 'Z'
            upload_date_raw = re.sub(r'\.\d+', '', upload_date_raw)
            upload_date_raw = re.sub(r'Z.*$', 'Z', upload_date_raw)
            try:
                parsed_date = datetime.datetime.fromisoformat(upload_date_raw.replace('Z', '+00:00'))
                upload_date = parsed_date.strftime('%Y%m%d')
            except ValueError:
                pass
        duration = data.get('duration')
        # Convert numeric fields safely
        def safe_int(val):
            try:
                return int(val)
            except (TypeError, ValueError):
                return None
        view_count = safe_int(data.get('views'))
        comment_count = safe_int(data.get('totalComments'))
        like_count = safe_int(data.get('likes'))
        dislike_count = safe_int(data.get('dislikes'))
        average_rating = None
        if like_count is not None and dislike_count is not None:
            total = like_count + dislike_count
            if total > 0:
                average_rating = round((like_count / total) * 5, 1)
        categories = data.get('category')
        if categories and not isinstance(categories, list):
            categories = [categories]
        tags = data.get('tags')
        # 3) Get watch token (required for accessing manifests)
        watch_token_url = 'https://api.vanillo.tv/v1/watch'
        post_data = json.dumps({'videoId': video_id}).encode('utf-8')
        watch_token_resp = self._download_json(
            watch_token_url, video_id,
            note='Downloading watch token',
            data=post_data,
            headers={'Content-Type': 'application/json'},
        )
        watch_token = watch_token_resp.get('data', {}).get('watchToken')
        if not watch_token:
            raise ExtractorError('Failed to retrieve watch token', expected=True)
        # 4) Get the HLS & DASH manifest URLs using the watch token
        manifests_url = f'https://api.vanillo.tv/v1/watch/manifests?watchToken={watch_token}'
        manifests = self._download_json(manifests_url, video_id, note='Downloading manifests')
        hls_url = manifests.get('data', {}).get('media', {}).get('hls')
        # dash_url = manifests.get('data', {}).get('media', {}).get('dash')
        # DASH provides comically gigantic files. Disabling.
        # example - 1.7 mb file becomes 15.1 mb, thus short videos for no reason become 100+gb
        # same for audio tracks, thus RAM usage will be high, and merged file will be even bigger.
        '''
        if dash_url:
            fmts, subs = self._extract_mpd_formats_and_subtitles(
                dash_url, video_id, mpd_id='dash', fatal=False)
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)
        '''
        # 5) Extract available formats/subtitles from HLS
        subtitles = {}
        formats = []
        if hls_url:
            fmts, subs = self._extract_m3u8_formats_and_subtitles(
                hls_url, video_id, ext='mp4', m3u8_id='hls', fatal=False,
            )
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)
        # 6) Download all comments using pagination with pageKey only if --write-comments
        if self._downloader.params.get('getcomments'):
            comments = self._get_comments(video_id, limit=10)
        else:
            comments = None
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'formats': formats,
            'subtitles': subtitles,
            'comments': comments,
            'uploader_url': uploader_url,
            'upload_date': upload_date,
            'duration': duration,
            'view_count': view_count,
            'comment_count': comment_count,
            'like_count': like_count,
            'dislike_count': dislike_count,
            'average_rating': average_rating,
            'categories': categories,
            'tags': tags,
        }
 class VanilloPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://(?:dev\.|beta\.)?vanillo\.tv/playlist/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://vanillo.tv/playlist/wn9_PM-DTPypZeNy32EE1A',
        'info_dict': {
            'id': 'wn9_PM-DTPypZeNy32EE1A',
            'title': 'Staff Picks',
        },
        'playlist_mincount': 1,
    }]
    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        # 1) Download playlist metadata
        playlist_api_url = f'https://api.vanillo.tv/v1/playlists/{playlist_id}'
        playlist_info = self._download_json(
            playlist_api_url, playlist_id, note='Downloading playlist metadata', fatal=False,
        )
        playlist_data = playlist_info.get('data', {}).get('playlist', {})
        playlist_title = playlist_data.get('name') or playlist_id
        playlist_description = playlist_data.get('description')
        video_count = playlist_data.get('videoCount') or 20
        # 2) Download the videos using the videoCount as the limit
        api_url = f'https://api.vanillo.tv/v1/playlists/{playlist_id}/videos?offset=0&limit={video_count}'
        playlist_data = self._download_json(
            api_url, playlist_id, note='Downloading playlist videos',
        )
        videos = playlist_data.get('data', {}).get('videos', [])
        entries = []
        for video in videos:
            vid = video.get('id')
            if not vid:
                continue
            video_url = f'https://vanillo.tv/v/{vid}'
            entries.append(self.url_result(video_url, VanilloIE.ie_key()))
        info = self.playlist_result(entries, playlist_id, playlist_title=playlist_title)
        if playlist_description:
            info['description'] = playlist_description
        return info
 class VanilloUserIE(InfoExtractor):
    _VALID_URL = r'https?://(?:dev\.|beta\.)?vanillo\.tv/u/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://vanillo.tv/u/f9pKNFrUSG6Qo3pJ4UlGbQ',
        'info_dict': {
            'id': 'f9pKNFrUSG6Qo3pJ4UlGbQ',
            'title': 'User BakhosVillager videos',
        },
        'playlist_mincount': 1,
    }]
    def _real_extract(self, url):
        user_id = self._match_id(url)
        entries = []
        offset = 0
        while True:
            # Loop to paginate through all user videos
            api_url = (
                f'https://api.vanillo.tv/v1/profiles/{user_id}/videos'
                f'?offset={offset}&limit=20&groups=videos.all'
            )
            user_data = self._download_json(
                api_url, user_id, note='Downloading user videos', fatal=False,
            )
            videos = user_data.get('data', {}).get('videos', [])
            if not videos:
                break
            for video in videos:
                vid = video.get('id')
                if not vid:
                    continue
                video_url = f'https://vanillo.tv/v/{vid}'
                entries.append(self.url_result(video_url, VanilloIE.ie_key()))
            if len(videos) < 20:
                break
            offset += 20
        return self.playlist_result(entries, user_id, playlist_title=f'User {user_id} videos')