Merge branch 'yt-dlp:master' into misc-2025-08

2025-08-15 00:48:28 +00:00 · 2025-08-01 15:29:01 -05:00 · 2025-08-01 15:29:01 -05:00 · 8d6510623b
commit 8d6510623b
parent 39fa0fcaa5 71f30921a2
4 changed files with 147 additions and 99 deletions
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@ -33,7 +33,6 @@
    unified_timestamp,
    url_or_none,
    urlhandle_detect_ext,
-    variadic,
 )


@ -232,6 +231,23 @@ class ArchiveOrgIE(InfoExtractor):
            'release_date': '19950402',
            'timestamp': 1084927901,
        },
+    }, {
+        # metadata['metadata']['description'] is a list of strings instead of str
+        'url': 'https://archive.org/details/pra-KZ1908.02',
+        'info_dict': {
+            'id': 'pra-KZ1908.02',
+            'ext': 'mp3',
+            'display_id': 'KZ1908.02_01.wav',
+            'title': 'Crips and Bloods speak about gang life',
+            'description': 'md5:2b56b35ff021311e3554b47a285e70b3',
+            'uploader': 'jake@archive.org',
+            'duration': 1733.74,
+            'track': 'KZ1908.02 01',
+            'track_number': 1,
+            'timestamp': 1336026026,
+            'upload_date': '20120503',
+            'release_year': 1992,
+        },
    }]

    @staticmethod
@ -274,34 +290,40 @@ def _real_extract(self, url):
        m = metadata['metadata']
        identifier = m['identifier']

-        info = {
+        info = traverse_obj(m, {
+            'title': ('title', {str}),
+            'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any),
+            'uploader': (('uploader', 'adder'), {str}, any),
+            'creators': ('creator', (None, ...), {str}, filter, all, filter),
+            'license': ('licenseurl', {url_or_none}),
+            'release_date': ('date', {unified_strdate}),
+            'timestamp': (('publicdate', 'addeddate'), {unified_timestamp}, any),
+            'location': ('venue', {str}),
+            'release_year': ('year', {int_or_none}),
+        })
+        info.update({
            'id': identifier,
-            'title': m['title'],
-            'description': clean_html(m.get('description')),
-            'uploader': dict_get(m, ['uploader', 'adder']),
-            'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
-            'license': m.get('licenseurl'),
-            'release_date': unified_strdate(m.get('date')),
-            'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
            'webpage_url': f'https://archive.org/details/{identifier}',
-            'location': m.get('venue'),
-            'release_year': int_or_none(m.get('year'))}
+        })

        for f in metadata['files']:
            if f['name'] in entries:
                entries[f['name']] = merge_dicts(entries[f['name']], {
                    'id': identifier + '/' + f['name'],
-                    'title': f.get('title') or f['name'],
-                    'display_id': f['name'],
-                    'description': clean_html(f.get('description')),
-                    'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
-                    'duration': parse_duration(f.get('length')),
-                    'track_number': int_or_none(f.get('track')),
-                    'album': f.get('album'),
-                    'discnumber': int_or_none(f.get('disc')),
-                    'release_year': int_or_none(f.get('year'))})
+                    **traverse_obj(f, {
+                        'title': (('title', 'name'), {str}, any),
+                        'display_id': ('name', {str}),
+                        'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any),
+                        'creators': ('creator', (None, ...), {str}, filter, all, filter),
+                        'duration': ('length', {parse_duration}),
+                        'track_number': ('track', {int_or_none}),
+                        'album': ('album', {str}),
+                        'discnumber': ('disc', {int_or_none}),
+                        'release_year': ('year', {int_or_none}),
+                    }),
+                })
                entry = entries[f['name']]
-            elif traverse_obj(f, 'original', expected_type=str) in entries:
+            elif traverse_obj(f, ('original', {str})) in entries:
                entry = entries[f['original']]
            else:
                continue
--- a/yt_dlp/extractor/streaks.py
+++ b/yt_dlp/extractor/streaks.py
@ -33,16 +33,20 @@ def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=No
                    **(headers or {}),
                })
        except ExtractorError as e:
-            if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}:
+            if isinstance(e.cause, HTTPError) and e.cause.status in (403, 404):
                error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False)
                message = traverse_obj(error, ('message', {str}))
                code = traverse_obj(error, ('code', {str}))
+                error_id = traverse_obj(error, ('id', {int}))
                if code == 'REQUEST_FAILED':
-                    self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES)
-                elif code == 'MEDIA_NOT_FOUND':
-                    raise ExtractorError(message, expected=True)
-                elif code or message:
-                    raise ExtractorError(join_nonempty(code, message, delim=': '))
+                    if error_id == 124:
+                        self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                    elif error_id == 126:
+                        raise ExtractorError('Access is denied (possibly due to invalid/missing API key)')
+                if code == 'MEDIA_NOT_FOUND':
+                    raise ExtractorError(join_nonempty(code, message, delim=': '), expected=True)
+                if code or message:
+                    raise ExtractorError(join_nonempty(code, error_id, message, delim=': '))
            raise

        streaks_id = response['id']
--- a/yt_dlp/extractor/tbsjp.py
+++ b/yt_dlp/extractor/tbsjp.py
@ -1,104 +1,107 @@
-from .common import InfoExtractor
-from ..networking.exceptions import HTTPError
+from .streaks import StreaksBaseIE
 from ..utils import (
-    ExtractorError,
    clean_html,
    int_or_none,
    str_or_none,
    unified_timestamp,
-    urljoin,
+    url_or_none,
 )
-from ..utils.traversal import find_element, traverse_obj
+from ..utils.traversal import traverse_obj


-class TBSJPEpisodeIE(InfoExtractor):
+class TBSJPBaseIE(StreaksBaseIE):
+    def _search_window_app_json(self, webpage, name, item_id, **kwargs):
+        return self._search_json(r'window\.app\s*=', webpage, f'{name} info', item_id, **kwargs)
+
+
+class TBSJPEpisodeIE(TBSJPBaseIE):
    _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)'
-    _GEO_BYPASS = False
    _TESTS = [{
-        'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010',
-        'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually',
+        'url': 'https://cu.tbs.co.jp/episode/14694_2094162_1000123656',
+        'skip': 'geo-blocked to japan + 7-day expiry',
        'info_dict': {
-            'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始',
-            'id': '23613_2044134_1000049010',
+            'title': 'クロちゃん、寝て起きたら川のほとりにいてその向こう岸に亡くなった父親がいたら死の淵にいるかと思う説 ほか',
+            'id': '14694_2094162_1000123656',
            'ext': 'mp4',
-            'upload_date': '20230728',
-            'duration': 3517,
-            'release_timestamp': 1691118230,
-            'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始',
-            'release_date': '20230804',
-            'categories': 'count:11',
-            'episode_number': 3,
-            'timestamp': 1690522538,
-            'description': 'md5:2b796341af1ef772034133174ba4a895',
-            'series': 'VIVANT',
+            'display_id': 'ref:14694_2094162_1000123656',
+            'description': 'md5:1a82fcdeb5e2e82190544bb72721c46e',
+            'uploader': 'TBS',
+            'uploader_id': 'tbs',
+            'duration': 2752,
+            'thumbnail': 'md5:d8855c8c292683c95a84cafdb42300bc',
+            'categories': ['エンタメ', '水曜日のダウンタウン', 'ダウンタウン', '浜田雅功', '松本人志', '水ダウ', '動画', 'バラエティ'],
+            'cast': ['浜田 雅功', '藤本 敏史', 'ビビる 大木', '千原 ジュニア', '横澤 夏子', 'せいや', 'あの', '服部 潤'],
+            'genres': ['variety'],
+            'series': '水曜日のダウンタウン',
+            'series_id': '14694',
+            'episode': 'クロちゃん、寝て起きたら川のほとりにいてその向こう岸に亡くなった父親がいたら死の淵にいるかと思う説 ほか',
+            'episode_number': 341,
+            'episode_id': '14694_2094162_1000123656',
+            'timestamp': 1753778992,
+            'upload_date': '20250729',
+            'release_timestamp': 1753880402,
+            'release_date': '20250730',
+            'modified_timestamp': 1753880741,
+            'modified_date': '20250730',
+            'live_status': 'not_live',
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
-        meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False)
+        meta = self._search_window_app_json(webpage, 'episode', video_id, fatal=False)
        episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value'))

-        tf_path = self._search_regex(
-            r'<script[^>]+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config')
-        tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config')
-        video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url')
-        api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key')
-
-        try:
-            source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id,
-                                              headers={'X-Streaks-Api-Key': api_key},
-                                              note='Downloading stream metadata')
-        except ExtractorError as e:
-            if isinstance(e.cause, HTTPError) and e.cause.status == 403:
-                self.raise_geo_restricted(countries=['JP'])
-            raise
-
-        formats, subtitles = [], {}
-        for src in traverse_obj(source_meta, ('sources', ..., 'src')):
-            fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False)
-            formats.extend(fmts)
-            self._merge_subtitles(subs, target=subtitles)
-
        return {
-            'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})),
-            'id': video_id,
+            **self._extract_from_streaks_api(
+                'tbs', f'ref:{video_id}', headers={'Origin': 'https://cu.tbs.co.jp'}),
            **traverse_obj(episode, {
-                'categories': ('keywords', {list}),
-                'id': ('content_id', {str}),
-                'description': ('description', 0, 'value'),
-                'timestamp': ('created_at', {unified_timestamp}),
-                'release_timestamp': ('pub_date', {unified_timestamp}),
+                'title': ('title', ..., 'value', {str}, any),
+                'cast': (
+                    'credit', ..., 'name', ..., 'value', {clean_html}, any,
+                    {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter),
+                'categories': ('keywords', ..., {str}, filter, all, filter),
+                'description': ('description', ..., 'value', {clean_html}, any),
                'duration': ('tv_episode_info', 'duration', {int_or_none}),
+                'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value', {str}, any),
+                'episode_id': ('content_id', {str}),
                'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}),
-                'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'),
-                'series': ('custom_data', 'program_name'),
-            }, get_all=False),
-            'formats': formats,
-            'subtitles': subtitles,
+                'genres': ('genre', ..., {str}, filter, all, filter),
+                'release_timestamp': ('pub_date', {unified_timestamp}),
+                'series': ('custom_data', 'program_name', {str}),
+                'tags': ('tags', ..., {str}, filter, all, filter),
+                'thumbnail': ('artwork', ..., 'url', {url_or_none}, any),
+                'timestamp': ('created_at', {unified_timestamp}),
+                'uploader': ('tv_show_info', 'networks', ..., {str}, any),
+            }),
+            **traverse_obj(episode, ('tv_episode_info', {
+                'duration': ('duration', {int_or_none}),
+                'episode_number': ('episode_number', {int_or_none}),
+                'series_id': ('show_content_id', {str}),
+            })),
+            'id': video_id,
        }


-class TBSJPProgramIE(InfoExtractor):
+class TBSJPProgramIE(TBSJPBaseIE):
    _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)'
    _TESTS = [{
-        'url': 'https://cu.tbs.co.jp/program/23601',
-        'playlist_mincount': 4,
+        'url': 'https://cu.tbs.co.jp/program/14694',
+        'playlist_mincount': 1,
        'info_dict': {
-            'id': '23601',
-            'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'],
-            'description': '幼少期の夢は大人になって、どう成長したのだろうか？\nそしてその夢は今後、どのように広がっていくのか？\nいま話題の会社で働く人の「夢の成長」を描く',
-            'series': 'ミライカプセル　-I have a dream-',
-            'title': 'ミライカプセル　-I have a dream-',
+            'id': '14694',
+            'title': '水曜日のダウンタウン',
+            'description': 'md5:cf1d46c76c2755d7f87512498718b837',
+            'categories': ['エンタメ', '水曜日のダウンタウン', 'ダウンタウン', '浜田雅功', '松本人志', '水ダウ', '動画', 'バラエティ'],
+            'series': '水曜日のダウンタウン',
        },
    }]

    def _real_extract(self, url):
        programme_id = self._match_id(url)
        webpage = self._download_webpage(url, programme_id)
-        meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id)
-
+        meta = self._search_window_app_json(webpage, 'programme', programme_id)
        programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value'))

        return {
@ -116,7 +119,7 @@ def _real_extract(self, url):
        }


-class TBSJPPlaylistIE(InfoExtractor):
+class TBSJPPlaylistIE(TBSJPBaseIE):
    _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)'
    _TESTS = [{
        'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e',
@ -129,8 +132,8 @@ class TBSJPPlaylistIE(InfoExtractor):

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
-        page = self._download_webpage(url, playlist_id)
-        meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id)
+        webpage = self._download_webpage(url, playlist_id)
+        meta = self._search_window_app_json(webpage, 'playlist', playlist_id)
        playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id))

        def entries():
--- a/yt_dlp/extractor/tver.py
+++ b/yt_dlp/extractor/tver.py
@ -1,12 +1,16 @@
+import datetime as dt
+
 from .streaks import StreaksBaseIE
 from ..utils import (
    ExtractorError,
+    GeoRestrictedError,
    int_or_none,
    join_nonempty,
    make_archive_id,
    smuggle_url,
    str_or_none,
    strip_or_none,
+    time_seconds,
    update_url_query,
 )
 from ..utils.traversal import require, traverse_obj
@ -96,6 +100,7 @@ class TVerIE(StreaksBaseIE):
        'Referer': 'https://tver.jp/',
    }
    _PLATFORM_QUERY = {}
+    _STREAKS_API_INFO = {}

    def _real_initialize(self):
        session_info = self._download_json(
@ -105,6 +110,9 @@ def _real_initialize(self):
            'platform_uid': 'platform_uid',
            'platform_token': 'platform_token',
        }))
+        self._STREAKS_API_INFO = self._download_json(
+            'https://player.tver.jp/player/streaks_info_v2.json', None,
+            'Downloading STREAKS API info', 'Unable to download STREAKS API info')

    def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None):
        return self._download_json(
@ -219,15 +227,26 @@ def _real_extract(self, url):
                '_type': 'url_transparent',
                'url': smuggle_url(
                    self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id),
-                    {'geo_countries': ['JP']}),
+                    {'geo_countries': self._GEO_COUNTRIES}),
                'ie_key': 'BrightcoveNew',
            }

-        return {
-            **self._extract_from_streaks_api(video_info['streaks']['projectID'], streaks_id, {
+        project_id = video_info['streaks']['projectID']
+        key_idx = dt.datetime.fromtimestamp(time_seconds(hours=9), dt.timezone.utc).month % 6 or 6
+
+        try:
+            streaks_info = self._extract_from_streaks_api(project_id, streaks_id, {
                'Origin': 'https://tver.jp',
                'Referer': 'https://tver.jp/',
-            }),
+                'X-Streaks-Api-Key': self._STREAKS_API_INFO[project_id]['api_key'][f'key0{key_idx}'],
+            })
+        except GeoRestrictedError as e:
+            # Catch and re-raise with metadata_available to support --ignore-no-formats-error
+            self.raise_geo_restricted(e.orig_msg, countries=self._GEO_COUNTRIES, metadata_available=True)
+            streaks_info = {}
+
+        return {
+            **streaks_info,
            **metadata,
            'id': video_id,
            '_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None,