From 42ca3d601ee10cef89d698f72e2b5d44fab4f013 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 30 Jul 2025 01:11:09 -0500 Subject: [PATCH 1/4] [ie/archive.org] Fix metadata extraction (#13880) Closes #13881 Authored by: bashonly --- yt_dlp/extractor/archiveorg.py | 64 +++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 572bd6bfe..1864ddbfd 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -33,7 +33,6 @@ unified_timestamp, url_or_none, urlhandle_detect_ext, - variadic, ) @@ -232,6 +231,23 @@ class ArchiveOrgIE(InfoExtractor): 'release_date': '19950402', 'timestamp': 1084927901, }, + }, { + # metadata['metadata']['description'] is a list of strings instead of str + 'url': 'https://archive.org/details/pra-KZ1908.02', + 'info_dict': { + 'id': 'pra-KZ1908.02', + 'ext': 'mp3', + 'display_id': 'KZ1908.02_01.wav', + 'title': 'Crips and Bloods speak about gang life', + 'description': 'md5:2b56b35ff021311e3554b47a285e70b3', + 'uploader': 'jake@archive.org', + 'duration': 1733.74, + 'track': 'KZ1908.02 01', + 'track_number': 1, + 'timestamp': 1336026026, + 'upload_date': '20120503', + 'release_year': 1992, + }, }] @staticmethod @@ -274,34 +290,40 @@ def _real_extract(self, url): m = metadata['metadata'] identifier = m['identifier'] - info = { + info = traverse_obj(m, { + 'title': ('title', {str}), + 'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any), + 'uploader': (('uploader', 'adder'), {str}, any), + 'creators': ('creator', (None, ...), {str}, filter, all, filter), + 'license': ('licenseurl', {url_or_none}), + 'release_date': ('date', {unified_strdate}), + 'timestamp': (('publicdate', 'addeddate'), {unified_timestamp}, any), + 'location': ('venue', {str}), + 'release_year': ('year', {int_or_none}), + }) + info.update({ 'id': identifier, - 'title': m['title'], - 'description': clean_html(m.get('description')), - 'uploader': dict_get(m, ['uploader', 'adder']), - 'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})), - 'license': m.get('licenseurl'), - 'release_date': unified_strdate(m.get('date')), - 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), 'webpage_url': f'https://archive.org/details/{identifier}', - 'location': m.get('venue'), - 'release_year': int_or_none(m.get('year'))} + }) for f in metadata['files']: if f['name'] in entries: entries[f['name']] = merge_dicts(entries[f['name']], { 'id': identifier + '/' + f['name'], - 'title': f.get('title') or f['name'], - 'display_id': f['name'], - 'description': clean_html(f.get('description')), - 'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})), - 'duration': parse_duration(f.get('length')), - 'track_number': int_or_none(f.get('track')), - 'album': f.get('album'), - 'discnumber': int_or_none(f.get('disc')), - 'release_year': int_or_none(f.get('year'))}) + **traverse_obj(f, { + 'title': (('title', 'name'), {str}, any), + 'display_id': ('name', {str}), + 'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any), + 'creators': ('creator', (None, ...), {str}, filter, all, filter), + 'duration': ('length', {parse_duration}), + 'track_number': ('track', {int_or_none}), + 'album': ('album', {str}), + 'discnumber': ('disc', {int_or_none}), + 'release_year': ('year', {int_or_none}), + }), + }) entry = entries[f['name']] - elif traverse_obj(f, 'original', expected_type=str) in entries: + elif traverse_obj(f, ('original', {str})) in entries: entry = entries[f['original']] else: continue From 70d7687487252a08dbf8b2831743e7833472ba05 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 30 Jul 2025 18:15:59 -0500 Subject: [PATCH 2/4] [ie/TVer] Extract Streaks API info (#13885) Closes #13874 Authored by: bashonly --- yt_dlp/extractor/streaks.py | 16 ++++++++++------ yt_dlp/extractor/tver.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/streaks.py b/yt_dlp/extractor/streaks.py index 1b3718473..60123d67b 100644 --- a/yt_dlp/extractor/streaks.py +++ b/yt_dlp/extractor/streaks.py @@ -33,16 +33,20 @@ def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=No **(headers or {}), }) except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: + if isinstance(e.cause, HTTPError) and e.cause.status in (403, 404): error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) message = traverse_obj(error, ('message', {str})) code = traverse_obj(error, ('code', {str})) + error_id = traverse_obj(error, ('id', {int})) if code == 'REQUEST_FAILED': - self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) - elif code == 'MEDIA_NOT_FOUND': - raise ExtractorError(message, expected=True) - elif code or message: - raise ExtractorError(join_nonempty(code, message, delim=': ')) + if error_id == 124: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + elif error_id == 126: + raise ExtractorError('Access is denied (possibly due to invalid/missing API key)') + if code == 'MEDIA_NOT_FOUND': + raise ExtractorError(join_nonempty(code, message, delim=': '), expected=True) + if code or message: + raise ExtractorError(join_nonempty(code, error_id, message, delim=': ')) raise streaks_id = response['id'] diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 805150db4..3b6a0390a 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -1,3 +1,5 @@ +import datetime as dt + from .streaks import StreaksBaseIE from ..utils import ( ExtractorError, @@ -7,6 +9,7 @@ smuggle_url, str_or_none, strip_or_none, + time_seconds, update_url_query, ) from ..utils.traversal import require, traverse_obj @@ -96,6 +99,7 @@ class TVerIE(StreaksBaseIE): 'Referer': 'https://tver.jp/', } _PLATFORM_QUERY = {} + _STREAKS_API_INFO = {} def _real_initialize(self): session_info = self._download_json( @@ -105,6 +109,9 @@ def _real_initialize(self): 'platform_uid': 'platform_uid', 'platform_token': 'platform_token', })) + self._STREAKS_API_INFO = self._download_json( + 'https://player.tver.jp/player/streaks_info_v2.json', None, + 'Downloading STREAKS API info', 'Unable to download STREAKS API info') def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): return self._download_json( @@ -223,10 +230,14 @@ def _real_extract(self, url): 'ie_key': 'BrightcoveNew', } + project_id = video_info['streaks']['projectID'] + key_idx = dt.datetime.fromtimestamp(time_seconds(hours=9), dt.timezone.utc).month % 6 or 6 + return { - **self._extract_from_streaks_api(video_info['streaks']['projectID'], streaks_id, { + **self._extract_from_streaks_api(project_id, streaks_id, { 'Origin': 'https://tver.jp', 'Referer': 'https://tver.jp/', + 'X-Streaks-Api-Key': self._STREAKS_API_INFO[project_id]['api_key'][f'key0{key_idx}'], }), **metadata, 'id': video_id, From 121647705a2fc6b968278723fe61801007e228a4 Mon Sep 17 00:00:00 2001 From: Abdulmohsen <1621552+arabcoders@users.noreply.github.com> Date: Thu, 31 Jul 2025 02:23:06 +0300 Subject: [PATCH 3/4] [ie/TVer] Support --ignore-no-formats-error when geo-blocked (#13598) Authored by: arabcoders --- yt_dlp/extractor/tver.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 3b6a0390a..a3dbabfd1 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -3,6 +3,7 @@ from .streaks import StreaksBaseIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, join_nonempty, make_archive_id, @@ -226,19 +227,26 @@ def _real_extract(self, url): '_type': 'url_transparent', 'url': smuggle_url( self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id), - {'geo_countries': ['JP']}), + {'geo_countries': self._GEO_COUNTRIES}), 'ie_key': 'BrightcoveNew', } project_id = video_info['streaks']['projectID'] key_idx = dt.datetime.fromtimestamp(time_seconds(hours=9), dt.timezone.utc).month % 6 or 6 - return { - **self._extract_from_streaks_api(project_id, streaks_id, { + try: + streaks_info = self._extract_from_streaks_api(project_id, streaks_id, { 'Origin': 'https://tver.jp', 'Referer': 'https://tver.jp/', 'X-Streaks-Api-Key': self._STREAKS_API_INFO[project_id]['api_key'][f'key0{key_idx}'], - }), + }) + except GeoRestrictedError as e: + # Catch and re-raise with metadata_available to support --ignore-no-formats-error + self.raise_geo_restricted(e.orig_msg, countries=self._GEO_COUNTRIES, metadata_available=True) + streaks_info = {} + + return { + **streaks_info, **metadata, 'id': video_id, '_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None, From 71f30921a2023dbb25c53fd1bb1399cac803116d Mon Sep 17 00:00:00 2001 From: garret1317 Date: Thu, 31 Jul 2025 21:33:05 +0100 Subject: [PATCH 4/4] [ie/tbsjp] Fix extractor (#13485) Closes #13484 Authored by: garret1317 --- yt_dlp/extractor/tbsjp.py | 139 +++++++++++++++++++------------------- 1 file changed, 71 insertions(+), 68 deletions(-) diff --git a/yt_dlp/extractor/tbsjp.py b/yt_dlp/extractor/tbsjp.py index 0d521f106..2f3ef0154 100644 --- a/yt_dlp/extractor/tbsjp.py +++ b/yt_dlp/extractor/tbsjp.py @@ -1,104 +1,107 @@ -from .common import InfoExtractor -from ..networking.exceptions import HTTPError +from .streaks import StreaksBaseIE from ..utils import ( - ExtractorError, clean_html, int_or_none, str_or_none, unified_timestamp, - urljoin, + url_or_none, ) -from ..utils.traversal import find_element, traverse_obj +from ..utils.traversal import traverse_obj -class TBSJPEpisodeIE(InfoExtractor): +class TBSJPBaseIE(StreaksBaseIE): + def _search_window_app_json(self, webpage, name, item_id, **kwargs): + return self._search_json(r'window\.app\s*=', webpage, f'{name} info', item_id, **kwargs) + + +class TBSJPEpisodeIE(TBSJPBaseIE): _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P[\d_]+)' - _GEO_BYPASS = False _TESTS = [{ - 'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010', - 'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually', + 'url': 'https://cu.tbs.co.jp/episode/14694_2094162_1000123656', + 'skip': 'geo-blocked to japan + 7-day expiry', 'info_dict': { - 'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始', - 'id': '23613_2044134_1000049010', + 'title': 'クロちゃん、寝て起きたら川のほとりにいてその向こう岸に亡くなった父親がいたら死の淵にいるかと思う説 ほか', + 'id': '14694_2094162_1000123656', 'ext': 'mp4', - 'upload_date': '20230728', - 'duration': 3517, - 'release_timestamp': 1691118230, - 'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始', - 'release_date': '20230804', - 'categories': 'count:11', - 'episode_number': 3, - 'timestamp': 1690522538, - 'description': 'md5:2b796341af1ef772034133174ba4a895', - 'series': 'VIVANT', + 'display_id': 'ref:14694_2094162_1000123656', + 'description': 'md5:1a82fcdeb5e2e82190544bb72721c46e', + 'uploader': 'TBS', + 'uploader_id': 'tbs', + 'duration': 2752, + 'thumbnail': 'md5:d8855c8c292683c95a84cafdb42300bc', + 'categories': ['エンタメ', '水曜日のダウンタウン', 'ダウンタウン', '浜田雅功', '松本人志', '水ダウ', '動画', 'バラエティ'], + 'cast': ['浜田 雅功', '藤本 敏史', 'ビビる 大木', '千原 ジュニア', '横澤 夏子', 'せいや', 'あの', '服部 潤'], + 'genres': ['variety'], + 'series': '水曜日のダウンタウン', + 'series_id': '14694', + 'episode': 'クロちゃん、寝て起きたら川のほとりにいてその向こう岸に亡くなった父親がいたら死の淵にいるかと思う説 ほか', + 'episode_number': 341, + 'episode_id': '14694_2094162_1000123656', + 'timestamp': 1753778992, + 'upload_date': '20250729', + 'release_timestamp': 1753880402, + 'release_date': '20250730', + 'modified_timestamp': 1753880741, + 'modified_date': '20250730', + 'live_status': 'not_live', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False) + meta = self._search_window_app_json(webpage, 'episode', video_id, fatal=False) episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value')) - tf_path = self._search_regex( - r']+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config') - tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config') - video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url') - api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key') - - try: - source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id, - headers={'X-Streaks-Api-Key': api_key}, - note='Downloading stream metadata') - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 403: - self.raise_geo_restricted(countries=['JP']) - raise - - formats, subtitles = [], {} - for src in traverse_obj(source_meta, ('sources', ..., 'src')): - fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False) - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - return { - 'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})), - 'id': video_id, + **self._extract_from_streaks_api( + 'tbs', f'ref:{video_id}', headers={'Origin': 'https://cu.tbs.co.jp'}), **traverse_obj(episode, { - 'categories': ('keywords', {list}), - 'id': ('content_id', {str}), - 'description': ('description', 0, 'value'), - 'timestamp': ('created_at', {unified_timestamp}), - 'release_timestamp': ('pub_date', {unified_timestamp}), + 'title': ('title', ..., 'value', {str}, any), + 'cast': ( + 'credit', ..., 'name', ..., 'value', {clean_html}, any, + {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter), + 'categories': ('keywords', ..., {str}, filter, all, filter), + 'description': ('description', ..., 'value', {clean_html}, any), 'duration': ('tv_episode_info', 'duration', {int_or_none}), + 'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value', {str}, any), + 'episode_id': ('content_id', {str}), 'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}), - 'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'), - 'series': ('custom_data', 'program_name'), - }, get_all=False), - 'formats': formats, - 'subtitles': subtitles, + 'genres': ('genre', ..., {str}, filter, all, filter), + 'release_timestamp': ('pub_date', {unified_timestamp}), + 'series': ('custom_data', 'program_name', {str}), + 'tags': ('tags', ..., {str}, filter, all, filter), + 'thumbnail': ('artwork', ..., 'url', {url_or_none}, any), + 'timestamp': ('created_at', {unified_timestamp}), + 'uploader': ('tv_show_info', 'networks', ..., {str}, any), + }), + **traverse_obj(episode, ('tv_episode_info', { + 'duration': ('duration', {int_or_none}), + 'episode_number': ('episode_number', {int_or_none}), + 'series_id': ('show_content_id', {str}), + })), + 'id': video_id, } -class TBSJPProgramIE(InfoExtractor): +class TBSJPProgramIE(TBSJPBaseIE): _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P\d+)' _TESTS = [{ - 'url': 'https://cu.tbs.co.jp/program/23601', - 'playlist_mincount': 4, + 'url': 'https://cu.tbs.co.jp/program/14694', + 'playlist_mincount': 1, 'info_dict': { - 'id': '23601', - 'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'], - 'description': '幼少期の夢は大人になって、どう成長したのだろうか?\nそしてその夢は今後、どのように広がっていくのか?\nいま話題の会社で働く人の「夢の成長」を描く', - 'series': 'ミライカプセル -I have a dream-', - 'title': 'ミライカプセル -I have a dream-', + 'id': '14694', + 'title': '水曜日のダウンタウン', + 'description': 'md5:cf1d46c76c2755d7f87512498718b837', + 'categories': ['エンタメ', '水曜日のダウンタウン', 'ダウンタウン', '浜田雅功', '松本人志', '水ダウ', '動画', 'バラエティ'], + 'series': '水曜日のダウンタウン', }, }] def _real_extract(self, url): programme_id = self._match_id(url) webpage = self._download_webpage(url, programme_id) - meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id) - + meta = self._search_window_app_json(webpage, 'programme', programme_id) programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value')) return { @@ -116,7 +119,7 @@ def _real_extract(self, url): } -class TBSJPPlaylistIE(InfoExtractor): +class TBSJPPlaylistIE(TBSJPBaseIE): _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P[\da-f]+)' _TESTS = [{ 'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e', @@ -129,8 +132,8 @@ class TBSJPPlaylistIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - page = self._download_webpage(url, playlist_id) - meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id) + webpage = self._download_webpage(url, playlist_id) + meta = self._search_window_app_json(webpage, 'playlist', playlist_id) playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id)) def entries():