1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-08-15 00:48:28 +00:00

Merge branch 'yt-dlp:master' into misc-2025-08

This commit is contained in:
bashonly 2025-08-01 15:29:01 -05:00
commit 8d6510623b
No known key found for this signature in database
GPG Key ID: 783F096F253D15B0
4 changed files with 147 additions and 99 deletions

View File

@ -33,7 +33,6 @@
unified_timestamp, unified_timestamp,
url_or_none, url_or_none,
urlhandle_detect_ext, urlhandle_detect_ext,
variadic,
) )
@ -232,6 +231,23 @@ class ArchiveOrgIE(InfoExtractor):
'release_date': '19950402', 'release_date': '19950402',
'timestamp': 1084927901, 'timestamp': 1084927901,
}, },
}, {
# metadata['metadata']['description'] is a list of strings instead of str
'url': 'https://archive.org/details/pra-KZ1908.02',
'info_dict': {
'id': 'pra-KZ1908.02',
'ext': 'mp3',
'display_id': 'KZ1908.02_01.wav',
'title': 'Crips and Bloods speak about gang life',
'description': 'md5:2b56b35ff021311e3554b47a285e70b3',
'uploader': 'jake@archive.org',
'duration': 1733.74,
'track': 'KZ1908.02 01',
'track_number': 1,
'timestamp': 1336026026,
'upload_date': '20120503',
'release_year': 1992,
},
}] }]
@staticmethod @staticmethod
@ -274,34 +290,40 @@ def _real_extract(self, url):
m = metadata['metadata'] m = metadata['metadata']
identifier = m['identifier'] identifier = m['identifier']
info = { info = traverse_obj(m, {
'title': ('title', {str}),
'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any),
'uploader': (('uploader', 'adder'), {str}, any),
'creators': ('creator', (None, ...), {str}, filter, all, filter),
'license': ('licenseurl', {url_or_none}),
'release_date': ('date', {unified_strdate}),
'timestamp': (('publicdate', 'addeddate'), {unified_timestamp}, any),
'location': ('venue', {str}),
'release_year': ('year', {int_or_none}),
})
info.update({
'id': identifier, 'id': identifier,
'title': m['title'],
'description': clean_html(m.get('description')),
'uploader': dict_get(m, ['uploader', 'adder']),
'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
'license': m.get('licenseurl'),
'release_date': unified_strdate(m.get('date')),
'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
'webpage_url': f'https://archive.org/details/{identifier}', 'webpage_url': f'https://archive.org/details/{identifier}',
'location': m.get('venue'), })
'release_year': int_or_none(m.get('year'))}
for f in metadata['files']: for f in metadata['files']:
if f['name'] in entries: if f['name'] in entries:
entries[f['name']] = merge_dicts(entries[f['name']], { entries[f['name']] = merge_dicts(entries[f['name']], {
'id': identifier + '/' + f['name'], 'id': identifier + '/' + f['name'],
'title': f.get('title') or f['name'], **traverse_obj(f, {
'display_id': f['name'], 'title': (('title', 'name'), {str}, any),
'description': clean_html(f.get('description')), 'display_id': ('name', {str}),
'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})), 'description': ('description', ({str}, (..., all, {' '.join})), {clean_html}, filter, any),
'duration': parse_duration(f.get('length')), 'creators': ('creator', (None, ...), {str}, filter, all, filter),
'track_number': int_or_none(f.get('track')), 'duration': ('length', {parse_duration}),
'album': f.get('album'), 'track_number': ('track', {int_or_none}),
'discnumber': int_or_none(f.get('disc')), 'album': ('album', {str}),
'release_year': int_or_none(f.get('year'))}) 'discnumber': ('disc', {int_or_none}),
'release_year': ('year', {int_or_none}),
}),
})
entry = entries[f['name']] entry = entries[f['name']]
elif traverse_obj(f, 'original', expected_type=str) in entries: elif traverse_obj(f, ('original', {str})) in entries:
entry = entries[f['original']] entry = entries[f['original']]
else: else:
continue continue

View File

@ -33,16 +33,20 @@ def _extract_from_streaks_api(self, project_id, media_id, headers=None, query=No
**(headers or {}), **(headers or {}),
}) })
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status in {403, 404}: if isinstance(e.cause, HTTPError) and e.cause.status in (403, 404):
error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False) error = self._parse_json(e.cause.response.read().decode(), media_id, fatal=False)
message = traverse_obj(error, ('message', {str})) message = traverse_obj(error, ('message', {str}))
code = traverse_obj(error, ('code', {str})) code = traverse_obj(error, ('code', {str}))
error_id = traverse_obj(error, ('id', {int}))
if code == 'REQUEST_FAILED': if code == 'REQUEST_FAILED':
self.raise_geo_restricted(message, countries=self._GEO_COUNTRIES) if error_id == 124:
elif code == 'MEDIA_NOT_FOUND': self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
raise ExtractorError(message, expected=True) elif error_id == 126:
elif code or message: raise ExtractorError('Access is denied (possibly due to invalid/missing API key)')
raise ExtractorError(join_nonempty(code, message, delim=': ')) if code == 'MEDIA_NOT_FOUND':
raise ExtractorError(join_nonempty(code, message, delim=': '), expected=True)
if code or message:
raise ExtractorError(join_nonempty(code, error_id, message, delim=': '))
raise raise
streaks_id = response['id'] streaks_id = response['id']

View File

@ -1,104 +1,107 @@
from .common import InfoExtractor from .streaks import StreaksBaseIE
from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
ExtractorError,
clean_html, clean_html,
int_or_none, int_or_none,
str_or_none, str_or_none,
unified_timestamp, unified_timestamp,
urljoin, url_or_none,
) )
from ..utils.traversal import find_element, traverse_obj from ..utils.traversal import traverse_obj
class TBSJPEpisodeIE(InfoExtractor): class TBSJPBaseIE(StreaksBaseIE):
def _search_window_app_json(self, webpage, name, item_id, **kwargs):
return self._search_json(r'window\.app\s*=', webpage, f'{name} info', item_id, **kwargs)
class TBSJPEpisodeIE(TBSJPBaseIE):
_VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)' _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)'
_GEO_BYPASS = False
_TESTS = [{ _TESTS = [{
'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010', 'url': 'https://cu.tbs.co.jp/episode/14694_2094162_1000123656',
'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually', 'skip': 'geo-blocked to japan + 7-day expiry',
'info_dict': { 'info_dict': {
'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始', 'title': 'クロちゃん、寝て起きたら川のほとりにいてその向こう岸に亡くなった父親がいたら死の淵にいるかと思う説 ほか',
'id': '23613_2044134_1000049010', 'id': '14694_2094162_1000123656',
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20230728', 'display_id': 'ref:14694_2094162_1000123656',
'duration': 3517, 'description': 'md5:1a82fcdeb5e2e82190544bb72721c46e',
'release_timestamp': 1691118230, 'uploader': 'TBS',
'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始', 'uploader_id': 'tbs',
'release_date': '20230804', 'duration': 2752,
'categories': 'count:11', 'thumbnail': 'md5:d8855c8c292683c95a84cafdb42300bc',
'episode_number': 3, 'categories': ['エンタメ', '水曜日のダウンタウン', 'ダウンタウン', '浜田雅功', '松本人志', '水ダウ', '動画', 'バラエティ'],
'timestamp': 1690522538, 'cast': ['浜田 雅功', '藤本 敏史', 'ビビる 大木', '千原 ジュニア', '横澤 夏子', 'せいや', 'あの', '服部 潤'],
'description': 'md5:2b796341af1ef772034133174ba4a895', 'genres': ['variety'],
'series': 'VIVANT', 'series': '水曜日のダウンタウン',
'series_id': '14694',
'episode': 'クロちゃん、寝て起きたら川のほとりにいてその向こう岸に亡くなった父親がいたら死の淵にいるかと思う説 ほか',
'episode_number': 341,
'episode_id': '14694_2094162_1000123656',
'timestamp': 1753778992,
'upload_date': '20250729',
'release_timestamp': 1753880402,
'release_date': '20250730',
'modified_timestamp': 1753880741,
'modified_date': '20250730',
'live_status': 'not_live',
}, },
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False) meta = self._search_window_app_json(webpage, 'episode', video_id, fatal=False)
episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value')) episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value'))
tf_path = self._search_regex(
r'<script[^>]+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config')
tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config')
video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url')
api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key')
try:
source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id,
headers={'X-Streaks-Api-Key': api_key},
note='Downloading stream metadata')
except ExtractorError as e:
if isinstance(e.cause, HTTPError) and e.cause.status == 403:
self.raise_geo_restricted(countries=['JP'])
raise
formats, subtitles = [], {}
for src in traverse_obj(source_meta, ('sources', ..., 'src')):
fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
return { return {
'title': traverse_obj(webpage, ({find_element(tag='h3')}, {clean_html})), **self._extract_from_streaks_api(
'id': video_id, 'tbs', f'ref:{video_id}', headers={'Origin': 'https://cu.tbs.co.jp'}),
**traverse_obj(episode, { **traverse_obj(episode, {
'categories': ('keywords', {list}), 'title': ('title', ..., 'value', {str}, any),
'id': ('content_id', {str}), 'cast': (
'description': ('description', 0, 'value'), 'credit', ..., 'name', ..., 'value', {clean_html}, any,
'timestamp': ('created_at', {unified_timestamp}), {lambda x: x.split(',')}, ..., {str.strip}, filter, all, filter),
'release_timestamp': ('pub_date', {unified_timestamp}), 'categories': ('keywords', ..., {str}, filter, all, filter),
'description': ('description', ..., 'value', {clean_html}, any),
'duration': ('tv_episode_info', 'duration', {int_or_none}), 'duration': ('tv_episode_info', 'duration', {int_or_none}),
'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value', {str}, any),
'episode_id': ('content_id', {str}),
'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}), 'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}),
'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'), 'genres': ('genre', ..., {str}, filter, all, filter),
'series': ('custom_data', 'program_name'), 'release_timestamp': ('pub_date', {unified_timestamp}),
}, get_all=False), 'series': ('custom_data', 'program_name', {str}),
'formats': formats, 'tags': ('tags', ..., {str}, filter, all, filter),
'subtitles': subtitles, 'thumbnail': ('artwork', ..., 'url', {url_or_none}, any),
'timestamp': ('created_at', {unified_timestamp}),
'uploader': ('tv_show_info', 'networks', ..., {str}, any),
}),
**traverse_obj(episode, ('tv_episode_info', {
'duration': ('duration', {int_or_none}),
'episode_number': ('episode_number', {int_or_none}),
'series_id': ('show_content_id', {str}),
})),
'id': video_id,
} }
class TBSJPProgramIE(InfoExtractor): class TBSJPProgramIE(TBSJPBaseIE):
_VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)' _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://cu.tbs.co.jp/program/23601', 'url': 'https://cu.tbs.co.jp/program/14694',
'playlist_mincount': 4, 'playlist_mincount': 1,
'info_dict': { 'info_dict': {
'id': '23601', 'id': '14694',
'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'], 'title': '水曜日のダウンタウン',
'description': '幼少期の夢は大人になって、どう成長したのだろうか?\nそしてその夢は今後、どのように広がっていくのか?\nいま話題の会社で働く人の「夢の成長」を描く', 'description': 'md5:cf1d46c76c2755d7f87512498718b837',
'series': 'ミライカプセル -I have a dream-', 'categories': ['エンタメ', '水曜日のダウンタウン', 'ダウンタウン', '浜田雅功', '松本人志', '水ダウ', '動画', 'バラエティ'],
'title': 'ミライカプセル -I have a dream-', 'series': '水曜日のダウンタウン',
}, },
}] }]
def _real_extract(self, url): def _real_extract(self, url):
programme_id = self._match_id(url) programme_id = self._match_id(url)
webpage = self._download_webpage(url, programme_id) webpage = self._download_webpage(url, programme_id)
meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id) meta = self._search_window_app_json(webpage, 'programme', programme_id)
programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value')) programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value'))
return { return {
@ -116,7 +119,7 @@ def _real_extract(self, url):
} }
class TBSJPPlaylistIE(InfoExtractor): class TBSJPPlaylistIE(TBSJPBaseIE):
_VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)' _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e', 'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e',
@ -129,8 +132,8 @@ class TBSJPPlaylistIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)
page = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, playlist_id)
meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id) meta = self._search_window_app_json(webpage, 'playlist', playlist_id)
playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id)) playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id))
def entries(): def entries():

View File

@ -1,12 +1,16 @@
import datetime as dt
from .streaks import StreaksBaseIE from .streaks import StreaksBaseIE
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
GeoRestrictedError,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
make_archive_id, make_archive_id,
smuggle_url, smuggle_url,
str_or_none, str_or_none,
strip_or_none, strip_or_none,
time_seconds,
update_url_query, update_url_query,
) )
from ..utils.traversal import require, traverse_obj from ..utils.traversal import require, traverse_obj
@ -96,6 +100,7 @@ class TVerIE(StreaksBaseIE):
'Referer': 'https://tver.jp/', 'Referer': 'https://tver.jp/',
} }
_PLATFORM_QUERY = {} _PLATFORM_QUERY = {}
_STREAKS_API_INFO = {}
def _real_initialize(self): def _real_initialize(self):
session_info = self._download_json( session_info = self._download_json(
@ -105,6 +110,9 @@ def _real_initialize(self):
'platform_uid': 'platform_uid', 'platform_uid': 'platform_uid',
'platform_token': 'platform_token', 'platform_token': 'platform_token',
})) }))
self._STREAKS_API_INFO = self._download_json(
'https://player.tver.jp/player/streaks_info_v2.json', None,
'Downloading STREAKS API info', 'Unable to download STREAKS API info')
def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None): def _call_platform_api(self, path, video_id, note=None, fatal=True, query=None):
return self._download_json( return self._download_json(
@ -219,15 +227,26 @@ def _real_extract(self, url):
'_type': 'url_transparent', '_type': 'url_transparent',
'url': smuggle_url( 'url': smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id), self.BRIGHTCOVE_URL_TEMPLATE % (account_id, brightcove_id),
{'geo_countries': ['JP']}), {'geo_countries': self._GEO_COUNTRIES}),
'ie_key': 'BrightcoveNew', 'ie_key': 'BrightcoveNew',
} }
return { project_id = video_info['streaks']['projectID']
**self._extract_from_streaks_api(video_info['streaks']['projectID'], streaks_id, { key_idx = dt.datetime.fromtimestamp(time_seconds(hours=9), dt.timezone.utc).month % 6 or 6
try:
streaks_info = self._extract_from_streaks_api(project_id, streaks_id, {
'Origin': 'https://tver.jp', 'Origin': 'https://tver.jp',
'Referer': 'https://tver.jp/', 'Referer': 'https://tver.jp/',
}), 'X-Streaks-Api-Key': self._STREAKS_API_INFO[project_id]['api_key'][f'key0{key_idx}'],
})
except GeoRestrictedError as e:
# Catch and re-raise with metadata_available to support --ignore-no-formats-error
self.raise_geo_restricted(e.orig_msg, countries=self._GEO_COUNTRIES, metadata_available=True)
streaks_info = {}
return {
**streaks_info,
**metadata, **metadata,
'id': video_id, 'id': video_id,
'_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None, '_old_archive_ids': [make_archive_id('BrightcoveNew', brightcove_id)] if brightcove_id else None,