From 83fabf352489d52843f67e6e9cc752db86d27e6e Mon Sep 17 00:00:00 2001 From: garret1317 Date: Wed, 21 May 2025 19:29:35 +0100 Subject: [PATCH 01/68] [ie/xinpianchang] Fix extractor (#13245) Closes #12737 Authored by: garret1317 --- yt_dlp/extractor/xinpianchang.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/xinpianchang.py b/yt_dlp/extractor/xinpianchang.py index 23ed9270d..a4263579a 100644 --- a/yt_dlp/extractor/xinpianchang.py +++ b/yt_dlp/extractor/xinpianchang.py @@ -45,7 +45,7 @@ class XinpianchangIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id=video_id) + webpage = self._download_webpage(url, video_id=video_id, headers={'Referer': url}) video_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['detail']['video'] data = self._download_json( From 167d7a9f0ffd1b4fe600193441bdb7358db2740b Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 22 May 2025 00:27:07 +0200 Subject: [PATCH 02/68] [jsinterp] Fix increment/decrement evaluation (#13238) Closes #13241 Authored by: seproDev, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- test/test_jsinterp.py | 8 +++++ test/test_youtube_signature.py | 4 +++ yt_dlp/jsinterp.py | 61 +++++++++++++++++----------------- 3 files changed, 43 insertions(+), 30 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index b14069ccc..2e3cdc2a5 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -478,6 +478,14 @@ def test_extract_function_with_global_stack(self): func = jsi.extract_function('c', {'e': 10}, {'f': 100, 'g': 1000}) self.assertEqual(func([1]), 1111) + def test_increment_decrement(self): + self._test('function f() { var x = 1; return ++x; }', 2) + self._test('function f() { var x = 1; return x++; }', 1) + self._test('function f() { var x = 1; x--; return x }', 0) + self._test('function f() { var y; var x = 1; x++, --x, x--, x--, y="z", "abc", x++; return --x }', -1) + self._test('function f() { var a = "test--"; return a; }', 'test--') + self._test('function f() { var b = 1; var a = "b--"; return a; }', 'b--') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 0f0885366..3f777aed7 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -316,6 +316,10 @@ 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', ), + ( + 'https://www.youtube.com/s/player/59b252b9/player_ias.vflset/en_US/base.js', + 'D3XWVpYgwhLLKNK4AGX', 'aZrQ1qWJ5yv5h', + ), ] diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index b59fb2c61..45aeffa22 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -590,39 +590,12 @@ def dict_item(key, val): return ret, True return ret, False - for m in re.finditer(rf'''(?x) - (?P\+\+|--)(?P{_NAME_RE})| - (?P{_NAME_RE})(?P\+\+|--)''', expr): - var = m.group('var1') or m.group('var2') - start, end = m.span() - sign = m.group('pre_sign') or m.group('post_sign') - ret = local_vars[var] - local_vars[var] += 1 if sign[0] == '+' else -1 - if m.group('pre_sign'): - ret = local_vars[var] - expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] - - if not expr: - return None, should_return - m = re.match(fr'''(?x) - (?P (?P{_NAME_RE})(?:\[(?P{_NESTED_BRACKETS})\])?\s* (?P{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? =(?!=)(?P.*)$ - )|(?P - (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ - )|(?P - (?P{_NAME_RE})(?: - (?P\?)?\.(?P[^(]+)| - \[(?P{_NESTED_BRACKETS})\] - )\s* - )|(?P - (?P{_NAME_RE})\[(?P.+)\]$ - )|(?P - (?P{_NAME_RE})\((?P.*)\)$ - )''', expr) - if m and m.group('assign'): + ''', expr) + if m: # We are assigning a value to a variable left_val = local_vars.get(m.group('out')) if not m.group('index'): @@ -640,7 +613,35 @@ def dict_item(key, val): m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) return left_val[idx], should_return - elif expr.isdigit(): + for m in re.finditer(rf'''(?x) + (?P\+\+|--)(?P{_NAME_RE})| + (?P{_NAME_RE})(?P\+\+|--)''', expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] + + if not expr: + return None, should_return + + m = re.match(fr'''(?x) + (?P + (?!if|return|true|false|null|undefined|NaN)(?P{_NAME_RE})$ + )|(?P + (?P{_NAME_RE})(?: + (?P\?)?\.(?P[^(]+)| + \[(?P{_NESTED_BRACKETS})\] + )\s* + )|(?P + (?P{_NAME_RE})\[(?P.+)\]$ + )|(?P + (?P{_NAME_RE})\((?P.*)\)$ + )''', expr) + if expr.isdigit(): return int(expr), should_return elif expr == 'break': From 32ed5f107c6c641958d1cd2752e130de4db55a13 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 22 May 2025 04:13:42 -0500 Subject: [PATCH 03/68] [ie/youtube] Add PO token support for subtitles (#13234) Closes #13075 Authored by: bashonly, coletdjnz Co-authored-by: coletdjnz --- README.md | 2 +- test/test_pot/test_pot_builtin_utils.py | 1 + yt_dlp/extractor/youtube/_base.py | 2 + yt_dlp/extractor/youtube/_video.py | 172 +++++++++++++++++------ yt_dlp/extractor/youtube/pot/provider.py | 1 + yt_dlp/extractor/youtube/pot/utils.py | 2 +- 6 files changed, 138 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 9f542844e..aaa3beb71 100644 --- a/README.md +++ b/README.md @@ -1805,7 +1805,7 @@ #### youtube * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning * `data_sync_id`: Overrides the account Data Sync ID used in Innertube API requests. This may be needed if you are using an account with `youtube:player_skip=webpage,configs` or `youtubetab:skip=webpage` * `visitor_data`: Overrides the Visitor Data used in Innertube API requests. This should be used with `player_skip=webpage,configs` and without cookies. Note: this may have adverse effects if used improperly. If a session from a browser is wanted, you should pass cookies instead (which contain the Visitor ID) -* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be either `gvs` (Google Video Server URLs) or `player` (Innertube player request) +* `po_token`: Proof of Origin (PO) Token(s) to use. Comma seperated list of PO Tokens in the format `CLIENT.CONTEXT+PO_TOKEN`, e.g. `youtube:po_token=web.gvs+XXX,web.player=XXX,web_safari.gvs+YYY`. Context can be any of `gvs` (Google Video Server URLs), `player` (Innertube player request) or `subs` (Subtitles) * `pot_trace`: Enable debug logging for PO Token fetching. Either `true` or `false` (default) * `fetch_pot`: Policy to use for fetching a PO Token from providers. One of `always` (always try fetch a PO Token regardless if the client requires one for the given context), `never` (never fetch a PO Token), or `auto` (default; only fetch a PO Token if the client requires one for the given context) diff --git a/test/test_pot/test_pot_builtin_utils.py b/test/test_pot/test_pot_builtin_utils.py index 1682e42a1..a95fc4e15 100644 --- a/test/test_pot/test_pot_builtin_utils.py +++ b/test/test_pot/test_pot_builtin_utils.py @@ -15,6 +15,7 @@ class TestGetWebPoContentBinding: for context, is_authenticated, expected in [ (PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), (PoTokenContext.PLAYER, False, ('example-video-id', ContentBindingType.VIDEO_ID)), + (PoTokenContext.SUBS, False, ('example-video-id', ContentBindingType.VIDEO_ID)), (PoTokenContext.GVS, True, ('example-data-sync-id', ContentBindingType.DATASYNC_ID)), ]], ('WEB_REMIX', PoTokenContext.GVS, False, ('example-visitor-data', ContentBindingType.VISITOR_DATA)), diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 4194e1c21..9c5bb75fe 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -35,6 +35,7 @@ class _PoTokenContext(enum.Enum): PLAYER = 'player' GVS = 'gvs' + SUBS = 'subs' # any clients starting with _ cannot be explicitly requested by the user @@ -787,6 +788,7 @@ def _download_webpage_with_retries(self, *args, retry_fatal=False, retry_on_stat def _download_ytcfg(self, client, video_id): url = { + 'mweb': 'https://m.youtube.com', 'web': 'https://www.youtube.com', 'web_music': 'https://music.youtube.com', 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1', diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index b4c6ba453..9f929664f 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -72,6 +72,9 @@ STREAMING_DATA_CLIENT_NAME = '__yt_dlp_client' STREAMING_DATA_INITIAL_PO_TOKEN = '__yt_dlp_po_token' +STREAMING_DATA_FETCH_SUBS_PO_TOKEN = '__yt_dlp_fetch_subs_po_token' +STREAMING_DATA_INNERTUBE_CONTEXT = '__yt_dlp_innertube_context' + PO_TOKEN_GUIDE_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide' @@ -2863,7 +2866,8 @@ def _get_config_po_token(self, client: str, context: _PoTokenContext): continue def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, visitor_data=None, - data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, **kwargs): + data_sync_id=None, session_index=None, player_url=None, video_id=None, webpage=None, + required=False, **kwargs): """ Fetch a PO Token for a given client and context. This function will validate required parameters for a given context and client. @@ -2878,6 +2882,7 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, @param player_url: player URL. @param video_id: video ID. @param webpage: video webpage. + @param required: Whether the PO Token is required (i.e. try to fetch unless policy is "never"). @param kwargs: Additional arguments to pass down. May be more added in the future. @return: The fetched PO Token. None if it could not be fetched. """ @@ -2926,6 +2931,7 @@ def fetch_po_token(self, client='web', context=_PoTokenContext.GVS, ytcfg=None, player_url=player_url, video_id=video_id, video_webpage=webpage, + required=required, **kwargs, ) @@ -2945,6 +2951,7 @@ def _fetch_po_token(self, client, **kwargs): or ( fetch_pot_policy == 'auto' and _PoTokenContext(context) not in self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] + and not kwargs.get('required', False) ) ): return None @@ -3133,6 +3140,8 @@ def append_client(*client_names): player_url = self._download_player_url(video_id) tried_iframe_fallback = True + pr = initial_pr if client == 'web' else None + visitor_data = visitor_data or self._extract_visitor_data(master_ytcfg, initial_pr, player_ytcfg) data_sync_id = data_sync_id or self._extract_data_sync_id(master_ytcfg, initial_pr, player_ytcfg) @@ -3147,12 +3156,19 @@ def append_client(*client_names): 'ytcfg': player_ytcfg or self._get_default_ytcfg(client), } - player_po_token = self.fetch_po_token( + # Don't need a player PO token for WEB if using player response from webpage + player_po_token = None if pr else self.fetch_po_token( context=_PoTokenContext.PLAYER, **fetch_po_token_args) gvs_po_token = self.fetch_po_token( context=_PoTokenContext.GVS, **fetch_po_token_args) + fetch_subs_po_token_func = functools.partial( + self.fetch_po_token, + context=_PoTokenContext.SUBS, + **fetch_po_token_args, + ) + required_pot_contexts = self._get_default_ytcfg(client)['PO_TOKEN_REQUIRED_CONTEXTS'] if ( @@ -3179,7 +3195,6 @@ def append_client(*client_names): only_once=True) deprioritize_pr = True - pr = initial_pr if client == 'web' else None try: pr = pr or self._extract_player_response( client, video_id, @@ -3197,10 +3212,13 @@ def append_client(*client_names): if pr_id := self._invalid_player_response(pr, video_id): skipped_clients[client] = pr_id elif pr: - # Save client name for introspection later - sd = traverse_obj(pr, ('streamingData', {dict})) or {} + # Save client details for introspection later + innertube_context = traverse_obj(player_ytcfg or self._get_default_ytcfg(client), 'INNERTUBE_CONTEXT') + sd = pr.setdefault('streamingData', {}) sd[STREAMING_DATA_CLIENT_NAME] = client sd[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token + sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context + sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client f[STREAMING_DATA_INITIAL_PO_TOKEN] = gvs_po_token @@ -3262,6 +3280,25 @@ def _report_pot_format_skipped(self, video_id, client_name, proto): else: self.report_warning(msg, only_once=True) + def _report_pot_subtitles_skipped(self, video_id, client_name, msg=None): + msg = msg or ( + f'{video_id}: Some {client_name} client subtitles require a PO Token which was not provided. ' + 'They will be discarded since they are not downloadable as-is. ' + f'You can manually pass a Subtitles PO Token for this client with ' + f'--extractor-args "youtube:po_token={client_name}.subs+XXX" . ' + f'For more information, refer to {PO_TOKEN_GUIDE_URL}') + + subs_wanted = any(( + self.get_param('writesubtitles'), + self.get_param('writeautomaticsub'), + self.get_param('listsubtitles'))) + + # Only raise a warning for non-default clients, to not confuse users. + if not subs_wanted or client_name in (*self._DEFAULT_CLIENTS, *self._DEFAULT_AUTHED_CLIENTS): + self.write_debug(msg, only_once=True) + else: + self.report_warning(msg, only_once=True) + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 @@ -3553,6 +3590,9 @@ def process_manifest_format(f, proto, client_name, itag, po_token): hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' fmts, subs = self._extract_m3u8_formats_and_subtitles( hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + for sub in traverse_obj(subs, (..., ..., {dict})): + # HLS subs (m3u8) do not need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) for f in fmts: if process_manifest_format(f, 'hls', client_name, self._search_regex( @@ -3564,6 +3604,9 @@ def process_manifest_format(f, proto, client_name, itag, po_token): if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + for sub in traverse_obj(subs, (..., ..., {dict})): + # TODO: Investigate if DASH subs ever need a PO token; save client name for debugging + sub[STREAMING_DATA_CLIENT_NAME] = client_name subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH for f in formats: if process_manifest_format(f, 'dash', client_name, f['format_id'], po_token): @@ -3890,47 +3933,81 @@ def is_bad_format(fmt): 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), } + def get_lang_code(track): + return (remove_start(track.get('vssId') or '', '.').replace('.', '-') + or track.get('languageCode')) + + def process_language(container, base_url, lang_code, sub_name, client_name, query): + lang_subs = container.setdefault(lang_code, []) + for fmt in self._SUBTITLE_FORMATS: + query = {**query, 'fmt': fmt} + lang_subs.append({ + 'ext': fmt, + 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), + 'name': sub_name, + STREAMING_DATA_CLIENT_NAME: client_name, + }) + subtitles = {} - pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) - if pctr: - def get_lang_code(track): - return (remove_start(track.get('vssId') or '', '.').replace('.', '-') - or track.get('languageCode')) + skipped_subs_clients = set() + prs = traverse_obj(player_responses, ( + # Filter out initial_pr which does not have streamingData (smuggled client context) + lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer'])) - # Converted into dicts to remove duplicates - captions = { - get_lang_code(sub): sub - for sub in traverse_obj(pctr, (..., 'captionTracks', ...))} - translation_languages = { - lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) - for lang in traverse_obj(pctr, (..., 'translationLanguages', ...))} + pctrs = traverse_obj(prs, (..., 'captions', 'playerCaptionsTracklistRenderer', {dict})) + translation_languages = { + lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) + for lang in traverse_obj(pctrs, (..., 'translationLanguages', ..., {dict}))} + # NB: Constructing the full subtitle dictionary is slow + get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( + self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - def process_language(container, base_url, lang_code, sub_name, query): - lang_subs = container.setdefault(lang_code, []) - for fmt in self._SUBTITLE_FORMATS: - query.update({ - 'fmt': fmt, - }) - lang_subs.append({ - 'ext': fmt, - 'url': urljoin('https://www.youtube.com', update_url_query(base_url, query)), - 'name': sub_name, - }) + all_captions = traverse_obj(pctrs, (..., 'captionTracks', ..., {dict})) + need_subs_langs = {get_lang_code(sub) for sub in all_captions if sub.get('kind') != 'asr'} + need_caps_langs = { + remove_start(get_lang_code(sub), 'a-') + for sub in all_captions if sub.get('kind') == 'asr'} - # NB: Constructing the full subtitle dictionary is slow - get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( - self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - for lang_code, caption_track in captions.items(): - base_url = caption_track.get('baseUrl') - orig_lang = parse_qs(base_url).get('lang', [None])[-1] - if not base_url: - continue + for pr in prs: + pctr = pr['captions']['playerCaptionsTracklistRenderer'] + client_name = pr['streamingData'][STREAMING_DATA_CLIENT_NAME] + innertube_client_name = pr['streamingData'][STREAMING_DATA_INNERTUBE_CONTEXT]['client']['clientName'] + required_contexts = self._get_default_ytcfg(client_name)['PO_TOKEN_REQUIRED_CONTEXTS'] + fetch_subs_po_token_func = pr['streamingData'][STREAMING_DATA_FETCH_SUBS_PO_TOKEN] + + pot_params = {} + already_fetched_pot = False + + for caption_track in traverse_obj(pctr, ('captionTracks', lambda _, v: v['baseUrl'])): + base_url = caption_track['baseUrl'] + qs = parse_qs(base_url) + lang_code = get_lang_code(caption_track) + requires_pot = ( + # We can detect the experiment for now + any(e in traverse_obj(qs, ('exp', ...)) for e in ('xpe', 'xpv')) + or _PoTokenContext.SUBS in required_contexts) + + if not already_fetched_pot: + already_fetched_pot = True + if subs_po_token := fetch_subs_po_token_func(required=requires_pot): + pot_params.update({ + 'pot': subs_po_token, + 'potc': '1', + 'c': innertube_client_name, + }) + + if not pot_params and requires_pot: + skipped_subs_clients.add(client_name) + self._report_pot_subtitles_skipped(video_id, client_name) + break + + orig_lang = qs.get('lang', [None])[-1] lang_name = self._get_text(caption_track, 'name', max_runs=1) if caption_track.get('kind') != 'asr': if not lang_code: continue process_language( - subtitles, base_url, lang_code, lang_name, {}) + subtitles, base_url, lang_code, lang_name, client_name, pot_params) if not caption_track.get('isTranslatable'): continue for trans_code, trans_name in translation_languages.items(): @@ -3950,10 +4027,25 @@ def process_language(container, base_url, lang_code, sub_name, query): # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility process_language( - automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {}) + automatic_captions, base_url, f'{trans_code}-orig', + f'{trans_name} (Original)', client_name, pot_params) # Setting tlang=lang returns damaged subtitles. - process_language(automatic_captions, base_url, trans_code, trans_name, - {} if orig_lang == orig_trans_code else {'tlang': trans_code}) + process_language( + automatic_captions, base_url, trans_code, trans_name, client_name, + pot_params if orig_lang == orig_trans_code else {'tlang': trans_code, **pot_params}) + + # Avoid duplication if we've already got everything we need + need_subs_langs.difference_update(subtitles) + need_caps_langs.difference_update(automatic_captions) + if not (need_subs_langs or need_caps_langs): + break + + if skipped_subs_clients and (need_subs_langs or need_caps_langs): + self._report_pot_subtitles_skipped(video_id, True, msg=join_nonempty( + f'{video_id}: There are missing subtitles languages because a PO token was not provided.', + need_subs_langs and f'Subtitles for these languages are missing: {", ".join(need_subs_langs)}.', + need_caps_langs and f'Automatic captions for {len(need_caps_langs)} languages are missing.', + delim=' ')) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles diff --git a/yt_dlp/extractor/youtube/pot/provider.py b/yt_dlp/extractor/youtube/pot/provider.py index 53af92d30..13b3b1f9b 100644 --- a/yt_dlp/extractor/youtube/pot/provider.py +++ b/yt_dlp/extractor/youtube/pot/provider.py @@ -39,6 +39,7 @@ class PoTokenContext(enum.Enum): GVS = 'gvs' PLAYER = 'player' + SUBS = 'subs' @dataclasses.dataclass diff --git a/yt_dlp/extractor/youtube/pot/utils.py b/yt_dlp/extractor/youtube/pot/utils.py index 1c0db243b..7a5b7d4ab 100644 --- a/yt_dlp/extractor/youtube/pot/utils.py +++ b/yt_dlp/extractor/youtube/pot/utils.py @@ -51,7 +51,7 @@ def get_webpo_content_binding( return visitor_id, ContentBindingType.VISITOR_ID return request.visitor_data, ContentBindingType.VISITOR_DATA - elif request.context == PoTokenContext.PLAYER or client_name != 'WEB_REMIX': + elif request.context in (PoTokenContext.PLAYER, PoTokenContext.SUBS): return request.video_id, ContentBindingType.VIDEO_ID return None, None From e491fd4d090db3af52a82863fb0553dd5e17fb85 Mon Sep 17 00:00:00 2001 From: Matt Broadway Date: Thu, 22 May 2025 10:22:11 +0100 Subject: [PATCH 04/68] [cookies] Fix Linux desktop environment detection (#13197) Closes #12885 Authored by: mbway --- test/test_cookies.py | 8 +++++++ yt_dlp/cookies.py | 53 ++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/test/test_cookies.py b/test/test_cookies.py index 4b9b9b5a9..f956ab187 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -58,6 +58,14 @@ def test_get_desktop_environment(self): ({'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), ({'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'gnome'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'mate'}, _LinuxDesktopEnvironment.GNOME), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'kde'}, _LinuxDesktopEnvironment.KDE3), + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'xfce'}, _LinuxDesktopEnvironment.XFCE), + + ({'XDG_CURRENT_DESKTOP': 'my_custom_de', 'DESKTOP_SESSION': 'my_custom_de', 'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), + ({'GNOME_DESKTOP_SESSION_ID': 1}, _LinuxDesktopEnvironment.GNOME), ({'KDE_FULL_SESSION': 1}, _LinuxDesktopEnvironment.KDE3), ({'KDE_FULL_SESSION': 1, 'DESKTOP_SESSION': 'kde4'}, _LinuxDesktopEnvironment.KDE4), diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index fad323c90..5675445ac 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -764,11 +764,11 @@ def _get_linux_desktop_environment(env, logger): GetDesktopEnvironment """ xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) - desktop_session = env.get('DESKTOP_SESSION', None) + desktop_session = env.get('DESKTOP_SESSION', '') if xdg_current_desktop is not None: for part in map(str.strip, xdg_current_desktop.split(':')): if part == 'Unity': - if desktop_session is not None and 'gnome-fallback' in desktop_session: + if 'gnome-fallback' in desktop_session: return _LinuxDesktopEnvironment.GNOME else: return _LinuxDesktopEnvironment.UNITY @@ -797,35 +797,34 @@ def _get_linux_desktop_environment(env, logger): return _LinuxDesktopEnvironment.UKUI elif part == 'LXQt': return _LinuxDesktopEnvironment.LXQT - logger.info(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') + logger.debug(f'XDG_CURRENT_DESKTOP is set to an unknown value: "{xdg_current_desktop}"') - elif desktop_session is not None: - if desktop_session == 'deepin': - return _LinuxDesktopEnvironment.DEEPIN - elif desktop_session in ('mate', 'gnome'): - return _LinuxDesktopEnvironment.GNOME - elif desktop_session in ('kde4', 'kde-plasma'): + if desktop_session == 'deepin': + return _LinuxDesktopEnvironment.DEEPIN + elif desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif desktop_session in ('kde4', 'kde-plasma'): + return _LinuxDesktopEnvironment.KDE4 + elif desktop_session == 'kde': + if 'KDE_SESSION_VERSION' in env: return _LinuxDesktopEnvironment.KDE4 - elif desktop_session == 'kde': - if 'KDE_SESSION_VERSION' in env: - return _LinuxDesktopEnvironment.KDE4 - else: - return _LinuxDesktopEnvironment.KDE3 - elif 'xfce' in desktop_session or desktop_session == 'xubuntu': - return _LinuxDesktopEnvironment.XFCE - elif desktop_session == 'ukui': - return _LinuxDesktopEnvironment.UKUI else: - logger.info(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') - + return _LinuxDesktopEnvironment.KDE3 + elif 'xfce' in desktop_session or desktop_session == 'xubuntu': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session == 'ukui': + return _LinuxDesktopEnvironment.UKUI else: - if 'GNOME_DESKTOP_SESSION_ID' in env: - return _LinuxDesktopEnvironment.GNOME - elif 'KDE_FULL_SESSION' in env: - if 'KDE_SESSION_VERSION' in env: - return _LinuxDesktopEnvironment.KDE4 - else: - return _LinuxDesktopEnvironment.KDE3 + logger.debug(f'DESKTOP_SESSION is set to an unknown value: "{desktop_session}"') + + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + if 'KDE_SESSION_VERSION' in env: + return _LinuxDesktopEnvironment.KDE4 + else: + return _LinuxDesktopEnvironment.KDE3 + return _LinuxDesktopEnvironment.OTHER From 7977b329ed97b216e37bd402f4935f28c00eac9e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 22 May 2025 04:33:11 -0500 Subject: [PATCH 05/68] [cleanup] Misc (#13166) Authored by: bashonly --- README.md | 1 + yt_dlp/extractor/playsuisse.py | 3 +-- yt_dlp/extractor/soundcloud.py | 2 +- yt_dlp/extractor/twitter.py | 2 +- yt_dlp/options.py | 11 +++++++---- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index aaa3beb71..518c63eef 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ * [Post-processing Options](#post-processing-options) * [SponsorBlock Options](#sponsorblock-options) * [Extractor Options](#extractor-options) + * [Preset Aliases](#preset-aliases) * [CONFIGURATION](#configuration) * [Configuration file encoding](#configuration-file-encoding) * [Authentication with netrc](#authentication-with-netrc) diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index 9bf5765fa..46e3a5b8f 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -9,11 +9,10 @@ int_or_none, join_nonempty, parse_qs, - traverse_obj, update_url_query, urlencode_postdata, ) -from ..utils.traversal import unpack +from ..utils.traversal import traverse_obj, unpack class PlaySuisseIE(InfoExtractor): diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index c70940a60..3496a08ef 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -697,7 +697,7 @@ def _real_extract(self, url): try: return self._extract_info_dict(info, full_title, token) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + if not isinstance(e.cause, HTTPError) or e.cause.status != 429: raise self.report_warning( 'You have reached the API rate limit, which is ~600 requests per ' diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 5eee3e726..ad3e74588 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1342,7 +1342,7 @@ def _extract_status(self, twid): 'tweet_mode': 'extended', }) except ExtractorError as e: - if not isinstance(e.cause, HTTPError) or not e.cause.status == 429: + if not isinstance(e.cause, HTTPError) or e.cause.status != 429: raise self.report_warning('Rate-limit exceeded; falling back to syndication endpoint') status = self._call_syndication_api(twid) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 19f16e725..b4d3d4d66 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -230,6 +230,9 @@ def format_option_help(self, formatter=None): formatter.indent() heading = formatter.format_heading('Preset Aliases') formatter.indent() + description = formatter.format_description( + 'Predefined aliases for convenience and ease of use. Note that future versions of yt-dlp ' + 'may add or adjust presets, but the existing preset names will not be changed or removed') result = [] for name, args in _PRESET_ALIASES.items(): option = optparse.Option('-t', help=shlex.join(args)) @@ -238,7 +241,7 @@ def format_option_help(self, formatter=None): formatter.dedent() formatter.dedent() help_lines = '\n'.join(result) - return f'{formatted_help}\n{heading}{help_lines}' + return f'{formatted_help}\n{heading}{description}\n{help_lines}' def create_parser(): @@ -470,7 +473,7 @@ def _preset_alias_callback(option, opt_str, value, parser): general.add_option( '--live-from-start', action='store_true', dest='live_from_start', - help='Download livestreams from the start. Currently only supported for YouTube (experimental) and Twitch') + help='Download livestreams from the start. Currently experimental and only supported for YouTube and Twitch') general.add_option( '--no-live-from-start', action='store_false', dest='live_from_start', @@ -545,9 +548,9 @@ def _preset_alias_callback(option, opt_str, value, parser): help=( 'Create aliases for an option string. Unless an alias starts with a dash "-", it is prefixed with "--". ' 'Arguments are parsed according to the Python string formatting mini-language. ' - 'E.g. --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' + 'E.g. --alias get-audio,-X "-S aext:{0},abr -x --audio-format {0}" creates options ' '"--get-audio" and "-X" that takes an argument (ARG0) and expands to ' - '"-S=aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' + '"-S aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' 'This option can be used multiple times')) From 415b4c9f955b1a0391204bd24a7132590e7b3bdb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 22 May 2025 09:49:11 +0000 Subject: [PATCH 06/68] Release 2025.05.22 Created by: bashonly :ci skip all --- CONTRIBUTORS | 5 +++++ Changelog.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 20 ++++++++++++-------- supportedsites.md | 10 ++++++---- yt_dlp/version.py | 6 +++--- 5 files changed, 72 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 5710f9a9e..6aa52c595 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -770,3 +770,8 @@ NeonMan pj47x troex WouterGordts +baierjan +GeoffreyFrogeye +Pawka +v3DJG6GL +yozel diff --git a/Changelog.md b/Changelog.md index 513724bf4..80b72da05 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,52 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.05.22 + +#### Core changes +- **cookies**: [Fix Linux desktop environment detection](https://github.com/yt-dlp/yt-dlp/commit/e491fd4d090db3af52a82863fb0553dd5e17fb85) ([#13197](https://github.com/yt-dlp/yt-dlp/issues/13197)) by [mbway](https://github.com/mbway) +- **jsinterp**: [Fix increment/decrement evaluation](https://github.com/yt-dlp/yt-dlp/commit/167d7a9f0ffd1b4fe600193441bdb7358db2740b) ([#13238](https://github.com/yt-dlp/yt-dlp/issues/13238)) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **1tv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/41c0a1fb89628696f8bb88e2b9f3a68f355b8c26) ([#13168](https://github.com/yt-dlp/yt-dlp/issues/13168)) by [bashonly](https://github.com/bashonly) +- **amcnetworks**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/464c84fedf78eef822a431361155f108b5df96d7) ([#13147](https://github.com/yt-dlp/yt-dlp/issues/13147)) by [bashonly](https://github.com/bashonly) +- **bitchute**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1d0f6539c47e5d5c68c3c47cdb7075339e2885ac) ([#13081](https://github.com/yt-dlp/yt-dlp/issues/13081)) by [bashonly](https://github.com/bashonly) +- **cartoonnetwork**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/7dbb47f84f0ee1266a3a01f58c9bc4c76d76794a) ([#13148](https://github.com/yt-dlp/yt-dlp/issues/13148)) by [bashonly](https://github.com/bashonly) +- **iprima**: [Fix login support](https://github.com/yt-dlp/yt-dlp/commit/a7d9a5eb79ceeecb851389f3f2c88597871ca3f2) ([#12937](https://github.com/yt-dlp/yt-dlp/issues/12937)) by [baierjan](https://github.com/baierjan) +- **jiosaavn** + - artist: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/586b557b124f954d3f625360ebe970989022ad97) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) + - playlist, show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/317f4b8006c2c0f0f64f095b1485163ad97c9053) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) + - show: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6839276496d8814cf16f58b637e45663467928e6) ([#12803](https://github.com/yt-dlp/yt-dlp/issues/12803)) by [subrat-lima](https://github.com/subrat-lima) +- **lrtradio**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/abf58dcd6a09e14eec4ea82ae12f79a0337cb383) ([#13200](https://github.com/yt-dlp/yt-dlp/issues/13200)) by [Pawka](https://github.com/Pawka) +- **nebula**: [Support `--mark-watched`](https://github.com/yt-dlp/yt-dlp/commit/20f288bdc2173c7cc58d709d25ca193c1f6001e7) ([#13120](https://github.com/yt-dlp/yt-dlp/issues/13120)) by [GeoffreyFrogeye](https://github.com/GeoffreyFrogeye) +- **niconico** + - [Fix error handling](https://github.com/yt-dlp/yt-dlp/commit/f569be4602c2a857087e495d5d7ed6060cd97abe) ([#13236](https://github.com/yt-dlp/yt-dlp/issues/13236)) by [bashonly](https://github.com/bashonly) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7a7b85c9014d96421e18aa7ea5f4c1bee5ceece0) ([#13045](https://github.com/yt-dlp/yt-dlp/issues/13045)) by [doe1080](https://github.com/doe1080) +- **nytimesarticle**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/b26bc32579c00ef579d75a835807ccc87d20ee0a) ([#13104](https://github.com/yt-dlp/yt-dlp/issues/13104)) by [bashonly](https://github.com/bashonly) +- **once**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/f475e8b529d18efdad603ffda02a56e707fe0e2c) ([#13164](https://github.com/yt-dlp/yt-dlp/issues/13164)) by [bashonly](https://github.com/bashonly) +- **picarto**: vod: [Support `/profile/` video URLs](https://github.com/yt-dlp/yt-dlp/commit/31e090cb787f3504ec25485adff9a2a51d056734) ([#13227](https://github.com/yt-dlp/yt-dlp/issues/13227)) by [subrat-lima](https://github.com/subrat-lima) +- **playsuisse**: [Improve metadata extraction](https://github.com/yt-dlp/yt-dlp/commit/d880e060803ae8ed5a047e578cca01e1f0e630ce) ([#12466](https://github.com/yt-dlp/yt-dlp/issues/12466)) by [v3DJG6GL](https://github.com/v3DJG6GL) +- **sprout**: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/cbcfe6378dde33a650e3852ab17ad4503b8e008d) ([#13149](https://github.com/yt-dlp/yt-dlp/issues/13149)) by [bashonly](https://github.com/bashonly) +- **svtpage**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/ea8498ed534642dd7e925961b97b934987142fd3) ([#12957](https://github.com/yt-dlp/yt-dlp/issues/12957)) by [diman8](https://github.com/diman8) +- **twitch**: [Support `--live-from-start`](https://github.com/yt-dlp/yt-dlp/commit/00b1bec55249cf2ad6271d36492c51b34b6459d1) ([#13202](https://github.com/yt-dlp/yt-dlp/issues/13202)) by [bashonly](https://github.com/bashonly) +- **vimeo**: event: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/545c1a5b6f2fe88722b41aef0e7485bf3be3f3f9) ([#13216](https://github.com/yt-dlp/yt-dlp/issues/13216)) by [bashonly](https://github.com/bashonly) +- **wat.tv**: [Improve error handling](https://github.com/yt-dlp/yt-dlp/commit/f123cc83b3aea45053f5fa1d9141048b01fc2774) ([#13111](https://github.com/yt-dlp/yt-dlp/issues/13111)) by [bashonly](https://github.com/bashonly) +- **weverse**: [Fix live extraction](https://github.com/yt-dlp/yt-dlp/commit/5328eda8820cc5f21dcf917684d23fbdca41831d) ([#13084](https://github.com/yt-dlp/yt-dlp/issues/13084)) by [bashonly](https://github.com/bashonly) +- **xinpianchang**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/83fabf352489d52843f67e6e9cc752db86d27e6e) ([#13245](https://github.com/yt-dlp/yt-dlp/issues/13245)) by [garret1317](https://github.com/garret1317) +- **youtube** + - [Add PO token support for subtitles](https://github.com/yt-dlp/yt-dlp/commit/32ed5f107c6c641958d1cd2752e130de4db55a13) ([#13234](https://github.com/yt-dlp/yt-dlp/issues/13234)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz) + - [Add `web_embedded` client for age-restricted videos](https://github.com/yt-dlp/yt-dlp/commit/0feec6dc131f488428bf881519e7c69766fbb9ae) ([#13089](https://github.com/yt-dlp/yt-dlp/issues/13089)) by [bashonly](https://github.com/bashonly) + - [Add a PO Token Provider Framework](https://github.com/yt-dlp/yt-dlp/commit/2685654a37141cca63eda3a92da0e2706e23ccfd) ([#12840](https://github.com/yt-dlp/yt-dlp/issues/12840)) by [coletdjnz](https://github.com/coletdjnz) + - [Extract `media_type` for all videos](https://github.com/yt-dlp/yt-dlp/commit/ded11ebc9afba6ba33923375103e9be2d7c804e7) ([#13136](https://github.com/yt-dlp/yt-dlp/issues/13136)) by [bashonly](https://github.com/bashonly) + - [Fix `--live-from-start` support for premieres](https://github.com/yt-dlp/yt-dlp/commit/8f303afb43395be360cafd7ad4ce2b6e2eedfb8a) ([#13079](https://github.com/yt-dlp/yt-dlp/issues/13079)) by [arabcoders](https://github.com/arabcoders) + - [Fix geo-restriction error handling](https://github.com/yt-dlp/yt-dlp/commit/c7e575e31608c19c5b26c10a4229db89db5fc9a8) ([#13217](https://github.com/yt-dlp/yt-dlp/issues/13217)) by [yozel](https://github.com/yozel) + +#### Misc. changes +- **build** + - [Bump PyInstaller to v6.13.0](https://github.com/yt-dlp/yt-dlp/commit/17cf9088d0d535e4a7feffbf02bd49cd9dae5ab9) ([#13082](https://github.com/yt-dlp/yt-dlp/issues/13082)) by [bashonly](https://github.com/bashonly) + - [Bump run-on-arch-action to v3](https://github.com/yt-dlp/yt-dlp/commit/9064d2482d1fe722bbb4a49731fe0711c410d1c8) ([#13088](https://github.com/yt-dlp/yt-dlp/issues/13088)) by [bashonly](https://github.com/bashonly) +- **cleanup**: Miscellaneous: [7977b32](https://github.com/yt-dlp/yt-dlp/commit/7977b329ed97b216e37bd402f4935f28c00eac9e) by [bashonly](https://github.com/bashonly) + ### 2025.04.30 #### Important changes diff --git a/README.md b/README.md index 518c63eef..6e2dc6243 100644 --- a/README.md +++ b/README.md @@ -349,8 +349,8 @@ ## General Options: --no-flat-playlist Fully extract the videos of a playlist (default) --live-from-start Download livestreams from the start. - Currently only supported for YouTube - (Experimental) + Currently experimental and only supported + for YouTube and Twitch --no-live-from-start Download livestreams from the current time (default) --wait-for-video MIN[-MAX] Wait for scheduled streams to become @@ -376,12 +376,12 @@ ## General Options: an alias starts with a dash "-", it is prefixed with "--". Arguments are parsed according to the Python string formatting - mini-language. E.g. --alias get-audio,-X - "-S=aext:{0},abr -x --audio-format {0}" - creates options "--get-audio" and "-X" that - takes an argument (ARG0) and expands to - "-S=aext:ARG0,abr -x --audio-format ARG0". - All defined aliases are listed in the --help + mini-language. E.g. --alias get-audio,-X "-S + aext:{0},abr -x --audio-format {0}" creates + options "--get-audio" and "-X" that takes an + argument (ARG0) and expands to "-S + aext:ARG0,abr -x --audio-format ARG0". All + defined aliases are listed in the --help output. Alias options can trigger more aliases; so be careful to avoid defining recursive options. As a safety measure, each @@ -1106,6 +1106,10 @@ ## Extractor Options: arguments for different extractors ## Preset Aliases: +Predefined aliases for convenience and ease of use. Note that future + versions of yt-dlp may add or adjust presets, but the existing preset + names will not be changed or removed + -t mp3 -f 'ba[acodec^=mp3]/ba/b' -x --audio-format mp3 diff --git a/supportedsites.md b/supportedsites.md index 03bd8a7c3..c2d7b4555 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -246,7 +246,6 @@ # Supported sites - **Canalplus**: mycanal.fr and piwiplus.fr - **Canalsurmas** - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** - **cbc.ca:​player:playlist** @@ -649,7 +648,10 @@ # Supported sites - **jiocinema**: [*jiocinema*](## "netrc machine") - **jiocinema:series**: [*jiocinema*](## "netrc machine") - **jiosaavn:album** + - **jiosaavn:artist** - **jiosaavn:playlist** + - **jiosaavn:show** + - **jiosaavn:​show:playlist** - **jiosaavn:song** - **Joj** - **JoqrAg**: 超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR) @@ -1081,8 +1083,8 @@ # Supported sites - **Photobucket** - **PiaLive** - **Piapro**: [*piapro*](## "netrc machine") - - **Picarto** - - **PicartoVod** + - **picarto** + - **picarto:vod** - **Piksel** - **Pinkbike** - **Pinterest** @@ -1390,7 +1392,6 @@ # Supported sites - **Spreaker** - **SpreakerShow** - **SpringboardPlatform** - - **Sprout** - **SproutVideo** - **sr:mediathek**: Saarländischer Rundfunk (**Currently broken**) - **SRGSSR** @@ -1656,6 +1657,7 @@ # Supported sites - **vimeo**: [*vimeo*](## "netrc machine") - **vimeo:album**: [*vimeo*](## "netrc machine") - **vimeo:channel**: [*vimeo*](## "netrc machine") + - **vimeo:event**: [*vimeo*](## "netrc machine") - **vimeo:group**: [*vimeo*](## "netrc machine") - **vimeo:likes**: [*vimeo*](## "netrc machine") Vimeo user likes - **vimeo:ondemand**: [*vimeo*](## "netrc machine") diff --git a/yt_dlp/version.py b/yt_dlp/version.py index e8b2bf170..c375cc6ad 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.04.30' +__version__ = '2025.05.22' -RELEASE_GIT_HEAD = '505b400795af557bdcfd9d4fa7e9133b26ef431c' +RELEASE_GIT_HEAD = '7977b329ed97b216e37bd402f4935f28c00eac9e' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.04.30' +_pkg_version = '2025.05.22' From 53ea743a9c158f8ca2d75a09ca44ba68606042d8 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 22 May 2025 17:41:31 -0500 Subject: [PATCH 07/68] [ie/youtube] Fix automatic captions for some client combinations (#13268) Fix 32ed5f107c6c641958d1cd2752e130de4db55a13 Authored by: bashonly --- yt_dlp/extractor/youtube/_video.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 9f929664f..840829be6 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -3950,19 +3950,23 @@ def process_language(container, base_url, lang_code, sub_name, client_name, quer subtitles = {} skipped_subs_clients = set() - prs = traverse_obj(player_responses, ( - # Filter out initial_pr which does not have streamingData (smuggled client context) - lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer'])) - pctrs = traverse_obj(prs, (..., 'captions', 'playerCaptionsTracklistRenderer', {dict})) + # Only web/mweb clients provide translationLanguages, so include initial_pr in the traversal translation_languages = { - lang.get('languageCode'): self._get_text(lang.get('languageName'), max_runs=1) - for lang in traverse_obj(pctrs, (..., 'translationLanguages', ..., {dict}))} + lang['languageCode']: self._get_text(lang['languageName'], max_runs=1) + for lang in traverse_obj(player_responses, ( + ..., 'captions', 'playerCaptionsTracklistRenderer', 'translationLanguages', + lambda _, v: v['languageCode'] and v['languageName'])) + } # NB: Constructing the full subtitle dictionary is slow get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) - all_captions = traverse_obj(pctrs, (..., 'captionTracks', ..., {dict})) + # Filter out initial_pr which does not have streamingData (smuggled client context) + prs = traverse_obj(player_responses, ( + lambda _, v: v['streamingData'] and v['captions']['playerCaptionsTracklistRenderer'])) + all_captions = traverse_obj(prs, ( + ..., 'captions', 'playerCaptionsTracklistRenderer', 'captionTracks', ..., {dict})) need_subs_langs = {get_lang_code(sub) for sub in all_captions if sub.get('kind') != 'asr'} need_caps_langs = { remove_start(get_lang_code(sub), 'a-') From e0d6c0822930f6e63f574d46d946a58b73ecd10c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 22 May 2025 17:42:42 -0500 Subject: [PATCH 08/68] [ie/patreon] Fix m3u8 formats extraction (#13266) Closes #13263 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 7794cae6c..dddb09c91 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -341,7 +341,7 @@ def _real_extract(self, url): })) # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo - headers = {'referer': 'https://patreon.com/'} + headers = {'referer': url} # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': @@ -379,11 +379,13 @@ def _real_extract(self, url): 'url': post_file['url'], }) elif name == 'video' or determine_ext(post_file.get('url')) == 'm3u8': - formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + post_file['url'], video_id, headers=headers) entries.append({ 'id': video_id, 'formats': formats, 'subtitles': subtitles, + 'http_headers': headers, }) can_view_post = traverse_obj(attributes, 'current_user_can_view') From 1a8a03ea8d827107319a18076ee3505090667c5a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 23 May 2025 07:53:36 -0500 Subject: [PATCH 09/68] [ie/patreon] Fix referer header used for embeds (#13276) Fix e0d6c0822930f6e63f574d46d946a58b73ecd10c Closes #13263 Authored by: bashonly --- yt_dlp/extractor/patreon.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index dddb09c91..2c1436cac 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -340,8 +340,9 @@ def _real_extract(self, url): 'channel_follower_count': ('attributes', 'patron_count', {int_or_none}), })) - # all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, Vimeo - headers = {'referer': url} + # Must be all-lowercase 'referer' so we can smuggle it to Generic, SproutVideo, and Vimeo. + # patreon.com URLs redirect to www.patreon.com; this matters when requesting mux.com m3u8s + headers = {'referer': 'https://www.patreon.com/'} # handle Vimeo embeds if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': @@ -352,7 +353,7 @@ def _real_extract(self, url): v_url, video_id, 'Checking Vimeo embed URL', headers=headers, fatal=False, errnote=False, expected_status=429): # 429 is TLS fingerprint rejection entries.append(self.url_result( - VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), + VimeoIE._smuggle_referrer(v_url, headers['referer']), VimeoIE, url_transparent=True)) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) From 52f9729c9a92ad4656d746ff0b1acecb87b3e96d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 23 May 2025 07:58:53 -0500 Subject: [PATCH 10/68] [ie/twitcasting] Fix password-protected livestream support (#13097) Closes #13096 Authored by: bashonly --- yt_dlp/extractor/twitcasting.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 0a7f95c21..ebc2963b0 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -1,4 +1,5 @@ import base64 +import hashlib import itertools import re @@ -16,6 +17,7 @@ str_to_int, try_get, unified_timestamp, + update_url_query, url_or_none, urlencode_postdata, urljoin, @@ -171,6 +173,10 @@ def find_dmu(x): 'player': 'pc_web', }) + password_params = { + 'word': hashlib.md5(video_password.encode()).hexdigest(), + } if video_password else None + formats = [] # low: 640x360, medium: 1280x720, high: 1920x1080 qq = qualities(['low', 'medium', 'high']) @@ -178,7 +184,7 @@ def find_dmu(x): 'tc-hls', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), )): formats.append({ - 'url': m3u8_url, + 'url': update_url_query(m3u8_url, password_params), 'format_id': f'hls-{quality}', 'ext': 'mp4', 'quality': qq(quality), @@ -192,7 +198,7 @@ def find_dmu(x): 'llfmp4', 'streams', {dict.items}, lambda _, v: url_or_none(v[1]), )): formats.append({ - 'url': ws_url, + 'url': update_url_query(ws_url, password_params), 'format_id': f'ws-{mode}', 'ext': 'mp4', 'quality': qq(mode), From f8051e3a61686c5db1de5f5746366ecfbc3ad20c Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sat, 24 May 2025 02:29:55 +0900 Subject: [PATCH 11/68] [ie/toutiao] Add extractor (#13246) Closes #12125 Authored by: doe1080 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/toutiao.py | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 yt_dlp/extractor/toutiao.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 14a006893..c516c79ce 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2147,6 +2147,7 @@ from .toggo import ToggoIE from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE +from .toutiao import ToutiaoIE from .toutv import TouTvIE from .toypics import ( ToypicsIE, diff --git a/yt_dlp/extractor/toutiao.py b/yt_dlp/extractor/toutiao.py new file mode 100644 index 000000000..b2a5aa236 --- /dev/null +++ b/yt_dlp/extractor/toutiao.py @@ -0,0 +1,121 @@ +import json +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_call, + url_or_none, +) +from ..utils.traversal import find_element, traverse_obj + + +class ToutiaoIE(InfoExtractor): + IE_NAME = 'toutiao' + IE_DESC = '今日头条' + + _VALID_URL = r'https?://www\.toutiao\.com/video/(?P\d+)/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.toutiao.com/video/7505382061495176511/', + 'info_dict': { + 'id': '7505382061495176511', + 'ext': 'mp4', + 'title': '新疆多地现不明飞行物,目击者称和月亮一样亮,几秒内突然加速消失,气象部门回应', + 'comment_count': int, + 'duration': 9.753, + 'like_count': int, + 'release_date': '20250517', + 'release_timestamp': 1747483344, + 'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$', + 'uploader': '极目新闻', + 'uploader_id': 'MS4wLjABAAAAeateBb9Su8I3MJOZozmvyzWktmba5LMlliRDz1KffnM', + 'view_count': int, + }, + }, { + 'url': 'https://www.toutiao.com/video/7479446610359878153/', + 'info_dict': { + 'id': '7479446610359878153', + 'ext': 'mp4', + 'title': '小伙竟然利用两块磁铁制作成磁力减震器,简直太有创意了!', + 'comment_count': int, + 'duration': 118.374, + 'like_count': int, + 'release_date': '20250308', + 'release_timestamp': 1741444368, + 'thumbnail': r're:https?://p\d+-sign\.toutiaoimg\.com/.+$', + 'uploader': '小莉创意发明', + 'uploader_id': 'MS4wLjABAAAA4f7d4mwtApALtHIiq-QM20dwXqe32NUz0DeWF7wbHKw', + 'view_count': int, + }, + }] + + def _real_initialize(self): + if self._get_cookies('https://www.toutiao.com').get('ttwid'): + return + + urlh = self._request_webpage( + 'https://ttwid.bytedance.com/ttwid/union/register/', None, + 'Fetching ttwid', 'Unable to fetch ttwid', headers={ + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'aid': 24, + 'needFid': False, + 'region': 'cn', + 'service': 'www.toutiao.com', + 'union': True, + }).encode(), + ) + + if ttwid := try_call(lambda: self._get_cookies(urlh.url)['ttwid'].value): + self._set_cookie('.toutiao.com', 'ttwid', ttwid) + return + + self.raise_login_required() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = traverse_obj(webpage, ( + {find_element(tag='script', id='RENDER_DATA')}, + {urllib.parse.unquote}, {json.loads}, 'data', 'initialVideo', + )) + + formats = [] + for video in traverse_obj(video_data, ( + 'videoPlayInfo', 'video_list', lambda _, v: v['main_url'], + )): + formats.append({ + 'url': video['main_url'], + **traverse_obj(video, ('video_meta', { + 'acodec': ('audio_profile', {str}), + 'asr': ('audio_sample_rate', {int_or_none}), + 'audio_channels': ('audio_channels', {float_or_none}, {int_or_none}), + 'ext': ('vtype', {str}), + 'filesize': ('size', {int_or_none}), + 'format_id': ('definition', {str}), + 'fps': ('fps', {int_or_none}), + 'height': ('vheight', {int_or_none}), + 'tbr': ('real_bitrate', {float_or_none(scale=1000)}), + 'vcodec': ('codec_type', {str}), + 'width': ('vwidth', {int_or_none}), + })), + }) + + return { + 'id': video_id, + 'formats': formats, + **traverse_obj(video_data, { + 'comment_count': ('commentCount', {int_or_none}), + 'duration': ('videoPlayInfo', 'video_duration', {float_or_none}), + 'like_count': ('repinCount', {int_or_none}), + 'release_timestamp': ('publishTime', {int_or_none}), + 'thumbnail': (('poster', 'coverUrl'), {url_or_none}, any), + 'title': ('title', {str}), + 'uploader': ('userInfo', 'name', {str}), + 'uploader_id': ('userInfo', 'userId', {str_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'webpage_url': ('detailUrl', {url_or_none}), + }), + } From 538eb305673c26bff6a2b12f1c96375fe02ce41a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 23 May 2025 12:42:24 -0500 Subject: [PATCH 12/68] [ie/podchaser] Fix extractor (#13271) Closes #13269 Authored by: bashonly --- yt_dlp/extractor/podchaser.py | 59 ++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/podchaser.py b/yt_dlp/extractor/podchaser.py index 4570f0f17..6c125f9ba 100644 --- a/yt_dlp/extractor/podchaser.py +++ b/yt_dlp/extractor/podchaser.py @@ -5,11 +5,13 @@ from ..utils import ( OnDemandPagedList, float_or_none, + int_or_none, + orderedSet, str_or_none, - str_to_int, - traverse_obj, unified_timestamp, + url_or_none, ) +from ..utils.traversal import require, traverse_obj class PodchaserIE(InfoExtractor): @@ -21,24 +23,25 @@ class PodchaserIE(InfoExtractor): 'id': '104365585', 'title': 'Ep. 285 – freeze me off', 'description': 'cam ahn', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:https?://.+/.+\.jpg', 'ext': 'mp3', - 'categories': ['Comedy'], + 'categories': ['Comedy', 'News', 'Politics', 'Arts'], 'tags': ['comedy', 'dark humor'], - 'series': 'Cum Town', + 'series': 'The Adam Friedland Show Podcast', 'duration': 3708, 'timestamp': 1636531259, 'upload_date': '20211110', 'average_rating': 4.0, + 'series_id': '36924', }, }, { 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', 'info_dict': { 'id': '28853', 'title': 'The Bone Zone', - 'description': 'Podcast by The Bone Zone', + 'description': r're:The official home of the Bone Zone podcast.+', }, - 'playlist_count': 275, + 'playlist_mincount': 275, }, { 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', 'info_dict': { @@ -51,19 +54,33 @@ class PodchaserIE(InfoExtractor): @staticmethod def _parse_episode(episode, podcast): - return { - 'id': str(episode.get('id')), - 'title': episode.get('title'), - 'description': episode.get('description'), - 'url': episode.get('audio_url'), - 'thumbnail': episode.get('image_url'), - 'duration': str_to_int(episode.get('length')), - 'timestamp': unified_timestamp(episode.get('air_date')), - 'average_rating': float_or_none(episode.get('rating')), - 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), - 'tags': traverse_obj(podcast, ('tags', ..., 'text')), - 'series': podcast.get('title'), - } + info = traverse_obj(episode, { + 'id': ('id', {int}, {str_or_none}, {require('episode ID')}), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'url': ('audio_url', {url_or_none}), + 'thumbnail': ('image_url', {url_or_none}), + 'duration': ('length', {int_or_none}), + 'timestamp': ('air_date', {unified_timestamp}), + 'average_rating': ('rating', {float_or_none}), + }) + info.update(traverse_obj(podcast, { + 'series': ('title', {str}), + 'series_id': ('id', {int}, {str_or_none}), + 'categories': (('summary', None), 'categories', ..., 'text', {str}, filter, all, {orderedSet}), + 'tags': ('tags', ..., 'text', {str}), + })) + info['vcodec'] = 'none' + + if info.get('series_id'): + podcast_slug = traverse_obj(podcast, ('slug', {str})) or 'podcast' + episode_slug = traverse_obj(episode, ('slug', {str})) or 'episode' + info['webpage_url'] = '/'.join(( + 'https://www.podchaser.com/podcasts', + '-'.join((podcast_slug[:30].rstrip('-'), info['series_id'])), + '-'.join((episode_slug[:30].rstrip('-'), info['id'])))) + + return info def _call_api(self, path, *args, **kwargs): return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs) @@ -93,5 +110,5 @@ def _real_extract(self, url): OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE), str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description')) - episode = self._call_api(f'episodes/{episode_id}', episode_id) + episode = self._call_api(f'podcasts/{podcast_id}/episodes/{episode_id}/player_ids', episode_id) return self._parse_episode(episode, podcast) From 7794374de8afb20499b023107e2abfd4e6b93ee4 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sat, 24 May 2025 04:25:56 +0900 Subject: [PATCH 13/68] [ie/twitter:broadcast] Support events URLs (#13248) Closes #12989 Authored by: doe1080 --- yt_dlp/extractor/twitter.py | 56 +++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index ad3e74588..65182b971 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -20,7 +20,6 @@ remove_end, str_or_none, strip_or_none, - traverse_obj, truncate_string, try_call, try_get, @@ -29,6 +28,7 @@ url_or_none, xpath_text, ) +from ..utils.traversal import require, traverse_obj class TwitterBaseIE(InfoExtractor): @@ -1596,8 +1596,8 @@ def _find_dimension(target): class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): IE_NAME = 'twitter:broadcast' - _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P[0-9a-zA-Z]{13})' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?Pbroadcasts|events)/(?P\w+)' _TESTS = [{ # untitled Periscope video 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj', @@ -1605,6 +1605,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1yNGaQLWpejGj', 'ext': 'mp4', 'title': 'Andrea May Sahouri - Periscope Broadcast', + 'display_id': '1yNGaQLWpejGj', 'uploader': 'Andrea May Sahouri', 'uploader_id': 'andreamsahouri', 'uploader_url': 'https://twitter.com/andreamsahouri', @@ -1612,6 +1613,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20200601', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, }, { 'url': 'https://twitter.com/i/broadcasts/1ZkKzeyrPbaxv', @@ -1619,6 +1622,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1ZkKzeyrPbaxv', 'ext': 'mp4', 'title': 'Starship | SN10 | High-Altitude Flight Test', + 'display_id': '1ZkKzeyrPbaxv', 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', @@ -1626,6 +1630,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20210303', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', }, }, { 'url': 'https://twitter.com/i/broadcasts/1OyKAVQrgzwGb', @@ -1633,6 +1639,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'id': '1OyKAVQrgzwGb', 'ext': 'mp4', 'title': 'Starship Flight Test', + 'display_id': '1OyKAVQrgzwGb', 'uploader': 'SpaceX', 'uploader_id': 'SpaceX', 'uploader_url': 'https://twitter.com/SpaceX', @@ -1640,21 +1647,58 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'upload_date': '20230420', 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', 'view_count': int, + 'concurrent_view_count': int, + 'live_status': 'was_live', + }, + }, { + 'url': 'https://x.com/i/events/1910629646300762112', + 'info_dict': { + 'id': '1LyxBWDRNqyKN', + 'ext': 'mp4', + 'title': '#ガンニバル ウォッチパーティー', + 'concurrent_view_count': int, + 'display_id': '1910629646300762112', + 'live_status': 'was_live', + 'release_date': '20250423', + 'release_timestamp': 1745409000, + 'tags': ['ガンニバル'], + 'thumbnail': r're:https?://[^?#]+\.jpg\?token=', + 'timestamp': 1745403328, + 'upload_date': '20250423', + 'uploader': 'ディズニープラス公式', + 'uploader_id': 'DisneyPlusJP', + 'uploader_url': 'https://twitter.com/DisneyPlusJP', + 'view_count': int, }, }] def _real_extract(self, url): - broadcast_id = self._match_id(url) + broadcast_type, display_id = self._match_valid_url(url).group('type', 'id') + + if broadcast_type == 'events': + timeline = self._call_api( + f'live_event/1/{display_id}/timeline.json', display_id) + broadcast_id = traverse_obj(timeline, ( + 'twitter_objects', 'broadcasts', ..., ('id', 'broadcast_id'), + {str}, any, {require('broadcast ID')})) + else: + broadcast_id = display_id + broadcast = self._call_api( 'broadcasts/show.json', broadcast_id, {'ids': broadcast_id})['broadcasts'][broadcast_id] if not broadcast: raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) - info['title'] = broadcast.get('status') or info.get('title') - info['uploader_id'] = broadcast.get('twitter_username') or info.get('uploader_id') - info['uploader_url'] = format_field(broadcast, 'twitter_username', 'https://twitter.com/%s', default=None) + info.update({ + 'display_id': display_id, + 'title': broadcast.get('status') or info.get('title'), + 'uploader_id': broadcast.get('twitter_username') or info.get('uploader_id'), + 'uploader_url': format_field( + broadcast, 'twitter_username', 'https://twitter.com/%s', default=None), + }) if info['live_status'] == 'is_upcoming': + self.raise_no_formats('This live broadcast has not yet started', expected=True) return info media_key = broadcast['media_key'] From 0ee1102268cf31b07f8a8318a47424c66b2f7378 Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 26 May 2025 13:34:20 -0500 Subject: [PATCH 14/68] [ie/adobepass] Always add newer user-agent when required (#13131) Fix dcfeea4dd5e5686821350baa6c7767a011944867 Closes #516 Authored by: bashonly --- yt_dlp/extractor/adobepass.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index f1b877927..f580dfda7 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -45,6 +45,7 @@ 'name': 'Comcast XFINITY', 'username_field': 'user', 'password_field': 'passwd', + 'needs_newer_ua': True, }, 'TWC': { 'name': 'Time Warner Cable | Spectrum', @@ -1355,7 +1356,6 @@ class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' - _MODERN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0' _MVPD_CACHE = 'ap-mvpd' _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' @@ -1367,6 +1367,14 @@ def _download_webpage_handle(self, *args, **kwargs): return super()._download_webpage_handle( *args, **kwargs) + @staticmethod + def _get_mso_headers(mso_info): + # yt-dlp's default user-agent is usually too old for some MSO's like Comcast_SSO + # See: https://github.com/yt-dlp/yt-dlp/issues/10848 + return { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:131.0) Gecko/20100101 Firefox/131.0', + } if mso_info.get('needs_newer_ua') else {} + @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): channel = etree.Element('channel') @@ -1383,6 +1391,12 @@ def _get_mvpd_resource(provider_id, title, guid, rating): return '' + etree.tostring(channel).decode() + '' def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + mso_id = self.get_param('ap_mso') + if mso_id: + mso_info = MSO_INFO[mso_id] + else: + mso_info = {} + def xml_text(xml_str, tag): return self._search_regex( f'<{tag}>(.+?)', xml_str, tag) @@ -1400,6 +1414,7 @@ def post_form(form_page_res, note, data={}): form_data.update(data) return self._download_webpage_handle( post_url, video_id, note, data=urlencode_postdata(form_data), headers={ + **self._get_mso_headers(mso_info), 'Content-Type': 'application/x-www-form-urlencoded', }) @@ -1439,12 +1454,10 @@ def extract_redirect_url(html, url=None, fatal=False): if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None if not authn_token: - mso_id = self.get_param('ap_mso') if mso_id: username, password = self._get_login_info('ap_username', 'ap_password', mso_id) if not username or not password: raise_mvpd_required() - mso_info = MSO_INFO[mso_id] provider_redirect_page_res = self._download_webpage_handle( self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, @@ -1455,11 +1468,7 @@ def extract_redirect_url(html, url=None, fatal=False): 'no_iframe': 'false', 'domain_name': 'adobe.com', 'redirect_url': url, - }, headers={ - # yt-dlp's default user-agent is usually too old for Comcast_SSO - # See: https://github.com/yt-dlp/yt-dlp/issues/10848 - 'User-Agent': self._MODERN_USER_AGENT, - } if mso_id == 'Comcast_SSO' else None) + }, headers=self._get_mso_headers(mso_info)) elif not self._cookies_passed: raise_mvpd_required() @@ -1489,8 +1498,8 @@ def extract_redirect_url(html, url=None, fatal=False): oauth_redirect_url = extract_redirect_url( provider_redirect_page, fatal=True) provider_login_page_res = self._download_webpage_handle( - oauth_redirect_url, video_id, - self._DOWNLOADING_LOGIN_PAGE) + oauth_redirect_url, video_id, self._DOWNLOADING_LOGIN_PAGE, + headers=self._get_mso_headers(mso_info)) else: provider_login_page_res = post_form( provider_redirect_page_res, From 89c1b349ad81318d9d3bea76c01c891696e58d38 Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 26 May 2025 13:48:10 -0500 Subject: [PATCH 15/68] [ie/adobepass] Validate login URL before sending credentials (#13131) Authored by: bashonly --- yt_dlp/extractor/adobepass.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index f580dfda7..8a5e7d9b5 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -45,6 +45,7 @@ 'name': 'Comcast XFINITY', 'username_field': 'user', 'password_field': 'passwd', + 'login_hostname': 'login.xfinity.com', 'needs_newer_ua': True, }, 'TWC': { @@ -75,6 +76,7 @@ 'name': 'Verizon FiOS', 'username_field': 'IDToken1', 'password_field': 'IDToken2', + 'login_hostname': 'ssoauth.verizon.com', }, 'Cablevision': { 'name': 'Optimum/Cablevision', @@ -1339,6 +1341,7 @@ 'name': 'Sling TV', 'username_field': 'username', 'password_field': 'password', + 'login_hostname': 'identity.sling.com', }, 'Suddenlink': { 'name': 'Suddenlink', @@ -1405,11 +1408,22 @@ def is_expired(token, date_ele): token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) return token_expires and token_expires <= int(time.time()) - def post_form(form_page_res, note, data={}): + def post_form(form_page_res, note, data={}, validate_url=False): form_page, urlh = form_page_res post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') if not re.match(r'https?://', post_url): post_url = urllib.parse.urljoin(urlh.url, post_url) + if validate_url: + # This request is submitting credentials so we should validate it when possible + url_parsed = urllib.parse.urlparse(post_url) + expected_hostname = mso_info.get('login_hostname') + if expected_hostname and expected_hostname != url_parsed.hostname: + raise ExtractorError( + f'Unexpected login URL hostname; expected "{expected_hostname}" but got ' + f'"{url_parsed.hostname}". Aborting before submitting credentials') + if url_parsed.scheme != 'https': + self.write_debug('Upgrading login URL scheme to https') + post_url = urllib.parse.urlunparse(url_parsed._replace(scheme='https')) form_data = self._hidden_inputs(form_page) form_data.update(data) return self._download_webpage_handle( @@ -1509,7 +1523,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_login_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) mvpd_confirm_page, urlh = mvpd_confirm_page_res if '' in mvpd_confirm_page: post_form(mvpd_confirm_page_res, 'Confirming Login') @@ -1548,7 +1562,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_redirect_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) saml_login_page, urlh = saml_login_page_res if 'Please try again.' in saml_login_page: raise ExtractorError( @@ -1569,7 +1583,7 @@ def extract_redirect_url(html, url=None, fatal=False): [saml_login_page, saml_redirect_url], 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) if 'Please try again.' in saml_login_page: raise ExtractorError( 'Failed to login, incorrect User ID or Password.') @@ -1640,7 +1654,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_login_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) provider_refresh_redirect_url = extract_redirect_url( provider_association_redirect, url=urlh.url) @@ -1691,7 +1705,7 @@ def extract_redirect_url(html, url=None, fatal=False): provider_login_page_res, 'Logging in', { mso_info['username_field']: username, mso_info['password_field']: password, - }) + }, validate_url=True) provider_refresh_redirect_url = extract_redirect_url( provider_association_redirect, url=urlh.url) @@ -1726,7 +1740,8 @@ def extract_redirect_url(html, url=None, fatal=False): } if mso_id in ('Cablevision', 'AlticeOne'): form_data['_eventId_proceed'] = '' - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) + mvpd_confirm_page_res = post_form( + provider_login_page_res, 'Logging in', form_data, validate_url=True) if mso_id != 'Rogers': post_form(mvpd_confirm_page_res, 'Confirming Login') From 711c5d5d098fee2992a1a624b1c4b30364b91426 Mon Sep 17 00:00:00 2001 From: Max Date: Mon, 26 May 2025 13:57:20 -0500 Subject: [PATCH 16/68] [ie/adobepass] Rework to require software statement (#13131) * Also removes broken cookie support Closes #11811 Authored by: maxbin123, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/adobepass.py | 87 +++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index 8a5e7d9b5..080fe319e 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -3,6 +3,7 @@ import re import time import urllib.parse +import uuid import xml.etree.ElementTree as etree from .common import InfoExtractor @@ -1393,7 +1394,7 @@ def _get_mvpd_resource(provider_id, title, guid, rating): resource_rating.text = rating return '' + etree.tostring(channel).decode() + '' - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource, software_statement): mso_id = self.get_param('ap_mso') if mso_id: mso_info = MSO_INFO[mso_id] @@ -1461,34 +1462,72 @@ def extract_redirect_url(html, url=None, fatal=False): } guid = xml_text(resource, 'guid') if '<' in resource else resource - count = 0 - while count < 2: + for _ in range(2): requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None if not authn_token: - if mso_id: - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }, headers=self._get_mso_headers(mso_info)) - elif not self._cookies_passed: + if not mso_id: + raise_mvpd_required() + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: raise_mvpd_required() - if not mso_id: - pass - elif mso_id == 'Comcast_SSO': + device_info, urlh = self._download_json_handle( + 'https://sp.auth.adobe.com/indiv/devices', + video_id, 'Registering device with Adobe', + data=json.dumps({'fingerprint': uuid.uuid4().hex}).encode(), + headers={'Content-Type': 'application/json; charset=UTF-8'}) + + device_id = device_info['deviceId'] + mvpd_headers['pass_sfp'] = urlh.get_header('pass_sfp') + mvpd_headers['Ap_21'] = device_id + + registration = self._download_json( + 'https://sp.auth.adobe.com/o/client/register', + video_id, 'Registering client with Adobe', + data=json.dumps({'software_statement': software_statement}).encode(), + headers={'Content-Type': 'application/json; charset=UTF-8'}) + + access_token = self._download_json( + 'https://sp.auth.adobe.com/o/client/token', video_id, + 'Obtaining access token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + 'client_id': registration['client_id'], + 'client_secret': registration['client_secret'], + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + })['access_token'] + mvpd_headers['Authorization'] = f'Bearer {access_token}' + + reg_code = self._download_json( + f'https://sp.auth.adobe.com/reggie/v1/{requestor_id}/regcode', + video_id, 'Obtaining registration code', + data=urlencode_postdata({ + 'requestor': requestor_id, + 'deviceId': device_id, + 'format': 'json', + }), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Authorization': f'Bearer {access_token}', + })['code'] + + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + 'reg_code': reg_code, + }, headers=self._get_mso_headers(mso_info)) + + if mso_id == 'Comcast_SSO': # Comcast page flow varies by video site and whether you # are on Comcast's network. provider_redirect_page, urlh = provider_redirect_page_res @@ -1751,6 +1790,7 @@ def extract_redirect_url(html, url=None, fatal=False): 'Retrieving Session', data=urlencode_postdata({ '_method': 'GET', 'requestor_id': requestor_id, + 'reg_code': reg_code, }), headers=mvpd_headers) except ExtractorError as e: if not mso_id and isinstance(e.cause, HTTPError) and e.cause.status == 401: @@ -1758,7 +1798,6 @@ def extract_redirect_url(html, url=None, fatal=False): raise if ' Date: Mon, 26 May 2025 14:03:39 -0500 Subject: [PATCH 17/68] [ie/adobepass] Add Fubo MSO (#13131) Closes #8287 Authored by: maxbin123 --- yt_dlp/extractor/adobepass.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py index 080fe319e..91c40b32e 100644 --- a/yt_dlp/extractor/adobepass.py +++ b/yt_dlp/extractor/adobepass.py @@ -11,6 +11,7 @@ from ..utils import ( NO_DEFAULT, ExtractorError, + parse_qs, unescapeHTML, unified_timestamp, urlencode_postdata, @@ -79,6 +80,11 @@ 'password_field': 'IDToken2', 'login_hostname': 'ssoauth.verizon.com', }, + 'Fubo': { + 'name': 'Fubo', + 'username_field': 'username', + 'password_field': 'password', + }, 'Cablevision': { 'name': 'Optimum/Cablevision', 'username_field': 'j_username', @@ -1761,6 +1767,27 @@ def extract_redirect_url(html, url=None, fatal=False): query=hidden_data) post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Fubo': + _, urlh = provider_redirect_page_res + + fubo_response = self._download_json( + 'https://api.fubo.tv/partners/tve/connect', video_id, + 'Authenticating with Fubo', 'Unable to authenticate with Fubo', + query=parse_qs(urlh.url), data=json.dumps({ + 'username': username, + 'password': password, + }).encode(), headers={ + 'Accept': 'application/json', + 'Content-Type': 'application/json', + }) + + self._request_webpage( + 'https://sp.auth.adobe.com/adobe-services/oauth2', video_id, + 'Authenticating with Adobe', 'Failed to authenticate with Adobe', + query={ + 'code': fubo_response['code'], + 'state': fubo_response['state'], + }) else: # Some providers (e.g. DIRECTV NOW) have another meta refresh # based redirect that should be followed. From ed108b3ea481c6a4b5215a9302ba92d74baa2425 Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 26 May 2025 14:06:39 -0500 Subject: [PATCH 18/68] [ie/theplatform] Improve metadata extraction (#13131) Authored by: bashonly --- yt_dlp/extractor/theplatform.py | 95 +++++++++++++++------------------ 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index ebe2ac296..0f8cdfed9 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -12,11 +12,13 @@ float_or_none, int_or_none, mimetype2ext, + parse_age_limit, parse_qs, traverse_obj, unsmuggle_url, update_url, update_url_query, + url_or_none, urlhandle_detect_ext, xpath_with_ns, ) @@ -63,62 +65,53 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d return formats, subtitles - def _download_theplatform_metadata(self, path, video_id): - info_url = f'http://link.theplatform.{self._TP_TLD}/s/{path}?format=preview' - return self._download_json(info_url, video_id) + def _download_theplatform_metadata(self, path, video_id, fatal=True): + return self._download_json( + f'https://link.theplatform.{self._TP_TLD}/s/{path}', video_id, + fatal=fatal, query={'format': 'preview'}) or {} - def _parse_theplatform_metadata(self, info): - subtitles = {} - captions = info.get('captions') - if isinstance(captions, list): - for caption in captions: - lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles.setdefault(lang, []).append({ - 'ext': mimetype2ext(mime), - 'url': src, - }) + @staticmethod + def _parse_theplatform_metadata(tp_metadata): + def site_specific_filter(*fields): + return lambda k, v: v and k.endswith(tuple(f'${f}' for f in fields)) - duration = info.get('duration') - tp_chapters = info.get('chapters', []) - chapters = [] - if tp_chapters: - def _add_chapter(start_time, end_time): - start_time = float_or_none(start_time, 1000) - end_time = float_or_none(end_time, 1000) - if start_time is None or end_time is None: - return - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - }) + info = traverse_obj(tp_metadata, { + 'title': ('title', {str}), + 'episode': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('defaultThumbnailUrl', {url_or_none}), + 'duration': ('duration', {float_or_none(scale=1000)}), + 'timestamp': ('pubDate', {float_or_none(scale=1000)}), + 'uploader': ('billingCode', {str}), + 'creators': ('author', {str}, filter, all, filter), + 'categories': ( + 'categories', lambda _, v: v.get('label') in ['category', None], + 'name', {str}, filter, all, filter), + 'tags': ('keywords', {str}, filter, {lambda x: re.split(r'[;,]\s?', x)}, filter), + 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}, any), + 'season_number': (site_specific_filter('seasonNumber'), {int_or_none}, any), + 'episode_number': (site_specific_filter('episodeNumber', 'airOrder'), {int_or_none}, any), + 'series': (site_specific_filter('show', 'seriesTitle', 'seriesShortTitle'), (None, ...), {str}, any), + 'location': (site_specific_filter('region'), {str}, any), + 'media_type': (site_specific_filter('programmingType', 'type'), {str}, any), + }) - for chapter in tp_chapters[:-1]: - _add_chapter(chapter.get('startTime'), chapter.get('endTime')) - _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + chapters = traverse_obj(tp_metadata, ('chapters', ..., { + 'start_time': ('startTime', {float_or_none(scale=1000)}), + 'end_time': ('endTime', {float_or_none(scale=1000)}), + })) + # Ignore pointless single chapters from short videos that span the entire video's duration + if len(chapters) > 1 or traverse_obj(chapters, (0, 'end_time')): + info['chapters'] = chapters - def extract_site_specific_field(field): - # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber' - return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False) + info['subtitles'] = {} + for caption in traverse_obj(tp_metadata, ('captions', lambda _, v: url_or_none(v['src']))): + info['subtitles'].setdefault(caption.get('lang') or 'en', []).append({ + 'url': caption['src'], + 'ext': mimetype2ext(caption.get('type')), + }) - return { - 'title': info['title'], - 'subtitles': subtitles, - 'description': info['description'], - 'thumbnail': info['defaultThumbnailUrl'], - 'duration': float_or_none(duration, 1000), - 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, - 'uploader': info.get('billingCode'), - 'chapters': chapters, - 'creator': traverse_obj(info, ('author', {str})) or None, - 'categories': traverse_obj(info, ( - 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None, - 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})), - 'location': extract_site_specific_field('region'), - 'series': extract_site_specific_field('show') or extract_site_specific_field('seriesTitle'), - 'season_number': int_or_none(extract_site_specific_field('seasonNumber')), - 'episode_number': int_or_none(extract_site_specific_field('episodeNumber')), - 'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'), - } + return info def _extract_theplatform_metadata(self, path, video_id): info = self._download_theplatform_metadata(path, video_id) From 2d7949d5642bc37d1e71bf00c9a55260e5505d58 Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 26 May 2025 14:14:16 -0500 Subject: [PATCH 19/68] [ie/nbc] Rework and adapt extractors to new AdobePass flow (#13131) Closes #1032, Closes #10874, Closes #11148, Closes #12432 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 4 +- yt_dlp/extractor/bravotv.py | 188 ------------- yt_dlp/extractor/nbc.py | 478 ++++++++++++++++++++++++-------- yt_dlp/extractor/syfy.py | 58 ---- 4 files changed, 365 insertions(+), 363 deletions(-) delete mode 100644 yt_dlp/extractor/bravotv.py delete mode 100644 yt_dlp/extractor/syfy.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c516c79ce..b0c52e0fc 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -300,7 +300,6 @@ BrainPOPIlIE, BrainPOPJrIE, ) -from .bravotv import BravoTVIE from .breitbart import BreitBartIE from .brightcove import ( BrightcoveLegacyIE, @@ -1262,6 +1261,7 @@ ) from .nbc import ( NBCIE, + BravoTVIE, NBCNewsIE, NBCOlympicsIE, NBCOlympicsStreamIE, @@ -1269,6 +1269,7 @@ NBCSportsStreamIE, NBCSportsVPlayerIE, NBCStationsIE, + SyfyIE, ) from .ndr import ( NDRIE, @@ -2022,7 +2023,6 @@ SVTSeriesIE, ) from .swearnet import SwearnetEpisodeIE -from .syfy import SyfyIE from .syvdk import SYVDKIE from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE diff --git a/yt_dlp/extractor/bravotv.py b/yt_dlp/extractor/bravotv.py deleted file mode 100644 index 0b2c44798..000000000 --- a/yt_dlp/extractor/bravotv.py +++ /dev/null @@ -1,188 +0,0 @@ -from .adobepass import AdobePassIE -from ..networking import HEADRequest -from ..utils import ( - extract_attributes, - float_or_none, - get_element_html_by_class, - int_or_none, - merge_dicts, - parse_age_limit, - remove_end, - str_or_none, - traverse_obj, - unescapeHTML, - unified_timestamp, - update_url_query, - url_or_none, -) - - -class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', - 'info_dict': { - 'id': '3923059', - 'ext': 'mp4', - 'title': 'The Top Chef Season 16 Winner Is...', - 'description': 'Find out who takes the title of Top Chef!', - 'upload_date': '20190314', - 'timestamp': 1552591860, - 'season_number': 16, - 'episode_number': 15, - 'series': 'Top Chef', - 'episode': 'The Top Chef Season 16 Winner Is...', - 'duration': 190.357, - 'season': 'Season 16', - 'thumbnail': r're:^https://.+\.jpg', - }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://www.bravotv.com/top-chef/season-20/episode-1/london-calling', - 'info_dict': { - 'id': '9000234570', - 'ext': 'mp4', - 'title': 'London Calling', - 'description': 'md5:5af95a8cbac1856bd10e7562f86bb759', - 'upload_date': '20230310', - 'timestamp': 1678410000, - 'season_number': 20, - 'episode_number': 1, - 'series': 'Top Chef', - 'episode': 'London Calling', - 'duration': 3266.03, - 'season': 'Season 20', - 'chapters': 'count:7', - 'thumbnail': r're:^https://.+\.jpg', - 'age_limit': 14, - }, - 'params': {'skip_download': 'm3u8'}, - 'skip': 'This video requires AdobePass MSO credentials', - }, { - 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-1/closing-night', - 'info_dict': { - 'id': '3692045', - 'ext': 'mp4', - 'title': 'Closing Night', - 'description': 'md5:3170065c5c2f19548d72a4cbc254af63', - 'upload_date': '20180401', - 'timestamp': 1522623600, - 'season_number': 1, - 'episode_number': 1, - 'series': 'In Ice Cold Blood', - 'episode': 'Closing Night', - 'duration': 2629.051, - 'season': 'Season 1', - 'chapters': 'count:6', - 'thumbnail': r're:^https://.+\.jpg', - 'age_limit': 14, - }, - 'params': {'skip_download': 'm3u8'}, - 'skip': 'This video requires AdobePass MSO credentials', - }, { - 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', - 'info_dict': { - 'id': '3974019', - 'ext': 'mp4', - 'title': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', - 'description': 'md5:f9d638dd6946a1c1c0533a9c6100eae5', - 'upload_date': '20190617', - 'timestamp': 1560790800, - 'season_number': 2, - 'episode_number': 16, - 'series': 'In Ice Cold Blood', - 'episode': '\'Handling The Horwitz House After The Murder (Season 2, Episode 16)', - 'duration': 68.235, - 'season': 'Season 2', - 'thumbnail': r're:^https://.+\.jpg', - 'age_limit': 14, - }, - 'params': {'skip_download': 'm3u8'}, - }, { - 'url': 'https://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', - 'only_matching': True, - }] - - def _real_extract(self, url): - site, display_id = self._match_valid_url(url).group('site', 'id') - webpage = self._download_webpage(url, display_id) - settings = self._search_json( - r']+data-drupal-selector="drupal-settings-json"[^>]*>', webpage, 'settings', display_id) - tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') - query = { - 'manifest': 'm3u', - 'formats': 'm3u,mpeg4', - } - - if tve: - account_pid = tve.get('data-mpx-media-account-pid') or 'HNK2IC' - account_id = tve['data-mpx-media-account-id'] - metadata = self._parse_json( - tve.get('data-normalized-video', ''), display_id, fatal=False, transform_source=unescapeHTML) - video_id = tve.get('data-guid') or metadata['guid'] - if tve.get('data-entitlement') == 'auth': - auth = traverse_obj(settings, ('tve_adobe_auth', {dict})) or {} - site = remove_end(site, 'tv') - release_pid = tve['data-release-pid'] - resource = self._get_mvpd_resource( - tve.get('data-adobe-pass-resource-id') or auth.get('adobePassResourceId') or site, - tve['data-title'], release_pid, tve.get('data-rating')) - query.update({ - 'switch': 'HLSServiceSecure', - 'auth': self._extract_mvpd_auth( - url, release_pid, auth.get('adobePassRequestorId') or site, resource), - }) - - else: - ls_playlist = traverse_obj(settings, ('ls_playlist', ..., {dict}), get_all=False) or {} - account_pid = ls_playlist.get('mpxMediaAccountPid') or 'PHSl-B' - account_id = ls_playlist['mpxMediaAccountId'] - video_id = ls_playlist['defaultGuid'] - metadata = traverse_obj( - ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, {dict}), get_all=False) - - tp_url = f'https://link.theplatform.com/s/{account_pid}/media/guid/{account_id}/{video_id}' - tp_metadata = self._download_json( - update_url_query(tp_url, {'format': 'preview'}), video_id, fatal=False) - - chapters = traverse_obj(tp_metadata, ('chapters', ..., { - 'start_time': ('startTime', {float_or_none(scale=1000)}), - 'end_time': ('endTime', {float_or_none(scale=1000)}), - })) - # prune pointless single chapters that span the entire duration from short videos - if len(chapters) == 1 and not traverse_obj(chapters, (0, 'end_time')): - chapters = None - - m3u8_url = self._request_webpage(HEADRequest( - update_url_query(f'{tp_url}/stream.m3u8', query)), video_id, 'Checking m3u8 URL').url - if 'mpeg_cenc' in m3u8_url: - self.report_drm(video_id) - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - 'chapters': chapters, - **merge_dicts(traverse_obj(tp_metadata, { - 'title': 'title', - 'description': 'description', - 'duration': ('duration', {float_or_none(scale=1000)}), - 'timestamp': ('pubDate', {float_or_none(scale=1000)}), - 'season_number': (('pl1$seasonNumber', 'nbcu$seasonNumber'), {int_or_none}), - 'episode_number': (('pl1$episodeNumber', 'nbcu$episodeNumber'), {int_or_none}), - 'series': (('pl1$show', 'nbcu$show'), (None, ...), {str}), - 'episode': (('title', 'pl1$episodeNumber', 'nbcu$episodeNumber'), {str_or_none}), - 'age_limit': ('ratings', ..., 'rating', {parse_age_limit}), - }, get_all=False), traverse_obj(metadata, { - 'title': 'title', - 'description': 'description', - 'duration': ('durationInSeconds', {int_or_none}), - 'timestamp': ('airDate', {unified_timestamp}), - 'thumbnail': ('thumbnailUrl', {url_or_none}), - 'season_number': ('seasonNumber', {int_or_none}), - 'episode_number': ('episodeNumber', {int_or_none}), - 'episode': 'episodeTitle', - 'series': 'show', - })), - } diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index d9aded09e..bd4862bde 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -6,7 +6,7 @@ from .adobepass import AdobePassIE from .common import InfoExtractor -from .theplatform import ThePlatformIE, default_ns +from .theplatform import ThePlatformBaseIE, ThePlatformIE, default_ns from ..networking import HEADRequest from ..utils import ( ExtractorError, @@ -14,26 +14,130 @@ UserNotLive, clean_html, determine_ext, + extract_attributes, float_or_none, + get_element_html_by_class, int_or_none, join_nonempty, + make_archive_id, mimetype2ext, parse_age_limit, parse_duration, + parse_iso8601, remove_end, - smuggle_url, - traverse_obj, try_get, unescapeHTML, unified_timestamp, update_url_query, url_basename, + url_or_none, ) +from ..utils.traversal import require, traverse_obj -class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P(?:NBCE|n)?\d+))' +class NBCUniversalBaseIE(ThePlatformBaseIE): + _GEO_COUNTRIES = ['US'] + _GEO_BYPASS = False + _M3U8_RE = r'https?://[^/?#]+/prod/[\w-]+/(?P[^?#]+/)cmaf/mpeg_(?:cbcs|cenc)\w*/master_cmaf\w*\.m3u8' + def _download_nbcu_smil_and_extract_m3u8_url(self, tp_path, video_id, query): + smil = self._download_xml( + f'https://link.theplatform.com/s/{tp_path}', video_id, + 'Downloading SMIL manifest', 'Failed to download SMIL manifest', query={ + **query, + 'format': 'SMIL', # XXX: Do not confuse "format" with "formats" + 'manifest': 'm3u', + 'switch': 'HLSServiceSecure', # Or else we get broken mp4 http URLs instead of HLS + }, headers=self.geo_verification_headers()) + + ns = f'//{{{default_ns}}}' + if url := traverse_obj(smil, (f'{ns}video/@src', lambda _, v: determine_ext(v) == 'm3u8', any)): + return url + + exc = traverse_obj(smil, (f'{ns}param', lambda _, v: v.get('name') == 'exception', '@value', any)) + if exc == 'GeoLocationBlocked': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(traverse_obj(smil, (f'{ns}ref/@abstract', ..., any)), expected=exc == 'Expired') + + def _extract_nbcu_formats_and_subtitles(self, tp_path, video_id, query): + # formats='mpeg4' will return either a working m3u8 URL or an m3u8 template for non-DRM HLS + # formats='m3u+none,mpeg4' may return DRM HLS but w/the "folders" needed for non-DRM template + query['formats'] = 'm3u+none,mpeg4' + m3u8_url = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) + + if mobj := re.fullmatch(self._M3U8_RE, m3u8_url): + query['formats'] = 'mpeg4' + m3u8_tmpl = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) + # Example: https://vod-lf-oneapp-prd.akamaized.net/prod/video/{folders}master_hls.m3u8 + if '{folders}' in m3u8_tmpl: + self.write_debug('Found m3u8 URL template, formatting URL path') + m3u8_url = m3u8_tmpl.format(folders=mobj.group('folders')) + + if '/mpeg_cenc' in m3u8_url or '/mpeg_cbcs' in m3u8_url: + self.report_drm(video_id) + + return self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + def _extract_nbcu_video(self, url, display_id, old_ie_key=None): + webpage = self._download_webpage(url, display_id) + settings = self._search_json( + r']+data-drupal-selector="drupal-settings-json"[^>]*>', + webpage, 'settings', display_id) + + query = {} + tve = extract_attributes(get_element_html_by_class('tve-video-deck-app', webpage) or '') + if tve: + account_pid = tve.get('data-mpx-media-account-pid') or tve['data-mpx-account-pid'] + account_id = tve['data-mpx-media-account-id'] + metadata = self._parse_json( + tve.get('data-normalized-video') or '', display_id, fatal=False, transform_source=unescapeHTML) + video_id = tve.get('data-guid') or metadata['guid'] + if tve.get('data-entitlement') == 'auth': + auth = settings['tve_adobe_auth'] + release_pid = tve['data-release-pid'] + resource = self._get_mvpd_resource( + tve.get('data-adobe-pass-resource-id') or auth['adobePassResourceId'], + tve['data-title'], release_pid, tve.get('data-rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, auth['adobePassRequestorId'], + resource, auth['adobePassSoftwareStatement']) + else: + ls_playlist = traverse_obj(settings, ( + 'ls_playlist', lambda _, v: v['defaultGuid'], any, {require('LS playlist')})) + video_id = ls_playlist['defaultGuid'] + account_pid = ls_playlist.get('mpxMediaAccountPid') or ls_playlist['mpxAccountPid'] + account_id = ls_playlist['mpxMediaAccountId'] + metadata = traverse_obj(ls_playlist, ('videos', lambda _, v: v['guid'] == video_id, any)) or {} + + tp_path = f'{account_pid}/media/guid/{account_id}/{video_id}' + formats, subtitles = self._extract_nbcu_formats_and_subtitles(tp_path, video_id, query) + tp_metadata = self._download_theplatform_metadata(tp_path, video_id, fatal=False) + parsed_info = self._parse_theplatform_metadata(tp_metadata) + self._merge_subtitles(parsed_info['subtitles'], target=subtitles) + + return { + **parsed_info, + **traverse_obj(metadata, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'duration': ('durationInSeconds', {int_or_none}), + 'timestamp': ('airDate', {parse_iso8601}), + 'thumbnail': ('thumbnailUrl', {url_or_none}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'episode': ('episodeTitle', {str}), + 'series': ('show', {str}), + }), + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + '_old_archive_ids': [make_archive_id(old_ie_key, video_id)] if old_ie_key else None, + } + + +class NBCIE(NBCUniversalBaseIE): + _VALID_URL = r'https?(?P://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/?#]+/video/[^/?#]+/(?P\w+))' _TESTS = [ { 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237', @@ -49,47 +153,20 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'episode_number': 86, 'season': 'Season 2', 'season_number': 2, - 'series': 'Tonight Show: Jimmy Fallon', - 'duration': 237.0, - 'chapters': 'count:1', - 'tags': 'count:4', + 'series': 'Tonight', + 'duration': 236.504, + 'tags': 'count:2', 'thumbnail': r're:https?://.+\.jpg', 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'], 'media_type': 'Full Episode', + 'age_limit': 14, + '_old_archive_ids': ['theplatform 2848237'], }, 'params': { 'skip_download': 'm3u8', }, }, { - 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', - 'info_dict': { - 'id': '2832821', - 'ext': 'mp4', - 'title': 'Star Wars Teaser', - 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', - 'timestamp': 1417852800, - 'upload_date': '20141206', - 'uploader': 'NBCU-COM', - }, - 'skip': 'page not found', - }, - { - # HLS streams requires the 'hdnea3' cookie - 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', - 'info_dict': { - 'id': '101528f5a9e8127b107e98c5e6ce4638', - 'ext': 'mp4', - 'title': 'Goliath', - 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.', - 'timestamp': 1237100400, - 'upload_date': '20090315', - 'uploader': 'NBCU-COM', - }, - 'skip': 'page not found', - }, - { - # manifest url does not have extension 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439', 'info_dict': { 'id': '3646439', @@ -99,48 +176,47 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'episode_number': 1, 'season': 'Season 75', 'season_number': 75, - 'series': 'The Golden Globe Awards', + 'series': 'Golden Globes', 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.', 'uploader': 'NBCU-COM', 'upload_date': '20180107', 'timestamp': 1515312000, - 'duration': 570.0, + 'duration': 569.703, 'tags': 'count:8', 'thumbnail': r're:https?://.+\.jpg', - 'chapters': 'count:1', + 'media_type': 'Highlight', + 'age_limit': 0, + 'categories': ['Series/The Golden Globe Awards'], + '_old_archive_ids': ['theplatform 3646439'], }, 'params': { 'skip_download': 'm3u8', }, }, { - # new video_id format - 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + # Needs to be extracted from webpage instead of GraphQL + 'url': 'https://www.nbc.com/paris2024/video/ali-truwit-found-purpose-pool-after-her-life-changed/para24_sww_alitruwittodayshow_240823', 'info_dict': { - 'id': 'NBCE125189978', + 'id': 'para24_sww_alitruwittodayshow_240823', 'ext': 'mp4', - 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap', - 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e', - 'uploader': 'NBCU-COM', - 'series': 'Quantum Leap', - 'season': 'Season 1', - 'season_number': 1, - 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap', - 'episode_number': 1, - 'duration': 170.171, - 'chapters': [], - 'timestamp': 1663956155, - 'upload_date': '20220923', - 'tags': 'count:10', - 'age_limit': 0, + 'title': 'Ali Truwit found purpose in the pool after her life changed', + 'description': 'md5:c16d7489e1516593de1cc5d3f39b9bdb', + 'uploader': 'NBCU-SPORTS', + 'duration': 311.077, 'thumbnail': r're:https?://.+\.jpg', - 'categories': ['Series/Quantum Leap 2022'], - 'media_type': 'Highlight', + 'episode': 'Ali Truwit found purpose in the pool after her life changed', + 'timestamp': 1724435902.0, + 'upload_date': '20240823', + '_old_archive_ids': ['theplatform para24_sww_alitruwittodayshow_240823'], }, 'params': { 'skip_download': 'm3u8', }, }, + { + 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978', + 'only_matching': True, + }, { 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', 'only_matching': True, @@ -151,6 +227,7 @@ class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'only_matching': True, }, ] + _SOFTWARE_STATEMENT = 'eyJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI1Yzg2YjdkYy04NDI3LTRjNDUtOGQwZi1iNDkzYmE3MmQwYjQiLCJuYmYiOjE1Nzg3MDM2MzEsImlzcyI6ImF1dGguYWRvYmUuY29tIiwiaWF0IjoxNTc4NzAzNjMxfQ.QQKIsBhAjGQTMdAqRTqhcz2Cddr4Y2hEjnSiOeKKki4nLrkDOsjQMmqeTR0hSRarraxH54wBgLvsxI7LHwKMvr7G8QpynNAxylHlQD3yhN9tFhxt4KR5wW3as02B-W2TznK9bhNWPKIyHND95Uo2Mi6rEQoq8tM9O09WPWaanE5BX_-r6Llr6dPq5F0Lpx2QOn2xYRb1T4nFxdFTNoss8GBds8OvChTiKpXMLHegLTc1OS4H_1a8tO_37jDwSdJuZ8iTyRLV4kZ2cpL6OL5JPMObD4-HQiec_dfcYgMKPiIfP9ZqdXpec2SVaCLsWEk86ZYvD97hLIQrK5rrKd1y-A' def _real_extract(self, url): permalink, video_id = self._match_valid_url(url).groups() @@ -196,62 +273,50 @@ def _real_extract(self, url): 'userId': '0', }), })['data']['bonanzaPage']['metadata'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - 'switch': 'HLSServiceSecure', - } + + if not video_data: + # Some videos are not available via GraphQL API + webpage = self._download_webpage(url, video_id) + video_data = self._search_json( + r'''', + { + 'thumbnails': [{'url': 'https://www.nobelprize.org/images/12693-landscape-medium-gallery.jpg'}], + }, + {}, + ), ] for html, expected_dict, search_json_ld_kwargs in _TESTS: expect_dict( diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d5607296d..1174bd4f5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1675,9 +1675,9 @@ def extract_video_object(e): 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnails': [{'url': unescapeHTML(url)} - for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) - if url_or_none(url)], + 'thumbnails': traverse_obj(e, (('thumbnailUrl', 'thumbnailURL', 'thumbnail_url'), (None, ...), { + 'url': ({str}, {unescapeHTML}, {self._proto_relative_url}, {url_or_none}), + })), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. From 148a1eb4c59e127965396c7a6e6acf1979de459e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Jun 2025 18:18:24 -0500 Subject: [PATCH 36/68] [ie/odnoklassniki] Detect and raise when login is required (#13361) Closes #13360 Authored by: bashonly --- yt_dlp/extractor/odnoklassniki.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py index d27d1c3f0..18eba42e6 100644 --- a/yt_dlp/extractor/odnoklassniki.py +++ b/yt_dlp/extractor/odnoklassniki.py @@ -273,6 +273,8 @@ def _extract_desktop(self, url): return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'})) elif error: raise ExtractorError(error, expected=True) + elif '>Access to this video is restricted' in webpage: + self.raise_login_required() player = self._parse_json( unescapeHTML(self._search_regex( @@ -429,7 +431,7 @@ def _extract_mobile(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - f'http://m.ok.ru/video/{video_id}', video_id, + f'https://m.ok.ru/video/{video_id}', video_id, note='Downloading mobile webpage') error = self._search_regex( From c723c4e5e78263df178dbe69844a3d05f3ef9e35 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 1 Jun 2025 18:20:29 -0500 Subject: [PATCH 37/68] [ie/vimeo] Extract subtitles from player subdomain (#13350) Closes #12198 Authored by: bashonly --- yt_dlp/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 09497b699..b268fad56 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -236,7 +236,7 @@ def _parse_config(self, config, video_id): for tt in (request.get('text_tracks') or []): subtitles.setdefault(tt['lang'], []).append({ 'ext': 'vtt', - 'url': urljoin('https://vimeo.com', tt['url']), + 'url': urljoin('https://player.vimeo.com/', tt['url']), }) thumbnails = [] From e1b6062f8c4a3fa33c65269d48d09ec78de765a2 Mon Sep 17 00:00:00 2001 From: barsnick Date: Tue, 3 Jun 2025 04:29:03 +0200 Subject: [PATCH 38/68] [ie/svt:play] Fix extractor (#13329) Closes #13312 Authored by: barsnick, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/svt.py | 134 +++++++++++--------------------- 2 files changed, 44 insertions(+), 91 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b0c52e0fc..34c98b537 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2017,7 +2017,6 @@ SverigesRadioPublicationIE, ) from .svt import ( - SVTIE, SVTPageIE, SVTPlayIE, SVTSeriesIE, diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 6a72f8d42..a48d7858d 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -6,10 +6,13 @@ determine_ext, dict_get, int_or_none, - traverse_obj, try_get, unified_timestamp, ) +from ..utils.traversal import ( + require, + traverse_obj, +) class SVTBaseIE(InfoExtractor): @@ -97,40 +100,8 @@ def _extract_video(self, video_info, video_id): } -class SVTIE(SVTBaseIE): - _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P\d+)&.*?\barticleId=(?P\d+)' - _EMBED_REGEX = [rf'(?: