From c840feeba121628708f14722cd3b12e5885106fa Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 5 Jun 2025 02:04:15 -0500 Subject: [PATCH 1/7] [ie] Add `_search_nextjs_v13_data` helper Authored by: bashonly --- test/test_InfoExtractor.py | 28 +++++++++++++++++++ yt_dlp/extractor/common.py | 55 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index bc89b2955..b1f6ef255 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1947,6 +1947,34 @@ def test_search_nextjs_data(self): with self.assertWarns(DeprecationWarning): self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + def test_search_nextjs_v13_data(self): + HTML = R''' + + + + + + + + ''' + EXPECTED = [{ + 'foo': 'bar', + }, { + 'meta': { + 'dateCreated': 1730489700, + 'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907', + }, + }, { + 'duplicated_field_name': {'x': 1}, + }, { + 'duplicated_field_name': {'y': 2}, + }, { + 'decoded': 'success', + }] + self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED) + self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), []) + self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), []) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1174bd4f5..125568305 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1,4 +1,5 @@ import base64 +import binascii import collections import functools import getpass @@ -1778,6 +1779,60 @@ def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAU r']+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', video_id, end_pattern='', fatal=fatal, default=default, **kw) + def _search_nextjs_v13_data(self, webpage, video_id, fatal=True): + """Parses Next.js app router flight data that was introduced in Next.js v13""" + nextjs_data = [] + if not fatal and not isinstance(webpage, str): + return nextjs_data + # This regex pattern can afford to be and should be strict + # Ref: https://github.com/vercel/next.js/commit/5a4a08fdce91a038f2ed3a70568d3ed040403150 + # /packages/next/src/server/app-render/use-flight-response.tsx + flight_segments = re.findall(r']*>self\.__next_f\.push\((\[.+?\])\)', webpage) + + def flatten(flight_data): + if not isinstance(flight_data, list) or not flight_data: + return + if len(flight_data) == 4 and flight_data[0] == '$': + _, name, _, data = flight_data + if not isinstance(data, dict): + return + children = data.pop('children', None) + if data and name and name[0] == '$': + # It is useful hydration JSON data + nextjs_data.append(data) + flatten(children) + return + for f in flight_data: + flatten(f) + + for flight_segment in flight_segments: + segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False) + # Some earlier versions of next.js "optimized" away this array structure; this is unsupported + # Ref: https://github.com/vercel/next.js/commit/0123a9d5c9a9a77a86f135b7ae30b46ca986d761 + if not isinstance(segment, list) or len(segment) != 2: + self.write_debug( + f'{video_id}: Unsupported next.js flight data structure detected', only_once=True) + continue + payload_type, chunk = segment + if payload_type == 3: + try: + chunk = base64.b64decode(chunk).decode() + except (ValueError, binascii.Error): + msg = 'Unable to parse next.js data: unable to decode flight data' + if not fatal: + self.report_warning(msg, video_id=video_id, only_once=True) + continue + raise ExtractorError(msg) + elif payload_type != 1: + # Ignore useless payload types (0: bootstrap, 2: form state) + continue + # Not all chunks are complete JSON data; this should always be non-fatal + flatten(self._search_json( + r'^[\da-f]+:', chunk, 'flight data', video_id, + default=None, contains_pattern=r'\[.+\]')) + + return nextjs_data + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) From 7f91e4df8cbf1fc33c434fc6747e94381d7507c0 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 5 Jun 2025 02:07:27 -0500 Subject: [PATCH 2/7] [ie/9now.com.au] Refactor extractor Authored by: bashonly --- yt_dlp/extractor/ninenow.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/ninenow.py b/yt_dlp/extractor/ninenow.py index 7b0cb77a7..2f3a4ed28 100644 --- a/yt_dlp/extractor/ninenow.py +++ b/yt_dlp/extractor/ninenow.py @@ -1,6 +1,3 @@ -import json -import re - from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import ( @@ -11,7 +8,12 @@ str_or_none, url_or_none, ) -from ..utils.traversal import require, traverse_obj, value +from ..utils.traversal import ( + get_first, + require, + traverse_obj, + value, +) class NineNowIE(InfoExtractor): @@ -101,20 +103,11 @@ class NineNowIE(InfoExtractor): }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId={}' - # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv and yt_dlp.extractor.goplay - def _find_json(self, s): - return self._search_json( - r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - def _real_extract(self, url): display_id, video_type = self._match_valid_url(url).group('id', 'type') webpage = self._download_webpage(url, display_id) - common_data = traverse_obj( - re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*', webpage), - (..., {json.loads}, ..., {self._find_json}, - lambda _, v: v['payload'][video_type]['slug'] == display_id, - 'payload', any, {require('video data')})) + common_data = get_first(self._search_nextjs_v13_data(webpage, display_id), ('payload', {dict})) if traverse_obj(common_data, (video_type, 'video', 'drm', {bool})): self.report_drm(display_id) From 0539ea2971ea9418a25ea00d983d0daa99735a66 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 5 Jun 2025 02:07:40 -0500 Subject: [PATCH 3/7] [ie/goplay] Refactor and fix extractor Authored by: bashonly --- yt_dlp/extractor/goplay.py | 44 +++++++++++++------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index c654c757c..2e959cead 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -5,16 +5,11 @@ import hmac import json import os -import re import urllib.parse from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - remove_end, - traverse_obj, -) +from ..utils import ExtractorError, int_or_none +from ..utils.traversal import get_first, traverse_obj class GoPlayIE(InfoExtractor): @@ -27,10 +22,10 @@ class GoPlayIE(InfoExtractor): 'info_dict': { 'id': '2baa4560-87a0-421b-bffc-359914e3c387', 'ext': 'mp4', - 'title': 'S22 - Aflevering 1', + 'title': 'De Slimste Mens ter Wereld - S22 - Aflevering 1', 'description': r're:In aflevering 1 nemen Daan Alferink, Tess Elst en Xander De Rycke .{66}', 'series': 'De Slimste Mens ter Wereld', - 'episode': 'Episode 1', + 'episode': 'Wordt aangekondigd', 'season_number': 22, 'episode_number': 1, 'season': 'Season 22', @@ -52,7 +47,7 @@ class GoPlayIE(InfoExtractor): 'info_dict': { 'id': 'ecb79672-92b9-4cd9-a0d7-e2f0250681ee', 'ext': 'mp4', - 'title': 'S11 - Aflevering 1', + 'title': 'De Mol - S11 - Aflevering 1', 'description': r're:Tien kandidaten beginnen aan hun verovering van Amerika en ontmoeten .{102}', 'episode': 'Episode 1', 'series': 'De Mol', @@ -75,21 +70,13 @@ def _real_initialize(self): if not self._id_token: raise self.raise_login_required(method='password') - # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.francetv - def _find_json(self, s): - return self._search_json( - r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - nextjs_data = traverse_obj( - re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*', webpage), - (..., {json.loads}, ..., {self._find_json}, ...)) - meta = traverse_obj(nextjs_data, ( - ..., ..., 'children', ..., ..., 'children', - lambda _, v: v['video']['path'] == urllib.parse.urlparse(url).path, 'video', any)) + nextjs_data = self._search_nextjs_v13_data(webpage, display_id) + meta = get_first(nextjs_data, ( + lambda k, v: k in ('video', 'meta') and v['path'] == urllib.parse.urlparse(url).path)) video_id = meta['uuid'] info_dict = traverse_obj(meta, { @@ -98,19 +85,18 @@ def _real_extract(self, url): }) if traverse_obj(meta, ('program', 'subtype')) != 'movie': - for season_data in traverse_obj(nextjs_data, (..., 'children', ..., 'playlists', ...)): - episode_data = traverse_obj( - season_data, ('videos', lambda _, v: v['videoId'] == video_id, any)) + for season_data in traverse_obj(nextjs_data, (..., 'playlists', ..., {dict})): + episode_data = traverse_obj(season_data, ('videos', lambda _, v: v['videoId'] == video_id, any)) if not episode_data: continue - episode_title = traverse_obj( - episode_data, 'contextualTitle', 'episodeTitle', expected_type=str) + season_number = traverse_obj(season_data, ('season', {int_or_none})) info_dict.update({ - 'title': episode_title or info_dict.get('title'), - 'series': remove_end(info_dict.get('title'), f' - {episode_title}'), - 'season_number': traverse_obj(season_data, ('season', {int_or_none})), + 'episode': traverse_obj(episode_data, ('episodeTitle', {str})), 'episode_number': traverse_obj(episode_data, ('episodeNumber', {int_or_none})), + 'season_number': season_number, + 'series': self._search_regex( + fr'^(.+)? - S{season_number} - ', info_dict.get('title'), 'series', default=None), }) break From 7894a0e98b84fbc85738f7e665371c57567cfbce Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 5 Jun 2025 02:07:52 -0500 Subject: [PATCH 4/7] [ie/francetv:site] Refactor and fix extractor Authored by: bashonly --- yt_dlp/extractor/francetv.py | 48 +++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 5c9f8e36d..edf6708a0 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -1,4 +1,3 @@ -import json import re import urllib.parse @@ -19,7 +18,11 @@ unsmuggle_url, url_or_none, ) -from ..utils.traversal import find_element, traverse_obj +from ..utils.traversal import ( + find_element, + get_first, + traverse_obj, +) class FranceTVBaseInfoExtractor(InfoExtractor): @@ -258,7 +261,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1', # old: c5bda21d-2c6f-4470-8849-3d8327adb2ba' + 'id': 'b2cf9fd8-e971-4757-8651-848f2772df61', # old: ec217ecc-0733-48cf-ac06-af1347b849d1 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', 'timestamp': 1502623500, @@ -269,7 +272,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [FranceTVIE.ie_key()], + 'skip': 'Unfortunately, this video is no longer available', }, { # geo-restricted 'url': 'https://www.france.tv/enfants/six-huit-ans/foot2rue/saison-1/3066387-duel-au-vieux-port.html', @@ -287,7 +290,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1441, }, - 'skip': 'No longer available', + 'skip': 'Unfortunately, this video is no longer available', }, { # geo-restricted livestream (workflow == 'token-akamai') 'url': 'https://www.france.tv/france-4/direct.html', @@ -308,6 +311,19 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'live_status': 'is_live', }, 'params': {'skip_download': 'livestream'}, + }, { + # Not geo-restricted + 'url': 'https://www.france.tv/france-2/la-maison-des-maternelles/5574051-nous-sommes-amis-et-nous-avons-fait-un-enfant-ensemble.html', + 'info_dict': { + 'id': 'b448bfe4-9fe7-11ee-97d8-2ba3426fa3df', + 'ext': 'mp4', + 'title': 'Nous sommes amis et nous avons fait un enfant ensemble - Émission du jeudi 21 décembre 2023', + 'duration': 1065, + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1703147921, + 'upload_date': '20231221', + }, + 'params': {'skip_download': 'm3u8'}, }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', @@ -342,30 +358,16 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor): 'only_matching': True, }] - # XXX: For parsing next.js v15+ data; see also yt_dlp.extractor.goplay - def _find_json(self, s): - return self._search_json( - r'\w+\s*:\s*', s, 'next js data', None, contains_pattern=r'\[(?s:.+)\]', default=None) - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + nextjs_data = self._search_nextjs_v13_data(webpage, display_id) - nextjs_data = traverse_obj( - re.findall(r']*>\s*self\.__next_f\.push\(\s*(\[.+?\])\s*\);?\s*', webpage), - (..., {json.loads}, ..., {self._find_json}, ..., 'children', ..., ..., 'children', ..., ..., 'children')) - - if traverse_obj(nextjs_data, (..., ..., 'children', ..., 'isLive', {bool}, any)): + if get_first(nextjs_data, ('isLive', {bool})): # For livestreams we need the id of the stream instead of the currently airing episode id - video_id = traverse_obj(nextjs_data, ( - ..., ..., 'children', ..., 'children', ..., 'children', ..., 'children', ..., ..., - 'children', ..., ..., 'children', ..., ..., 'children', (..., (..., ...)), - 'options', 'id', {str}, any)) + video_id = get_first(nextjs_data, ('options', 'id', {str})) else: - video_id = traverse_obj(nextjs_data, ( - ..., ..., ..., 'children', - lambda _, v: v['video']['url'] == urllib.parse.urlparse(url).path, - 'video', ('playerReplayId', 'siId'), {str}, any)) + video_id = get_first(nextjs_data, ('video', ('playerReplayId', 'siId'), {str})) if not video_id: raise ExtractorError('Unable to extract video ID') From 9986c49ac6952de619a887b116fc94a58306356d Mon Sep 17 00:00:00 2001 From: bashonly Date: Sun, 22 Jun 2025 16:18:11 -0500 Subject: [PATCH 5/7] cleanup Authored by: bashonly --- yt_dlp/extractor/common.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 125568305..10ca1257f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1784,10 +1784,6 @@ def _search_nextjs_v13_data(self, webpage, video_id, fatal=True): nextjs_data = [] if not fatal and not isinstance(webpage, str): return nextjs_data - # This regex pattern can afford to be and should be strict - # Ref: https://github.com/vercel/next.js/commit/5a4a08fdce91a038f2ed3a70568d3ed040403150 - # /packages/next/src/server/app-render/use-flight-response.tsx - flight_segments = re.findall(r']*>self\.__next_f\.push\((\[.+?\])\)', webpage) def flatten(flight_data): if not isinstance(flight_data, list) or not flight_data: @@ -1805,7 +1801,10 @@ def flatten(flight_data): for f in flight_data: flatten(f) - for flight_segment in flight_segments: + # The flight segments regex pattern can afford to be (and should be) strict + # Ref: https://github.com/vercel/next.js/commit/5a4a08fdce91a038f2ed3a70568d3ed040403150 + # /packages/next/src/server/app-render/use-flight-response.tsx + for flight_segment in re.findall(r']*>self\.__next_f\.push\((\[.+?\])\)', webpage): segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False) # Some earlier versions of next.js "optimized" away this array structure; this is unsupported # Ref: https://github.com/vercel/next.js/commit/0123a9d5c9a9a77a86f135b7ae30b46ca986d761 From 1ded2a3faa50ae0b51aae1ea606d867ef4e78278 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:25:51 +0000 Subject: [PATCH 6/7] precision --- yt_dlp/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 53023b642..893b7f41f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1794,7 +1794,7 @@ def flatten(flight_data): if not isinstance(data, dict): return children = data.pop('children', None) - if data and name and name[0] == '$': + if data and isinstance(name, str) and name[:1] == '$': # It is useful hydration JSON data nextjs_data.append(data) flatten(children) From 3f9542f4e8dffc8d14ae88ac1851c167cd20f44d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 22 Jun 2025 22:26:45 +0000 Subject: [PATCH 7/7] simplify --- yt_dlp/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 893b7f41f..9c17f721d 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1787,7 +1787,7 @@ def _search_nextjs_v13_data(self, webpage, video_id, fatal=True): return nextjs_data def flatten(flight_data): - if not isinstance(flight_data, list) or not flight_data: + if not isinstance(flight_data, list): return if len(flight_data) == 4 and flight_data[0] == '$': _, name, _, data = flight_data