From b5fea53f2099bed41ba1b17ab0ac87c8dba5a5ec Mon Sep 17 00:00:00 2001
From: bashonly <88596187+bashonly@users.noreply.github.com>
Date: Sat, 12 Jul 2025 18:12:05 -0500
Subject: [PATCH] [ie] Rework `_search_nextjs_v13_data` helper (#13711)
Fix 5245231e4a39ecd5595d4337d46d85e150e2430a
Authored by: bashonly
---
test/test_InfoExtractor.py | 31 ++++++++++++++++++-------------
yt_dlp/extractor/common.py | 18 ++++++++++++------
2 files changed, 30 insertions(+), 19 deletions(-)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 7c3825f779..40dd05e136 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -1969,21 +1969,26 @@ def test_search_nextjs_v13_data(self):
'''
- EXPECTED = [{
- 'foo': 'bar',
- }, {
- 'meta': {
- 'dateCreated': 1730489700,
- 'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
+ EXPECTED = {
+ '18': {
+ 'foo': 'bar',
},
- }, {
- 'duplicated_field_name': {'x': 1},
- }, {
- 'duplicated_field_name': {'y': 2},
- }]
+ '16': {
+ 'meta': {
+ 'dateCreated': 1730489700,
+ 'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
+ },
+ },
+ '19': {
+ 'duplicated_field_name': {'x': 1},
+ },
+ '20': {
+ 'duplicated_field_name': {'y': 2},
+ },
+ }
self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED)
- self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), [])
- self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), [])
+ self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), {})
+ self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), {})
def test_search_nuxt_json(self):
HTML_TMPL = ''
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index a3ff5a1c0b..d601e17514 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1785,7 +1785,7 @@ def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAU
def _search_nextjs_v13_data(self, webpage, video_id, fatal=True):
"""Parses Next.js app router flight data that was introduced in Next.js v13"""
- nextjs_data = []
+ nextjs_data = {}
if not fatal and not isinstance(webpage, str):
return nextjs_data
@@ -1797,9 +1797,9 @@ def flatten(flight_data):
if not isinstance(data, dict):
return
children = data.pop('children', None)
- if data and isinstance(name, str) and name.startswith('$'):
+ if data and isinstance(name, str) and re.fullmatch(r'\$L[0-9a-f]+', name):
# It is useful hydration JSON data
- nextjs_data.append(data)
+ nextjs_data[name[2:]] = data
flatten(children)
return
for f in flight_data:
@@ -1823,10 +1823,16 @@ def flatten(flight_data):
flight_text += chunk
for f in flight_text.splitlines():
- prefix, _, body = f.partition(':')
- if body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix.lstrip()):
- # The body isn't necessarily valid JSON, so this should always be non-fatal
+ prefix, _, body = f.lstrip().partition(':')
+ if not re.fullmatch(r'[0-9a-f]+', prefix):
+ continue
+ # The body still isn't guaranteed to be valid JSON, so parsing should always be non-fatal
+ if body.startswith('[') and body.endswith(']'):
flatten(self._parse_json(body, video_id, fatal=False, errnote=False))
+ elif body.startswith('{') and body.endswith('}'):
+ data = self._parse_json(body, video_id, fatal=False, errnote=False)
+ if data is not None:
+ nextjs_data[prefix] = data
return nextjs_data