1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-06-27 17:08:32 +00:00

[ie] Add _search_nextjs_v13_data helper

Authored by: bashonly
This commit is contained in:
bashonly 2025-06-05 02:04:15 -05:00
parent 4e7c1ea346
commit c840feeba1
No known key found for this signature in database
GPG Key ID: 783F096F253D15B0
2 changed files with 83 additions and 0 deletions

View File

@ -1947,6 +1947,34 @@ def test_search_nextjs_data(self):
with self.assertWarns(DeprecationWarning):
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
def test_search_nextjs_v13_data(self):
HTML = R'''
<script>(self.__next_f=self.__next_f||[]).push([0])</script>
<script>self.__next_f.push([2,"0:[\"$\",\"$L0\",null,{\"do_not_add_this\":\"fail\"}]\n"])</script>
<script>self.__next_f.push([1,"1:I[46975,[],\"HTTPAccessFallbackBoundary\"]\n2:I[32630,[\"8183\",\"static/chunks/8183-768193f6a9e33cdd.js\"]]\n"])</script>
<script nonce="abc123">self.__next_f.push([1,"e:[false,[\"$\",\"div\",null,{\"children\":[\"$\",\"$L18\",null,{\"foo\":\"bar\"}]}],false]\n"])</script>
<script>self.__next_f.push([1,"2a:[[\"$\",\"div\",null,{\"className\":\"flex flex-col\",\"children\":[]}],[\"$\",\"$L16\",null,{\"meta\":{\"dateCreated\":1730489700,\"uuid\":\"40cac41d-8d29-4ef5-aa11-75047b9f0907\"}}]]\n"])</script>
<script>self.__next_f.push([1,"df:[\"$undefined\",[\"$\",\"div\",null,{\"children\":[\"$\",\"$L17\",null,{}],\"do_not_include_this_field\":\"fail\"}],[\"$\",\"div\",null,{\"children\":[[\"$\",\"$L19\",null,{\"duplicated_field_name\":{\"x\":1}}],[\"$\",\"$L20\",null,{\"duplicated_field_name\":{\"y\":2}}]]}],\"$undefined\"]\n"])</script>
<script>self.__next_f.push([3,"MzM6WyIkIiwiJEwzMiIsbnVsbCx7ImRlY29kZWQiOiJzdWNjZXNzIn1d"])</script>
'''
EXPECTED = [{
'foo': 'bar',
}, {
'meta': {
'dateCreated': 1730489700,
'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
},
}, {
'duplicated_field_name': {'x': 1},
}, {
'duplicated_field_name': {'y': 2},
}, {
'decoded': 'success',
}]
self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED)
self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), [])
self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), [])
if __name__ == '__main__':
unittest.main()

View File

@ -1,4 +1,5 @@
import base64
import binascii
import collections
import functools
import getpass
@ -1778,6 +1779,60 @@ def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAU
r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
def _search_nextjs_v13_data(self, webpage, video_id, fatal=True):
"""Parses Next.js app router flight data that was introduced in Next.js v13"""
nextjs_data = []
if not fatal and not isinstance(webpage, str):
return nextjs_data
# This regex pattern can afford to be and should be strict
# Ref: https://github.com/vercel/next.js/commit/5a4a08fdce91a038f2ed3a70568d3ed040403150
# /packages/next/src/server/app-render/use-flight-response.tsx
flight_segments = re.findall(r'<script[^>]*>self\.__next_f\.push\((\[.+?\])\)</script>', webpage)
def flatten(flight_data):
if not isinstance(flight_data, list) or not flight_data:
return
if len(flight_data) == 4 and flight_data[0] == '$':
_, name, _, data = flight_data
if not isinstance(data, dict):
return
children = data.pop('children', None)
if data and name and name[0] == '$':
# It is useful hydration JSON data
nextjs_data.append(data)
flatten(children)
return
for f in flight_data:
flatten(f)
for flight_segment in flight_segments:
segment = self._parse_json(flight_segment, video_id, fatal=fatal, errnote=None if fatal else False)
# Some earlier versions of next.js "optimized" away this array structure; this is unsupported
# Ref: https://github.com/vercel/next.js/commit/0123a9d5c9a9a77a86f135b7ae30b46ca986d761
if not isinstance(segment, list) or len(segment) != 2:
self.write_debug(
f'{video_id}: Unsupported next.js flight data structure detected', only_once=True)
continue
payload_type, chunk = segment
if payload_type == 3:
try:
chunk = base64.b64decode(chunk).decode()
except (ValueError, binascii.Error):
msg = 'Unable to parse next.js data: unable to decode flight data'
if not fatal:
self.report_warning(msg, video_id=video_id, only_once=True)
continue
raise ExtractorError(msg)
elif payload_type != 1:
# Ignore useless payload types (0: bootstrap, 2: form state)
continue
# Not all chunks are complete JSON data; this should always be non-fatal
flatten(self._search_json(
r'^[\da-f]+:', chunk, 'flight data', video_id,
default=None, contains_pattern=r'\[.+\]'))
return nextjs_data
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
rectx = re.escape(context_name)