diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 7c3825f779..40dd05e136 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1969,21 +1969,26 @@ def test_search_nextjs_v13_data(self): ''' - EXPECTED = [{ - 'foo': 'bar', - }, { - 'meta': { - 'dateCreated': 1730489700, - 'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907', + EXPECTED = { + '18': { + 'foo': 'bar', }, - }, { - 'duplicated_field_name': {'x': 1}, - }, { - 'duplicated_field_name': {'y': 2}, - }] + '16': { + 'meta': { + 'dateCreated': 1730489700, + 'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907', + }, + }, + '19': { + 'duplicated_field_name': {'x': 1}, + }, + '20': { + 'duplicated_field_name': {'y': 2}, + }, + } self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED) - self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), []) - self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), []) + self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), {}) + self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), {}) def test_search_nuxt_json(self): HTML_TMPL = '' diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index a3ff5a1c0b..d601e17514 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1785,7 +1785,7 @@ def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAU def _search_nextjs_v13_data(self, webpage, video_id, fatal=True): """Parses Next.js app router flight data that was introduced in Next.js v13""" - nextjs_data = [] + nextjs_data = {} if not fatal and not isinstance(webpage, str): return nextjs_data @@ -1797,9 +1797,9 @@ def flatten(flight_data): if not isinstance(data, dict): return children = data.pop('children', None) - if data and isinstance(name, str) and name.startswith('$'): + if data and isinstance(name, str) and re.fullmatch(r'\$L[0-9a-f]+', name): # It is useful hydration JSON data - nextjs_data.append(data) + nextjs_data[name[2:]] = data flatten(children) return for f in flight_data: @@ -1823,10 +1823,16 @@ def flatten(flight_data): flight_text += chunk for f in flight_text.splitlines(): - prefix, _, body = f.partition(':') - if body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix.lstrip()): - # The body isn't necessarily valid JSON, so this should always be non-fatal + prefix, _, body = f.lstrip().partition(':') + if not re.fullmatch(r'[0-9a-f]+', prefix): + continue + # The body still isn't guaranteed to be valid JSON, so parsing should always be non-fatal + if body.startswith('[') and body.endswith(']'): flatten(self._parse_json(body, video_id, fatal=False, errnote=False)) + elif body.startswith('{') and body.endswith('}'): + data = self._parse_json(body, video_id, fatal=False, errnote=False) + if data is not None: + nextjs_data[prefix] = data return nextjs_data