[ie] Rework _search_nextjs_v13_data helper (#13711)

Fix 5245231e4a Authored by: bashonly
2025-12-05 07:45:20 +00:00 · 2025-07-12 18:12:05 -05:00
parent 5245231e4a
commit b5fea53f20
2 changed files with 30 additions and 19 deletions
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -1969,21 +1969,26 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
            <script>self.__next_f.push([1,"df:[\"$undefined\",[\"$\",\"div\",null,{\"children\":[\"$\",\"$L17\",null,{}],\"do_not_include_this_field\":\"fail\"}],[\"$\",\"div\",null,{\"children\":[[\"$\",\"$L19\",null,{\"duplicated_field_name\":{\"x\":1}}],[\"$\",\"$L20\",null,{\"duplicated_field_name\":{\"y\":2}}]]}],\"$undefined\"]\n"])</script>
            <script>self.__next_f.push([3,"MzM6WyIkIiwiJEwzMiIsbnVsbCx7ImRlY29kZWQiOiJzdWNjZXNzIn1d"])</script>
            '''
-        EXPECTED = [{
-            'foo': 'bar',
-        }, {
-            'meta': {
-                'dateCreated': 1730489700,
-                'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
+        EXPECTED = {
+            '18': {
+                'foo': 'bar',
            },
-        }, {
-            'duplicated_field_name': {'x': 1},
-        }, {
-            'duplicated_field_name': {'y': 2},
-        }]
+            '16': {
+                'meta': {
+                    'dateCreated': 1730489700,
+                    'uuid': '40cac41d-8d29-4ef5-aa11-75047b9f0907',
+                },
+            },
+            '19': {
+                'duplicated_field_name': {'x': 1},
+            },
+            '20': {
+                'duplicated_field_name': {'y': 2},
+            },
+        }
        self.assertEqual(self.ie._search_nextjs_v13_data(HTML, None), EXPECTED)
-        self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), [])
-        self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), [])
+        self.assertEqual(self.ie._search_nextjs_v13_data('', None, fatal=False), {})
+        self.assertEqual(self.ie._search_nextjs_v13_data(None, None, fatal=False), {})

    def test_search_nuxt_json(self):
        HTML_TMPL = '<script data-ssr="true" id="__NUXT_DATA__" type="application/json">[{}]</script>'
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1785,7 +1785,7 @@ class InfoExtractor:

    def _search_nextjs_v13_data(self, webpage, video_id, fatal=True):
        """Parses Next.js app router flight data that was introduced in Next.js v13"""
-        nextjs_data = []
+        nextjs_data = {}
        if not fatal and not isinstance(webpage, str):
            return nextjs_data

@@ -1797,9 +1797,9 @@ class InfoExtractor:
                if not isinstance(data, dict):
                    return
                children = data.pop('children', None)
-                if data and isinstance(name, str) and name.startswith('$'):
+                if data and isinstance(name, str) and re.fullmatch(r'\$L[0-9a-f]+', name):
                    # It is useful hydration JSON data
-                    nextjs_data.append(data)
+                    nextjs_data[name[2:]] = data
                flatten(children)
                return
            for f in flight_data:
@@ -1823,10 +1823,16 @@ class InfoExtractor:
                flight_text += chunk

        for f in flight_text.splitlines():
-            prefix, _, body = f.partition(':')
-            if body.startswith('[') and body.endswith(']') and re.fullmatch(r'[0-9a-f]{1,3}', prefix.lstrip()):
-                # The body isn't necessarily valid JSON, so this should always be non-fatal
+            prefix, _, body = f.lstrip().partition(':')
+            if not re.fullmatch(r'[0-9a-f]+', prefix):
+                continue
+            # The body still isn't guaranteed to be valid JSON, so parsing should always be non-fatal
+            if body.startswith('[') and body.endswith(']'):
                flatten(self._parse_json(body, video_id, fatal=False, errnote=False))
+            elif body.startswith('{') and body.endswith('}'):
+                data = self._parse_json(body, video_id, fatal=False, errnote=False)
+                if data is not None:
+                    nextjs_data[prefix] = data

        return nextjs_data