diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index bc89b2955e..85b53acf4a 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1947,6 +1947,87 @@ def test_search_nextjs_data(self): with self.assertWarns(DeprecationWarning): self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + def test_search_nuxt_json(self): + HTML = ''' + ''' + DATA = { + 'podcast': { + 'podcast': { + 'title': 'Series Title', + 'id': 'podcast-id-01', + }, + 'seasons': [1, 2, 3], + }, + 'activeEpisodeData': { + 'episode': { + 'title': 'Episode Title', + 'id': 'episode-id-99', + }, + 'creators': ['Podcast Creator'], + 'trick_data': [99, 'gotcha'], + 'empty_list': [], + }, + } + FULL = { + 'data': { + '$abcdef123456': DATA, + }, + 'state': { + '$ssite-config': { + 'env': 'production', + 'name': 'podcast-website', + }, + }, + 'once': ['Set'], + } + BAD_HTML = ''' + ''' + + self.assertEqual(self.ie._search_nuxt_json(HTML, None), DATA) + self.assertEqual(self.ie._search_nuxt_json(HTML, None, traverse=None), FULL) + self.assertEqual(self.ie._search_nuxt_json('', None, fatal=False), {}) + self.assertEqual(self.ie._search_nuxt_json(BAD_HTML, None, fatal=False), {}) + self.assertEqual(self.ie._search_nuxt_json(HTML, None, fatal=False, allow_recursion=1), {}) + with self.assertRaisesRegex(ExtractorError, r'recursion limit reached'): + self.ie._search_nuxt_json(HTML, None, allow_recursion=1) + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1174bd4f5e..aeb2ec293f 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1795,6 +1795,47 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} + def _search_nuxt_json(self, webpage, video_id, script_id='__NUXT_DATA__', *, fatal=True, + traverse=('data', ..., {dict}, any), allow_recursion=100): + """Parses Nuxt.js metadata when it has already been rendered into a JSON array""" + + ERROR_MSG = 'Unable to extract NUXT JSON data' + array = self._search_json( + fr']+\bid="{re.escape(script_id)}"[^>]*>', webpage, script_id, + video_id, contains_pattern=r'\[(?s:.+)\]', default=NO_DEFAULT if fatal else [{}]) + + def extract_element(element, allow_recursion): + if allow_recursion < 0: + msg = f'{ERROR_MSG}: recursion limit reached' + if fatal: + raise ExtractorError(msg) + self.report_warning(msg, video_id=video_id, only_once=True) + return None + allow_recursion -= 1 + + try: + if isinstance(element, list) and element: + if element[0] in ('ShallowReactive', 'Reactive') and isinstance(element[1], int): + return extract_element(array[element[1]], allow_recursion) + if all(isinstance(ele, int) for ele in element): + return [extract_element(array[ele], allow_recursion) for ele in element] + if isinstance(element, dict): + ret = {} + for k, v in element.items(): + if isinstance(v, int): + ret[k] = extract_element(array[v], allow_recursion) + else: + ret[k] = v + return ret + except IndexError as e: + if not fatal: + return None + raise ExtractorError(f'{ERROR_MSG}: {e}') + + return element + + return traverse_obj(extract_element(array[0], allow_recursion), traverse) or {} + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html)