diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a6a17e035..52c1a6c94 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1979,26 +1979,25 @@ def test_search_nuxt_json(self): ["Set"] ] ''' - DATA = { - 'podcast': { - 'podcast': { - 'title': 'Series Title', - 'id': 'podcast-id-01', - }, - 'seasons': [1, 2, 3], - }, - 'activeEpisodeData': { - 'episode': { - 'title': 'Episode Title', - 'id': 'episode-id-99', - }, - 'creators': ['Podcast Creator'], - 'empty_list': [], - }, - } - FULL = { + PAYLOAD = { 'data': { - '$abcdef123456': DATA, + '$abcdef123456': { + 'podcast': { + 'podcast': { + 'title': 'Series Title', + 'id': 'podcast-id-01', + }, + 'seasons': [1, 2, 3], + }, + 'activeEpisodeData': { + 'episode': { + 'title': 'Episode Title', + 'id': 'episode-id-99', + }, + 'creators': ['Podcast Creator'], + 'empty_list': [], + }, + }, }, 'state': { '$ssite-config': { @@ -2006,7 +2005,7 @@ def test_search_nuxt_json(self): 'name': 'podcast-website', }, }, - 'once': ['Set'], + 'once': None, } BAD_HTML = ''' ''' - self.assertEqual(self.ie._search_nuxt_json(HTML, None), DATA) - self.assertEqual(self.ie._search_nuxt_json(HTML, None, traverse=None), FULL) + self.assertEqual(self.ie._search_nuxt_json(HTML, 'id'), PAYLOAD) self.assertEqual(self.ie._search_nuxt_json('', None, fatal=False), {}) self.assertEqual(self.ie._search_nuxt_json(BAD_HTML, None, fatal=False), {}) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 85221e0fc..fc31a4804 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1795,39 +1795,57 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} - def _search_nuxt_json(self, webpage, video_id, *, fatal=True, traverse=('data', ..., {dict}, any)): + def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT): """Parses metadata from Nuxt rich JSON payload arrays""" # Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57 # https://github.com/nuxt/nuxt/pull/19205 - array = self._search_json( - r']+\bid="__NUXT_DATA__"[^>]*>', webpage, 'nuxt data', video_id, - contains_pattern=r'\[(?s:.+)\]', default=NO_DEFAULT if fatal else [{}]) + try: + array = self._search_json( + r']+\bid="__NUXT_DATA__"[^>]*>', webpage, + 'Nuxt JSON data', video_id, contains_pattern=r'\[(?s:.+)\]') + except ExtractorError as e: + if fatal: + raise + if default is NO_DEFAULT: + self.report_warning(e.orig_msg) + return {} + return default + + IGNORED_TYPES = ('Map', 'Set', 'Ref', 'ShallowRef', 'EmptyRef', 'EmptyShallowRef', 'NuxtError') def extract_element(element): - try: - if isinstance(element, list) and element: + if isinstance(element, list): + if element and isinstance(element[0], str): if element[0] in ('ShallowReactive', 'Reactive') and isinstance(element[1], int): return extract_element(array[element[1]]) - if all(isinstance(ele, int) for ele in element): - return [extract_element(array[ele]) for ele in element] - if isinstance(element, dict): - ret = {} - for k, v in element.items(): - if isinstance(v, int): - ret[k] = extract_element(array[v]) - else: - ret[k] = v - return ret - except IndexError as e: - error_msg = f'Unable to extract NUXT JSON data: {e}' - if not fatal: - self.report_warning(error_msg, video_id=video_id, only_once=True) + if element[0] not in IGNORED_TYPES: + self.write_debug( + f'{video_id}: Discarding unsupported type in Nuxt payload: {element[0]}', + only_once=True) return None - raise ExtractorError(error_msg) - + return [extract_element(array[ele]) for ele in element] + if isinstance(element, dict): + ret = {} + for k, v in element.items(): + ret[k] = extract_element(array[v]) + return ret return element - return traverse_obj(extract_element(array[0]), traverse) or {} + try: + payload = extract_element(array[0]) + except IndexError as e: + error_msg = f'Unable to extract Nuxt JSON data: {e}' + if fatal: + raise ExtractorError(error_msg) + if default is NO_DEFAULT: + self.report_warning(error_msg, video_id=video_id) + return {} + return default + + if default is NO_DEFAULT: + default = {} + + return payload or default @staticmethod def _hidden_inputs(html):