mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-07-03 03:48:31 +00:00
[ie] Add _search_nuxt_json
helper
Authored by: bashonly
This commit is contained in:
parent
e1b6062f8c
commit
291301e939
@ -1947,6 +1947,87 @@ def test_search_nextjs_data(self):
|
|||||||
with self.assertWarns(DeprecationWarning):
|
with self.assertWarns(DeprecationWarning):
|
||||||
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
|
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
|
||||||
|
|
||||||
|
def test_search_nuxt_json(self):
|
||||||
|
HTML = '''
|
||||||
|
<script data-ssr="true" id="__NUXT_DATA__" type="application/json">
|
||||||
|
[
|
||||||
|
["ShallowReactive",1],
|
||||||
|
{"data":2,"state":22,"once":26},
|
||||||
|
["ShallowReactive",3],
|
||||||
|
{"$abcdef123456":4},
|
||||||
|
{"podcast":5,"activeEpisodeData":7},
|
||||||
|
{"podcast":6,"seasons":14},
|
||||||
|
{"title":10,"id":11},
|
||||||
|
["Reactive",8],
|
||||||
|
{"episode":9,"creators":18,"trick_data":19,"empty_list":21},
|
||||||
|
{"title":12,"id":13},
|
||||||
|
"Series Title",
|
||||||
|
"podcast-id-01",
|
||||||
|
"Episode Title",
|
||||||
|
"episode-id-99",
|
||||||
|
[15,16,17],
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
3,
|
||||||
|
[20],
|
||||||
|
[99,"gotcha"],
|
||||||
|
"Podcast Creator",
|
||||||
|
[],
|
||||||
|
{"$ssite-config":23},
|
||||||
|
{"env":24,"name":25},
|
||||||
|
"production",
|
||||||
|
"podcast-website",
|
||||||
|
["Set"]
|
||||||
|
]
|
||||||
|
</script>'''
|
||||||
|
DATA = {
|
||||||
|
'podcast': {
|
||||||
|
'podcast': {
|
||||||
|
'title': 'Series Title',
|
||||||
|
'id': 'podcast-id-01',
|
||||||
|
},
|
||||||
|
'seasons': [1, 2, 3],
|
||||||
|
},
|
||||||
|
'activeEpisodeData': {
|
||||||
|
'episode': {
|
||||||
|
'title': 'Episode Title',
|
||||||
|
'id': 'episode-id-99',
|
||||||
|
},
|
||||||
|
'creators': ['Podcast Creator'],
|
||||||
|
'trick_data': [99, 'gotcha'],
|
||||||
|
'empty_list': [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
FULL = {
|
||||||
|
'data': {
|
||||||
|
'$abcdef123456': DATA,
|
||||||
|
},
|
||||||
|
'state': {
|
||||||
|
'$ssite-config': {
|
||||||
|
'env': 'production',
|
||||||
|
'name': 'podcast-website',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
'once': ['Set'],
|
||||||
|
}
|
||||||
|
BAD_HTML = '''
|
||||||
|
<script data-ssr="true" id="__NUXT_DATA__" type="application/json">
|
||||||
|
[
|
||||||
|
["ShallowReactive",1],
|
||||||
|
{"data":2},
|
||||||
|
{"improper_raw_list":3},
|
||||||
|
[15,16,17]
|
||||||
|
]
|
||||||
|
</script>'''
|
||||||
|
|
||||||
|
self.assertEqual(self.ie._search_nuxt_json(HTML, None), DATA)
|
||||||
|
self.assertEqual(self.ie._search_nuxt_json(HTML, None, traverse=None), FULL)
|
||||||
|
self.assertEqual(self.ie._search_nuxt_json('', None, fatal=False), {})
|
||||||
|
self.assertEqual(self.ie._search_nuxt_json(BAD_HTML, None, fatal=False), {})
|
||||||
|
self.assertEqual(self.ie._search_nuxt_json(HTML, None, fatal=False, allow_recursion=1), {})
|
||||||
|
with self.assertRaisesRegex(ExtractorError, r'recursion limit reached'):
|
||||||
|
self.ie._search_nuxt_json(HTML, None, allow_recursion=1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -1795,6 +1795,47 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal
|
|||||||
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
|
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
|
||||||
return traverse_obj(ret, traverse) or {}
|
return traverse_obj(ret, traverse) or {}
|
||||||
|
|
||||||
|
def _search_nuxt_json(self, webpage, video_id, script_id='__NUXT_DATA__', *, fatal=True,
|
||||||
|
traverse=('data', ..., {dict}, any), allow_recursion=100):
|
||||||
|
"""Parses Nuxt.js metadata when it has already been rendered into a JSON array"""
|
||||||
|
|
||||||
|
ERROR_MSG = 'Unable to extract NUXT JSON data'
|
||||||
|
array = self._search_json(
|
||||||
|
fr'<script\b[^>]+\bid="{re.escape(script_id)}"[^>]*>', webpage, script_id,
|
||||||
|
video_id, contains_pattern=r'\[(?s:.+)\]', default=NO_DEFAULT if fatal else [{}])
|
||||||
|
|
||||||
|
def extract_element(element, allow_recursion):
|
||||||
|
if allow_recursion < 0:
|
||||||
|
msg = f'{ERROR_MSG}: recursion limit reached'
|
||||||
|
if fatal:
|
||||||
|
raise ExtractorError(msg)
|
||||||
|
self.report_warning(msg, video_id=video_id, only_once=True)
|
||||||
|
return None
|
||||||
|
allow_recursion -= 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
if isinstance(element, list) and element:
|
||||||
|
if element[0] in ('ShallowReactive', 'Reactive') and isinstance(element[1], int):
|
||||||
|
return extract_element(array[element[1]], allow_recursion)
|
||||||
|
if all(isinstance(ele, int) for ele in element):
|
||||||
|
return [extract_element(array[ele], allow_recursion) for ele in element]
|
||||||
|
if isinstance(element, dict):
|
||||||
|
ret = {}
|
||||||
|
for k, v in element.items():
|
||||||
|
if isinstance(v, int):
|
||||||
|
ret[k] = extract_element(array[v], allow_recursion)
|
||||||
|
else:
|
||||||
|
ret[k] = v
|
||||||
|
return ret
|
||||||
|
except IndexError as e:
|
||||||
|
if not fatal:
|
||||||
|
return None
|
||||||
|
raise ExtractorError(f'{ERROR_MSG}: {e}')
|
||||||
|
|
||||||
|
return element
|
||||||
|
|
||||||
|
return traverse_obj(extract_element(array[0], allow_recursion), traverse) or {}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _hidden_inputs(html):
|
def _hidden_inputs(html):
|
||||||
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
|
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
|
||||||
|
Loading…
Reference in New Issue
Block a user