mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-06-28 01:18:30 +00:00
split into 2 methods, support Set and Map, add test data
Authored by: bashonly
This commit is contained in:
parent
0beb5faf3c
commit
593e3ec151
@ -1952,7 +1952,7 @@ def test_search_nuxt_json(self):
|
|||||||
<script data-ssr="true" id="__NUXT_DATA__" type="application/json">
|
<script data-ssr="true" id="__NUXT_DATA__" type="application/json">
|
||||||
[
|
[
|
||||||
["ShallowReactive",1],
|
["ShallowReactive",1],
|
||||||
{"data":2,"state":21,"once":25},
|
{"data":2,"state":21,"once":25,"_errors":28},
|
||||||
["ShallowReactive",3],
|
["ShallowReactive",3],
|
||||||
{"$abcdef123456":4},
|
{"$abcdef123456":4},
|
||||||
{"podcast":5,"activeEpisodeData":7},
|
{"podcast":5,"activeEpisodeData":7},
|
||||||
@ -1973,10 +1973,14 @@ def test_search_nuxt_json(self):
|
|||||||
"Podcast Creator",
|
"Podcast Creator",
|
||||||
[],
|
[],
|
||||||
{"$ssite-config":22},
|
{"$ssite-config":22},
|
||||||
{"env":23,"name":24},
|
{"env":23,"name":24,"map":26},
|
||||||
"production",
|
"production",
|
||||||
"podcast-website",
|
"podcast-website",
|
||||||
["Set"]
|
["Set"],
|
||||||
|
["Reactive",27],
|
||||||
|
["Map"],
|
||||||
|
["ShallowReactive",29],
|
||||||
|
{}
|
||||||
]
|
]
|
||||||
</script>'''
|
</script>'''
|
||||||
PAYLOAD = {
|
PAYLOAD = {
|
||||||
@ -2003,9 +2007,11 @@ def test_search_nuxt_json(self):
|
|||||||
'$ssite-config': {
|
'$ssite-config': {
|
||||||
'env': 'production',
|
'env': 'production',
|
||||||
'name': 'podcast-website',
|
'name': 'podcast-website',
|
||||||
|
'map': {},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'once': None,
|
'once': [],
|
||||||
|
'_errors': {},
|
||||||
}
|
}
|
||||||
BAD_HTML = '''
|
BAD_HTML = '''
|
||||||
<script data-ssr="true" id="__NUXT_DATA__" type="application/json">
|
<script data-ssr="true" id="__NUXT_DATA__" type="application/json">
|
||||||
|
@ -1795,29 +1795,21 @@ def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal
|
|||||||
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
|
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
|
||||||
return traverse_obj(ret, traverse) or {}
|
return traverse_obj(ret, traverse) or {}
|
||||||
|
|
||||||
def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT):
|
def _resolve_nuxt_array(self, array, video_id, *, fatal=True, default=NO_DEFAULT):
|
||||||
"""Parses metadata from Nuxt rich JSON payload arrays"""
|
"""Resolves Nuxt rich JSON payload arrays"""
|
||||||
# Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57
|
# Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57
|
||||||
# https://github.com/nuxt/nuxt/pull/19205
|
# https://github.com/nuxt/nuxt/pull/19205
|
||||||
IGNORED_TYPES = ('Map', 'Set', 'EmptyRef', 'EmptyShallowRef', 'NuxtError')
|
|
||||||
ERROR_MSG = 'Unable to extract Nuxt JSON data'
|
ERROR_MSG = 'Unable to extract Nuxt JSON data'
|
||||||
|
|
||||||
if default is not NO_DEFAULT:
|
|
||||||
fatal = False
|
|
||||||
|
|
||||||
array = self._search_json(
|
|
||||||
r'<script\b[^>]+\bid="__NUXT_DATA__"[^>]*>', webpage, 'Nuxt JSON data', video_id,
|
|
||||||
contains_pattern=r'\[(?s:.+)\]', default=NO_DEFAULT if fatal else [])
|
|
||||||
|
|
||||||
result = [None]
|
result = [None]
|
||||||
stack = [(result, 0, 0)]
|
stack = [(result, 0, 0)]
|
||||||
while stack:
|
while stack:
|
||||||
target, index, source = stack.pop()
|
target, index, source = stack.pop()
|
||||||
if 0 <= source < len(array):
|
if 0 <= source < len(array):
|
||||||
element = array[source]
|
element = array[source]
|
||||||
elif fatal:
|
|
||||||
raise ExtractorError(ERROR_MSG, video_id=video_id)
|
|
||||||
elif default is NO_DEFAULT:
|
elif default is NO_DEFAULT:
|
||||||
|
if fatal:
|
||||||
|
raise ExtractorError(ERROR_MSG, video_id=video_id)
|
||||||
self.report_warning(ERROR_MSG, video_id=video_id)
|
self.report_warning(ERROR_MSG, video_id=video_id)
|
||||||
return {}
|
return {}
|
||||||
else:
|
else:
|
||||||
@ -1827,11 +1819,16 @@ def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT
|
|||||||
if element[0] in ('ShallowReactive', 'Reactive', 'ShallowRef', 'Ref'):
|
if element[0] in ('ShallowReactive', 'Reactive', 'ShallowRef', 'Ref'):
|
||||||
stack.append((target, index, element[1]))
|
stack.append((target, index, element[1]))
|
||||||
continue
|
continue
|
||||||
if element[0] not in IGNORED_TYPES:
|
if element[0] == 'Map':
|
||||||
|
target[index] = {}
|
||||||
|
elif element[0] == 'Set':
|
||||||
|
target[index] = []
|
||||||
|
else:
|
||||||
|
target[index] = None
|
||||||
|
if element[0] not in ('EmptyRef', 'EmptyShallowRef', 'NuxtError'):
|
||||||
self.write_debug(
|
self.write_debug(
|
||||||
f'{video_id}: Discarding unsupported type in Nuxt payload: {element[0]}',
|
f'{video_id}: Discarding unsupported type in Nuxt payload: {element[0]}',
|
||||||
only_once=True)
|
only_once=True)
|
||||||
target[index] = None
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if isinstance(element, list):
|
if isinstance(element, list):
|
||||||
@ -1850,6 +1847,17 @@ def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT
|
|||||||
|
|
||||||
return result[0]
|
return result[0]
|
||||||
|
|
||||||
|
def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT):
|
||||||
|
"""Parses metadata from Nuxt rich JSON payloads embedded in HTML"""
|
||||||
|
if default is not NO_DEFAULT:
|
||||||
|
fatal = False
|
||||||
|
|
||||||
|
array = self._search_json(
|
||||||
|
r'<script\b[^>]+\bid="__NUXT_DATA__"[^>]*>', webpage, 'Nuxt JSON data', video_id,
|
||||||
|
contains_pattern=r'\[(?s:.+)\]', default=NO_DEFAULT if fatal else [])
|
||||||
|
|
||||||
|
return self._resolve_nuxt_array(array, video_id, fatal=fatal, default=default)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _hidden_inputs(html):
|
def _hidden_inputs(html):
|
||||||
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
|
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
|
||||||
|
Loading…
Reference in New Issue
Block a user