mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[ie] Add _search_nuxt_json helper (#13386)
				
					
				
			* Adds InfoExtractor._search_nuxt_json for webpage extraction * Adds InfoExtractor._resolve_nuxt_array for direct use with payload JSON * Adds yt_dlp.utils.jslib module for Python solutions to common JavaScript libraries * Adds devalue.parse and devalue.parse_iter to jslib utils Ref: *9e503be0f2*f3fd2aa93d/src/parse.jsAuthored by: bashonly, Grub4K Co-authored-by: Simon Sawicki <contact@grub4k.dev>
This commit is contained in:
		| @@ -101,6 +101,7 @@ from ..utils import ( | ||||
|     xpath_with_ns, | ||||
| ) | ||||
| from ..utils._utils import _request_dump_filename | ||||
| from ..utils.jslib import devalue | ||||
| 
 | ||||
| 
 | ||||
| class InfoExtractor: | ||||
| @@ -1795,6 +1796,63 @@ class InfoExtractor: | ||||
|         ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) | ||||
|         return traverse_obj(ret, traverse) or {} | ||||
| 
 | ||||
|     def _resolve_nuxt_array(self, array, video_id, *, fatal=True, default=NO_DEFAULT): | ||||
|         """Resolves Nuxt rich JSON payload arrays""" | ||||
|         # Ref: https://github.com/nuxt/nuxt/commit/9e503be0f2a24f4df72a3ccab2db4d3e63511f57 | ||||
|         #      https://github.com/nuxt/nuxt/pull/19205 | ||||
|         if default is not NO_DEFAULT: | ||||
|             fatal = False | ||||
| 
 | ||||
|         if not isinstance(array, list) or not array: | ||||
|             error_msg = 'Unable to resolve Nuxt JSON data: invalid input' | ||||
|             if fatal: | ||||
|                 raise ExtractorError(error_msg, video_id=video_id) | ||||
|             elif default is NO_DEFAULT: | ||||
|                 self.report_warning(error_msg, video_id=video_id) | ||||
|             return {} if default is NO_DEFAULT else default | ||||
| 
 | ||||
|         def indirect_reviver(data): | ||||
|             return data | ||||
| 
 | ||||
|         def json_reviver(data): | ||||
|             return json.loads(data) | ||||
| 
 | ||||
|         gen = devalue.parse_iter(array, revivers={ | ||||
|             'NuxtError': indirect_reviver, | ||||
|             'EmptyShallowRef': json_reviver, | ||||
|             'EmptyRef': json_reviver, | ||||
|             'ShallowRef': indirect_reviver, | ||||
|             'ShallowReactive': indirect_reviver, | ||||
|             'Ref': indirect_reviver, | ||||
|             'Reactive': indirect_reviver, | ||||
|         }) | ||||
| 
 | ||||
|         while True: | ||||
|             try: | ||||
|                 error_msg = f'Error resolving Nuxt JSON: {gen.send(None)}' | ||||
|                 if fatal: | ||||
|                     raise ExtractorError(error_msg, video_id=video_id) | ||||
|                 elif default is NO_DEFAULT: | ||||
|                     self.report_warning(error_msg, video_id=video_id, only_once=True) | ||||
|                 else: | ||||
|                     self.write_debug(f'{video_id}: {error_msg}', only_once=True) | ||||
|             except StopIteration as error: | ||||
|                 return error.value or ({} if default is NO_DEFAULT else default) | ||||
| 
 | ||||
|     def _search_nuxt_json(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT): | ||||
|         """Parses metadata from Nuxt rich JSON payloads embedded in HTML""" | ||||
|         passed_default = default is not NO_DEFAULT | ||||
| 
 | ||||
|         array = self._search_json( | ||||
|             r'<script\b[^>]+\bid="__NUXT_DATA__"[^>]*>', webpage, | ||||
|             'Nuxt JSON data', video_id, contains_pattern=r'\[(?s:.+)\]', | ||||
|             fatal=fatal, default=NO_DEFAULT if not passed_default else None) | ||||
| 
 | ||||
|         if not array: | ||||
|             return default if passed_default else {} | ||||
| 
 | ||||
|         return self._resolve_nuxt_array(array, video_id, fatal=fatal, default=default) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _hidden_inputs(html): | ||||
|         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 bashonly
					bashonly