mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[extractor/amazon] Add AmazonReviews extractor (#5857)
				
					
				
			Closes #5766 Authored by: bashonly
This commit is contained in:
		| @@ -87,7 +87,10 @@ from .alura import ( | |||||||
|     AluraCourseIE |     AluraCourseIE | ||||||
| ) | ) | ||||||
| from .amcnetworks import AMCNetworksIE | from .amcnetworks import AMCNetworksIE | ||||||
| from .amazon import AmazonStoreIE | from .amazon import ( | ||||||
|  |     AmazonStoreIE, | ||||||
|  |     AmazonReviewsIE, | ||||||
|  | ) | ||||||
| from .amazonminitv import ( | from .amazonminitv import ( | ||||||
|     AmazonMiniTVIE, |     AmazonMiniTVIE, | ||||||
|     AmazonMiniTVSeasonIE, |     AmazonMiniTVSeasonIE, | ||||||
|   | |||||||
| @@ -1,5 +1,17 @@ | |||||||
|  | import re | ||||||
|  | 
 | ||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| from ..utils import ExtractorError, int_or_none | from ..utils import ( | ||||||
|  |     ExtractorError, | ||||||
|  |     clean_html, | ||||||
|  |     float_or_none, | ||||||
|  |     get_element_by_attribute, | ||||||
|  |     get_element_by_class, | ||||||
|  |     int_or_none, | ||||||
|  |     js_to_json, | ||||||
|  |     traverse_obj, | ||||||
|  |     url_or_none, | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class AmazonStoreIE(InfoExtractor): | class AmazonStoreIE(InfoExtractor): | ||||||
| @@ -9,7 +21,7 @@ class AmazonStoreIE(InfoExtractor): | |||||||
|         'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', |         'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': 'B098XNCHLD', |             'id': 'B098XNCHLD', | ||||||
|             'title': 'md5:dae240564cbb2642170c02f7f0d7e472', |             'title': str, | ||||||
|         }, |         }, | ||||||
|         'playlist_mincount': 1, |         'playlist_mincount': 1, | ||||||
|         'playlist': [{ |         'playlist': [{ | ||||||
| @@ -20,28 +32,32 @@ class AmazonStoreIE(InfoExtractor): | |||||||
|                 'thumbnail': r're:^https?://.*\.jpg$', |                 'thumbnail': r're:^https?://.*\.jpg$', | ||||||
|                 'duration': 34, |                 'duration': 34, | ||||||
|             }, |             }, | ||||||
|         }] |         }], | ||||||
|  |         'expected_warnings': ['Unable to extract data'], | ||||||
|     }, { |     }, { | ||||||
|         'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', |         'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': 'B0863TXGM3', |             'id': 'B0863TXGM3', | ||||||
|             'title': 'md5:d1d3352428f8f015706c84b31e132169', |             'title': str, | ||||||
|         }, |         }, | ||||||
|         'playlist_mincount': 4, |         'playlist_mincount': 4, | ||||||
|  |         'expected_warnings': ['Unable to extract data'], | ||||||
|     }, { |     }, { | ||||||
|         'url': 'https://www.amazon.com/dp/B0845NXCXF/', |         'url': 'https://www.amazon.com/dp/B0845NXCXF/', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': 'B0845NXCXF', |             'id': 'B0845NXCXF', | ||||||
|             'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', |             'title': str, | ||||||
|         }, |         }, | ||||||
|         'playlist-mincount': 1, |         'playlist-mincount': 1, | ||||||
|  |         'expected_warnings': ['Unable to extract data'], | ||||||
|     }, { |     }, { | ||||||
|         'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', |         'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': 'B08WX337PQ', |             'id': 'B08WX337PQ', | ||||||
|             'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', |             'title': str, | ||||||
|         }, |         }, | ||||||
|         'playlist_mincount': 1, |         'playlist_mincount': 1, | ||||||
|  |         'expected_warnings': ['Unable to extract data'], | ||||||
|     }] |     }] | ||||||
| 
 | 
 | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
| @@ -52,7 +68,7 @@ class AmazonStoreIE(InfoExtractor): | |||||||
|             try: |             try: | ||||||
|                 data_json = self._search_json( |                 data_json = self._search_json( | ||||||
|                     r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, |                     r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, | ||||||
|                     transform_source=lambda x: x.replace(R'\\u', R'\u')) |                     transform_source=js_to_json) | ||||||
|             except ExtractorError as e: |             except ExtractorError as e: | ||||||
|                 retry.error = e |                 retry.error = e | ||||||
| 
 | 
 | ||||||
| @@ -66,3 +82,89 @@ class AmazonStoreIE(InfoExtractor): | |||||||
|             'width': int_or_none(video.get('videoWidth')), |             'width': int_or_none(video.get('videoWidth')), | ||||||
|         } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] |         } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] | ||||||
|         return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) |         return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class AmazonReviewsIE(InfoExtractor): | ||||||
|  |     _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)' | ||||||
|  |     _TESTS = [{ | ||||||
|  |         'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': 'R10VE9VUSY19L3', | ||||||
|  |             'ext': 'mp4', | ||||||
|  |             'title': 'Get squad #Suspicious', | ||||||
|  |             'description': 'md5:7012695052f440a1e064e402d87e0afb', | ||||||
|  |             'uploader': 'Kimberly Cronkright', | ||||||
|  |             'average_rating': 1.0, | ||||||
|  |             'thumbnail': r're:^https?://.*\.jpg$', | ||||||
|  |         }, | ||||||
|  |         'expected_warnings': ['Review body was not found in webpage'], | ||||||
|  |     }, { | ||||||
|  |         'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': 'R10VE9VUSY19L3', | ||||||
|  |             'ext': 'mp4', | ||||||
|  |             'title': 'Get squad #Suspicious', | ||||||
|  |             'description': 'md5:7012695052f440a1e064e402d87e0afb', | ||||||
|  |             'uploader': 'Kimberly Cronkright', | ||||||
|  |             'average_rating': 1.0, | ||||||
|  |             'thumbnail': r're:^https?://.*\.jpg$', | ||||||
|  |         }, | ||||||
|  |         'expected_warnings': ['Review body was not found in webpage'], | ||||||
|  |     }, { | ||||||
|  |         'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': 'RV1CO8JN5VGXV', | ||||||
|  |             'ext': 'mp4', | ||||||
|  |             'title': 'Not sure about its durability', | ||||||
|  |             'description': 'md5:1a252c106357f0a3109ebf37d2e87494', | ||||||
|  |             'uploader': 'Shoaib Gulzar', | ||||||
|  |             'average_rating': 2.0, | ||||||
|  |             'thumbnail': r're:^https?://.*\.jpg$', | ||||||
|  |         }, | ||||||
|  |         'expected_warnings': ['Review body was not found in webpage'], | ||||||
|  |     }] | ||||||
|  | 
 | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         video_id = self._match_id(url) | ||||||
|  | 
 | ||||||
|  |         for retry in self.RetryManager(): | ||||||
|  |             webpage = self._download_webpage(url, video_id) | ||||||
|  |             review_body = get_element_by_attribute('data-hook', 'review-body', webpage) | ||||||
|  |             if not review_body: | ||||||
|  |                 retry.error = ExtractorError('Review body was not found in webpage', expected=True) | ||||||
|  | 
 | ||||||
|  |         formats, subtitles = [], {} | ||||||
|  | 
 | ||||||
|  |         manifest_url = self._search_regex( | ||||||
|  |             r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None) | ||||||
|  |         if url_or_none(manifest_url): | ||||||
|  |             fmts, subtitles = self._extract_m3u8_formats_and_subtitles( | ||||||
|  |                 manifest_url, video_id, 'mp4', fatal=False) | ||||||
|  |             formats.extend(fmts) | ||||||
|  | 
 | ||||||
|  |         video_url = self._search_regex( | ||||||
|  |             r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None) | ||||||
|  |         if url_or_none(video_url): | ||||||
|  |             formats.append({ | ||||||
|  |                 'url': video_url, | ||||||
|  |                 'ext': 'mp4', | ||||||
|  |                 'format_id': 'http-mp4', | ||||||
|  |             }) | ||||||
|  | 
 | ||||||
|  |         if not formats: | ||||||
|  |             self.raise_no_formats('No video found for this customer review', expected=True) | ||||||
|  | 
 | ||||||
|  |         return { | ||||||
|  |             'id': video_id, | ||||||
|  |             'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage)) | ||||||
|  |                       or self._html_extract_title(webpage)), | ||||||
|  |             'description': clean_html(traverse_obj(re.findall( | ||||||
|  |                 r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)), | ||||||
|  |             'uploader': clean_html(get_element_by_class('a-profile-name', webpage)), | ||||||
|  |             'average_rating': float_or_none(clean_html(get_element_by_attribute( | ||||||
|  |                 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]), | ||||||
|  |             'thumbnail': self._search_regex( | ||||||
|  |                 r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None), | ||||||
|  |             'formats': formats, | ||||||
|  |             'subtitles': subtitles, | ||||||
|  |         } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 bashonly
					bashonly