mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[BostonGlobe] New. Nonstandard version of Brightcove.
Has a "data-brightcove-video-id" instead of a "data-video-id," otherwise pretty much just Brightcove. Except the Globe isn't all Brightcove videos, so fallback to Generic, too. Also, abstract playlist_from_matches() from generic.py to common.py, and use it here. History of these changes can be found in 51170427d4b1143572a498dedaee61863a5b2c5b.
This commit is contained in:
		 John Hawkinson
					John Hawkinson
				
			
				
					committed by
					
						 Yen Chi Hsuan
						Yen Chi Hsuan
					
				
			
			
				
	
			
			
			 Yen Chi Hsuan
						Yen Chi Hsuan
					
				
			
						parent
						
							772b5ff57f
						
					
				
				
					commit
					46b18f2349
				
			
							
								
								
									
										72
									
								
								youtube_dl/extractor/bostonglobe.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								youtube_dl/extractor/bostonglobe.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,72 @@ | ||||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| import re | ||||
|  | ||||
| from .common import InfoExtractor | ||||
|  | ||||
| from ..utils import ( | ||||
|     extract_attributes, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class BostonGlobeIE(InfoExtractor): | ||||
|     _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' | ||||
|     _TESTS = [ | ||||
|         { | ||||
|             'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', | ||||
|             'md5': '0a62181079c85c2d2b618c9a738aedaf', | ||||
|             'info_dict': { | ||||
|                 'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', | ||||
|                 'id': '5320421710001', | ||||
|                 'ext': 'mp4', | ||||
|                 'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', | ||||
|                 'timestamp': 1486877593, | ||||
|                 'upload_date': '20170212', | ||||
|                 'uploader_id': '245991542', | ||||
|             }, | ||||
|         }, | ||||
|         { | ||||
|             # Embedded youtube video; we hand it off to the Generic extractor. | ||||
|             'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', | ||||
|             'md5': '582b40327089d5c0c949b3c54b13c24b', | ||||
|             'info_dict': { | ||||
|                 'title': "Who Is Matt Damon's Favorite Batman?", | ||||
|                 'id': 'ZW1QCnlA6Qc', | ||||
|                 'ext': 'mp4', | ||||
|                 'upload_date': '20170217', | ||||
|                 'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', | ||||
|                 'uploader': 'The Late Late Show with James Corden', | ||||
|                 'uploader_id': 'TheLateLateShow', | ||||
|             }, | ||||
|             'expected_warnings': ['404'], | ||||
|         }, | ||||
|     ] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         page_id = self._match_id(url) | ||||
|         webpage = self._download_webpage(url, page_id) | ||||
|  | ||||
|         page_title = self._og_search_title(webpage, default=None) | ||||
|  | ||||
|         # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> | ||||
|         entries = [] | ||||
|         for video in re.findall(r'(?i)(<video[^>]+>)', webpage): | ||||
|             attrs = extract_attributes(video) | ||||
|  | ||||
|             video_id = attrs.get('data-brightcove-video-id') | ||||
|             account_id = attrs.get('data-account') | ||||
|             player_id = attrs.get('data-player') | ||||
|             embed = attrs.get('data-embed') | ||||
|  | ||||
|             if video_id and account_id and player_id and embed: | ||||
|                 entries.append( | ||||
|                     'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' | ||||
|                     % (account_id, player_id, embed, video_id)) | ||||
|  | ||||
|         if len(entries) == 0: | ||||
|             return self.url_result(url, 'Generic') | ||||
|         elif len(entries) == 1: | ||||
|             return self.url_result(entries[0], 'BrightcoveNew') | ||||
|         else: | ||||
|             return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') | ||||
| @@ -36,34 +36,35 @@ from ..utils import ( | ||||
|     clean_html, | ||||
|     compiled_regex_type, | ||||
|     determine_ext, | ||||
|     determine_protocol, | ||||
|     error_to_compat_str, | ||||
|     ExtractorError, | ||||
|     extract_attributes, | ||||
|     fix_xml_ampersands, | ||||
|     float_or_none, | ||||
|     GeoRestrictedError, | ||||
|     GeoUtils, | ||||
|     int_or_none, | ||||
|     js_to_json, | ||||
|     mimetype2ext, | ||||
|     orderedSet, | ||||
|     parse_codecs, | ||||
|     parse_duration, | ||||
|     parse_iso8601, | ||||
|     parse_m3u8_attributes, | ||||
|     RegexNotFoundError, | ||||
|     sanitize_filename, | ||||
|     sanitized_Request, | ||||
|     sanitize_filename, | ||||
|     unescapeHTML, | ||||
|     unified_strdate, | ||||
|     unified_timestamp, | ||||
|     update_Request, | ||||
|     update_url_query, | ||||
|     urljoin, | ||||
|     url_basename, | ||||
|     xpath_element, | ||||
|     xpath_text, | ||||
|     xpath_with_ns, | ||||
|     determine_protocol, | ||||
|     parse_duration, | ||||
|     mimetype2ext, | ||||
|     update_Request, | ||||
|     update_url_query, | ||||
|     parse_m3u8_attributes, | ||||
|     extract_attributes, | ||||
|     parse_codecs, | ||||
|     urljoin, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @@ -714,6 +715,13 @@ class InfoExtractor(object): | ||||
|             video_info['title'] = video_title | ||||
|         return video_info | ||||
|  | ||||
|     def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): | ||||
|         urlrs = orderedSet( | ||||
|             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) | ||||
|             for m in matches) | ||||
|         return self.playlist_result( | ||||
|             urlrs, playlist_id=video_id, playlist_title=video_title) | ||||
|  | ||||
|     @staticmethod | ||||
|     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): | ||||
|         """Returns a playlist""" | ||||
|   | ||||
| @@ -117,6 +117,7 @@ from .bleacherreport import ( | ||||
| from .blinkx import BlinkxIE | ||||
| from .bloomberg import BloombergIE | ||||
| from .bokecc import BokeCCIE | ||||
| from .bostonglobe import BostonGlobeIE | ||||
| from .bpb import BpbIE | ||||
| from .br import BRIE | ||||
| from .bravotv import BravoTVIE | ||||
|   | ||||
| @@ -1841,14 +1841,6 @@ class GenericIE(InfoExtractor): | ||||
|         video_description = self._og_search_description(webpage, default=None) | ||||
|         video_thumbnail = self._og_search_thumbnail(webpage, default=None) | ||||
|  | ||||
|         # Helper method | ||||
|         def _playlist_from_matches(matches, getter=None, ie=None): | ||||
|             urlrs = orderedSet( | ||||
|                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) | ||||
|                 for m in matches) | ||||
|             return self.playlist_result( | ||||
|                 urlrs, playlist_id=video_id, playlist_title=video_title) | ||||
|  | ||||
|         # Look for Brightcove Legacy Studio embeds | ||||
|         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) | ||||
|         if bc_urls: | ||||
| @@ -1869,28 +1861,28 @@ class GenericIE(InfoExtractor): | ||||
|         # Look for Brightcove New Studio embeds | ||||
|         bc_urls = BrightcoveNewIE._extract_urls(webpage) | ||||
|         if bc_urls: | ||||
|             return _playlist_from_matches(bc_urls, ie='BrightcoveNew') | ||||
|             return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') | ||||
|  | ||||
|         # Look for ThePlatform embeds | ||||
|         tp_urls = ThePlatformIE._extract_urls(webpage) | ||||
|         if tp_urls: | ||||
|             return _playlist_from_matches(tp_urls, ie='ThePlatform') | ||||
|             return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') | ||||
|  | ||||
|         # Look for Vessel embeds | ||||
|         vessel_urls = VesselIE._extract_urls(webpage) | ||||
|         if vessel_urls: | ||||
|             return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) | ||||
|             return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) | ||||
|  | ||||
|         # Look for embedded rtl.nl player | ||||
|         matches = re.findall( | ||||
|             r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', | ||||
|             webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches(matches, ie='RtlNl') | ||||
|             return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') | ||||
|  | ||||
|         vimeo_urls = VimeoIE._extract_urls(url, webpage) | ||||
|         if vimeo_urls: | ||||
|             return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) | ||||
|             return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) | ||||
|  | ||||
|         vid_me_embed_url = self._search_regex( | ||||
|             r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', | ||||
| @@ -1912,25 +1904,25 @@ class GenericIE(InfoExtractor): | ||||
|                 (?:embed|v|p)/.+?) | ||||
|             \1''', webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches( | ||||
|                 matches, lambda m: unescapeHTML(m[1])) | ||||
|             return self.playlist_from_matches( | ||||
|                 matches, video_id, video_title, lambda m: unescapeHTML(m[1])) | ||||
|  | ||||
|         # Look for lazyYT YouTube embed | ||||
|         matches = re.findall( | ||||
|             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) | ||||
|             return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) | ||||
|  | ||||
|         # Look for Wordpress "YouTube Video Importer" plugin | ||||
|         matches = re.findall(r'''(?x)<div[^>]+ | ||||
|             class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ | ||||
|             data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches(matches, lambda m: m[-1]) | ||||
|             return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) | ||||
|  | ||||
|         matches = DailymotionIE._extract_urls(webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches(matches) | ||||
|             return self.playlist_from_matches(matches, video_id, video_title) | ||||
|  | ||||
|         # Look for embedded Dailymotion playlist player (#3822) | ||||
|         m = re.search( | ||||
| @@ -1939,8 +1931,8 @@ class GenericIE(InfoExtractor): | ||||
|             playlists = re.findall( | ||||
|                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) | ||||
|             if playlists: | ||||
|                 return _playlist_from_matches( | ||||
|                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p) | ||||
|                 return self.playlist_from_matches( | ||||
|                     playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) | ||||
|  | ||||
|         # Look for embedded Wistia player | ||||
|         match = re.search( | ||||
| @@ -2047,8 +2039,9 @@ class GenericIE(InfoExtractor): | ||||
|         if mobj is not None: | ||||
|             embeds = self._parse_json(mobj.group(1), video_id, fatal=False) | ||||
|             if embeds: | ||||
|                 return _playlist_from_matches( | ||||
|                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') | ||||
|                 return self.playlist_from_matches( | ||||
|                     embeds, video_id, video_title, | ||||
|                     getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') | ||||
|  | ||||
|         # Look for Aparat videos | ||||
|         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) | ||||
| @@ -2110,13 +2103,13 @@ class GenericIE(InfoExtractor): | ||||
|         # Look for funnyordie embed | ||||
|         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches( | ||||
|                 matches, getter=unescapeHTML, ie='FunnyOrDie') | ||||
|             return self.playlist_from_matches( | ||||
|                 matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') | ||||
|  | ||||
|         # Look for BBC iPlayer embed | ||||
|         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) | ||||
|         if matches: | ||||
|             return _playlist_from_matches(matches, ie='BBCCoUk') | ||||
|             return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') | ||||
|  | ||||
|         # Look for embedded RUTV player | ||||
|         rutv_url = RUTVIE._extract_url(webpage) | ||||
| @@ -2131,32 +2124,32 @@ class GenericIE(InfoExtractor): | ||||
|         # Look for embedded SportBox player | ||||
|         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) | ||||
|         if sportbox_urls: | ||||
|             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') | ||||
|             return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') | ||||
|  | ||||
|         # Look for embedded XHamster player | ||||
|         xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) | ||||
|         if xhamster_urls: | ||||
|             return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') | ||||
|             return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') | ||||
|  | ||||
|         # Look for embedded TNAFlixNetwork player | ||||
|         tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) | ||||
|         if tnaflix_urls: | ||||
|             return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) | ||||
|             return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) | ||||
|  | ||||
|         # Look for embedded PornHub player | ||||
|         pornhub_urls = PornHubIE._extract_urls(webpage) | ||||
|         if pornhub_urls: | ||||
|             return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) | ||||
|             return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) | ||||
|  | ||||
|         # Look for embedded DrTuber player | ||||
|         drtuber_urls = DrTuberIE._extract_urls(webpage) | ||||
|         if drtuber_urls: | ||||
|             return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) | ||||
|             return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) | ||||
|  | ||||
|         # Look for embedded RedTube player | ||||
|         redtube_urls = RedTubeIE._extract_urls(webpage) | ||||
|         if redtube_urls: | ||||
|             return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) | ||||
|             return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) | ||||
|  | ||||
|         # Look for embedded Tvigle player | ||||
|         mobj = re.search( | ||||
| @@ -2202,12 +2195,12 @@ class GenericIE(InfoExtractor): | ||||
|         # Look for embedded soundcloud player | ||||
|         soundcloud_urls = SoundcloudIE._extract_urls(webpage) | ||||
|         if soundcloud_urls: | ||||
|             return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) | ||||
|             return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) | ||||
|  | ||||
|         # Look for tunein player | ||||
|         tunein_urls = TuneInBaseIE._extract_urls(webpage) | ||||
|         if tunein_urls: | ||||
|             return _playlist_from_matches(tunein_urls) | ||||
|             return self.playlist_from_matches(tunein_urls, video_id, video_title) | ||||
|  | ||||
|         # Look for embedded mtvservices player | ||||
|         mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) | ||||
| @@ -2490,35 +2483,35 @@ class GenericIE(InfoExtractor): | ||||
|         # Look for DBTV embeds | ||||
|         dbtv_urls = DBTVIE._extract_urls(webpage) | ||||
|         if dbtv_urls: | ||||
|             return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) | ||||
|             return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) | ||||
|  | ||||
|         # Look for Videa embeds | ||||
|         videa_urls = VideaIE._extract_urls(webpage) | ||||
|         if videa_urls: | ||||
|             return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) | ||||
|             return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) | ||||
|  | ||||
|         # Look for 20 minuten embeds | ||||
|         twentymin_urls = TwentyMinutenIE._extract_urls(webpage) | ||||
|         if twentymin_urls: | ||||
|             return _playlist_from_matches( | ||||
|                 twentymin_urls, ie=TwentyMinutenIE.ie_key()) | ||||
|             return self.playlist_from_matches( | ||||
|                 twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) | ||||
|  | ||||
|         # Look for Openload embeds | ||||
|         openload_urls = OpenloadIE._extract_urls(webpage) | ||||
|         if openload_urls: | ||||
|             return _playlist_from_matches( | ||||
|                 openload_urls, ie=OpenloadIE.ie_key()) | ||||
|             return self.playlist_from_matches( | ||||
|                 openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) | ||||
|  | ||||
|         # Look for VideoPress embeds | ||||
|         videopress_urls = VideoPressIE._extract_urls(webpage) | ||||
|         if videopress_urls: | ||||
|             return _playlist_from_matches( | ||||
|                 videopress_urls, ie=VideoPressIE.ie_key()) | ||||
|             return self.playlist_from_matches( | ||||
|                 videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) | ||||
|  | ||||
|         # Look for Rutube embeds | ||||
|         rutube_urls = RutubeIE._extract_urls(webpage) | ||||
|         if rutube_urls: | ||||
|             return _playlist_from_matches( | ||||
|             return self.playlist_from_matches( | ||||
|                 rutube_urls, ie=RutubeIE.ie_key()) | ||||
|  | ||||
|         # Looking for http://schema.org/VideoObject | ||||
|   | ||||
		Reference in New Issue
	
	Block a user