mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[brightcove] Relax video tag embeds extraction
BrightcoveNewIE.extract_urls(): Handle player data when it is all in attributes of the <video> tag, and also the id is data-brightcove-video-id not data-video-id. Add test to generic extractor, note HEAD gives 404, no checksum computed(?).
This commit is contained in:
		 John Hawkinson
					John Hawkinson
				
			
				
					committed by
					
						 Sergey M․
						Sergey M․
					
				
			
			
				
	
			
			
			 Sergey M․
						Sergey M․
					
				
			
						parent
						
							3dfceb286c
						
					
				
				
					commit
					40158f55c9
				
			| @@ -17,6 +17,7 @@ from ..compat import ( | ||||
| from ..utils import ( | ||||
|     determine_ext, | ||||
|     ExtractorError, | ||||
|     extract_attributes, | ||||
|     find_xpath_attr, | ||||
|     fix_xml_ampersands, | ||||
|     float_or_none, | ||||
| @@ -109,6 +110,7 @@ class BrightcoveLegacyIE(InfoExtractor): | ||||
|                 'upload_date': '20140827', | ||||
|                 'uploader_id': '710858724001', | ||||
|             }, | ||||
|             'skip': 'Video gone', | ||||
|         }, | ||||
|         { | ||||
|             # playlist with 'videoList' | ||||
| @@ -490,9 +492,10 @@ class BrightcoveNewIE(InfoExtractor): | ||||
|     def _extract_urls(webpage): | ||||
|         # Reference: | ||||
|         # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe | ||||
|         # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript | ||||
|         # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html | ||||
|         # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player | ||||
|         # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag | ||||
|         # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript | ||||
|         # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html | ||||
|         # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player | ||||
|  | ||||
|         entries = [] | ||||
|  | ||||
| @@ -501,22 +504,39 @@ class BrightcoveNewIE(InfoExtractor): | ||||
|                 r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): | ||||
|             entries.append(url if url.startswith('http') else 'http:' + url) | ||||
|  | ||||
|         # Look for embed_in_page embeds [2] | ||||
|         for video_id, account_id, player_id, embed in re.findall( | ||||
|                 # According to examples from [3] it's unclear whether video id | ||||
|                 # may be optional and what to do when it is | ||||
|                 # According to [4] data-video-id may be prefixed with ref: | ||||
|                 r'''(?sx) | ||||
|                     <video[^>]+ | ||||
|                         data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? | ||||
|                     </video>.*? | ||||
|                     <script[^>]+ | ||||
|                         src=["\'](?:https?:)?//players\.brightcove\.net/ | ||||
|                         (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js | ||||
|                 ''', webpage): | ||||
|             entries.append( | ||||
|                 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' | ||||
|                 % (account_id, player_id, embed, video_id)) | ||||
|         # Look for <video> tags [2] and embed_in_page embeds [3] | ||||
|         # [2] looks like: | ||||
|         # <video data-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> | ||||
|  | ||||
|         for video, script_tag, account_id, player_id, embed in re.findall( | ||||
|             r'''(?isx) | ||||
|                 (<video[^>]+>) | ||||
|                 (?:.*? | ||||
|                   (<script[^>]+ | ||||
|                     src=["\'](?:https?:)?//players\.brightcove\.net/ | ||||
|                     (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js | ||||
|                   ) | ||||
|                 )? | ||||
|             ''', webpage | ||||
|         ): | ||||
|             attrs = extract_attributes(video) | ||||
|  | ||||
|             # According to examples from [4] it's unclear whether video id | ||||
|             # may be optional and what to do when it is | ||||
|             video_id = attrs.get('data-video-id') | ||||
|             # See PR#12099/bostonglobe.py for 'data-brightcove-video-id' variant | ||||
|  | ||||
|             if not account_id: | ||||
|                 account_id = attrs.get('data-account') | ||||
|             if not player_id: | ||||
|                 player_id = attrs.get('data-player') | ||||
|             if not embed: | ||||
|                 embed = attrs.get('data-embed') | ||||
|  | ||||
|             if video_id and account_id and player_id and embed: | ||||
|                 entries.append( | ||||
|                     'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' | ||||
|                     % (account_id, player_id, embed, video_id)) | ||||
|  | ||||
|         return entries | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user