mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-26 04:00:57 +00:00 
			
		
		
		
	[pbs] Fix description extraction and update tests
This commit is contained in:
		
							parent
							
								
									82997dad57
								
							
						
					
					
						commit
						542130a5d9
					
				| @ -10,6 +10,7 @@ | ||||
|     int_or_none, | ||||
|     js_to_json, | ||||
|     strip_jsonp, | ||||
|     strip_or_none, | ||||
|     unified_strdate, | ||||
|     US_RATINGS, | ||||
| ) | ||||
| @ -200,7 +201,7 @@ class PBSIE(InfoExtractor): | ||||
|                 'id': '2365006249', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', | ||||
|                 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', | ||||
|                 'description': 'md5:31b664af3c65fd07fa460d306b837d00', | ||||
|                 'duration': 3190, | ||||
|             }, | ||||
|         }, | ||||
| @ -211,7 +212,7 @@ class PBSIE(InfoExtractor): | ||||
|                 'id': '2365297690', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': 'FRONTLINE - Losing Iraq', | ||||
|                 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', | ||||
|                 'description': 'md5:5979a4d069b157f622d02bff62fbe654', | ||||
|                 'duration': 5050, | ||||
|             }, | ||||
|         }, | ||||
| @ -222,7 +223,7 @@ class PBSIE(InfoExtractor): | ||||
|                 'id': '2201174722', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', | ||||
|                 'description': 'md5:95a19f568689d09a166dff9edada3301', | ||||
|                 'description': 'md5:86ab9a3d04458b876147b355788b8781', | ||||
|                 'duration': 801, | ||||
|             }, | ||||
|         }, | ||||
| @ -267,7 +268,7 @@ class PBSIE(InfoExtractor): | ||||
|                 'display_id': 'player', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': 'American Experience - Death and the Civil War, Chapter 1', | ||||
|                 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', | ||||
|                 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', | ||||
|                 'duration': 682, | ||||
|                 'thumbnail': 're:^https?://.*\.jpg$', | ||||
|             }, | ||||
| @ -293,13 +294,13 @@ class PBSIE(InfoExtractor): | ||||
|             # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see | ||||
|             # https://github.com/rg3/youtube-dl/issues/7059) | ||||
|             'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/', | ||||
|             'md5': '84ced42850d78f1d4650297356e95e6f', | ||||
|             'md5': '59b0ef5009f9ac8a319cc5efebcd865e', | ||||
|             'info_dict': { | ||||
|                 'id': '2365546844', | ||||
|                 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", | ||||
|                 'description': 'md5:54033c6baa1f9623607c6e2ed245888b', | ||||
|                 'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5', | ||||
|                 'duration': 1480, | ||||
|                 'thumbnail': 're:^https?://.*\.jpg$', | ||||
|             }, | ||||
| @ -312,7 +313,7 @@ class PBSIE(InfoExtractor): | ||||
|                 'display_id': 'the-atomic-artists', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': 'FRONTLINE - The Atomic Artists', | ||||
|                 'description': 'md5:1a2481e86b32b2e12ec1905dd473e2c1', | ||||
|                 'description': 'md5:f677e4520cfacb4a5ce1471e31b57800', | ||||
|                 'duration': 723, | ||||
|                 'thumbnail': 're:^https?://.*\.jpg$', | ||||
|             }, | ||||
| @ -323,7 +324,7 @@ class PBSIE(InfoExtractor): | ||||
|         { | ||||
|             # Serves hd only via wigget/partnerplayer page | ||||
|             'url': 'http://www.pbs.org/video/2365641075/', | ||||
|             'md5': 'acfd4c400b48149a44861cb16dd305cf', | ||||
|             'md5': 'fdf907851eab57211dd589cf12006666', | ||||
|             'info_dict': { | ||||
|                 'id': '2365641075', | ||||
|                 'ext': 'mp4', | ||||
| @ -333,16 +334,6 @@ class PBSIE(InfoExtractor): | ||||
|                 'formats': 'mincount:8', | ||||
|             }, | ||||
|         }, | ||||
|         { | ||||
|             # has undocumented http formats(4500k and 6500k) | ||||
|             'url': 'http://www.pbs.org/video/2365815229/', | ||||
|             'md5': '94635cd06b7133688e23f4b94e6637a5', | ||||
|             'info_dict': { | ||||
|                 'id': '2365815229', | ||||
|                 'ext': 'mp4', | ||||
|                 'title': 'FRONTLINE - Mosquito Hunter', | ||||
|             }, | ||||
|         }, | ||||
|         { | ||||
|             'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', | ||||
|             'only_matching': True, | ||||
| @ -362,11 +353,16 @@ class PBSIE(InfoExtractor): | ||||
|     def _extract_webpage(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
| 
 | ||||
|         description = None | ||||
| 
 | ||||
|         presumptive_id = mobj.group('presumptive_id') | ||||
|         display_id = presumptive_id | ||||
|         if presumptive_id: | ||||
|             webpage = self._download_webpage(url, display_id) | ||||
| 
 | ||||
|             description = strip_or_none(self._og_search_description( | ||||
|                 webpage, default=None) or self._html_search_meta( | ||||
|                 'description', webpage, default=None)) | ||||
|             upload_date = unified_strdate(self._search_regex( | ||||
|                 r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', | ||||
|                 webpage, 'upload date', default=None)) | ||||
| @ -379,7 +375,7 @@ def _extract_webpage(self, url): | ||||
|             for p in MULTI_PART_REGEXES: | ||||
|                 tabbed_videos = re.findall(p, webpage) | ||||
|                 if tabbed_videos: | ||||
|                     return tabbed_videos, presumptive_id, upload_date | ||||
|                     return tabbed_videos, presumptive_id, upload_date, description | ||||
| 
 | ||||
|             MEDIA_ID_REGEXES = [ | ||||
|                 r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed | ||||
| @ -391,7 +387,7 @@ def _extract_webpage(self, url): | ||||
|             media_id = self._search_regex( | ||||
|                 MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) | ||||
|             if media_id: | ||||
|                 return media_id, presumptive_id, upload_date | ||||
|                 return media_id, presumptive_id, upload_date, description | ||||
| 
 | ||||
|             # Fronline video embedded via flp | ||||
|             video_id = self._search_regex( | ||||
| @ -408,7 +404,7 @@ def _extract_webpage(self, url): | ||||
|                     'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, | ||||
|                     presumptive_id, 'Downloading getdir JSON', | ||||
|                     transform_source=strip_jsonp) | ||||
|                 return getdir['mid'], presumptive_id, upload_date | ||||
|                 return getdir['mid'], presumptive_id, upload_date, description | ||||
| 
 | ||||
|             for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): | ||||
|                 url = self._search_regex( | ||||
| @ -432,10 +428,10 @@ def _extract_webpage(self, url): | ||||
|             video_id = mobj.group('id') | ||||
|             display_id = video_id | ||||
| 
 | ||||
|         return video_id, display_id, None | ||||
|         return video_id, display_id, None, description | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id, display_id, upload_date = self._extract_webpage(url) | ||||
|         video_id, display_id, upload_date, description = self._extract_webpage(url) | ||||
| 
 | ||||
|         if isinstance(video_id, list): | ||||
|             entries = [self.url_result( | ||||
| @ -564,11 +560,14 @@ def extract_redirect_urls(info): | ||||
|         if alt_title: | ||||
|             info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) | ||||
| 
 | ||||
|         description = info.get('description') or info.get( | ||||
|             'program', {}).get('description') or description | ||||
| 
 | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'display_id': display_id, | ||||
|             'title': info['title'], | ||||
|             'description': info.get('description') or info.get('program', {}).get('description'), | ||||
|             'description': description, | ||||
|             'thumbnail': info.get('image_url'), | ||||
|             'duration': int_or_none(info.get('duration')), | ||||
|             'age_limit': age_limit, | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Sergey M․
						Sergey M․