mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[PRX] Add Extractors (#2245)
Closes #2144, https://github.com/ytdl-org/youtube-dl/issues/15948 Authored by: coletdjnz
This commit is contained in:
		| @@ -1216,6 +1216,13 @@ from .puhutv import ( | |||||||
| from .presstv import PressTVIE | from .presstv import PressTVIE | ||||||
| from .projectveritas import ProjectVeritasIE | from .projectveritas import ProjectVeritasIE | ||||||
| from .prosiebensat1 import ProSiebenSat1IE | from .prosiebensat1 import ProSiebenSat1IE | ||||||
|  | from .prx import ( | ||||||
|  |     PRXStoryIE, | ||||||
|  |     PRXSeriesIE, | ||||||
|  |     PRXAccountIE, | ||||||
|  |     PRXStoriesSearchIE, | ||||||
|  |     PRXSeriesSearchIE | ||||||
|  | ) | ||||||
| from .puls4 import Puls4IE | from .puls4 import Puls4IE | ||||||
| from .pyvideo import PyvideoIE | from .pyvideo import PyvideoIE | ||||||
| from .qqmusic import ( | from .qqmusic import ( | ||||||
|   | |||||||
							
								
								
									
										431
									
								
								yt_dlp/extractor/prx.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										431
									
								
								yt_dlp/extractor/prx.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,431 @@ | |||||||
|  | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  |  | ||||||
|  | import itertools | ||||||
|  | from .common import InfoExtractor, SearchInfoExtractor | ||||||
|  | from ..utils import ( | ||||||
|  |     urljoin, | ||||||
|  |     traverse_obj, | ||||||
|  |     int_or_none, | ||||||
|  |     mimetype2ext, | ||||||
|  |     clean_html, | ||||||
|  |     url_or_none, | ||||||
|  |     unified_timestamp, | ||||||
|  |     str_or_none, | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PRXBaseIE(InfoExtractor): | ||||||
|  |     PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s' | ||||||
|  |  | ||||||
|  |     def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'): | ||||||
|  |         return self._download_json( | ||||||
|  |             urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _get_prx_embed_response(response, section): | ||||||
|  |         return traverse_obj(response, ('_embedded', f'prx:{section}')) | ||||||
|  |  | ||||||
|  |     @staticmethod | ||||||
|  |     def _extract_file_link(response): | ||||||
|  |         return url_or_none(traverse_obj( | ||||||
|  |             response, ('_links', 'enclosure', 'href'), expected_type=str)) | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def _extract_image(cls, image_response): | ||||||
|  |         if not isinstance(image_response, dict): | ||||||
|  |             return | ||||||
|  |         return { | ||||||
|  |             'id': str_or_none(image_response.get('id')), | ||||||
|  |             'filesize': image_response.get('size'), | ||||||
|  |             'width': image_response.get('width'), | ||||||
|  |             'height': image_response.get('height'), | ||||||
|  |             'url': cls._extract_file_link(image_response) | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def _extract_base_info(cls, response): | ||||||
|  |         if not isinstance(response, dict): | ||||||
|  |             return | ||||||
|  |         item_id = str_or_none(response.get('id')) | ||||||
|  |         if not item_id: | ||||||
|  |             return | ||||||
|  |         thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image')) | ||||||
|  |         description = ( | ||||||
|  |             clean_html(response.get('description')) | ||||||
|  |             or response.get('shortDescription')) | ||||||
|  |         return { | ||||||
|  |             'id': item_id, | ||||||
|  |             'title': response.get('title') or item_id, | ||||||
|  |             'thumbnails': [thumbnail_dict] if thumbnail_dict else None, | ||||||
|  |             'description': description, | ||||||
|  |             'release_timestamp': unified_timestamp(response.get('releasedAt')), | ||||||
|  |             'timestamp': unified_timestamp(response.get('createdAt')), | ||||||
|  |             'modified_timestamp': unified_timestamp(response.get('updatedAt')), | ||||||
|  |             'duration': int_or_none(response.get('duration')), | ||||||
|  |             'tags': response.get('tags'), | ||||||
|  |             'episode_number': int_or_none(response.get('episodeIdentifier')), | ||||||
|  |             'season_number': int_or_none(response.get('seasonIdentifier')) | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def _extract_series_info(cls, series_response): | ||||||
|  |         base_info = cls._extract_base_info(series_response) | ||||||
|  |         if not base_info: | ||||||
|  |             return | ||||||
|  |         account_info = cls._extract_account_info( | ||||||
|  |             cls._get_prx_embed_response(series_response, 'account')) or {} | ||||||
|  |         return { | ||||||
|  |             **base_info, | ||||||
|  |             'channel_id': account_info.get('channel_id'), | ||||||
|  |             'channel_url': account_info.get('channel_url'), | ||||||
|  |             'channel': account_info.get('channel'), | ||||||
|  |             'series': base_info.get('title'), | ||||||
|  |             'series_id': base_info.get('id'), | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def _extract_account_info(cls, account_response): | ||||||
|  |         base_info = cls._extract_base_info(account_response) | ||||||
|  |         if not base_info: | ||||||
|  |             return | ||||||
|  |         name = account_response.get('name') | ||||||
|  |         return { | ||||||
|  |             **base_info, | ||||||
|  |             'title': name, | ||||||
|  |             'channel_id': base_info.get('id'), | ||||||
|  |             'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'), | ||||||
|  |             'channel': name, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def _extract_story_info(cls, story_response): | ||||||
|  |         base_info = cls._extract_base_info(story_response) | ||||||
|  |         if not base_info: | ||||||
|  |             return | ||||||
|  |         series = cls._extract_series_info( | ||||||
|  |             cls._get_prx_embed_response(story_response, 'series')) or {} | ||||||
|  |         account = cls._extract_account_info( | ||||||
|  |             cls._get_prx_embed_response(story_response, 'account')) or {} | ||||||
|  |         return { | ||||||
|  |             **base_info, | ||||||
|  |             'series': series.get('series'), | ||||||
|  |             'series_id': series.get('series_id'), | ||||||
|  |             'channel_id': account.get('channel_id'), | ||||||
|  |             'channel_url': account.get('channel_url'), | ||||||
|  |             'channel': account.get('channel') | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     def _entries(self, item_id, endpoint, entry_func, query=None): | ||||||
|  |         """ | ||||||
|  |         Extract entries from paginated list API | ||||||
|  |         @param entry_func: Function to generate entry from response item | ||||||
|  |         """ | ||||||
|  |         total = 0 | ||||||
|  |         for page in itertools.count(1): | ||||||
|  |             response = self._call_api(f'{item_id}: page {page}', endpoint, query={ | ||||||
|  |                 **(query or {}), | ||||||
|  |                 'page': page, | ||||||
|  |                 'per': 100 | ||||||
|  |             }) | ||||||
|  |             items = self._get_prx_embed_response(response, 'items') | ||||||
|  |             if not response or not items: | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |             yield from filter(None, map(entry_func, items)) | ||||||
|  |  | ||||||
|  |             total += response['count'] | ||||||
|  |             if total >= response['total']: | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |     def _story_playlist_entry(self, response): | ||||||
|  |         story = self._extract_story_info(response) | ||||||
|  |         if not story: | ||||||
|  |             return | ||||||
|  |         story.update({ | ||||||
|  |             '_type': 'url', | ||||||
|  |             'url': 'https://beta.prx.org/stories/%s' % story['id'], | ||||||
|  |             'ie_key': PRXStoryIE.ie_key() | ||||||
|  |         }) | ||||||
|  |         return story | ||||||
|  |  | ||||||
|  |     def _series_playlist_entry(self, response): | ||||||
|  |         series = self._extract_series_info(response) | ||||||
|  |         if not series: | ||||||
|  |             return | ||||||
|  |         series.update({ | ||||||
|  |             '_type': 'url', | ||||||
|  |             'url': 'https://beta.prx.org/series/%s' % series['id'], | ||||||
|  |             'ie_key': PRXSeriesIE.ie_key() | ||||||
|  |         }) | ||||||
|  |         return series | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PRXStoryIE(PRXBaseIE): | ||||||
|  |     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)' | ||||||
|  |  | ||||||
|  |     _TESTS = [ | ||||||
|  |         { | ||||||
|  |             # Story with season and episode details | ||||||
|  |             'url': 'https://beta.prx.org/stories/399200', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': '399200', | ||||||
|  |                 'title': 'Fly Me To The Moon', | ||||||
|  |                 'description': 'md5:43230168390b95d3322048d8a56bf2bb', | ||||||
|  |                 'release_timestamp': 1640250000, | ||||||
|  |                 'timestamp': 1640208972, | ||||||
|  |                 'modified_timestamp': 1641318202, | ||||||
|  |                 'duration': 1004, | ||||||
|  |                 'tags': 'count:7', | ||||||
|  |                 'episode_number': 8, | ||||||
|  |                 'season_number': 5, | ||||||
|  |                 'series': 'AirSpace', | ||||||
|  |                 'series_id': '38057', | ||||||
|  |                 'channel_id': '220986', | ||||||
|  |                 'channel_url': 'https://beta.prx.org/accounts/220986', | ||||||
|  |                 'channel': 'Air and Space Museum', | ||||||
|  |             }, | ||||||
|  |             'playlist': [{ | ||||||
|  |                 'info_dict': { | ||||||
|  |                     'id': '399200_part1', | ||||||
|  |                     'title': 'Fly Me To The Moon', | ||||||
|  |                     'description': 'md5:43230168390b95d3322048d8a56bf2bb', | ||||||
|  |                     'release_timestamp': 1640250000, | ||||||
|  |                     'timestamp': 1640208972, | ||||||
|  |                     'modified_timestamp': 1641318202, | ||||||
|  |                     'duration': 530, | ||||||
|  |                     'tags': 'count:7', | ||||||
|  |                     'episode_number': 8, | ||||||
|  |                     'season_number': 5, | ||||||
|  |                     'series': 'AirSpace', | ||||||
|  |                     'series_id': '38057', | ||||||
|  |                     'channel_id': '220986', | ||||||
|  |                     'channel_url': 'https://beta.prx.org/accounts/220986', | ||||||
|  |                     'channel': 'Air and Space Museum', | ||||||
|  |                     'ext': 'mp3', | ||||||
|  |                     'upload_date': '20211222', | ||||||
|  |                     'episode': 'Episode 8', | ||||||
|  |                     'release_date': '20211223', | ||||||
|  |                     'season': 'Season 5', | ||||||
|  |                     'modified_date': '20220104' | ||||||
|  |                 } | ||||||
|  |             }, { | ||||||
|  |                 'info_dict': { | ||||||
|  |                     'id': '399200_part2', | ||||||
|  |                     'title': 'Fly Me To The Moon', | ||||||
|  |                     'description': 'md5:43230168390b95d3322048d8a56bf2bb', | ||||||
|  |                     'release_timestamp': 1640250000, | ||||||
|  |                     'timestamp': 1640208972, | ||||||
|  |                     'modified_timestamp': 1641318202, | ||||||
|  |                     'duration': 474, | ||||||
|  |                     'tags': 'count:7', | ||||||
|  |                     'episode_number': 8, | ||||||
|  |                     'season_number': 5, | ||||||
|  |                     'series': 'AirSpace', | ||||||
|  |                     'series_id': '38057', | ||||||
|  |                     'channel_id': '220986', | ||||||
|  |                     'channel_url': 'https://beta.prx.org/accounts/220986', | ||||||
|  |                     'channel': 'Air and Space Museum', | ||||||
|  |                     'ext': 'mp3', | ||||||
|  |                     'upload_date': '20211222', | ||||||
|  |                     'episode': 'Episode 8', | ||||||
|  |                     'release_date': '20211223', | ||||||
|  |                     'season': 'Season 5', | ||||||
|  |                     'modified_date': '20220104' | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             ] | ||||||
|  |         }, { | ||||||
|  |             # Story with only split audio | ||||||
|  |             'url': 'https://beta.prx.org/stories/326414', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': '326414', | ||||||
|  |                 'title': 'Massachusetts v EPA', | ||||||
|  |                 'description': 'md5:744fffba08f19f4deab69fa8d49d5816', | ||||||
|  |                 'timestamp': 1592509124, | ||||||
|  |                 'modified_timestamp': 1592510457, | ||||||
|  |                 'duration': 3088, | ||||||
|  |                 'tags': 'count:0', | ||||||
|  |                 'series': 'Outside/In', | ||||||
|  |                 'series_id': '36252', | ||||||
|  |                 'channel_id': '206', | ||||||
|  |                 'channel_url': 'https://beta.prx.org/accounts/206', | ||||||
|  |                 'channel': 'New Hampshire Public Radio', | ||||||
|  |             }, | ||||||
|  |             'playlist_count': 4 | ||||||
|  |         }, { | ||||||
|  |             # Story with single combined audio | ||||||
|  |             'url': 'https://beta.prx.org/stories/400404', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': '400404', | ||||||
|  |                 'title': 'Cafe Chill (Episode 2022-01)', | ||||||
|  |                 'thumbnails': 'count:1', | ||||||
|  |                 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539', | ||||||
|  |                 'timestamp': 1641233952, | ||||||
|  |                 'modified_timestamp': 1641234248, | ||||||
|  |                 'duration': 3540, | ||||||
|  |                 'series': 'Café Chill', | ||||||
|  |                 'series_id': '37762', | ||||||
|  |                 'channel_id': '5767', | ||||||
|  |                 'channel_url': 'https://beta.prx.org/accounts/5767', | ||||||
|  |                 'channel': 'C89.5 - KNHC Seattle', | ||||||
|  |                 'ext': 'mp3', | ||||||
|  |                 'tags': 'count:0', | ||||||
|  |                 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg', | ||||||
|  |                 'upload_date': '20220103', | ||||||
|  |                 'modified_date': '20220103' | ||||||
|  |             } | ||||||
|  |         }, { | ||||||
|  |             'url': 'https://listen.prx.org/stories/399200', | ||||||
|  |             'only_matching': True | ||||||
|  |         } | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     def _extract_audio_pieces(self, audio_response): | ||||||
|  |         return [{ | ||||||
|  |             'format_id': str_or_none(piece_response.get('id')), | ||||||
|  |             'format_note': str_or_none(piece_response.get('label')), | ||||||
|  |             'filesize': int_or_none(piece_response.get('size')), | ||||||
|  |             'duration': int_or_none(piece_response.get('duration')), | ||||||
|  |             'ext': mimetype2ext(piece_response.get('contentType')), | ||||||
|  |             'asr': int_or_none(piece_response.get('frequency'), scale=1000), | ||||||
|  |             'abr': int_or_none(piece_response.get('bitRate')), | ||||||
|  |             'url': self._extract_file_link(piece_response), | ||||||
|  |             'vcodec': 'none' | ||||||
|  |         } for piece_response in sorted( | ||||||
|  |             self._get_prx_embed_response(audio_response, 'items') or [], | ||||||
|  |             key=lambda p: int_or_none(p.get('position')))] | ||||||
|  |  | ||||||
|  |     def _extract_story(self, story_response): | ||||||
|  |         info = self._extract_story_info(story_response) | ||||||
|  |         if not info: | ||||||
|  |             return | ||||||
|  |         audio_pieces = self._extract_audio_pieces( | ||||||
|  |             self._get_prx_embed_response(story_response, 'audio')) | ||||||
|  |         if len(audio_pieces) == 1: | ||||||
|  |             return { | ||||||
|  |                 'formats': audio_pieces, | ||||||
|  |                 **info | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |         entries = [{ | ||||||
|  |             **info, | ||||||
|  |             'id': '%s_part%d' % (info['id'], (idx + 1)), | ||||||
|  |             'formats': [fmt], | ||||||
|  |         } for idx, fmt in enumerate(audio_pieces)] | ||||||
|  |         return { | ||||||
|  |             '_type': 'multi_video', | ||||||
|  |             'entries': entries, | ||||||
|  |             **info | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         story_id = self._match_id(url) | ||||||
|  |         response = self._call_api(story_id, f'stories/{story_id}') | ||||||
|  |         return self._extract_story(response) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PRXSeriesIE(PRXBaseIE): | ||||||
|  |     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)' | ||||||
|  |     _TESTS = [ | ||||||
|  |         { | ||||||
|  |             'url': 'https://beta.prx.org/series/36252', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': '36252', | ||||||
|  |                 'title': 'Outside/In', | ||||||
|  |                 'thumbnails': 'count:1', | ||||||
|  |                 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114', | ||||||
|  |                 'timestamp': 1470684964, | ||||||
|  |                 'modified_timestamp': 1582308830, | ||||||
|  |                 'channel_id': '206', | ||||||
|  |                 'channel_url': 'https://beta.prx.org/accounts/206', | ||||||
|  |                 'channel': 'New Hampshire Public Radio', | ||||||
|  |                 'series': 'Outside/In', | ||||||
|  |                 'series_id': '36252' | ||||||
|  |             }, | ||||||
|  |             'playlist_mincount': 39 | ||||||
|  |         }, { | ||||||
|  |             # Blank series | ||||||
|  |             'url': 'https://beta.prx.org/series/25038', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'id': '25038', | ||||||
|  |                 'title': '25038', | ||||||
|  |                 'timestamp': 1207612800, | ||||||
|  |                 'modified_timestamp': 1207612800, | ||||||
|  |                 'channel_id': '206', | ||||||
|  |                 'channel_url': 'https://beta.prx.org/accounts/206', | ||||||
|  |                 'channel': 'New Hampshire Public Radio', | ||||||
|  |                 'series': '25038', | ||||||
|  |                 'series_id': '25038' | ||||||
|  |             }, | ||||||
|  |             'playlist_count': 0 | ||||||
|  |         } | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     def _extract_series(self, series_response): | ||||||
|  |         info = self._extract_series_info(series_response) | ||||||
|  |         return { | ||||||
|  |             '_type': 'playlist', | ||||||
|  |             'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry), | ||||||
|  |             **info | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         series_id = self._match_id(url) | ||||||
|  |         response = self._call_api(series_id, f'series/{series_id}') | ||||||
|  |         return self._extract_series(response) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PRXAccountIE(PRXBaseIE): | ||||||
|  |     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)' | ||||||
|  |     _TESTS = [{ | ||||||
|  |         'url': 'https://beta.prx.org/accounts/206', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': '206', | ||||||
|  |             'title': 'New Hampshire Public Radio', | ||||||
|  |             'description': 'md5:277f2395301d0aca563c80c70a18ee0a', | ||||||
|  |             'channel_id': '206', | ||||||
|  |             'channel_url': 'https://beta.prx.org/accounts/206', | ||||||
|  |             'channel': 'New Hampshire Public Radio', | ||||||
|  |             'thumbnails': 'count:1' | ||||||
|  |         }, | ||||||
|  |         'playlist_mincount': 380 | ||||||
|  |     }] | ||||||
|  |  | ||||||
|  |     def _extract_account(self, account_response): | ||||||
|  |         info = self._extract_account_info(account_response) | ||||||
|  |         series = self._entries( | ||||||
|  |             info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry) | ||||||
|  |         stories = self._entries( | ||||||
|  |             info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry) | ||||||
|  |         return { | ||||||
|  |             '_type': 'playlist', | ||||||
|  |             'entries': itertools.chain(series, stories), | ||||||
|  |             **info | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         account_id = self._match_id(url) | ||||||
|  |         response = self._call_api(account_id, f'accounts/{account_id}') | ||||||
|  |         return self._extract_account(response) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor): | ||||||
|  |     IE_DESC = 'PRX Stories Search' | ||||||
|  |     IE_NAME = 'prxstories:search' | ||||||
|  |     _SEARCH_KEY = 'prxstories' | ||||||
|  |  | ||||||
|  |     def _search_results(self, query): | ||||||
|  |         yield from self._entries( | ||||||
|  |             f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query}) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor): | ||||||
|  |     IE_DESC = 'PRX Series Search' | ||||||
|  |     IE_NAME = 'prxseries:search' | ||||||
|  |     _SEARCH_KEY = 'prxseries' | ||||||
|  |  | ||||||
|  |     def _search_results(self, query): | ||||||
|  |         yield from self._entries( | ||||||
|  |             f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query}) | ||||||
		Reference in New Issue
	
	Block a user
	 coletdjnz
					coletdjnz