mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[PRX] Add Extractors (#2245)
Closes #2144, https://github.com/ytdl-org/youtube-dl/issues/15948 Authored by: coletdjnz
This commit is contained in:
		| @@ -1216,6 +1216,13 @@ from .puhutv import ( | ||||
| from .presstv import PressTVIE | ||||
| from .projectveritas import ProjectVeritasIE | ||||
| from .prosiebensat1 import ProSiebenSat1IE | ||||
| from .prx import ( | ||||
|     PRXStoryIE, | ||||
|     PRXSeriesIE, | ||||
|     PRXAccountIE, | ||||
|     PRXStoriesSearchIE, | ||||
|     PRXSeriesSearchIE | ||||
| ) | ||||
| from .puls4 import Puls4IE | ||||
| from .pyvideo import PyvideoIE | ||||
| from .qqmusic import ( | ||||
|   | ||||
							
								
								
									
										431
									
								
								yt_dlp/extractor/prx.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										431
									
								
								yt_dlp/extractor/prx.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,431 @@ | ||||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| import itertools | ||||
| from .common import InfoExtractor, SearchInfoExtractor | ||||
| from ..utils import ( | ||||
|     urljoin, | ||||
|     traverse_obj, | ||||
|     int_or_none, | ||||
|     mimetype2ext, | ||||
|     clean_html, | ||||
|     url_or_none, | ||||
|     unified_timestamp, | ||||
|     str_or_none, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class PRXBaseIE(InfoExtractor): | ||||
|     PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s' | ||||
|  | ||||
|     def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'): | ||||
|         return self._download_json( | ||||
|             urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _get_prx_embed_response(response, section): | ||||
|         return traverse_obj(response, ('_embedded', f'prx:{section}')) | ||||
|  | ||||
|     @staticmethod | ||||
|     def _extract_file_link(response): | ||||
|         return url_or_none(traverse_obj( | ||||
|             response, ('_links', 'enclosure', 'href'), expected_type=str)) | ||||
|  | ||||
|     @classmethod | ||||
|     def _extract_image(cls, image_response): | ||||
|         if not isinstance(image_response, dict): | ||||
|             return | ||||
|         return { | ||||
|             'id': str_or_none(image_response.get('id')), | ||||
|             'filesize': image_response.get('size'), | ||||
|             'width': image_response.get('width'), | ||||
|             'height': image_response.get('height'), | ||||
|             'url': cls._extract_file_link(image_response) | ||||
|         } | ||||
|  | ||||
|     @classmethod | ||||
|     def _extract_base_info(cls, response): | ||||
|         if not isinstance(response, dict): | ||||
|             return | ||||
|         item_id = str_or_none(response.get('id')) | ||||
|         if not item_id: | ||||
|             return | ||||
|         thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image')) | ||||
|         description = ( | ||||
|             clean_html(response.get('description')) | ||||
|             or response.get('shortDescription')) | ||||
|         return { | ||||
|             'id': item_id, | ||||
|             'title': response.get('title') or item_id, | ||||
|             'thumbnails': [thumbnail_dict] if thumbnail_dict else None, | ||||
|             'description': description, | ||||
|             'release_timestamp': unified_timestamp(response.get('releasedAt')), | ||||
|             'timestamp': unified_timestamp(response.get('createdAt')), | ||||
|             'modified_timestamp': unified_timestamp(response.get('updatedAt')), | ||||
|             'duration': int_or_none(response.get('duration')), | ||||
|             'tags': response.get('tags'), | ||||
|             'episode_number': int_or_none(response.get('episodeIdentifier')), | ||||
|             'season_number': int_or_none(response.get('seasonIdentifier')) | ||||
|         } | ||||
|  | ||||
|     @classmethod | ||||
|     def _extract_series_info(cls, series_response): | ||||
|         base_info = cls._extract_base_info(series_response) | ||||
|         if not base_info: | ||||
|             return | ||||
|         account_info = cls._extract_account_info( | ||||
|             cls._get_prx_embed_response(series_response, 'account')) or {} | ||||
|         return { | ||||
|             **base_info, | ||||
|             'channel_id': account_info.get('channel_id'), | ||||
|             'channel_url': account_info.get('channel_url'), | ||||
|             'channel': account_info.get('channel'), | ||||
|             'series': base_info.get('title'), | ||||
|             'series_id': base_info.get('id'), | ||||
|         } | ||||
|  | ||||
|     @classmethod | ||||
|     def _extract_account_info(cls, account_response): | ||||
|         base_info = cls._extract_base_info(account_response) | ||||
|         if not base_info: | ||||
|             return | ||||
|         name = account_response.get('name') | ||||
|         return { | ||||
|             **base_info, | ||||
|             'title': name, | ||||
|             'channel_id': base_info.get('id'), | ||||
|             'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'), | ||||
|             'channel': name, | ||||
|         } | ||||
|  | ||||
|     @classmethod | ||||
|     def _extract_story_info(cls, story_response): | ||||
|         base_info = cls._extract_base_info(story_response) | ||||
|         if not base_info: | ||||
|             return | ||||
|         series = cls._extract_series_info( | ||||
|             cls._get_prx_embed_response(story_response, 'series')) or {} | ||||
|         account = cls._extract_account_info( | ||||
|             cls._get_prx_embed_response(story_response, 'account')) or {} | ||||
|         return { | ||||
|             **base_info, | ||||
|             'series': series.get('series'), | ||||
|             'series_id': series.get('series_id'), | ||||
|             'channel_id': account.get('channel_id'), | ||||
|             'channel_url': account.get('channel_url'), | ||||
|             'channel': account.get('channel') | ||||
|         } | ||||
|  | ||||
|     def _entries(self, item_id, endpoint, entry_func, query=None): | ||||
|         """ | ||||
|         Extract entries from paginated list API | ||||
|         @param entry_func: Function to generate entry from response item | ||||
|         """ | ||||
|         total = 0 | ||||
|         for page in itertools.count(1): | ||||
|             response = self._call_api(f'{item_id}: page {page}', endpoint, query={ | ||||
|                 **(query or {}), | ||||
|                 'page': page, | ||||
|                 'per': 100 | ||||
|             }) | ||||
|             items = self._get_prx_embed_response(response, 'items') | ||||
|             if not response or not items: | ||||
|                 break | ||||
|  | ||||
|             yield from filter(None, map(entry_func, items)) | ||||
|  | ||||
|             total += response['count'] | ||||
|             if total >= response['total']: | ||||
|                 break | ||||
|  | ||||
|     def _story_playlist_entry(self, response): | ||||
|         story = self._extract_story_info(response) | ||||
|         if not story: | ||||
|             return | ||||
|         story.update({ | ||||
|             '_type': 'url', | ||||
|             'url': 'https://beta.prx.org/stories/%s' % story['id'], | ||||
|             'ie_key': PRXStoryIE.ie_key() | ||||
|         }) | ||||
|         return story | ||||
|  | ||||
|     def _series_playlist_entry(self, response): | ||||
|         series = self._extract_series_info(response) | ||||
|         if not series: | ||||
|             return | ||||
|         series.update({ | ||||
|             '_type': 'url', | ||||
|             'url': 'https://beta.prx.org/series/%s' % series['id'], | ||||
|             'ie_key': PRXSeriesIE.ie_key() | ||||
|         }) | ||||
|         return series | ||||
|  | ||||
|  | ||||
| class PRXStoryIE(PRXBaseIE): | ||||
|     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)' | ||||
|  | ||||
|     _TESTS = [ | ||||
|         { | ||||
|             # Story with season and episode details | ||||
|             'url': 'https://beta.prx.org/stories/399200', | ||||
|             'info_dict': { | ||||
|                 'id': '399200', | ||||
|                 'title': 'Fly Me To The Moon', | ||||
|                 'description': 'md5:43230168390b95d3322048d8a56bf2bb', | ||||
|                 'release_timestamp': 1640250000, | ||||
|                 'timestamp': 1640208972, | ||||
|                 'modified_timestamp': 1641318202, | ||||
|                 'duration': 1004, | ||||
|                 'tags': 'count:7', | ||||
|                 'episode_number': 8, | ||||
|                 'season_number': 5, | ||||
|                 'series': 'AirSpace', | ||||
|                 'series_id': '38057', | ||||
|                 'channel_id': '220986', | ||||
|                 'channel_url': 'https://beta.prx.org/accounts/220986', | ||||
|                 'channel': 'Air and Space Museum', | ||||
|             }, | ||||
|             'playlist': [{ | ||||
|                 'info_dict': { | ||||
|                     'id': '399200_part1', | ||||
|                     'title': 'Fly Me To The Moon', | ||||
|                     'description': 'md5:43230168390b95d3322048d8a56bf2bb', | ||||
|                     'release_timestamp': 1640250000, | ||||
|                     'timestamp': 1640208972, | ||||
|                     'modified_timestamp': 1641318202, | ||||
|                     'duration': 530, | ||||
|                     'tags': 'count:7', | ||||
|                     'episode_number': 8, | ||||
|                     'season_number': 5, | ||||
|                     'series': 'AirSpace', | ||||
|                     'series_id': '38057', | ||||
|                     'channel_id': '220986', | ||||
|                     'channel_url': 'https://beta.prx.org/accounts/220986', | ||||
|                     'channel': 'Air and Space Museum', | ||||
|                     'ext': 'mp3', | ||||
|                     'upload_date': '20211222', | ||||
|                     'episode': 'Episode 8', | ||||
|                     'release_date': '20211223', | ||||
|                     'season': 'Season 5', | ||||
|                     'modified_date': '20220104' | ||||
|                 } | ||||
|             }, { | ||||
|                 'info_dict': { | ||||
|                     'id': '399200_part2', | ||||
|                     'title': 'Fly Me To The Moon', | ||||
|                     'description': 'md5:43230168390b95d3322048d8a56bf2bb', | ||||
|                     'release_timestamp': 1640250000, | ||||
|                     'timestamp': 1640208972, | ||||
|                     'modified_timestamp': 1641318202, | ||||
|                     'duration': 474, | ||||
|                     'tags': 'count:7', | ||||
|                     'episode_number': 8, | ||||
|                     'season_number': 5, | ||||
|                     'series': 'AirSpace', | ||||
|                     'series_id': '38057', | ||||
|                     'channel_id': '220986', | ||||
|                     'channel_url': 'https://beta.prx.org/accounts/220986', | ||||
|                     'channel': 'Air and Space Museum', | ||||
|                     'ext': 'mp3', | ||||
|                     'upload_date': '20211222', | ||||
|                     'episode': 'Episode 8', | ||||
|                     'release_date': '20211223', | ||||
|                     'season': 'Season 5', | ||||
|                     'modified_date': '20220104' | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             ] | ||||
|         }, { | ||||
|             # Story with only split audio | ||||
|             'url': 'https://beta.prx.org/stories/326414', | ||||
|             'info_dict': { | ||||
|                 'id': '326414', | ||||
|                 'title': 'Massachusetts v EPA', | ||||
|                 'description': 'md5:744fffba08f19f4deab69fa8d49d5816', | ||||
|                 'timestamp': 1592509124, | ||||
|                 'modified_timestamp': 1592510457, | ||||
|                 'duration': 3088, | ||||
|                 'tags': 'count:0', | ||||
|                 'series': 'Outside/In', | ||||
|                 'series_id': '36252', | ||||
|                 'channel_id': '206', | ||||
|                 'channel_url': 'https://beta.prx.org/accounts/206', | ||||
|                 'channel': 'New Hampshire Public Radio', | ||||
|             }, | ||||
|             'playlist_count': 4 | ||||
|         }, { | ||||
|             # Story with single combined audio | ||||
|             'url': 'https://beta.prx.org/stories/400404', | ||||
|             'info_dict': { | ||||
|                 'id': '400404', | ||||
|                 'title': 'Cafe Chill (Episode 2022-01)', | ||||
|                 'thumbnails': 'count:1', | ||||
|                 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539', | ||||
|                 'timestamp': 1641233952, | ||||
|                 'modified_timestamp': 1641234248, | ||||
|                 'duration': 3540, | ||||
|                 'series': 'Café Chill', | ||||
|                 'series_id': '37762', | ||||
|                 'channel_id': '5767', | ||||
|                 'channel_url': 'https://beta.prx.org/accounts/5767', | ||||
|                 'channel': 'C89.5 - KNHC Seattle', | ||||
|                 'ext': 'mp3', | ||||
|                 'tags': 'count:0', | ||||
|                 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg', | ||||
|                 'upload_date': '20220103', | ||||
|                 'modified_date': '20220103' | ||||
|             } | ||||
|         }, { | ||||
|             'url': 'https://listen.prx.org/stories/399200', | ||||
|             'only_matching': True | ||||
|         } | ||||
|     ] | ||||
|  | ||||
|     def _extract_audio_pieces(self, audio_response): | ||||
|         return [{ | ||||
|             'format_id': str_or_none(piece_response.get('id')), | ||||
|             'format_note': str_or_none(piece_response.get('label')), | ||||
|             'filesize': int_or_none(piece_response.get('size')), | ||||
|             'duration': int_or_none(piece_response.get('duration')), | ||||
|             'ext': mimetype2ext(piece_response.get('contentType')), | ||||
|             'asr': int_or_none(piece_response.get('frequency'), scale=1000), | ||||
|             'abr': int_or_none(piece_response.get('bitRate')), | ||||
|             'url': self._extract_file_link(piece_response), | ||||
|             'vcodec': 'none' | ||||
|         } for piece_response in sorted( | ||||
|             self._get_prx_embed_response(audio_response, 'items') or [], | ||||
|             key=lambda p: int_or_none(p.get('position')))] | ||||
|  | ||||
|     def _extract_story(self, story_response): | ||||
|         info = self._extract_story_info(story_response) | ||||
|         if not info: | ||||
|             return | ||||
|         audio_pieces = self._extract_audio_pieces( | ||||
|             self._get_prx_embed_response(story_response, 'audio')) | ||||
|         if len(audio_pieces) == 1: | ||||
|             return { | ||||
|                 'formats': audio_pieces, | ||||
|                 **info | ||||
|             } | ||||
|  | ||||
|         entries = [{ | ||||
|             **info, | ||||
|             'id': '%s_part%d' % (info['id'], (idx + 1)), | ||||
|             'formats': [fmt], | ||||
|         } for idx, fmt in enumerate(audio_pieces)] | ||||
|         return { | ||||
|             '_type': 'multi_video', | ||||
|             'entries': entries, | ||||
|             **info | ||||
|         } | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         story_id = self._match_id(url) | ||||
|         response = self._call_api(story_id, f'stories/{story_id}') | ||||
|         return self._extract_story(response) | ||||
|  | ||||
|  | ||||
| class PRXSeriesIE(PRXBaseIE): | ||||
|     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)' | ||||
|     _TESTS = [ | ||||
|         { | ||||
|             'url': 'https://beta.prx.org/series/36252', | ||||
|             'info_dict': { | ||||
|                 'id': '36252', | ||||
|                 'title': 'Outside/In', | ||||
|                 'thumbnails': 'count:1', | ||||
|                 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114', | ||||
|                 'timestamp': 1470684964, | ||||
|                 'modified_timestamp': 1582308830, | ||||
|                 'channel_id': '206', | ||||
|                 'channel_url': 'https://beta.prx.org/accounts/206', | ||||
|                 'channel': 'New Hampshire Public Radio', | ||||
|                 'series': 'Outside/In', | ||||
|                 'series_id': '36252' | ||||
|             }, | ||||
|             'playlist_mincount': 39 | ||||
|         }, { | ||||
|             # Blank series | ||||
|             'url': 'https://beta.prx.org/series/25038', | ||||
|             'info_dict': { | ||||
|                 'id': '25038', | ||||
|                 'title': '25038', | ||||
|                 'timestamp': 1207612800, | ||||
|                 'modified_timestamp': 1207612800, | ||||
|                 'channel_id': '206', | ||||
|                 'channel_url': 'https://beta.prx.org/accounts/206', | ||||
|                 'channel': 'New Hampshire Public Radio', | ||||
|                 'series': '25038', | ||||
|                 'series_id': '25038' | ||||
|             }, | ||||
|             'playlist_count': 0 | ||||
|         } | ||||
|     ] | ||||
|  | ||||
|     def _extract_series(self, series_response): | ||||
|         info = self._extract_series_info(series_response) | ||||
|         return { | ||||
|             '_type': 'playlist', | ||||
|             'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry), | ||||
|             **info | ||||
|         } | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         series_id = self._match_id(url) | ||||
|         response = self._call_api(series_id, f'series/{series_id}') | ||||
|         return self._extract_series(response) | ||||
|  | ||||
|  | ||||
| class PRXAccountIE(PRXBaseIE): | ||||
|     _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)' | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://beta.prx.org/accounts/206', | ||||
|         'info_dict': { | ||||
|             'id': '206', | ||||
|             'title': 'New Hampshire Public Radio', | ||||
|             'description': 'md5:277f2395301d0aca563c80c70a18ee0a', | ||||
|             'channel_id': '206', | ||||
|             'channel_url': 'https://beta.prx.org/accounts/206', | ||||
|             'channel': 'New Hampshire Public Radio', | ||||
|             'thumbnails': 'count:1' | ||||
|         }, | ||||
|         'playlist_mincount': 380 | ||||
|     }] | ||||
|  | ||||
|     def _extract_account(self, account_response): | ||||
|         info = self._extract_account_info(account_response) | ||||
|         series = self._entries( | ||||
|             info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry) | ||||
|         stories = self._entries( | ||||
|             info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry) | ||||
|         return { | ||||
|             '_type': 'playlist', | ||||
|             'entries': itertools.chain(series, stories), | ||||
|             **info | ||||
|         } | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         account_id = self._match_id(url) | ||||
|         response = self._call_api(account_id, f'accounts/{account_id}') | ||||
|         return self._extract_account(response) | ||||
|  | ||||
|  | ||||
| class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor): | ||||
|     IE_DESC = 'PRX Stories Search' | ||||
|     IE_NAME = 'prxstories:search' | ||||
|     _SEARCH_KEY = 'prxstories' | ||||
|  | ||||
|     def _search_results(self, query): | ||||
|         yield from self._entries( | ||||
|             f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query}) | ||||
|  | ||||
|  | ||||
| class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor): | ||||
|     IE_DESC = 'PRX Series Search' | ||||
|     IE_NAME = 'prxseries:search' | ||||
|     _SEARCH_KEY = 'prxseries' | ||||
|  | ||||
|     def _search_results(self, query): | ||||
|         yield from self._entries( | ||||
|             f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query}) | ||||
		Reference in New Issue
	
	Block a user
	 coletdjnz
					coletdjnz