diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb595f924..18d2f780c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2177,6 +2177,7 @@ ) from .tumblr import TumblrIE from .tunein import ( + TuneInEmbedIE, TuneInPodcastEpisodeIE, TuneInPodcastIE, TuneInShortenerIE, diff --git a/yt_dlp/extractor/tunein.py b/yt_dlp/extractor/tunein.py index 90fb04bf3..f6c37ddaf 100644 --- a/yt_dlp/extractor/tunein.py +++ b/yt_dlp/extractor/tunein.py @@ -1,24 +1,32 @@ +import itertools import urllib.parse from .common import InfoExtractor from ..utils import ( - OnDemandPagedList, - determine_ext, + clean_html, + int_or_none, parse_iso8601, - traverse_obj, + update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj class TuneInBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?tunein\.com' + def _call_api(self, item_id, endpoint=None, note='Downloading JSON metadata', query=None): + path = f'/{endpoint}' if endpoint else '' - def _extract_metadata(self, webpage, content_id): - return self._search_json(r'window.INITIAL_STATE=', webpage, 'hydration', content_id, fatal=False) + return self._download_json( + f'https://api.tunein.com/profiles/{item_id}{path}', item_id, note=note, query=query) def _extract_formats_and_subtitles(self, content_id): streams = self._download_json( - f'https://opml.radiotime.com/Tune.ashx?render=json&formats=mp3,aac,ogg,flash,hls&id={content_id}', - content_id)['body'] + 'https://opml.radiotime.com/Tune.ashx', content_id, query={ + 'formats': 'mp3,aac,ogg,flash,hls', + 'id': content_id, + 'render': 'json', + }, + )['body'] formats, subtitles = [], {} for stream in streams: @@ -26,219 +34,299 @@ def _extract_formats_and_subtitles(self, content_id): fmts, subs = self._extract_m3u8_formats_and_subtitles(stream['url'], content_id, fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - elif determine_ext(stream['url']) == 'pls': - playlist_content = self._download_webpage(stream['url'], content_id) - formats.append({ - 'url': self._search_regex(r'File1=(.*)', playlist_content, 'url', fatal=False), - 'abr': stream.get('bitrate'), - 'ext': stream.get('media_type'), - }) else: - formats.append({ - 'url': stream['url'], - 'abr': stream.get('bitrate'), - 'ext': stream.get('media_type'), - }) + formats.append(traverse_obj(stream, { + 'abr': ('bitrate', {int_or_none}), + 'ext': ('media_type', {str}), + 'url': ('url', {self._proto_relative_url}, {url_or_none}), + })) return formats, subtitles class TuneInStationIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'(?:/radio/[^?#]+-|/embed/player/)(?Ps\d+)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/s\d+)'] - + IE_NAME = 'tunein:station' + _VALID_URL = r'https?://tunein\.com/radio/[^/]+(?Ps\d+)' _TESTS = [{ 'url': 'https://tunein.com/radio/Jazz24-885-s34682/', 'info_dict': { 'id': 's34682', - 'title': str, - 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', - 'location': 'Seattle-Tacoma, US', 'ext': 'mp3', + 'title': str, + 'alt_title': 'World Class Jazz', + 'channel_follower_count': int, + 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', + 'location': 'Seattle-Tacoma, US', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tunein.com/embed/player/s6404/', - 'only_matching': True, + 'params': {'skip_download': 'Livestream'}, }, { 'url': 'https://tunein.com/radio/BBC-Radio-1-988-s24939/', 'info_dict': { 'id': 's24939', - 'title': str, - 'description': 'md5:ee2c56794844610d045f8caf5ff34d0c', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', - 'location': 'London, UK', 'ext': 'm4a', + 'title': str, + 'alt_title': 'The biggest new pop and all-day vibes', + 'channel_follower_count': int, + 'description': 'md5:ee2c56794844610d045f8caf5ff34d0c', + 'location': 'London, UK', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, + 'params': {'skip_download': 'Livestream'}, + }] + + def _real_extract(self, url): + station_id = self._match_id(url) + formats, subtitles = self._extract_formats_and_subtitles(station_id) + + return { + 'id': station_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(self._call_api(station_id), ('Item', { + 'title': ('Title', {clean_html}), + 'alt_title': ('Subtitle', {clean_html}, filter), + 'channel_follower_count': ('Actions', 'Follow', 'FollowerCount', {int_or_none}), + 'description': ('Description', {clean_html}, filter), + 'is_live': ('Actions', 'Play', 'IsLive', {bool}), + 'location': ('Properties', 'Location', 'DisplayName', {str}, any), + 'thumbnail': ('Image', {url_or_none}), + })), + } + + +class TuneInPodcastIE(TuneInBaseIE): + IE_NAME = 'tunein:podcast:program' + _PAGE_SIZE = 20 + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?(?:\?(?![^#]*(?i:\btopicid)=)[^#]*)?(?:#|$)' + _TESTS = [{ + 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/', + 'info_dict': { + 'id': 'p1153019', + 'title': 'Lex Fridman Podcast', }, + 'playlist_mincount': 200, + }, { + 'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/', + 'info_dict': { + 'id': 'p14', + 'title': 'BBC News', + }, + 'playlist_mincount': 35, + }] + + def _entries(self, podcast_id): + for page in itertools.count(): + contents = self._call_api( + podcast_id, 'contents', + f'Downloading page {page + 1}', query={ + 'filter': 't:free', + 'limit': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }) + + yield from traverse_obj(contents, ( + 'Items', ..., 'GuideId', {str}, filter, all, filter)) + + if not traverse_obj(contents, ('Paging', 'Next', {url_or_none})): + break + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + return self.playlist_from_matches( + self._entries(podcast_id), podcast_id, + traverse_obj(self._call_api(podcast_id), ('Item', 'Title', {str})), + getter=lambda x: update_url_query(url, {'topicId': x[1:]})) + + +class TuneInPodcastEpisodeIE(TuneInBaseIE): + IE_NAME = 'tunein:podcast' + _VALID_URL = r'https?://tunein\.com/podcasts(?:/[^/?#]+)?/[^/?#]+(?Pp\d+)/?\?[^#]*?(?<=\?|&)(?i:\btopicid)=(?P\d+)(?:[&#]|$)' + _TESTS = [{ + 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', + 'info_dict': { + 'id': 't236404354', + 'ext': 'mp3', + 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + }, + }, { + 'url': 'https://tunein.com/podcasts/The-BOB--TOM-Show-Free-Podcast-p20069/?topicId=174556405', + 'info_dict': { + 'id': 't174556405', + 'ext': 'mp3', + 'title': 'B&T Extra: Ohhh Yeah, It\'s Sexy Time', + 'alt_title': 'Westwood One >', + 'cast': 'count:2', + 'description': 'md5:6828234f410ab88c85655495c5fcfa88', + 'display_id': '174556405', + 'duration': 1203, + 'series': 'The BOB & TOM Show Free Podcast', + 'series_id': 'p20069', + 'thumbnail': r're:https?://.+', + 'timestamp': 1661799600, + 'upload_date': '20220829', + }, + }] + + def _real_extract(self, url): + series_id, display_id = self._match_valid_url(url).group('series_id', 'id') + episode_id = f't{display_id}' + formats, subtitles = self._extract_formats_and_subtitles(episode_id) + + return { + 'id': episode_id, + 'display_id': display_id, + 'formats': formats, + 'series': traverse_obj(self._call_api(series_id), ('Item', 'Title', {clean_html})), + 'series_id': series_id, + 'subtitles': subtitles, + **traverse_obj(self._call_api(episode_id), ('Item', { + 'title': ('Title', {clean_html}), + 'alt_title': ('Subtitle', {clean_html}, filter), + 'cast': ( + 'Properties', 'ParentProgram', 'Hosts', {clean_html}, + {lambda x: x.split(';')}, ..., {str.strip}, filter, all, filter), + 'description': ('Description', {clean_html}, filter), + 'duration': ('Actions', 'Play', 'Duration', {int_or_none}), + 'thumbnail': ('Image', {url_or_none}), + 'timestamp': ('Actions', 'Play', 'PublishTime', {parse_iso8601}), + })), + } + + +class TuneInEmbedIE(TuneInBaseIE): + IE_NAME = 'tunein:embed' + _VALID_URL = r'https?://tunein\.com/embed/player/(?P[^/?#]+)' + _EMBED_REGEX = [r']+src=["\'](?P(?:https?:)?//tunein\.com/embed/player/[^/?#]+)'] + _TESTS = [{ + 'url': 'https://tunein.com/embed/player/s6404/', + 'info_dict': { + 'id': 's6404', + 'ext': 'mp3', + 'title': str, + 'alt_title': 'South Africa\'s News and Information Leader', + 'channel_follower_count': int, + 'live_status': 'is_live', + 'location': 'Johannesburg, South Africa', + 'thumbnail': r're:https?://.+', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'https://tunein.com/embed/player/t236404354/', + 'info_dict': { + 'id': 't236404354', + 'ext': 'mp3', + 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', + }, + }, { + 'url': 'https://tunein.com/embed/player/p191660/', + 'info_dict': { + 'id': 'p191660', + 'title': 'SBS Tamil', + }, + 'playlist_mincount': 197, }] _WEBPAGE_TESTS = [{ 'url': 'https://www.martiniinthemorning.com/', 'info_dict': { 'id': 's55412', 'ext': 'mp3', - 'title': 'TuneInStation video #s55412', + 'title': str, + 'alt_title': 'Now that\'s music!', + 'channel_follower_count': int, + 'description': 'md5:41588a3e2cf34b3eafc6c33522fa611a', + 'live_status': 'is_live', + 'location': 'US', + 'thumbnail': r're:https?://.+', }, - 'expected_warnings': ['unable to extract hydration', 'Extractor failed to obtain "title"'], + 'params': {'skip_download': 'Livestream'}, }] def _real_extract(self, url): - station_id = self._match_id(url) + embed_id = self._match_id(url) + kind = { + 'p': 'program', + 's': 'station', + 't': 'topic', + }.get(embed_id[:1]) - webpage = self._download_webpage(url, station_id) - metadata = self._extract_metadata(webpage, station_id) - - formats, subtitles = self._extract_formats_and_subtitles(station_id) - return { - 'id': station_id, - 'title': traverse_obj(metadata, ('profiles', station_id, 'title')), - 'description': traverse_obj(metadata, ('profiles', station_id, 'description')), - 'thumbnail': traverse_obj(metadata, ('profiles', station_id, 'image')), - 'timestamp': parse_iso8601( - traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'publishTime'))), - 'location': traverse_obj( - metadata, ('profiles', station_id, 'metadata', 'properties', 'location', 'displayName'), - ('profiles', station_id, 'properties', 'location', 'displayName')), - 'formats': formats, - 'subtitles': subtitles, - 'is_live': traverse_obj(metadata, ('profiles', station_id, 'actions', 'play', 'isLive')), - } - - -class TuneInPodcastIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/(?:podcasts/[^?#]+-|embed/player/)(?Pp\d+)/?(?:#|$)' - _EMBED_REGEX = [r']+src=["\'](?P(?:https?://)?tunein\.com/embed/player/p\d+)'] - - _TESTS = [{ - 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019', - 'info_dict': { - 'id': 'p1153019', - 'title': 'Lex Fridman Podcast', - 'description': 'md5:bedc4e5f1c94f7dec6e4317b5654b00d', - }, - 'playlist_mincount': 200, - }, { - 'url': 'https://tunein.com/embed/player/p191660/', - 'only_matching': True, - }, { - 'url': 'https://tunein.com/podcasts/World-News/BBC-News-p14/', - 'info_dict': { - 'id': 'p14', - 'title': 'BBC News', - 'description': 'md5:30b9622bcc4bd101d4acd6f38f284aed', - }, - 'playlist_mincount': 36, - }] - - _PAGE_SIZE = 30 - - def _real_extract(self, url): - podcast_id = self._match_id(url) - - webpage = self._download_webpage(url, podcast_id, fatal=False) - metadata = self._extract_metadata(webpage, podcast_id) - - def page_func(page_num): - api_response = self._download_json( - f'https://api.tunein.com/profiles/{podcast_id}/contents', podcast_id, - note=f'Downloading page {page_num + 1}', query={ - 'filter': 't:free', - 'offset': page_num * self._PAGE_SIZE, - 'limit': self._PAGE_SIZE, - }) - - return [ - self.url_result( - f'https://tunein.com/podcasts/{podcast_id}?topicId={episode["GuideId"][1:]}', - TuneInPodcastEpisodeIE, title=episode.get('Title')) - for episode in api_response['Items']] - - entries = OnDemandPagedList(page_func, self._PAGE_SIZE) - return self.playlist_result( - entries, playlist_id=podcast_id, title=traverse_obj(metadata, ('profiles', podcast_id, 'title')), - description=traverse_obj(metadata, ('profiles', podcast_id, 'description'))) - - -class TuneInPodcastEpisodeIE(TuneInBaseIE): - _VALID_URL = TuneInBaseIE._VALID_URL_BASE + r'/podcasts/(?:[^?&]+-)?(?Pp\d+)/?\?topicId=(?P\w\d+)' - - _TESTS = [{ - 'url': 'https://tunein.com/podcasts/Technology-Podcasts/Artificial-Intelligence-p1153019/?topicId=236404354', - 'info_dict': { - 'id': 't236404354', - 'title': '#351 – MrBeast: Future of YouTube, Twitter, TikTok, and Instagram', - 'description': 'md5:2784533b98f8ac45c0820b1e4a8d8bb2', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', - 'timestamp': 1673458571, - 'upload_date': '20230111', - 'series_id': 'p1153019', - 'series': 'Lex Fridman Podcast', - 'ext': 'mp3', - }, - }] - - def _real_extract(self, url): - podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id') - episode_id = f't{episode_id}' - - webpage = self._download_webpage(url, episode_id) - metadata = self._extract_metadata(webpage, episode_id) - - formats, subtitles = self._extract_formats_and_subtitles(episode_id) - return { - 'id': episode_id, - 'title': traverse_obj(metadata, ('profiles', episode_id, 'title')), - 'description': traverse_obj(metadata, ('profiles', episode_id, 'description')), - 'thumbnail': traverse_obj(metadata, ('profiles', episode_id, 'image')), - 'timestamp': parse_iso8601( - traverse_obj(metadata, ('profiles', episode_id, 'actions', 'play', 'publishTime'))), - 'series_id': podcast_id, - 'series': traverse_obj(metadata, ('profiles', podcast_id, 'title')), - 'formats': formats, - 'subtitles': subtitles, - } + return self.url_result( + f'https://tunein.com/{kind}/?{kind}id={embed_id[1:]}') class TuneInShortenerIE(InfoExtractor): - _WORKING = False IE_NAME = 'tunein:shortener' IE_DESC = False # Do not list - _VALID_URL = r'https?://tun\.in/(?P[A-Za-z0-9]+)' - + _VALID_URL = r'https?://tun\.in/(?P[^/?#]+)' _TESTS = [{ - # test redirection 'url': 'http://tun.in/ser7s', 'info_dict': { 'id': 's34682', 'title': str, - 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', - 'thumbnail': r're:https?://cdn-profiles\.tunein\.com/.+', - 'location': 'Seattle-Tacoma, US', 'ext': 'mp3', + 'alt_title': 'World Class Jazz', + 'channel_follower_count': int, + 'description': 'md5:d6d0b89063fd68d529fa7058ee98619b', + 'location': 'Seattle-Tacoma, US', 'live_status': 'is_live', + 'thumbnail': r're:https?://.+', }, - 'params': { - 'skip_download': True, # live stream + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'http://tun.in/tqeeFw', + 'info_dict': { + 'id': 't236404354', + 'title': str, + 'ext': 'mp3', + 'alt_title': 'Technology Podcasts >', + 'cast': 'count:1', + 'description': 'md5:1029895354ef073ff00f20b82eb6eb71', + 'display_id': '236404354', + 'duration': 8330, + 'series': 'Lex Fridman Podcast', + 'series_id': 'p1153019', + 'thumbnail': r're:https?://.+', + 'timestamp': 1673458571, + 'upload_date': '20230111', }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'http://tun.in/pei6i', + 'info_dict': { + 'id': 'p14', + 'title': 'BBC News', + }, + 'playlist_mincount': 35, }] def _real_extract(self, url): redirect_id = self._match_id(url) # The server doesn't support HEAD requests - urlh = self._request_webpage( - url, redirect_id, note='Downloading redirect page') + urlh = self._request_webpage(url, redirect_id, 'Downloading redirect page') + parsed = urllib.parse.urlparse(urlh.url) - url = urlh.url - url_parsed = urllib.parse.urlparse(url) - if url_parsed.port == 443: - url = url_parsed._replace(netloc=url_parsed.hostname).url - - self.to_screen(f'Following redirect: {url}') - return self.url_result(url) + return self.url_result( + urllib.parse.urlunparse(parsed._replace(netloc=parsed.hostname)))