From f123cc83b3aea45053f5fa1d9141048b01fc2774 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 5 May 2025 10:03:07 -0500 Subject: [PATCH 01/12] [ie/wat.tv] Improve error handling (#13111) Closes #8191 Authored by: bashonly --- yt_dlp/extractor/wat.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/wat.py b/yt_dlp/extractor/wat.py index 03bac66ac..c1c3af800 100644 --- a/yt_dlp/extractor/wat.py +++ b/yt_dlp/extractor/wat.py @@ -2,9 +2,11 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, try_get, unified_strdate, ) +from ..utils.traversal import traverse_obj class WatIE(InfoExtractor): @@ -70,8 +72,14 @@ def _real_extract(self, url): error_desc = video_info.get('error_desc') if error_desc: - if video_info.get('error_code') == 'GEOBLOCKED': + error_code = video_info.get('error_code') + if error_code == 'GEOBLOCKED': self.raise_geo_restricted(error_desc, video_info.get('geoList')) + elif error_code == 'DELIVERY_ERROR': + if traverse_obj(video_data, ('delivery', 'code')) == 500: + self.report_drm(video_id) + error_desc = join_nonempty( + error_desc, traverse_obj(video_data, ('delivery', 'error', {str})), delim=': ') raise ExtractorError(error_desc, expected=True) title = video_info['title'] From b26bc32579c00ef579d75a835807ccc87d20ee0a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 6 May 2025 15:32:41 -0500 Subject: [PATCH 02/12] [ie/nytimesarticle] Fix extraction (#13104) Closes #13098 Authored by: bashonly --- yt_dlp/extractor/nytimes.py | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/nytimes.py b/yt_dlp/extractor/nytimes.py index a97add71a..3472a2159 100644 --- a/yt_dlp/extractor/nytimes.py +++ b/yt_dlp/extractor/nytimes.py @@ -181,6 +181,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 119.0, }, + 'skip': 'HTTP Error 500: Internal Server Error', }, { # article with audio and no video 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html', @@ -190,13 +191,14 @@ class NYTimesArticleIE(NYTimesBaseIE): 'ext': 'mp3', 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?', 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e', - 'timestamp': 1695960700, + 'timestamp': 1696008129, 'upload_date': '20230929', - 'creator': 'Stephanie Nolen, Natalija Gormalova', + 'creators': ['Stephanie Nolen', 'Natalija Gormalova'], 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 1322, }, }, { + # lede_media_block already has sourceId 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html', 'md5': '3eb5ddb1d6f86254fe4f233826778737', 'info_dict': { @@ -207,7 +209,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'timestamp': 1701290997, 'upload_date': '20231129', 'uploader': 'By The New York Times', - 'creator': 'Katie Rogers', + 'creators': ['Katie Rogers'], 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg', 'duration': 97.631, }, @@ -222,10 +224,22 @@ class NYTimesArticleIE(NYTimesBaseIE): 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink', 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d', 'upload_date': '20231202', - 'creator': 'Emily Steel, Sydney Ember', + 'creators': ['Emily Steel', 'Sydney Ember'], 'timestamp': 1701511264, }, 'playlist_count': 3, + }, { + # lede_media_block does not have sourceId + 'url': 'https://www.nytimes.com/2025/04/30/well/move/hip-mobility-routine.html', + 'info_dict': { + 'id': 'hip-mobility-routine', + 'title': 'Tight Hips? These Moves Can Help.', + 'description': 'Sitting all day is hard on your hips. Try this simple routine for better mobility.', + 'creators': ['Alyssa Ages', 'Theodore Tae'], + 'timestamp': 1746003629, + 'upload_date': '20250430', + }, + 'playlist_count': 7, }, { 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html', 'only_matching': True, @@ -256,14 +270,18 @@ def _extract_content_from_block(self, block): def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage = self._download_webpage(url, page_id, impersonate=True) art_json = self._search_json( r'window\.__preloadedData\s*=', webpage, 'media details', page_id, transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article'] + content = art_json['sprinkledBody']['content'] - blocks = traverse_obj(art_json, ( - 'sprinkledBody', 'content', ..., ('ledeMedia', None), - lambda _, v: v['__typename'] in ('Video', 'Audio'))) + blocks = [] + block_filter = lambda k, v: k == 'media' and v['__typename'] in ('Video', 'Audio') + if lede_media_block := traverse_obj(content, (..., 'ledeMedia', block_filter, any)): + lede_media_block.setdefault('sourceId', art_json.get('sourceId')) + blocks.append(lede_media_block) + blocks.extend(traverse_obj(content, (..., block_filter))) if not blocks: raise ExtractorError('Unable to extract any media blocks from webpage') @@ -273,8 +291,7 @@ def _real_extract(self, url): 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}), get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage), 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})), - 'creator': ', '.join( - traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list) + 'creators': traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName', {str})), 'thumbnails': self._extract_thumbnails(traverse_obj( art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))), } From ea8498ed534642dd7e925961b97b934987142fd3 Mon Sep 17 00:00:00 2001 From: diman8 Date: Sat, 10 May 2025 10:53:59 +0200 Subject: [PATCH 03/12] [ie/SVTPage] Fix extractor (#12957) Closes #13142 Authored by: diman8 --- yt_dlp/extractor/svt.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index b5df2e1a1..6a72f8d42 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -471,8 +471,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage) - urql_state = self._search_json( - r'window\.svt\.(?:nyh\.)?urqlState\s*=', webpage, 'json data', display_id) + urql_state = self._search_json(r'urqlState\s*[=:]', webpage, 'json data', display_id) data = traverse_obj(urql_state, (..., 'data', {str}, {json.loads}), get_all=False) or {} From ded11ebc9afba6ba33923375103e9be2d7c804e7 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 May 2025 17:33:57 -0500 Subject: [PATCH 04/12] [ie/youtube] Extract `media_type` for all videos (#13136) Authored by: bashonly --- yt_dlp/extractor/youtube/_clip.py | 2 ++ yt_dlp/extractor/youtube/_redirect.py | 1 + yt_dlp/extractor/youtube/_video.py | 38 +++++++++++++++++++++++---- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube/_clip.py b/yt_dlp/extractor/youtube/_clip.py index 7d063700e..1a708cd01 100644 --- a/yt_dlp/extractor/youtube/_clip.py +++ b/yt_dlp/extractor/youtube/_clip.py @@ -37,6 +37,7 @@ class YoutubeClipIE(YoutubeTabBaseInfoExtractor): 'chapters': 'count:20', 'comment_count': int, 'heatmap': 'count:100', + 'media_type': 'clip', }, }] @@ -59,6 +60,7 @@ def _real_extract(self, url): 'url': f'https://www.youtube.com/watch?v={video_id}', 'ie_key': YoutubeIE.ie_key(), 'id': clip_id, + 'media_type': 'clip', 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility diff --git a/yt_dlp/extractor/youtube/_redirect.py b/yt_dlp/extractor/youtube/_redirect.py index 1908df124..14e565b42 100644 --- a/yt_dlp/extractor/youtube/_redirect.py +++ b/yt_dlp/extractor/youtube/_redirect.py @@ -35,6 +35,7 @@ class YoutubeYtBeIE(YoutubeBaseInfoExtractor): 'duration': 59, 'comment_count': int, 'channel_follower_count': int, + 'media_type': 'short', }, 'params': { 'noplaylist': True, diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 872b09b21..548e3aa93 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -376,6 +376,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Afrojack', 'uploader_url': 'https://www.youtube.com/@Afrojack', 'uploader_id': '@Afrojack', + 'media_type': 'video', }, 'params': { 'youtube_include_dash_manifest': True, @@ -413,10 +414,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1401991663, + 'media_type': 'video', }, }, { - 'note': 'Age-gate video with embed allowed in public site', + 'note': 'Formerly an age-gate video with embed allowed in public site', 'url': 'https://youtube.com/watch?v=HsUATh_Nc2U', 'info_dict': { 'id': 'HsUATh_Nc2U', @@ -424,8 +426,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Godzilla 2 (Official Video)', 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'upload_date': '20200408', - 'age_limit': 18, - 'availability': 'needs_auth', + 'age_limit': 0, + 'availability': 'public', 'channel_id': 'UCYQT13AtrJC0gsM1far_zJg', 'channel': 'FlyingKitty', 'channel_url': 'https://www.youtube.com/channel/UCYQT13AtrJC0gsM1far_zJg', @@ -443,8 +445,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@FlyingKitty900', 'comment_count': int, 'channel_is_verified': True, + 'media_type': 'video', }, - 'skip': 'Age-restricted; requires authentication', }, { 'note': 'Age-gate video embedable only with clientScreen=EMBED', @@ -507,6 +509,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Herr Lurik', 'uploader_url': 'https://www.youtube.com/@HerrLurik', 'uploader_id': '@HerrLurik', + 'media_type': 'video', }, }, { @@ -546,6 +549,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'deadmau5', 'uploader_url': 'https://www.youtube.com/@deadmau5', 'uploader_id': '@deadmau5', + 'media_type': 'video', }, 'expected_warnings': [ 'DASH manifest missing', @@ -581,6 +585,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@Olympics', 'channel_is_verified': True, 'timestamp': 1440707674, + 'media_type': 'livestream', }, 'params': { 'skip_download': 'requires avconv', @@ -615,6 +620,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@AllenMeow', 'uploader_id': '@AllenMeow', 'timestamp': 1299776999, + 'media_type': 'video', }, }, # url_encoded_fmt_stream_map is empty string @@ -809,6 +815,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'age_limit': 0, 'channel_follower_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -868,6 +875,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@BKCHarvard', 'uploader_url': 'https://www.youtube.com/@BKCHarvard', 'timestamp': 1422422076, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -904,6 +912,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1447987198, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -968,6 +977,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'timestamp': 1484761047, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1070,6 +1080,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': 'count:11', 'live_status': 'not_live', 'channel_follower_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1124,6 +1135,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@ElevageOrVert', 'uploader_id': '@ElevageOrVert', 'timestamp': 1497343210, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1163,6 +1175,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1377976349, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1207,6 +1220,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'uploader': 'The Cinematic Orchestra', 'comment_count': int, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1275,6 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/@walkaroundjapan7124', 'uploader_id': '@walkaroundjapan7124', 'timestamp': 1605884416, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -1371,6 +1386,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1395685455, + 'media_type': 'video', }, 'params': {'format': 'mhtml', 'skip_download': True}, }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -1401,6 +1417,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@LeonNguyen', 'heatmap': 'count:100', 'timestamp': 1641170939, + 'media_type': 'video', }, }, { # date text is premiered video, ensure upload date in UTC (published 1641172509) @@ -1434,6 +1451,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_is_verified': True, 'heatmap': 'count:100', 'timestamp': 1641172509, + 'media_type': 'video', }, }, { # continuous livestream. @@ -1495,6 +1513,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Lesmiscore', 'uploader_url': 'https://www.youtube.com/@lesmiscore', 'timestamp': 1648005313, + 'media_type': 'short', }, }, { # Prefer primary title+description language metadata by default @@ -1523,6 +1542,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@coletdjnz', 'uploader': 'cole-dlp-test-acc', 'timestamp': 1662677394, + 'media_type': 'video', }, 'params': {'skip_download': True}, }, { @@ -1551,6 +1571,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'cole-dlp-test-acc', 'timestamp': 1659073275, 'like_count': int, + 'media_type': 'video', }, 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, 'expected_warnings': [r'Preferring "fr" translated fields'], @@ -1587,6 +1608,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'heatmap': 'count:100', + 'media_type': 'video', }, 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, }, { @@ -1687,6 +1709,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'comment_count': int, 'channel_is_verified': True, 'heatmap': 'count:100', + 'media_type': 'video', }, 'params': { 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, @@ -1719,6 +1742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'channel_follower_count': int, 'categories': ['People & Blogs'], 'tags': [], + 'media_type': 'short', }, }, ] @@ -1754,6 +1778,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': '@ChristopherSykesDocumentaries', 'heatmap': 'count:100', 'timestamp': 1211825920, + 'media_type': 'video', }, 'params': { 'skip_download': True, @@ -3787,7 +3812,10 @@ def is_bad_format(fmt): 'tags': keywords, 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), 'live_status': live_status, - 'media_type': 'livestream' if get_first(video_details, 'isLiveContent') else None, + 'media_type': ( + 'livestream' if get_first(video_details, 'isLiveContent') + else 'short' if get_first(microformats, 'isShortsEligible') + else 'video'), 'release_timestamp': live_start_time, '_format_sort_fields': ( # source_preference is lower for potentially damaged formats 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), From d880e060803ae8ed5a047e578cca01e1f0e630ce Mon Sep 17 00:00:00 2001 From: v3DJG6GL <72495210+v3DJG6GL@users.noreply.github.com> Date: Sun, 11 May 2025 00:37:04 +0200 Subject: [PATCH 05/12] [ie/playsuisse] Improve metadata extraction (#12466) Authored by: v3DJG6GL --- yt_dlp/extractor/playsuisse.py | 66 ++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/playsuisse.py b/yt_dlp/extractor/playsuisse.py index 59231d840..9bf5765fa 100644 --- a/yt_dlp/extractor/playsuisse.py +++ b/yt_dlp/extractor/playsuisse.py @@ -7,11 +7,13 @@ from ..utils import ( ExtractorError, int_or_none, + join_nonempty, parse_qs, traverse_obj, update_url_query, urlencode_postdata, ) +from ..utils.traversal import unpack class PlaySuisseIE(InfoExtractor): @@ -26,12 +28,12 @@ class PlaySuisseIE(InfoExtractor): { # episode in a series 'url': 'https://www.playsuisse.ch/watch/763182?episodeId=763211', - 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', + 'md5': 'e20d1ede6872a03b41905ca1060a1ef2', 'info_dict': { 'id': '763211', 'ext': 'mp4', 'title': 'Knochen', - 'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', + 'description': 'md5:3bdd80e2ce20227c47aab1df2a79a519', 'duration': 3344, 'series': 'Wilder', 'season': 'Season 1', @@ -42,24 +44,33 @@ class PlaySuisseIE(InfoExtractor): }, }, { # film - 'url': 'https://www.playsuisse.ch/watch/808675', - 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', + 'url': 'https://www.playsuisse.ch/detail/2573198', + 'md5': '1f115bb0a5191477b1a5771643a4283d', 'info_dict': { - 'id': '808675', + 'id': '2573198', 'ext': 'mp4', - 'title': 'Der Läufer', - 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', - 'duration': 5280, + 'title': 'Azor', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'genres': ['Fiction'], + 'creators': ['Andreas Fontana'], + 'cast': ['Fabrizio Rongione', 'Stéphanie Cléau', 'Gilles Privat', 'Alexandre Trocki'], + 'location': 'France; Argentine', + 'release_year': 2021, + 'duration': 5981, 'thumbnail': 're:https://playsuisse-img.akamaized.net/', }, }, { # series (treated as a playlist) 'url': 'https://www.playsuisse.ch/detail/1115687', 'info_dict': { - 'description': 'md5:e4a2ae29a8895823045b5c3145a02aa3', 'id': '1115687', 'series': 'They all came out to Montreux', 'title': 'They all came out to Montreux', + 'description': 'md5:0fefd8c5b4468a0bb35e916887681520', + 'genres': ['Documentary'], + 'creators': ['Oliver Murray'], + 'location': 'Switzerland', + 'release_year': 2021, }, 'playlist': [{ 'info_dict': { @@ -120,6 +131,12 @@ class PlaySuisseIE(InfoExtractor): id name description + descriptionLong + year + contentTypes + directors + mainCast + productionCountries duration episodeNumber seasonNumber @@ -215,9 +232,7 @@ def _perform_login(self, username, password): if not self._ID_TOKEN: raise ExtractorError('Login failed') - def _get_media_data(self, media_id): - # NOTE In the web app, the "locale" header is used to switch between languages, - # However this doesn't seem to take effect when passing the header here. + def _get_media_data(self, media_id, locale=None): response = self._download_json( 'https://www.playsuisse.ch/api/graphql', media_id, data=json.dumps({ @@ -225,7 +240,7 @@ def _get_media_data(self, media_id): 'query': self._GRAPHQL_QUERY, 'variables': {'assetId': media_id}, }).encode(), - headers={'Content-Type': 'application/json', 'locale': 'de'}) + headers={'Content-Type': 'application/json', 'locale': locale or 'de'}) return response['data']['assetV2'] @@ -234,7 +249,7 @@ def _real_extract(self, url): self.raise_login_required(method='password') media_id = self._match_id(url) - media_data = self._get_media_data(media_id) + media_data = self._get_media_data(media_id, traverse_obj(parse_qs(url), ('locale', 0))) info = self._extract_single(media_data) if media_data.get('episodes'): info.update({ @@ -257,15 +272,22 @@ def _extract_single(self, media_data): self._merge_subtitles(subs, target=subtitles) return { - 'id': media_data['id'], - 'title': media_data.get('name'), - 'description': media_data.get('description'), 'thumbnails': thumbnails, - 'duration': int_or_none(media_data.get('duration')), 'formats': formats, 'subtitles': subtitles, - 'series': media_data.get('seriesName'), - 'season_number': int_or_none(media_data.get('seasonNumber')), - 'episode': media_data.get('name') if media_data.get('episodeNumber') else None, - 'episode_number': int_or_none(media_data.get('episodeNumber')), + **traverse_obj(media_data, { + 'id': ('id', {str}), + 'title': ('name', {str}), + 'description': (('descriptionLong', 'description'), {str}, any), + 'genres': ('contentTypes', ..., {str}), + 'creators': ('directors', ..., {str}), + 'cast': ('mainCast', ..., {str}), + 'location': ('productionCountries', ..., {str}, all, {unpack(join_nonempty, delim='; ')}, filter), + 'release_year': ('year', {str}, {lambda x: x[:4]}, {int_or_none}), + 'duration': ('duration', {int_or_none}), + 'series': ('seriesName', {str}), + 'season_number': ('seasonNumber', {int_or_none}), + 'episode': ('name', {str}, {lambda x: x if media_data['episodeNumber'] is not None else None}), + 'episode_number': ('episodeNumber', {int_or_none}), + }), } From 7a7b85c9014d96421e18aa7ea5f4c1bee5ceece0 Mon Sep 17 00:00:00 2001 From: doe1080 <98906116+doe1080@users.noreply.github.com> Date: Sun, 11 May 2025 07:46:28 +0900 Subject: [PATCH 06/12] [ie/niconico:live] Fix extractor (#13045) Authored by: doe1080 --- yt_dlp/extractor/niconico.py | 44 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 52ba6c417..fc050c383 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -16,6 +16,7 @@ determine_ext, float_or_none, int_or_none, + parse_bitrate, parse_duration, parse_iso8601, parse_qs, @@ -23,7 +24,6 @@ qualities, remove_start, str_or_none, - try_get, unescapeHTML, unified_timestamp, update_url_query, @@ -785,8 +785,6 @@ class NiconicoLiveIE(NiconicoBaseIE): 'only_matching': True, }] - _KNOWN_LATENCY = ('high', 'low') - def _real_extract(self, url): video_id = self._match_id(url) webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id) @@ -802,22 +800,19 @@ def _real_extract(self, url): }) hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.') - latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) - if latency not in self._KNOWN_LATENCY: - latency = 'high' ws = self._request_webpage( Request(ws_url, headers={'Origin': f'https://{hostname}'}), video_id=video_id, note='Connecting to WebSocket server') - self.write_debug('[debug] Sending HLS server request') + self.write_debug('Sending HLS server request') ws.send(json.dumps({ 'type': 'startWatching', 'data': { 'stream': { 'quality': 'abr', - 'protocol': 'hls+fmp4', - 'latency': latency, + 'protocol': 'hls', + 'latency': 'high', 'accessRightMethod': 'single_cookie', 'chasePlay': False, }, @@ -881,18 +876,29 @@ def _real_extract(self, url): for cookie in cookies: self._set_cookie( cookie['domain'], cookie['name'], cookie['value'], - expire_time=unified_timestamp(cookie['expires']), path=cookie['path'], secure=cookie['secure']) + expire_time=unified_timestamp(cookie.get('expires')), path=cookie['path'], secure=cookie['secure']) + + fmt_common = { + 'live_latency': 'high', + 'origin': hostname, + 'protocol': 'niconico_live', + 'video_id': video_id, + 'ws': ws, + } + q_iter = (q for q in qualities[1:] if not q.startswith('audio_')) # ignore initial 'abr' + a_map = {96: 'audio_low', 192: 'audio_high'} formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True) - for fmt, q in zip(formats, reversed(qualities[1:])): - fmt.update({ - 'format_id': q, - 'protocol': 'niconico_live', - 'ws': ws, - 'video_id': video_id, - 'live_latency': latency, - 'origin': hostname, - }) + for fmt in formats: + if fmt.get('acodec') == 'none': + fmt['format_id'] = next(q_iter, fmt['format_id']) + elif fmt.get('vcodec') == 'none': + abr = parse_bitrate(fmt['url'].lower()) + fmt.update({ + 'abr': abr, + 'format_id': a_map.get(abr, fmt['format_id']), + }) + fmt.update(fmt_common) return { 'id': video_id, From 464c84fedf78eef822a431361155f108b5df96d7 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 May 2025 18:15:12 -0500 Subject: [PATCH 07/12] [ie/amcnetworks] Fix extractor (#13147) Authored by: bashonly --- yt_dlp/extractor/amcnetworks.py | 140 ++++++-------------------------- 1 file changed, 27 insertions(+), 113 deletions(-) diff --git a/yt_dlp/extractor/amcnetworks.py b/yt_dlp/extractor/amcnetworks.py index 15a86e245..3817f3516 100644 --- a/yt_dlp/extractor/amcnetworks.py +++ b/yt_dlp/extractor/amcnetworks.py @@ -1,32 +1,24 @@ -import re - -from .theplatform import ThePlatformIE -from ..utils import ( - int_or_none, - parse_age_limit, - try_get, - update_url_query, -) +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils.traversal import traverse_obj -class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE - _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' +class AMCNetworksIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/?#]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', + 'url': 'https://www.amc.com/shows/dark-winds/videos/dark-winds-a-look-at-season-3--1072027', 'info_dict': { - 'id': '4Lq1dzOnZGt0', + 'id': '6369261343112', 'ext': 'mp4', - 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", - 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", - 'upload_date': '20201120', - 'timestamp': 1605904350, - 'uploader': 'AMCN', + 'title': 'Dark Winds: A Look at Season 3', + 'uploader_id': '6240731308001', + 'duration': 176.427, + 'thumbnail': r're:https://[^/]+\.boltdns\.net/.+/image\.jpg', + 'tags': [], + 'timestamp': 1740414792, + 'upload_date': '20250224', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': '404 Not Found', + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -52,96 +44,18 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] - _REQUESTOR_ID_MAP = { - 'amc': 'AMC', - 'bbcamerica': 'BBCA', - 'ifc': 'IFC', - 'sundancetv': 'SUNDANCE', - 'wetv': 'WETV', - } def _real_extract(self, url): - site, display_id = self._match_valid_url(url).groups() - requestor_id = self._REQUESTOR_ID_MAP[site] - page_data = self._download_json( - f'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/{requestor_id.lower()}/url/{display_id}', - display_id)['data'] - properties = page_data.get('properties') or {} - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + initial_data = self._search_json( + r'window\.initialData\s*=\s*JSON\.parse\(String\.raw`', webpage, 'initial data', display_id) + video_id = traverse_obj(initial_data, ('initialData', 'properties', 'videoId', {str})) + if not video_id: # All locked videos are now DRM-protected + self.report_drm(display_id) + account_id = initial_data['config']['brightcove']['accountId'] + player_id = initial_data['config']['brightcove']['playerId'] - video_player_count = 0 - try: - for v in page_data['children']: - if v.get('type') == 'video-player': - release_pid = v['properties']['currentVideo']['meta']['releasePid'] - tp_path = 'M_UwQC/' + release_pid - media_url = 'https://link.theplatform.com/s/' + tp_path - video_player_count += 1 - except KeyError: - pass - if video_player_count > 1: - self.report_warning( - f'The JSON data has {video_player_count} video players. Only one will be extracted') - - # Fall back to videoPid if releasePid not found. - # TODO: Fall back to videoPid if releasePid manifest uses DRM. - if not video_player_count: - tp_path = 'M_UwQC/media/' + properties['videoPid'] - media_url = 'https://link.theplatform.com/s/' + tp_path - - theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - video_id = theplatform_metadata['pid'] - title = theplatform_metadata['title'] - rating = try_get( - theplatform_metadata, lambda x: x['ratings'][0]['rating']) - video_category = properties.get('videoCategory') - if video_category and video_category.endswith('-Auth'): - resource = self._get_mvpd_resource( - requestor_id, title, video_id, rating) - query['auth'] = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - media_url = update_url_query(media_url, query) - formats, subtitles = self._extract_theplatform_smil( - media_url, video_id) - - thumbnails = [] - thumbnail_urls = [properties.get('imageDesktop')] - if 'thumbnail' in info: - thumbnail_urls.append(info.pop('thumbnail')) - for thumbnail_url in thumbnail_urls: - if not thumbnail_url: - continue - mobj = re.search(r'(\d+)x(\d+)', thumbnail_url) - thumbnails.append({ - 'url': thumbnail_url, - 'width': int(mobj.group(1)) if mobj else None, - 'height': int(mobj.group(2)) if mobj else None, - }) - - info.update({ - 'age_limit': parse_age_limit(rating), - 'formats': formats, - 'id': video_id, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - }) - ns_keys = theplatform_metadata.get('$xmlns', {}).keys() - if ns_keys: - ns = next(iter(ns_keys)) - episode = theplatform_metadata.get(ns + '$episodeTitle') or None - episode_number = int_or_none( - theplatform_metadata.get(ns + '$episode')) - season_number = int_or_none( - theplatform_metadata.get(ns + '$season')) - series = theplatform_metadata.get(ns + '$show') or None - info.update({ - 'episode': episode, - 'episode_number': episode_number, - 'season_number': season_number, - 'series': series, - }) - return info + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + BrightcoveNewIE, video_id) From 7dbb47f84f0ee1266a3a01f58c9bc4c76d76794a Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 May 2025 18:22:38 -0500 Subject: [PATCH 08/12] [ie/cartoonnetwork] Remove extractor (#13148) Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/cartoonnetwork.py | 59 ------------------------------ 2 files changed, 60 deletions(-) delete mode 100644 yt_dlp/extractor/cartoonnetwork.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb1c3db16..08bca319d 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -338,7 +338,6 @@ from .canalplus import CanalplusIE from .canalsurmas import CanalsurmasIE from .caracoltv import CaracolTvPlayIE -from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCGemIE, diff --git a/yt_dlp/extractor/cartoonnetwork.py b/yt_dlp/extractor/cartoonnetwork.py deleted file mode 100644 index 1749a008a..000000000 --- a/yt_dlp/extractor/cartoonnetwork.py +++ /dev/null @@ -1,59 +0,0 @@ -from .turner import TurnerBaseIE -from ..utils import int_or_none - - -class CartoonNetworkIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P[^/?#]+)-(?:clip|episode)\.html' - _TEST = { - 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', - 'info_dict': { - 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', - 'ext': 'mp4', - 'title': 'How to Draw Upgrade', - 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): - metadata_re = '' - if content_re: - metadata_re = r'|video_metadata\.content_' + content_re - return self._search_regex( - rf'(?:_cnglobal\.currentVideo\.{global_re}{metadata_re})\s*=\s*"({value_re})";', - webpage, name, fatal=fatal) - - media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) - title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) - - info = self._extract_ngtv_info( - media_id, {'networkId': 'cartoonnetwork'}, { - 'url': url, - 'site_name': 'CartoonNetwork', - 'auth_required': find_field('authType', 'auth type') != 'unauth', - }) - - series = find_field( - 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) - info.update({ - 'id': media_id, - 'display_id': display_id, - 'title': title, - 'description': self._html_search_meta('description', webpage), - 'series': series, - 'episode': title, - }) - - for field in ('season', 'episode'): - field_name = field + 'Number' - info[field + '_number'] = int_or_none(find_field( - field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) - - return info From cbcfe6378dde33a650e3852ab17ad4503b8e008d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 10 May 2025 18:22:53 -0500 Subject: [PATCH 09/12] [ie/sprout] Remove extractor (#13149) Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/sprout.py | 61 --------------------------------- 2 files changed, 62 deletions(-) delete mode 100644 yt_dlp/extractor/sprout.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 08bca319d..ade71576c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1963,7 +1963,6 @@ SpreakerShowIE, ) from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE from .sproutvideo import ( SproutVideoIE, VidsIoIE, diff --git a/yt_dlp/extractor/sprout.py b/yt_dlp/extractor/sprout.py deleted file mode 100644 index 444a6c270..000000000 --- a/yt_dlp/extractor/sprout.py +++ /dev/null @@ -1,61 +0,0 @@ -from .adobepass import AdobePassIE -from ..utils import ( - int_or_none, - smuggle_url, - update_url_query, -) - - -class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', - 'info_dict': { - 'id': 'bm0foJFaTKqb', - 'ext': 'mp4', - 'title': 'Robot Bike Race', - 'description': 'md5:436b1d97117cc437f54c383f4debc66d', - 'timestamp': 1606148940, - 'upload_date': '20201123', - 'uploader': 'NBCU-MPAT', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'only_matching': True, - }, { - 'url': 'https://www.universalkids.com/watch/robot-bike-race', - 'only_matching': True, - }] - _GEO_COUNTRIES = ['US'] - - def _real_extract(self, url): - display_id = self._match_id(url) - mpx_metadata = self._download_json( - # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ - 'https://www.universalkids.com/_api/videos/' + display_id, - display_id)['mpxMetadata'] - media_pid = mpx_metadata['mediaPid'] - theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if mpx_metadata.get('entitlement') == 'auth': - query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') - theplatform_url = smuggle_url( - update_url_query(theplatform_url, query), { - 'force_smil_url': True, - 'geo_countries': self._GEO_COUNTRIES, - }) - return { - '_type': 'url_transparent', - 'id': media_pid, - 'url': theplatform_url, - 'series': mpx_metadata.get('seriesName'), - 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), - 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), - 'ie_key': 'ThePlatform', - } From 6839276496d8814cf16f58b637e45663467928e6 Mon Sep 17 00:00:00 2001 From: Subrat Lima Date: Sun, 11 May 2025 00:39:35 -0500 Subject: [PATCH 10/12] [ie/jiosaavn:show] Add extractor (#12803) Closes #12766 Authored by: subrat-lima --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jiosaavn.py | 192 +++++++++++++++++++++++++------- 2 files changed, 155 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ade71576c..7b31e49ef 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -929,6 +929,7 @@ from .jiosaavn import ( JioSaavnAlbumIE, JioSaavnPlaylistIE, + JioSaavnShowIE, JioSaavnSongIE, ) from .joj import JojIE diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 030fe686b..c9a452bd0 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -5,19 +5,24 @@ from .common import InfoExtractor from ..utils import ( InAdvancePagedList, + ISO639Utils, clean_html, int_or_none, make_archive_id, smuggle_url, + unified_strdate, + unified_timestamp, unsmuggle_url, url_basename, url_or_none, urlencode_postdata, + urljoin, ) from ..utils.traversal import traverse_obj class JioSaavnBaseIE(InfoExtractor): + _URL_BASE_RE = r'https?://(?:www\.)?(?:jio)?saavn\.com' _API_URL = 'https://www.jiosaavn.com/api.php' _VALID_BITRATES = {'16', '32', '64', '128', '320'} @@ -30,16 +35,20 @@ def requested_bitrates(self): f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') return requested_bitrates - def _extract_formats(self, song_data): + def _extract_formats(self, item_data): + # Show/episode JSON data has a slightly different structure than song JSON data + if media_url := traverse_obj(item_data, ('more_info', 'encrypted_media_url', {str})): + item_data.setdefault('encrypted_media_url', media_url) + for bitrate in self.requested_bitrates: media_data = self._download_json( - self._API_URL, song_data['id'], + self._API_URL, item_data['id'], f'Downloading format info for {bitrate}', fatal=False, data=urlencode_postdata({ '__call': 'song.generateAuthToken', '_format': 'json', 'bitrate': bitrate, - 'url': song_data['encrypted_media_url'], + 'url': item_data['encrypted_media_url'], })) if not traverse_obj(media_data, ('auth_url', {url_or_none})): self.report_warning(f'Unable to extract format info for {bitrate}') @@ -53,24 +62,6 @@ def _extract_formats(self, song_data): 'vcodec': 'none', } - def _extract_song(self, song_data, url=None): - info = traverse_obj(song_data, { - 'id': ('id', {str}), - 'title': ('song', {clean_html}), - 'album': ('album', {clean_html}), - 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), - 'duration': ('duration', {int_or_none}), - 'view_count': ('play_count', {int_or_none}), - 'release_year': ('year', {int_or_none}), - 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), - 'webpage_url': ('perma_url', {url_or_none}), - }) - if webpage_url := info.get('webpage_url') or url: - info['display_id'] = url_basename(webpage_url) - info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] - - return info - def _call_api(self, type_, token, note='API', params={}): return self._download_json( self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', @@ -84,6 +75,81 @@ def _call_api(self, type_, token, note='API', params={}): **params, }) + @staticmethod + def _extract_common_info(data): + return traverse_obj(data, { + 'id': ('id', {str}), + 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), + 'view_count': ('play_count', {int_or_none}), + 'release_year': ('year', {int_or_none}), + 'language': ('language', {lambda x: ISO639Utils.short2long(x.casefold()) or 'und'}), + 'webpage_url': ('perma_url', {url_or_none}), + }) + + @staticmethod + def _extract_song(song_data, url=None): + info = JioSaavnBaseIE._extract_common_info(song_data) + info.update(traverse_obj(song_data, { + 'title': ('song', {clean_html}), + 'album': ('album', {clean_html}), + 'duration': ('duration', {int_or_none}), + 'release_date': ('release_date', {unified_strdate}), + 'channel': ('label', {str}), + 'channel_id': ('label_id', {str}), + 'channel_url': ('label_url', {urljoin('https://www.jiosaavn.com/')}), + 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), + })) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + + return info + + @staticmethod + def _extract_episode(episode_data, url=None): + info = JioSaavnBaseIE._extract_common_info(episode_data) + info.update(traverse_obj(episode_data, { + 'title': ('title', {clean_html}), + 'description': ('more_info', 'description', {str}), + 'duration': ('more_info', 'duration', {int_or_none}), + 'timestamp': ('more_info', 'release_time', {unified_timestamp}), + 'channel': ('more_info', 'label', {str}), + 'channel_id': ('more_info', 'label_id', {str}), + 'channel_url': ('more_info', 'label_url', {urljoin('https://www.jiosaavn.com/')}), + 'series': ('more_info', 'show_title', {str}), + 'series_id': ('more_info', 'show_id', {str}), + 'season': ('more_info', 'season_title', {str}), + 'season_number': ('more_info', 'season_no', {int_or_none}), + 'season_id': ('more_info', 'season_id', {str}), + 'episode_number': ('more_info', 'episode_number', {int_or_none}), + 'cast': ('starring', {lambda x: x.split(', ') if x else None}), + 'artists': ('more_info', 'artistMap', 'primary_artists', ..., 'name', {str}), + })) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + + if featured_artists := traverse_obj(episode_data, ('featured_artists', {str})): + info.setdefault('artists', []).extend(featured_artists.split(', ')) + + return info + + def _extract_jiosaavn_result(self, url, endpoint, response_key, parse_func): + url, smuggled_data = unsmuggle_url(url) + data = traverse_obj(smuggled_data, ({ + 'id': ('id', {str}), + 'encrypted_media_url': ('encrypted_media_url', {str}), + })) + + if 'id' in data and 'encrypted_media_url' in data: + result = {'id': data['id']} + else: + # only extract metadata if this is not a url_transparent result + data = self._call_api(endpoint, self._match_id(url))[response_key][0] + result = parse_func(data, url) + + result['formats'] = list(self._extract_formats(data)) + return result + def _yield_songs(self, playlist_data): for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): song_info = self._extract_song(song_data) @@ -96,7 +162,7 @@ def _yield_songs(self, playlist_data): class JioSaavnSongIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:song' - _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'(?:/song/[^/?#]+/|/s/song/(?:[^/?#]+/){3})(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', 'md5': '3b84396d15ed9e083c3106f1fa589c04', @@ -106,12 +172,38 @@ class JioSaavnSongIE(JioSaavnBaseIE): 'ext': 'm4a', 'title': 'Leja Re', 'album': 'Leja Re', - 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'thumbnail': r're:https?://.+/.+\.jpg', 'duration': 205, 'view_count': int, 'release_year': 2018, 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], + 'channel': 'T-Series', + 'language': 'hin', + 'channel_id': '34297', + 'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_', + 'release_date': '20181124', + }, + }, { + 'url': 'https://www.jiosaavn.com/song/chuttamalle/P1FfWjZkQ0Q', + 'md5': '96296c58d6ce488a417ef0728fd2d680', + 'info_dict': { + 'id': 'O94kBTtw', + 'display_id': 'P1FfWjZkQ0Q', + 'ext': 'm4a', + 'title': 'Chuttamalle', + 'album': 'Devara Part 1 - Telugu', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'duration': 222, + 'view_count': int, + 'release_year': 2024, + 'artists': 'count:3', + '_old_archive_ids': ['jiosaavnsong P1FfWjZkQ0Q'], + 'channel': 'T-Series', + 'language': 'tel', + 'channel_id': '34297', + 'channel_url': 'https://www.jiosaavn.com/label/t-series-albums/6DLuXO3VoTo_', + 'release_date': '20240926', }, }, { 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', @@ -119,26 +211,50 @@ class JioSaavnSongIE(JioSaavnBaseIE): }] def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url) - song_data = traverse_obj(smuggled_data, ({ - 'id': ('id', {str}), - 'encrypted_media_url': ('encrypted_media_url', {str}), - })) + return self._extract_jiosaavn_result(url, 'song', 'songs', self._extract_song) - if 'id' in song_data and 'encrypted_media_url' in song_data: - result = {'id': song_data['id']} - else: - # only extract metadata if this is not a url_transparent result - song_data = self._call_api('song', self._match_id(url))['songs'][0] - result = self._extract_song(song_data, url) - result['formats'] = list(self._extract_formats(song_data)) - return result +class JioSaavnShowIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:show' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/[^/?#]+/(?P[^/?#]{11,})/?(?:$|[?#])' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/shows/non-food-ways-to-boost-your-energy/XFMcKICOCgc_', + 'md5': '0733cd254cfe74ef88bea1eaedcf1f4f', + 'info_dict': { + 'id': 'qqzh3RKZ', + 'display_id': 'XFMcKICOCgc_', + 'ext': 'mp3', + 'title': 'Non-Food Ways To Boost Your Energy', + 'description': 'md5:26e7129644b5c6aada32b8851c3997c8', + 'episode': 'Episode 1', + 'timestamp': 1640563200, + 'series': 'Holistic Lifestyle With Neha Ranglani', + 'series_id': '52397', + 'season': 'Holistic Lifestyle With Neha Ranglani', + 'season_number': 1, + 'season_id': '61273', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'duration': 311, + 'view_count': int, + 'release_year': 2021, + 'language': 'eng', + 'channel': 'Saavn OG', + 'channel_id': '1953876', + 'episode_number': 1, + 'upload_date': '20211227', + }, + }, { + 'url': 'https://www.jiosaavn.com/shows/himesh-reshammiya/Kr8fmfSN4vo_', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self._extract_jiosaavn_result(url, 'episode', 'episodes', self._extract_episode) class JioSaavnAlbumIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:album' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/album/[^/?#]+/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', 'info_dict': { @@ -158,7 +274,7 @@ def _real_extract(self, url): class JioSaavnPlaylistIE(JioSaavnBaseIE): IE_NAME = 'jiosaavn:playlist' - _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/(?:s/playlist/(?:[^/?#]+/){2}|featured/[^/?#]+/)(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', 'info_dict': { From 317f4b8006c2c0f0f64f095b1485163ad97c9053 Mon Sep 17 00:00:00 2001 From: Subrat Lima Date: Sun, 11 May 2025 00:51:34 -0500 Subject: [PATCH 11/12] [ie/jiosaavn:show:playlist] Add extractor (#12803) Closes #12766 Authored by: subrat-lima --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jiosaavn.py | 53 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7b31e49ef..bb0b949e2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -930,6 +930,7 @@ JioSaavnAlbumIE, JioSaavnPlaylistIE, JioSaavnShowIE, + JioSaavnShowPlaylistIE, JioSaavnSongIE, ) from .joj import JojIE diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index c9a452bd0..c0dc96015 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -6,8 +6,10 @@ from ..utils import ( InAdvancePagedList, ISO639Utils, + OnDemandPagedList, clean_html, int_or_none, + js_to_json, make_archive_id, smuggle_url, unified_strdate, @@ -315,3 +317,54 @@ def _real_extract(self, url): return self.playlist_result(InAdvancePagedList( functools.partial(self._entries, display_id, playlist_data), total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) + + +class JioSaavnShowPlaylistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:show:playlist' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/shows/(?P[^#/?]+)/(?P\d+)/[^/?#]+' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/shows/talking-music/1/PjReFP-Sguk_', + 'info_dict': { + 'id': 'talking-music-1', + 'title': 'Talking Music', + }, + 'playlist_mincount': 11, + }] + _PAGE_SIZE = 10 + + def _fetch_page(self, show_id, season_id, page): + return self._call_api('show', show_id, f'show page {page}', { + 'p': page, + '__call': 'show.getAllEpisodes', + 'show_id': show_id, + 'season_number': season_id, + 'api_version': '4', + 'sort_order': 'desc', + }) + + def _yield_episodes(self, playlist_data): + for episode_data in playlist_data: + episode_info = self._extract_episode(episode_data) + url = smuggle_url(episode_info['webpage_url'], { + 'id': episode_data['id'], + 'encrypted_media_url': episode_data['more_info']['encrypted_media_url'], + }) + yield self.url_result(url, JioSaavnShowIE, url_transparent=True, **episode_info) + + def _entries(self, show_id, season_id, page): + page_data = self._fetch_page(show_id, season_id, page + 1) + yield from self._yield_episodes(page_data) + + def _real_extract(self, url): + show_slug, season_id = self._match_valid_url(url).group('show', 'season') + playlist_id = f'{show_slug}-{season_id}' + webpage = self._download_webpage(url, playlist_id) + + show_info = self._search_json( + r'window\.__INITIAL_DATA__\s*=', webpage, 'initial data', + playlist_id, transform_source=js_to_json)['showView'] + show_id = show_info['current_id'] + + entries = OnDemandPagedList(functools.partial(self._entries, show_id, season_id), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, traverse_obj(show_info, ('show', 'title', 'text', {str}))) From 586b557b124f954d3f625360ebe970989022ad97 Mon Sep 17 00:00:00 2001 From: Subrat Lima Date: Sun, 11 May 2025 02:43:46 -0500 Subject: [PATCH 12/12] [ie/jiosaavn:artist] Add extractor (#12803) Closes #10823 Authored by: subrat-lima --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jiosaavn.py | 146 ++++++++++++++++++++------------ 2 files changed, 95 insertions(+), 52 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb0b949e2..e7dcb9853 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -928,6 +928,7 @@ ) from .jiosaavn import ( JioSaavnAlbumIE, + JioSaavnArtistIE, JioSaavnPlaylistIE, JioSaavnShowIE, JioSaavnShowPlaylistIE, diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index c0dc96015..3760e9abe 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,4 +1,5 @@ import functools +import itertools import math import re @@ -11,6 +12,7 @@ int_or_none, js_to_json, make_archive_id, + orderedSet, smuggle_url, unified_strdate, unified_timestamp, @@ -19,6 +21,7 @@ url_or_none, urlencode_postdata, urljoin, + variadic, ) from ..utils.traversal import traverse_obj @@ -78,46 +81,42 @@ def _call_api(self, type_, token, note='API', params={}): }) @staticmethod - def _extract_common_info(data): - return traverse_obj(data, { + def _extract_song(song_data, url=None): + info = traverse_obj(song_data, { 'id': ('id', {str}), + 'title': (('song', 'title'), {clean_html}, any), + 'album': ((None, 'more_info'), 'album', {clean_html}, any), + 'duration': ((None, 'more_info'), 'duration', {int_or_none}, any), + 'channel': ((None, 'more_info'), 'label', {str}, any), + 'channel_id': ((None, 'more_info'), 'label_id', {str}, any), + 'channel_url': ((None, 'more_info'), 'label_url', {urljoin('https://www.jiosaavn.com/')}, any), + 'release_date': ((None, 'more_info'), 'release_date', {unified_strdate}, any), + 'release_year': ('year', {int_or_none}), 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), 'view_count': ('play_count', {int_or_none}), - 'release_year': ('year', {int_or_none}), 'language': ('language', {lambda x: ISO639Utils.short2long(x.casefold()) or 'und'}), 'webpage_url': ('perma_url', {url_or_none}), + 'artists': ('more_info', 'artistMap', 'primary_artists', ..., 'name', {str}, filter, all), }) - - @staticmethod - def _extract_song(song_data, url=None): - info = JioSaavnBaseIE._extract_common_info(song_data) - info.update(traverse_obj(song_data, { - 'title': ('song', {clean_html}), - 'album': ('album', {clean_html}), - 'duration': ('duration', {int_or_none}), - 'release_date': ('release_date', {unified_strdate}), - 'channel': ('label', {str}), - 'channel_id': ('label_id', {str}), - 'channel_url': ('label_url', {urljoin('https://www.jiosaavn.com/')}), - 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), - })) if webpage_url := info.get('webpage_url') or url: info['display_id'] = url_basename(webpage_url) info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + if primary_artists := traverse_obj(song_data, ('primary_artists', {lambda x: x.split(', ') if x else None})): + info['artists'].extend(primary_artists) + if featured_artists := traverse_obj(song_data, ('featured_artists', {str}, filter)): + info['artists'].extend(featured_artists.split(', ')) + info['artists'] = orderedSet(info['artists']) or None + return info @staticmethod def _extract_episode(episode_data, url=None): - info = JioSaavnBaseIE._extract_common_info(episode_data) + info = JioSaavnBaseIE._extract_song(episode_data, url) + info.pop('_old_archive_ids', None) info.update(traverse_obj(episode_data, { - 'title': ('title', {clean_html}), 'description': ('more_info', 'description', {str}), - 'duration': ('more_info', 'duration', {int_or_none}), 'timestamp': ('more_info', 'release_time', {unified_timestamp}), - 'channel': ('more_info', 'label', {str}), - 'channel_id': ('more_info', 'label_id', {str}), - 'channel_url': ('more_info', 'label_url', {urljoin('https://www.jiosaavn.com/')}), 'series': ('more_info', 'show_title', {str}), 'series_id': ('more_info', 'show_id', {str}), 'season': ('more_info', 'season_title', {str}), @@ -125,14 +124,7 @@ def _extract_episode(episode_data, url=None): 'season_id': ('more_info', 'season_id', {str}), 'episode_number': ('more_info', 'episode_number', {int_or_none}), 'cast': ('starring', {lambda x: x.split(', ') if x else None}), - 'artists': ('more_info', 'artistMap', 'primary_artists', ..., 'name', {str}), })) - if webpage_url := info.get('webpage_url') or url: - info['display_id'] = url_basename(webpage_url) - - if featured_artists := traverse_obj(episode_data, ('featured_artists', {str})): - info.setdefault('artists', []).extend(featured_artists.split(', ')) - return info def _extract_jiosaavn_result(self, url, endpoint, response_key, parse_func): @@ -152,14 +144,20 @@ def _extract_jiosaavn_result(self, url, endpoint, response_key, parse_func): result['formats'] = list(self._extract_formats(data)) return result - def _yield_songs(self, playlist_data): - for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): - song_info = self._extract_song(song_data) - url = smuggle_url(song_info['webpage_url'], { - 'id': song_data['id'], - 'encrypted_media_url': song_data['encrypted_media_url'], - }) - yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + def _yield_items(self, playlist_data, keys=None, parse_func=None): + """Subclasses using this method must set _ENTRY_IE""" + if parse_func is None: + parse_func = self._extract_song + + for item_data in traverse_obj(playlist_data, ( + *variadic(keys, (str, bytes, dict, set)), lambda _, v: v['id'] and v['perma_url'], + )): + info = parse_func(item_data) + url = smuggle_url(info['webpage_url'], traverse_obj(item_data, { + 'id': ('id', {str}), + 'encrypted_media_url': ((None, 'more_info'), 'encrypted_media_url', {str}, any), + })) + yield self.url_result(url, self._ENTRY_IE, url_transparent=True, **info) class JioSaavnSongIE(JioSaavnBaseIE): @@ -244,6 +242,7 @@ class JioSaavnShowIE(JioSaavnBaseIE): 'channel_id': '1953876', 'episode_number': 1, 'upload_date': '20211227', + 'release_date': '20211227', }, }, { 'url': 'https://www.jiosaavn.com/shows/himesh-reshammiya/Kr8fmfSN4vo_', @@ -265,13 +264,14 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): }, 'playlist_count': 10, }] + _ENTRY_IE = JioSaavnSongIE def _real_extract(self, url): display_id = self._match_id(url) album_data = self._call_api('album', display_id) return self.playlist_result( - self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) + self._yield_items(album_data, 'songs'), display_id, traverse_obj(album_data, ('title', {str}))) class JioSaavnPlaylistIE(JioSaavnBaseIE): @@ -290,15 +290,16 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE): 'id': 'DVR,pFUOwyXqIp77B1JF,A__', 'title': 'Mood Hindi', }, - 'playlist_mincount': 801, + 'playlist_mincount': 750, }, { 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', 'info_dict': { 'id': 'Me5RridRfDk_', 'title': 'Taaza Tunes', }, - 'playlist_mincount': 301, + 'playlist_mincount': 50, }] + _ENTRY_IE = JioSaavnSongIE _PAGE_SIZE = 50 def _fetch_page(self, token, page): @@ -307,7 +308,7 @@ def _fetch_page(self, token, page): def _entries(self, token, first_page_data, page): page_data = first_page_data if not page else self._fetch_page(token, page + 1) - yield from self._yield_songs(page_data) + yield from self._yield_items(page_data, 'songs') def _real_extract(self, url): display_id = self._match_id(url) @@ -330,6 +331,7 @@ class JioSaavnShowPlaylistIE(JioSaavnBaseIE): }, 'playlist_mincount': 11, }] + _ENTRY_IE = JioSaavnShowIE _PAGE_SIZE = 10 def _fetch_page(self, show_id, season_id, page): @@ -342,18 +344,9 @@ def _fetch_page(self, show_id, season_id, page): 'sort_order': 'desc', }) - def _yield_episodes(self, playlist_data): - for episode_data in playlist_data: - episode_info = self._extract_episode(episode_data) - url = smuggle_url(episode_info['webpage_url'], { - 'id': episode_data['id'], - 'encrypted_media_url': episode_data['more_info']['encrypted_media_url'], - }) - yield self.url_result(url, JioSaavnShowIE, url_transparent=True, **episode_info) - def _entries(self, show_id, season_id, page): page_data = self._fetch_page(show_id, season_id, page + 1) - yield from self._yield_episodes(page_data) + yield from self._yield_items(page_data, keys=None, parse_func=self._extract_episode) def _real_extract(self, url): show_slug, season_id = self._match_valid_url(url).group('show', 'season') @@ -368,3 +361,52 @@ def _real_extract(self, url): entries = OnDemandPagedList(functools.partial(self._entries, show_id, season_id), self._PAGE_SIZE) return self.playlist_result( entries, playlist_id, traverse_obj(show_info, ('show', 'title', 'text', {str}))) + + +class JioSaavnArtistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:artist' + _VALID_URL = JioSaavnBaseIE._URL_BASE_RE + r'/artist/[^/?#]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/artist/krsna-songs/rYLBEve2z3U_', + 'info_dict': { + 'id': 'rYLBEve2z3U_', + 'title': 'KR$NA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'https://www.jiosaavn.com/artist/sanam-puri-songs/SkNEv3qRhDE_', + 'info_dict': { + 'id': 'SkNEv3qRhDE_', + 'title': 'Sanam Puri', + }, + 'playlist_mincount': 51, + }] + _ENTRY_IE = JioSaavnSongIE + _PAGE_SIZE = 50 + + def _fetch_page(self, artist_id, page): + return self._call_api('artist', artist_id, f'artist page {page + 1}', { + 'p': page, + 'n_song': self._PAGE_SIZE, + 'n_album': self._PAGE_SIZE, + 'sub_type': '', + 'includeMetaTags': '', + 'api_version': '4', + 'category': 'alphabetical', + 'sort_order': 'asc', + }) + + def _entries(self, artist_id, first_page): + for page in itertools.count(): + playlist_data = first_page if not page else self._fetch_page(artist_id, page) + if not traverse_obj(playlist_data, ('topSongs', ..., {dict})): + break + yield from self._yield_items(playlist_data, 'topSongs') + + def _real_extract(self, url): + artist_id = self._match_id(url) + first_page = self._fetch_page(artist_id, 0) + + return self.playlist_result( + self._entries(artist_id, first_page), artist_id, + traverse_obj(first_page, ('name', {str})))