From af9d6ae75e1203e186ccf7c4345152595ff84344 Mon Sep 17 00:00:00 2001 From: Subrat Lima Date: Mon, 24 Mar 2025 00:34:17 +0530 Subject: [PATCH 1/3] [ie/difm] Add new extractor (#12520) Add complete show and individual episode extractor Closes #12520 Authored by: Subrat Lima --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/difm.py | 83 +++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 yt_dlp/extractor/difm.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c56ec9df6a..0380b3a99e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -504,6 +504,10 @@ ) from .dfb import DFBIE from .dhm import DHMIE +from .difm import ( + DIFMShowEpisodeIE, + DIFMShowIE, +) from .digitalconcerthall import DigitalConcertHallIE from .digiteka import DigitekaIE from .digiview import DigiviewIE diff --git a/yt_dlp/extractor/difm.py b/yt_dlp/extractor/difm.py new file mode 100644 index 0000000000..2425058f41 --- /dev/null +++ b/yt_dlp/extractor/difm.py @@ -0,0 +1,83 @@ +import functools + +from .common import InfoExtractor +from ..utils import ( + OnDemandPagedList, + int_or_none, + sanitize_url, + str_or_none, + traverse_obj, + unescapeHTML, + unified_strdate, +) + +clean_url = lambda x: unescapeHTML(sanitize_url(x, scheme='https')) + + +def extract_episode(episode): + return traverse_obj(episode, { + 'id': ('id', {str_or_none}), + 'ext': 'mp4', + 'timestamp': ('start_at', {unified_strdate}, {int_or_none}), + 'duration': ('tracks', 0, 'length', {int_or_none}), + 'artist': ('tracks', 0, 'display_artist', {str_or_none}), + 'title': ('tracks', 0, 'display_title', {str_or_none}), + 'thumbnail': ('tracks', 0, 'asset_url', {clean_url}), + 'url': ('tracks', 0, 'content', 'assets', 0, 'url', {clean_url}), + 'filesize': ('tracks', 0, 'content', 'assets', 0, 'size', {int_or_none}), + }) + + +class DIFMShowEpisodeIE(InfoExtractor): + IE_NAME = 'difm:showepisode' + _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)/episodes/(?P\d+)' + _TESTS = [ + { + 'url': 'https://www.di.fm/shows/airwaves-progressions-radio/episodes/001', + 'md5': '5725ec4226aed05c58b6460df5e4b4df', + 'info_dict': { + 'id': '130151', + 'ext': 'mp4', + 'title': 'Progressions 001 (04 April 2020)', + 'duration': 7456, + 'thumbnail': r're:https?://.*\.jpg', + }, + }, { + 'url': 'https://www.di.fm/shows/the-global-warm-up/episodes/1095', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + show_name, episode_id = self._match_valid_url(url).group('show_name', 'episode_id') + video_id = f'{show_name}-{episode_id}' + webpage = self._download_webpage(url, video_id, fatal=False, impersonate=True) + json_data = self._search_json('"EpisodeDetail.LayoutEngine",', webpage, 'json_data', video_id)['episode'] + return extract_episode(json_data) + + +class DIFMShowIE(InfoExtractor): + IE_NAME = 'difm:show' + _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.di.fm/shows/the-global-warm-up', + 'info_dict': { + '_type': 'playlist', + 'id': 'the-global-warm-up', + 'title': 'the-global-warm-up', + }, + 'playlist_mincount': 5, + }] + _PAGE_SIZE = 5 + + def _entries(self, show_name, session_key, page): + show_metadata = self._download_json(f'https://api.audioaddict.com/v1/di/shows/{show_name}/episodes?page={page + 1}&per_page={self._PAGE_SIZE}', f'{show_name}-{page + 1}', headers={'X-Session-Key': session_key}) + for episode_metadata in show_metadata: + yield extract_episode(episode_metadata) + + def _real_extract(self, url): + show_name = self._match_valid_url(url).group('show_name') + webpage = self._download_webpage(url, show_name, fatal=False, impersonate=True) + session_key = self._search_json('"user":', webpage, 'json_data', show_name).get('session_key') + entries = OnDemandPagedList(functools.partial(self._entries, show_name, session_key), self._PAGE_SIZE) + return self.playlist_result(entries, show_name, show_name) From e4b1fd68f878abf744b99bdcc75ea05ba2cc794b Mon Sep 17 00:00:00 2001 From: Subrat Lima Date: Mon, 24 Mar 2025 13:54:18 +0530 Subject: [PATCH 2/3] [ie/difm] fix regex and update example --- yt_dlp/extractor/difm.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/difm.py b/yt_dlp/extractor/difm.py index 2425058f41..d5648da4fd 100644 --- a/yt_dlp/extractor/difm.py +++ b/yt_dlp/extractor/difm.py @@ -17,7 +17,6 @@ def extract_episode(episode): return traverse_obj(episode, { 'id': ('id', {str_or_none}), - 'ext': 'mp4', 'timestamp': ('start_at', {unified_strdate}, {int_or_none}), 'duration': ('tracks', 0, 'length', {int_or_none}), 'artist': ('tracks', 0, 'display_artist', {str_or_none}), @@ -30,17 +29,19 @@ def extract_episode(episode): class DIFMShowEpisodeIE(InfoExtractor): IE_NAME = 'difm:showepisode' - _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)/episodes/(?P\d+)' + _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)/episodes/(?P[\w-]+)' _TESTS = [ { 'url': 'https://www.di.fm/shows/airwaves-progressions-radio/episodes/001', 'md5': '5725ec4226aed05c58b6460df5e4b4df', 'info_dict': { 'id': '130151', - 'ext': 'mp4', 'title': 'Progressions 001 (04 April 2020)', 'duration': 7456, 'thumbnail': r're:https?://.*\.jpg', + 'timestamp': 20200404, + 'artist': 'Airwave', + 'filesize': 120584191, }, }, { 'url': 'https://www.di.fm/shows/the-global-warm-up/episodes/1095', @@ -51,20 +52,20 @@ class DIFMShowEpisodeIE(InfoExtractor): def _real_extract(self, url): show_name, episode_id = self._match_valid_url(url).group('show_name', 'episode_id') video_id = f'{show_name}-{episode_id}' - webpage = self._download_webpage(url, video_id, fatal=False, impersonate=True) + webpage = self._download_webpage(url, video_id, fatal=True, impersonate=True) json_data = self._search_json('"EpisodeDetail.LayoutEngine",', webpage, 'json_data', video_id)['episode'] return extract_episode(json_data) class DIFMShowIE(InfoExtractor): IE_NAME = 'difm:show' - _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)' + _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)$' _TESTS = [{ 'url': 'https://www.di.fm/shows/the-global-warm-up', 'info_dict': { '_type': 'playlist', 'id': 'the-global-warm-up', - 'title': 'the-global-warm-up', + 'title': 'The Global Warm Up with Judge Jules', }, 'playlist_mincount': 5, }] @@ -77,7 +78,8 @@ def _entries(self, show_name, session_key, page): def _real_extract(self, url): show_name = self._match_valid_url(url).group('show_name') - webpage = self._download_webpage(url, show_name, fatal=False, impersonate=True) - session_key = self._search_json('"user":', webpage, 'json_data', show_name).get('session_key') + webpage = self._download_webpage(url, show_name, fatal=True, impersonate=True) + show_title = self._html_extract_title(webpage).removesuffix(' - DI.FM') + session_key = self._search_regex(r'"session_key"\s*:\s*"(?P\w+)"', webpage, 'session_key') entries = OnDemandPagedList(functools.partial(self._entries, show_name, session_key), self._PAGE_SIZE) - return self.playlist_result(entries, show_name, show_name) + return self.playlist_result(entries, show_name, show_title) From a1694a14c3f1f0dc709c14d08f06d5be448a3685 Mon Sep 17 00:00:00 2001 From: Subrat Lima Date: Sun, 6 Apr 2025 17:04:43 +0530 Subject: [PATCH 3/3] [ie/difm] enhance extractor, fix test and clean up code --- yt_dlp/extractor/difm.py | 72 ++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/yt_dlp/extractor/difm.py b/yt_dlp/extractor/difm.py index d5648da4fd..2f30ea01d1 100644 --- a/yt_dlp/extractor/difm.py +++ b/yt_dlp/extractor/difm.py @@ -8,40 +8,32 @@ str_or_none, traverse_obj, unescapeHTML, - unified_strdate, + unified_timestamp, ) clean_url = lambda x: unescapeHTML(sanitize_url(x, scheme='https')) -def extract_episode(episode): - return traverse_obj(episode, { - 'id': ('id', {str_or_none}), - 'timestamp': ('start_at', {unified_strdate}, {int_or_none}), - 'duration': ('tracks', 0, 'length', {int_or_none}), - 'artist': ('tracks', 0, 'display_artist', {str_or_none}), - 'title': ('tracks', 0, 'display_title', {str_or_none}), - 'thumbnail': ('tracks', 0, 'asset_url', {clean_url}), - 'url': ('tracks', 0, 'content', 'assets', 0, 'url', {clean_url}), - 'filesize': ('tracks', 0, 'content', 'assets', 0, 'size', {int_or_none}), - }) - - class DIFMShowEpisodeIE(InfoExtractor): IE_NAME = 'difm:showepisode' - _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)/episodes/(?P[\w-]+)' + _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)/episodes/(?P[\w-]+)' _TESTS = [ { 'url': 'https://www.di.fm/shows/airwaves-progressions-radio/episodes/001', 'md5': '5725ec4226aed05c58b6460df5e4b4df', 'info_dict': { 'id': '130151', + 'ext': 'm4a', 'title': 'Progressions 001 (04 April 2020)', + 'description': '', 'duration': 7456, 'thumbnail': r're:https?://.*\.jpg', - 'timestamp': 20200404, - 'artist': 'Airwave', + 'upload_date': '20200404', + 'timestamp': 1586008800, + 'artists': ['Airwave'], 'filesize': 120584191, + 'like_count': int, + 'dislike_count': int, }, }, { 'url': 'https://www.di.fm/shows/the-global-warm-up/episodes/1095', @@ -49,17 +41,35 @@ class DIFMShowEpisodeIE(InfoExtractor): }, ] + def _extract_data(self, episode): + return { + 'ext': 'm4a', + **traverse_obj(episode, { + 'id': ('id', {str_or_none}), + 'timestamp': ('start_at', {unified_timestamp}), + 'description': ('description', {str}), + 'duration': ('tracks', 0, 'length', {int_or_none}), + 'artist': ('tracks', 0, 'display_artist', {str}), + 'title': ('tracks', 0, 'display_title', {str}), + 'like_count': ('tracks', 0, 'votes', 'up', {int_or_none}), + 'dislike_count': ('tracks', 0, 'votes', 'down', {int_or_none}), + 'thumbnail': ('tracks', 0, 'asset_url', {clean_url}), + 'url': ('tracks', 0, 'content', 'assets', 0, 'url', {clean_url}), + 'filesize': ('tracks', 0, 'content', 'assets', 0, 'size', {int_or_none}), + }), + } + def _real_extract(self, url): - show_name, episode_id = self._match_valid_url(url).group('show_name', 'episode_id') - video_id = f'{show_name}-{episode_id}' + show_id, episode_id = self._match_valid_url(url).group('show_id', 'episode_id') + video_id = f'{show_id}-{episode_id}' webpage = self._download_webpage(url, video_id, fatal=True, impersonate=True) json_data = self._search_json('"EpisodeDetail.LayoutEngine",', webpage, 'json_data', video_id)['episode'] - return extract_episode(json_data) + return self._extract_data(json_data) -class DIFMShowIE(InfoExtractor): +class DIFMShowIE(DIFMShowEpisodeIE): IE_NAME = 'difm:show' - _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)$' + _VALID_URL = r'https?://www\.di\.fm/shows/(?P[\w-]+)$' _TESTS = [{ 'url': 'https://www.di.fm/shows/the-global-warm-up', 'info_dict': { @@ -71,15 +81,19 @@ class DIFMShowIE(InfoExtractor): }] _PAGE_SIZE = 5 - def _entries(self, show_name, session_key, page): - show_metadata = self._download_json(f'https://api.audioaddict.com/v1/di/shows/{show_name}/episodes?page={page + 1}&per_page={self._PAGE_SIZE}', f'{show_name}-{page + 1}', headers={'X-Session-Key': session_key}) + def _entries(self, show_id, session_key, page): + show_metadata = self._download_json( + f'https://api.audioaddict.com/v1/di/shows/{show_id}/episodes', + f'{show_id}-{page + 1}', headers={'X-Session-Key': session_key}, + query={'page': str(page + 1), 'per_page': str(self._PAGE_SIZE)}, + ) for episode_metadata in show_metadata: - yield extract_episode(episode_metadata) + yield self._extract_data(episode_metadata) def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - webpage = self._download_webpage(url, show_name, fatal=True, impersonate=True) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id, fatal=True, impersonate=True) show_title = self._html_extract_title(webpage).removesuffix(' - DI.FM') session_key = self._search_regex(r'"session_key"\s*:\s*"(?P\w+)"', webpage, 'session_key') - entries = OnDemandPagedList(functools.partial(self._entries, show_name, session_key), self._PAGE_SIZE) - return self.playlist_result(entries, show_name, show_title) + entries = OnDemandPagedList(functools.partial(self._entries, show_id, session_key), self._PAGE_SIZE) + return self.playlist_result(entries, show_id, show_title)