From af9d6ae75e1203e186ccf7c4345152595ff84344 Mon Sep 17 00:00:00 2001
From: Subrat Lima <subrat.k.lima@protonmail.com>
Date: Mon, 24 Mar 2025 00:34:17 +0530
Subject: [PATCH 1/3] [ie/difm] Add new extractor (#12520)

Add complete show and individual episode extractor

Closes #12520
Authored by: Subrat Lima
---
 yt_dlp/extractor/_extractors.py |  4 ++
 yt_dlp/extractor/difm.py        | 83 +++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 yt_dlp/extractor/difm.py

diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py
index c56ec9df6a..0380b3a99e 100644
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -504,6 +504,10 @@
 )
 from .dfb import DFBIE
 from .dhm import DHMIE
+from .difm import (
+    DIFMShowEpisodeIE,
+    DIFMShowIE,
+)
 from .digitalconcerthall import DigitalConcertHallIE
 from .digiteka import DigitekaIE
 from .digiview import DigiviewIE
diff --git a/yt_dlp/extractor/difm.py b/yt_dlp/extractor/difm.py
new file mode 100644
index 0000000000..2425058f41
--- /dev/null
+++ b/yt_dlp/extractor/difm.py
@@ -0,0 +1,83 @@
+import functools
+
+from .common import InfoExtractor
+from ..utils import (
+    OnDemandPagedList,
+    int_or_none,
+    sanitize_url,
+    str_or_none,
+    traverse_obj,
+    unescapeHTML,
+    unified_strdate,
+)
+
+clean_url = lambda x: unescapeHTML(sanitize_url(x, scheme='https'))
+
+
+def extract_episode(episode):
+    return traverse_obj(episode, {
+        'id': ('id', {str_or_none}),
+        'ext': 'mp4',
+        'timestamp': ('start_at', {unified_strdate}, {int_or_none}),
+        'duration': ('tracks', 0, 'length', {int_or_none}),
+        'artist': ('tracks', 0, 'display_artist', {str_or_none}),
+        'title': ('tracks', 0, 'display_title', {str_or_none}),
+        'thumbnail': ('tracks', 0, 'asset_url', {clean_url}),
+        'url': ('tracks', 0, 'content', 'assets', 0, 'url', {clean_url}),
+        'filesize': ('tracks', 0, 'content', 'assets', 0, 'size', {int_or_none}),
+    })
+
+
+class DIFMShowEpisodeIE(InfoExtractor):
+    IE_NAME = 'difm:showepisode'
+    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)/episodes/(?P<episode_id>\d+)'
+    _TESTS = [
+        {
+            'url': 'https://www.di.fm/shows/airwaves-progressions-radio/episodes/001',
+            'md5': '5725ec4226aed05c58b6460df5e4b4df',
+            'info_dict': {
+                'id': '130151',
+                'ext': 'mp4',
+                'title': 'Progressions 001 (04 April 2020)',
+                'duration': 7456,
+                'thumbnail': r're:https?://.*\.jpg',
+            },
+        }, {
+            'url': 'https://www.di.fm/shows/the-global-warm-up/episodes/1095',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        show_name, episode_id = self._match_valid_url(url).group('show_name', 'episode_id')
+        video_id = f'{show_name}-{episode_id}'
+        webpage = self._download_webpage(url, video_id, fatal=False, impersonate=True)
+        json_data = self._search_json('"EpisodeDetail.LayoutEngine",', webpage, 'json_data', video_id)['episode']
+        return extract_episode(json_data)
+
+
+class DIFMShowIE(InfoExtractor):
+    IE_NAME = 'difm:show'
+    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)'
+    _TESTS = [{
+        'url': 'https://www.di.fm/shows/the-global-warm-up',
+        'info_dict': {
+            '_type': 'playlist',
+            'id': 'the-global-warm-up',
+            'title': 'the-global-warm-up',
+        },
+        'playlist_mincount': 5,
+    }]
+    _PAGE_SIZE = 5
+
+    def _entries(self, show_name, session_key, page):
+        show_metadata = self._download_json(f'https://api.audioaddict.com/v1/di/shows/{show_name}/episodes?page={page + 1}&per_page={self._PAGE_SIZE}', f'{show_name}-{page + 1}', headers={'X-Session-Key': session_key})
+        for episode_metadata in show_metadata:
+            yield extract_episode(episode_metadata)
+
+    def _real_extract(self, url):
+        show_name = self._match_valid_url(url).group('show_name')
+        webpage = self._download_webpage(url, show_name, fatal=False, impersonate=True)
+        session_key = self._search_json('"user":', webpage, 'json_data', show_name).get('session_key')
+        entries = OnDemandPagedList(functools.partial(self._entries, show_name, session_key), self._PAGE_SIZE)
+        return self.playlist_result(entries, show_name, show_name)

From e4b1fd68f878abf744b99bdcc75ea05ba2cc794b Mon Sep 17 00:00:00 2001
From: Subrat Lima <subrat.k.lima@protonmail.com>
Date: Mon, 24 Mar 2025 13:54:18 +0530
Subject: [PATCH 2/3] [ie/difm] fix regex and update example

---
 yt_dlp/extractor/difm.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/yt_dlp/extractor/difm.py b/yt_dlp/extractor/difm.py
index 2425058f41..d5648da4fd 100644
--- a/yt_dlp/extractor/difm.py
+++ b/yt_dlp/extractor/difm.py
@@ -17,7 +17,6 @@
 def extract_episode(episode):
     return traverse_obj(episode, {
         'id': ('id', {str_or_none}),
-        'ext': 'mp4',
         'timestamp': ('start_at', {unified_strdate}, {int_or_none}),
         'duration': ('tracks', 0, 'length', {int_or_none}),
         'artist': ('tracks', 0, 'display_artist', {str_or_none}),
@@ -30,17 +29,19 @@ def extract_episode(episode):
 
 class DIFMShowEpisodeIE(InfoExtractor):
     IE_NAME = 'difm:showepisode'
-    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)/episodes/(?P<episode_id>\d+)'
+    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)/episodes/(?P<episode_id>[\w-]+)'
     _TESTS = [
         {
             'url': 'https://www.di.fm/shows/airwaves-progressions-radio/episodes/001',
             'md5': '5725ec4226aed05c58b6460df5e4b4df',
             'info_dict': {
                 'id': '130151',
-                'ext': 'mp4',
                 'title': 'Progressions 001 (04 April 2020)',
                 'duration': 7456,
                 'thumbnail': r're:https?://.*\.jpg',
+                'timestamp': 20200404,
+                'artist': 'Airwave',
+                'filesize': 120584191,
             },
         }, {
             'url': 'https://www.di.fm/shows/the-global-warm-up/episodes/1095',
@@ -51,20 +52,20 @@ class DIFMShowEpisodeIE(InfoExtractor):
     def _real_extract(self, url):
         show_name, episode_id = self._match_valid_url(url).group('show_name', 'episode_id')
         video_id = f'{show_name}-{episode_id}'
-        webpage = self._download_webpage(url, video_id, fatal=False, impersonate=True)
+        webpage = self._download_webpage(url, video_id, fatal=True, impersonate=True)
         json_data = self._search_json('"EpisodeDetail.LayoutEngine",', webpage, 'json_data', video_id)['episode']
         return extract_episode(json_data)
 
 
 class DIFMShowIE(InfoExtractor):
     IE_NAME = 'difm:show'
-    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)'
+    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)$'
     _TESTS = [{
         'url': 'https://www.di.fm/shows/the-global-warm-up',
         'info_dict': {
             '_type': 'playlist',
             'id': 'the-global-warm-up',
-            'title': 'the-global-warm-up',
+            'title': 'The Global Warm Up with Judge Jules',
         },
         'playlist_mincount': 5,
     }]
@@ -77,7 +78,8 @@ def _entries(self, show_name, session_key, page):
 
     def _real_extract(self, url):
         show_name = self._match_valid_url(url).group('show_name')
-        webpage = self._download_webpage(url, show_name, fatal=False, impersonate=True)
-        session_key = self._search_json('"user":', webpage, 'json_data', show_name).get('session_key')
+        webpage = self._download_webpage(url, show_name, fatal=True, impersonate=True)
+        show_title = self._html_extract_title(webpage).removesuffix(' - DI.FM')
+        session_key = self._search_regex(r'"session_key"\s*:\s*"(?P<session_key>\w+)"', webpage, 'session_key')
         entries = OnDemandPagedList(functools.partial(self._entries, show_name, session_key), self._PAGE_SIZE)
-        return self.playlist_result(entries, show_name, show_name)
+        return self.playlist_result(entries, show_name, show_title)

From a1694a14c3f1f0dc709c14d08f06d5be448a3685 Mon Sep 17 00:00:00 2001
From: Subrat Lima <subrat.k.lima@protonmail.com>
Date: Sun, 6 Apr 2025 17:04:43 +0530
Subject: [PATCH 3/3] [ie/difm] enhance extractor, fix test and clean up code

---
 yt_dlp/extractor/difm.py | 72 ++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/yt_dlp/extractor/difm.py b/yt_dlp/extractor/difm.py
index d5648da4fd..2f30ea01d1 100644
--- a/yt_dlp/extractor/difm.py
+++ b/yt_dlp/extractor/difm.py
@@ -8,40 +8,32 @@
     str_or_none,
     traverse_obj,
     unescapeHTML,
-    unified_strdate,
+    unified_timestamp,
 )
 
 clean_url = lambda x: unescapeHTML(sanitize_url(x, scheme='https'))
 
 
-def extract_episode(episode):
-    return traverse_obj(episode, {
-        'id': ('id', {str_or_none}),
-        'timestamp': ('start_at', {unified_strdate}, {int_or_none}),
-        'duration': ('tracks', 0, 'length', {int_or_none}),
-        'artist': ('tracks', 0, 'display_artist', {str_or_none}),
-        'title': ('tracks', 0, 'display_title', {str_or_none}),
-        'thumbnail': ('tracks', 0, 'asset_url', {clean_url}),
-        'url': ('tracks', 0, 'content', 'assets', 0, 'url', {clean_url}),
-        'filesize': ('tracks', 0, 'content', 'assets', 0, 'size', {int_or_none}),
-    })
-
-
 class DIFMShowEpisodeIE(InfoExtractor):
     IE_NAME = 'difm:showepisode'
-    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)/episodes/(?P<episode_id>[\w-]+)'
+    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_id>[\w-]+)/episodes/(?P<episode_id>[\w-]+)'
     _TESTS = [
         {
             'url': 'https://www.di.fm/shows/airwaves-progressions-radio/episodes/001',
             'md5': '5725ec4226aed05c58b6460df5e4b4df',
             'info_dict': {
                 'id': '130151',
+                'ext': 'm4a',
                 'title': 'Progressions 001 (04 April 2020)',
+                'description': '',
                 'duration': 7456,
                 'thumbnail': r're:https?://.*\.jpg',
-                'timestamp': 20200404,
-                'artist': 'Airwave',
+                'upload_date': '20200404',
+                'timestamp': 1586008800,
+                'artists': ['Airwave'],
                 'filesize': 120584191,
+                'like_count': int,
+                'dislike_count': int,
             },
         }, {
             'url': 'https://www.di.fm/shows/the-global-warm-up/episodes/1095',
@@ -49,17 +41,35 @@ class DIFMShowEpisodeIE(InfoExtractor):
         },
     ]
 
+    def _extract_data(self, episode):
+        return {
+            'ext': 'm4a',
+            **traverse_obj(episode, {
+                'id': ('id', {str_or_none}),
+                'timestamp': ('start_at', {unified_timestamp}),
+                'description': ('description', {str}),
+                'duration': ('tracks', 0, 'length', {int_or_none}),
+                'artist': ('tracks', 0, 'display_artist', {str}),
+                'title': ('tracks', 0, 'display_title', {str}),
+                'like_count': ('tracks', 0, 'votes', 'up', {int_or_none}),
+                'dislike_count': ('tracks', 0, 'votes', 'down', {int_or_none}),
+                'thumbnail': ('tracks', 0, 'asset_url', {clean_url}),
+                'url': ('tracks', 0, 'content', 'assets', 0, 'url', {clean_url}),
+                'filesize': ('tracks', 0, 'content', 'assets', 0, 'size', {int_or_none}),
+            }),
+        }
+
     def _real_extract(self, url):
-        show_name, episode_id = self._match_valid_url(url).group('show_name', 'episode_id')
-        video_id = f'{show_name}-{episode_id}'
+        show_id, episode_id = self._match_valid_url(url).group('show_id', 'episode_id')
+        video_id = f'{show_id}-{episode_id}'
         webpage = self._download_webpage(url, video_id, fatal=True, impersonate=True)
         json_data = self._search_json('"EpisodeDetail.LayoutEngine",', webpage, 'json_data', video_id)['episode']
-        return extract_episode(json_data)
+        return self._extract_data(json_data)
 
 
-class DIFMShowIE(InfoExtractor):
+class DIFMShowIE(DIFMShowEpisodeIE):
     IE_NAME = 'difm:show'
-    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<show_name>[\w-]+)$'
+    _VALID_URL = r'https?://www\.di\.fm/shows/(?P<id>[\w-]+)$'
     _TESTS = [{
         'url': 'https://www.di.fm/shows/the-global-warm-up',
         'info_dict': {
@@ -71,15 +81,19 @@ class DIFMShowIE(InfoExtractor):
     }]
     _PAGE_SIZE = 5
 
-    def _entries(self, show_name, session_key, page):
-        show_metadata = self._download_json(f'https://api.audioaddict.com/v1/di/shows/{show_name}/episodes?page={page + 1}&per_page={self._PAGE_SIZE}', f'{show_name}-{page + 1}', headers={'X-Session-Key': session_key})
+    def _entries(self, show_id, session_key, page):
+        show_metadata = self._download_json(
+            f'https://api.audioaddict.com/v1/di/shows/{show_id}/episodes',
+            f'{show_id}-{page + 1}', headers={'X-Session-Key': session_key},
+            query={'page': str(page + 1), 'per_page': str(self._PAGE_SIZE)},
+        )
         for episode_metadata in show_metadata:
-            yield extract_episode(episode_metadata)
+            yield self._extract_data(episode_metadata)
 
     def _real_extract(self, url):
-        show_name = self._match_valid_url(url).group('show_name')
-        webpage = self._download_webpage(url, show_name, fatal=True, impersonate=True)
+        show_id = self._match_id(url)
+        webpage = self._download_webpage(url, show_id, fatal=True, impersonate=True)
         show_title = self._html_extract_title(webpage).removesuffix(' - DI.FM')
         session_key = self._search_regex(r'"session_key"\s*:\s*"(?P<session_key>\w+)"', webpage, 'session_key')
-        entries = OnDemandPagedList(functools.partial(self._entries, show_name, session_key), self._PAGE_SIZE)
-        return self.playlist_result(entries, show_name, show_title)
+        entries = OnDemandPagedList(functools.partial(self._entries, show_id, session_key), self._PAGE_SIZE)
+        return self.playlist_result(entries, show_id, show_title)