From fdfac32149d5a4ab0365a02159057b94f15044fd Mon Sep 17 00:00:00 2001 From: Randalix Date: Tue, 12 Aug 2025 23:55:21 +0200 Subject: [PATCH 1/7] feat: Update SouthParkDeIE to use new API extraction logic --- yt_dlp/extractor/southpark.py | 50 ++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 3d661a86ac..61251a010a 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -1,4 +1,8 @@ from .mtv import MTVServicesInfoExtractor +from ..utils import ( + traverse_obj, + random_uuidv4, +) class SouthParkIE(MTVServicesInfoExtractor): @@ -99,14 +103,46 @@ class SouthParkDeIE(SouthParkIE): # XXX: Do not subclass from concrete IE }, }] - def _get_feed_url(self, uri, url=None): - video_id = self._id_from_uri(uri) - config = self._download_json( - f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge&ref={url}', video_id) - return self._remove_template_parameter(config['feedWithQueryParams']) + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - def _get_feed_query(self, uri): - return + data = self._parse_json(self._search_regex( + r'window\.__DATA__\s*=\s*({.+?});', webpage, 'data'), display_id) + + # Find the videoDetail object by first finding the MainContainer component + video_detail = traverse_obj(data, ( + 'children', lambda _, v: v.get('type') == 'MainContainer', + 'children', 0, 'children', 0, 'props', 'videoDetail' + ), get_all=False) + + # Fallback for a simpler data structure found on some pages + if not video_detail: + video_detail = traverse_obj(data, ('children', 0, 'videoDetail'), get_all=False) + + api_url = video_detail['videoServiceUrl'] + + # Call the Topaz API to get the final stream URL + api_data = self._download_json( + api_url, display_id, 'Fetching video metadata', query={ + 'ssus': random_uuidv4(), + 'clientPlatform': 'mobile', + }) + + hls_url = traverse_obj(api_data, ('stitchedstream', 'source')) + + return { + 'id': video_detail['id'], + 'display_id': display_id, + 'url': hls_url, + 'title': video_detail.get('title'), + 'description': video_detail.get('description'), + 'duration': traverse_obj(video_detail, ('duration', 'milliseconds'), expected_type=int) / 1000, + 'season_number': video_detail.get('seasonNumber'), + 'episode_number': traverse_obj(video_detail, 'episodeAiringOrder'), + 'timestamp': traverse_obj(video_detail, ('publishDate', 'timestamp')), + 'series': traverse_obj(video_detail, ('parentEntity', 'title')), + } class SouthParkLatIE(SouthParkIE): # XXX: Do not subclass from concrete IE From 52876aa5a79ab698b93037a987ca945f14959ac2 Mon Sep 17 00:00:00 2001 From: Randalix Date: Wed, 13 Aug 2025 00:00:48 +0200 Subject: [PATCH 2/7] fix: Update SouthParkDeIE to use modern API extraction --- yt_dlp/extractor/southpark.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 61251a010a..83700a3675 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -1,7 +1,7 @@ from .mtv import MTVServicesInfoExtractor from ..utils import ( - traverse_obj, random_uuidv4, + traverse_obj, ) @@ -110,26 +110,20 @@ def _real_extract(self, url): data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+?});', webpage, 'data'), display_id) - # Find the videoDetail object by first finding the MainContainer component video_detail = traverse_obj(data, ( 'children', lambda _, v: v.get('type') == 'MainContainer', 'children', 0, 'children', 0, 'props', 'videoDetail' - ), get_all=False) - - # Fallback for a simpler data structure found on some pages - if not video_detail: - video_detail = traverse_obj(data, ('children', 0, 'videoDetail'), get_all=False) + ), default=traverse_obj(data, ('children', 0, 'videoDetail'))) api_url = video_detail['videoServiceUrl'] - # Call the Topaz API to get the final stream URL api_data = self._download_json( api_url, display_id, 'Fetching video metadata', query={ 'ssus': random_uuidv4(), 'clientPlatform': 'mobile', }) - hls_url = traverse_obj(api_data, ('stitchedstream', 'source')) + hls_url = traverse_obj(api_data, ('stitchedstream', 'source'), expected_type=str) return { 'id': video_detail['id'], From 3228b3de4913a2862830ba945e0e02a9d8ae11bb Mon Sep 17 00:00:00 2001 From: Randalix Date: Wed, 13 Aug 2025 00:05:38 +0200 Subject: [PATCH 3/7] fix: Improve SouthParkDeIE to handle various page layouts --- yt_dlp/extractor/southpark.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 83700a3675..ca2a3bae74 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -110,10 +110,17 @@ def _real_extract(self, url): data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+?});', webpage, 'data'), display_id) - video_detail = traverse_obj(data, ( - 'children', lambda _, v: v.get('type') == 'MainContainer', - 'children', 0, 'children', 0, 'props', 'videoDetail' - ), default=traverse_obj(data, ('children', 0, 'videoDetail'))) + # Try multiple paths to find the video data, handling both regular and special episodes + video_detail = traverse_obj(data, [ + # Path for regular episodes (more complex) + ('children', lambda _, v: v.get('type') == 'MainContainer', + 'children', 0, 'children', 0, 'props', 'videoDetail'), + # Fallback path for special episodes (simpler) + ('children', 0, 'videoDetail'), + ]) + + if not video_detail: + raise ExtractorError('Could not find video data in page') api_url = video_detail['videoServiceUrl'] From 1d5852663d674ef39009751c0cfc62da259ccf9b Mon Sep 17 00:00:00 2001 From: Randalix Date: Wed, 13 Aug 2025 00:05:53 +0200 Subject: [PATCH 4/7] fix: Import ExtractorError in SouthParkDeIE --- yt_dlp/extractor/southpark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index ca2a3bae74..cfcfa3e682 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -2,6 +2,7 @@ from ..utils import ( random_uuidv4, traverse_obj, + ExtractorError, # Added ExtractorError import ) From 35c83c26dc305cc2c8efae12209458ac12cd6236 Mon Sep 17 00:00:00 2001 From: Randalix Date: Wed, 13 Aug 2025 00:08:12 +0200 Subject: [PATCH 5/7] fix: Correct traverse_obj call in SouthParkDeIE --- yt_dlp/extractor/southpark.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index cfcfa3e682..db9f5d8f6f 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -111,14 +111,13 @@ def _real_extract(self, url): data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+?});', webpage, 'data'), display_id) - # Try multiple paths to find the video data, handling both regular and special episodes - video_detail = traverse_obj(data, [ - # Path for regular episodes (more complex) + # CORRECTED: Provide paths as separate arguments, not a list + video_detail = traverse_obj(data, + # Path for regular episodes ('children', lambda _, v: v.get('type') == 'MainContainer', 'children', 0, 'children', 0, 'props', 'videoDetail'), - # Fallback path for special episodes (simpler) - ('children', 0, 'videoDetail'), - ]) + # Fallback path for special episodes + ('children', 0, 'videoDetail')) if not video_detail: raise ExtractorError('Could not find video data in page') From 8207f3b82451a67a53fc742247282fe8ae60a00a Mon Sep 17 00:00:00 2001 From: Randalix Date: Wed, 13 Aug 2025 00:10:17 +0200 Subject: [PATCH 6/7] fix: Correct SouthParkDeIE _real_extract to use traverse_obj correctly --- yt_dlp/extractor/southpark.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index db9f5d8f6f..9b79040a22 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -111,13 +111,14 @@ def _real_extract(self, url): data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+?});', webpage, 'data'), display_id) - # CORRECTED: Provide paths as separate arguments, not a list + # Try multiple paths and, crucially, get only the FIRST match, not a list video_detail = traverse_obj(data, # Path for regular episodes ('children', lambda _, v: v.get('type') == 'MainContainer', 'children', 0, 'children', 0, 'props', 'videoDetail'), # Fallback path for special episodes - ('children', 0, 'videoDetail')) + ('children', 0, 'videoDetail'), + get_all=False) if not video_detail: raise ExtractorError('Could not find video data in page') @@ -130,7 +131,7 @@ def _real_extract(self, url): 'clientPlatform': 'mobile', }) - hls_url = traverse_obj(api_data, ('stitchedstream', 'source'), expected_type=str) + hls_url = traverse_obj(api_data, ('stitchedstream', 'source'), expected_type=str, get_all=False) return { 'id': video_detail['id'], From d3de29389dab6c40bcd48e2bdd32b9c71d3f0a3b Mon Sep 17 00:00:00 2001 From: Randalix Date: Wed, 13 Aug 2025 00:14:45 +0200 Subject: [PATCH 7/7] fix: Correct SouthParkDeIE _real_extract method --- yt_dlp/extractor/southpark.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 9b79040a22..5d70c9d830 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -111,14 +111,10 @@ def _real_extract(self, url): data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+?});', webpage, 'data'), display_id) - # Try multiple paths and, crucially, get only the FIRST match, not a list - video_detail = traverse_obj(data, - # Path for regular episodes - ('children', lambda _, v: v.get('type') == 'MainContainer', - 'children', 0, 'children', 0, 'props', 'videoDetail'), - # Fallback path for special episodes - ('children', 0, 'videoDetail'), - get_all=False) + video_detail = traverse_obj(data, ( + 'children', lambda _, v: v.get('type') == 'MainContainer', + 'children', 0, 'children', 0, 'props', 'videoDetail' + ), ('children', 0, 'videoDetail'), get_all=False) if not video_detail: raise ExtractorError('Could not find video data in page') @@ -131,12 +127,11 @@ def _real_extract(self, url): 'clientPlatform': 'mobile', }) - hls_url = traverse_obj(api_data, ('stitchedstream', 'source'), expected_type=str, get_all=False) + hls_url = traverse_obj(api_data, ('stitchedstream', 'source'), expected_type=str) - return { + info = { 'id': video_detail['id'], 'display_id': display_id, - 'url': hls_url, 'title': video_detail.get('title'), 'description': video_detail.get('description'), 'duration': traverse_obj(video_detail, ('duration', 'milliseconds'), expected_type=int) / 1000, @@ -145,6 +140,9 @@ def _real_extract(self, url): 'timestamp': traverse_obj(video_detail, ('publishDate', 'timestamp')), 'series': traverse_obj(video_detail, ('parentEntity', 'title')), } + info['formats'] = self._extract_m3u8_formats( + hls_url, display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + return info class SouthParkLatIE(SouthParkIE): # XXX: Do not subclass from concrete IE