From 637ae202aca7a990b3b61bc33d692870dc16c3ad Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 7 Feb 2026 17:12:45 -0600 Subject: [PATCH] [ie/gem.cbc.ca] Support standalone, series & Olympics URLs (#15878) Closes #8382, Closes #8790, Closes #15850 Authored by: bashonly, makew0rld, 0xvd Co-authored-by: makeworld Co-authored-by: 0xvd <0xvd12@gmail.com> --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/cbc.py | 194 +++++++++++++++++++++++++------- 2 files changed, 158 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b2fea5d5c2..348cb0e984 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -311,8 +311,10 @@ from .canalsurmas import CanalsurmasIE from .caracoltv import CaracolTvPlayIE from .cbc import ( CBCIE, + CBCGemContentIE, CBCGemIE, CBCGemLiveIE, + CBCGemOlympicsIE, CBCGemPlaylistIE, CBCListenIE, CBCPlayerIE, diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index de625f8886..5ccd143256 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -10,6 +10,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, js_to_json, jwt_decode_hs256, mimetype2ext, @@ -25,6 +26,7 @@ from ..utils import ( url_basename, url_or_none, urlencode_postdata, + urljoin, ) from ..utils.traversal import require, traverse_obj, trim_str @@ -540,6 +542,32 @@ class CBCGemBaseIE(InfoExtractor): f'https://services.radio-canada.ca/ott/catalog/v2/gem/show/{item_id}', display_id or item_id, query={'device': 'web'}) + def _call_media_api(self, media_id, app_code='gem', display_id=None, headers=None): + media_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', + display_id or media_id, headers=headers, query={ + 'appCode': app_code, + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestVersion': '2', + 'manifestType': 'desktop', + 'idMedia': media_id, + }) + + error_code = traverse_obj(media_data, ('errorCode', {int})) + if error_code == 1: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + if error_code == 35: + self.raise_login_required(method='password') + if error_code != 0: + error_message = join_nonempty(error_code, media_data.get('message'), delim=' - ') + raise ExtractorError(f'{self.IE_NAME} said: {error_message}') + + return media_data + def _extract_item_info(self, item_info): episode_number = None title = traverse_obj(item_info, ('title', {str})) @@ -567,7 +595,7 @@ class CBCGemBaseIE(InfoExtractor): class CBCGemIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca' - _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P[0-9a-z-]+/s(?P[0-9]+)[a-z][0-9]+)' + _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P[0-9a-z-]+/s(?P[0-9]+)[a-z][0-9]{2,4})/?(?:[?#]|$)' _TESTS = [{ # This is a normal, public, TV show video 'url': 'https://gem.cbc.ca/media/schitts-creek/s06e01', @@ -709,29 +737,10 @@ class CBCGemIE(CBCGemBaseIE): if claims_token := self._fetch_claims_token(): headers['x-claims-token'] = claims_token - m3u8_info = self._download_json( - 'https://services.radio-canada.ca/media/validation/v2/', - video_id, headers=headers, query={ - 'appCode': 'gem', - 'connectionType': 'hd', - 'deviceType': 'ipad', - 'multibitrate': 'true', - 'output': 'json', - 'tech': 'hls', - 'manifestVersion': '2', - 'manifestType': 'desktop', - 'idMedia': item_info['idMedia'], - }) - - if m3u8_info.get('errorCode') == 1: - self.raise_geo_restricted(countries=['CA']) - elif m3u8_info.get('errorCode') == 35: - self.raise_login_required(method='password') - elif m3u8_info.get('errorCode') != 0: - raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') - + m3u8_url = self._call_media_api( + item_info['idMedia'], display_id=video_id, headers=headers)['url'] formats = self._extract_m3u8_formats( - m3u8_info['url'], video_id, 'mp4', m3u8_id='hls', query={'manifestType': ''}) + m3u8_url, video_id, 'mp4', m3u8_id='hls', query={'manifestType': ''}) self._remove_duplicate_formats(formats) for fmt in formats: @@ -801,7 +810,128 @@ class CBCGemPlaylistIE(CBCGemBaseIE): }), series=traverse_obj(show_info, ('title', {str}))) -class CBCGemLiveIE(InfoExtractor): +class CBCGemContentIE(CBCGemBaseIE): + IE_NAME = 'gem.cbc.ca:content' + IE_DESC = False # Do not list + _VALID_URL = r'https?://gem\.cbc\.ca/(?P[0-9a-z-]+)/?(?:[?#]|$)' + _TESTS = [{ + # Series URL; content_type == 'Season' + 'url': 'https://gem.cbc.ca/the-tunnel', + 'playlist_count': 3, + 'info_dict': { + 'id': 'the-tunnel', + }, + }, { + # Miniseries URL; content_type == 'Parts' + 'url': 'https://gem.cbc.ca/summit-72', + 'playlist_count': 1, + 'info_dict': { + 'id': 'summit-72', + }, + }, { + # Olympics URL; content_type == 'Standalone' + 'url': 'https://gem.cbc.ca/ski-jumping-nh-individual-womens-final-30086', + 'info_dict': { + 'id': 'ski-jumping-nh-individual-womens-final-30086', + 'ext': 'mp4', + 'title': 'Ski Jumping: NH Individual (Women\'s) - Final', + 'description': 'md5:411c07c8a9a4a36344530b0c726bf8ab', + 'duration': 12793, + 'thumbnail': r're:https://[^.]+\.cbc\.ca/.+\.jpg', + 'release_timestamp': 1770482100, + 'release_date': '20260207', + 'live_status': 'was_live', + }, + }, { + # Movie URL; content_type == 'Standalone'; requires authentication + 'url': 'https://gem.cbc.ca/copa-71', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] + content_type = data['contentType'] + self.write_debug(f'Routing for content type "{content_type}"') + + if content_type == 'Standalone': + new_url = traverse_obj(data, ( + 'header', 'cta', 'media', 'url', {urljoin('https://gem.cbc.ca/')})) + if CBCGemOlympicsIE.suitable(new_url): + return self.url_result(new_url, CBCGemOlympicsIE) + + # Manually construct non-Olympics standalone URLs to avoid returning trailer URLs + return self.url_result(f'https://gem.cbc.ca/{display_id}/s01e01', CBCGemIE) + + # Handle series URLs (content_type == 'Season') and miniseries URLs (content_type == 'Parts') + def entries(): + for playlist_url in traverse_obj(data, ( + 'content', ..., 'lineups', ..., 'url', {urljoin('https://gem.cbc.ca/')}, + {lambda x: x if CBCGemPlaylistIE.suitable(x) else None}, + )): + yield self.url_result(playlist_url, CBCGemPlaylistIE) + + return self.playlist_result(entries(), display_id) + + +class CBCGemOlympicsIE(CBCGemBaseIE): + IE_NAME = 'gem.cbc.ca:olympics' + _VALID_URL = r'https?://gem\.cbc\.ca/(?P(?:[0-9a-z]+-)+[0-9]{5,})/s01e(?P[0-9]{5,})' + _TESTS = [{ + 'url': 'https://gem.cbc.ca/ski-jumping-nh-individual-womens-final-30086/s01e30086', + 'info_dict': { + 'id': 'ski-jumping-nh-individual-womens-final-30086', + 'ext': 'mp4', + 'title': 'Ski Jumping: NH Individual (Women\'s) - Final', + 'description': 'md5:411c07c8a9a4a36344530b0c726bf8ab', + 'duration': 12793, + 'thumbnail': r're:https://[^.]+\.cbc\.ca/.+\.jpg', + 'release_timestamp': 1770482100, + 'release_date': '20260207', + 'live_status': 'was_live', + }, + }] + + def _real_extract(self, url): + video_id, media_id = self._match_valid_url(url).group('id', 'media_id') + + video_info = self._call_show_api(video_id) + item_info = traverse_obj(video_info, ( + 'content', ..., 'lineups', ..., 'items', + lambda _, v: v['formattedIdMedia'] == media_id, any, {require('item info')})) + + live_status = { + 'LiveEvent': 'is_live', + 'Replay': 'was_live', + }.get(item_info.get('type')) + + release_timestamp = traverse_obj(item_info, ( + 'metadata', (('live', 'startDate'), ('replay', 'airDate')), {parse_iso8601}, any)) + + if live_status == 'is_live' and release_timestamp and release_timestamp > time.time(): + formats = [] + live_status = 'is_upcoming' + self.raise_no_formats('This livestream has not yet started', expected=True) + else: + m3u8_url = self._call_media_api(media_id, 'medianetlive', video_id)['url'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=live_status == 'is_live') + + return { + 'id': video_id, + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, + **traverse_obj(item_info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'thumbnail': ('images', 'card', 'url', {url_or_none}), + 'duration': ('metadata', 'replay', 'duration', {int_or_none}), + }), + } + + +class CBCGemLiveIE(CBCGemBaseIE): IE_NAME = 'gem.cbc.ca:live' _VALID_URL = r'https?://gem\.cbc\.ca/live(?:-event)?/(?P\d+)' _TESTS = [ @@ -871,7 +1001,6 @@ class CBCGemLiveIE(InfoExtractor): 'only_matching': True, }, ] - _GEO_COUNTRIES = ['CA'] def _real_extract(self, url): video_id = self._match_id(url) @@ -900,19 +1029,8 @@ class CBCGemLiveIE(InfoExtractor): live_status = 'is_upcoming' self.raise_no_formats('This livestream has not yet started', expected=True) else: - stream_data = self._download_json( - 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ - 'appCode': 'medianetlive', - 'connectionType': 'hd', - 'deviceType': 'ipad', - 'idMedia': video_stream_id, - 'multibitrate': 'true', - 'output': 'json', - 'tech': 'hls', - 'manifestType': 'desktop', - }) - formats = self._extract_m3u8_formats( - stream_data['url'], video_id, 'mp4', live=live_status == 'is_live') + m3u8_url = self._call_media_api(video_stream_id, 'medianetlive', video_id)['url'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=live_status == 'is_live') return { 'id': video_id,