mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[ie/cbc.ca:player] Fix extractor (#10302)
Closes #10170 Authored by: trainman261, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
This commit is contained in:
		| @@ -1,4 +1,5 @@ | ||||
| import base64 | ||||
| import functools | ||||
| import json | ||||
| import re | ||||
| import time | ||||
| @@ -6,17 +7,24 @@ import urllib.parse | ||||
| import xml.etree.ElementTree | ||||
| 
 | ||||
| from .common import InfoExtractor | ||||
| from ..networking import HEADRequest | ||||
| from ..utils import ( | ||||
|     ExtractorError, | ||||
|     float_or_none, | ||||
|     int_or_none, | ||||
|     join_nonempty, | ||||
|     js_to_json, | ||||
|     mimetype2ext, | ||||
|     orderedSet, | ||||
|     parse_iso8601, | ||||
|     replace_extension, | ||||
|     smuggle_url, | ||||
|     strip_or_none, | ||||
|     traverse_obj, | ||||
|     try_get, | ||||
|     update_url, | ||||
|     url_basename, | ||||
|     url_or_none, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @@ -149,6 +157,7 @@ class CBCIE(InfoExtractor): | ||||
| class CBCPlayerIE(InfoExtractor): | ||||
|     IE_NAME = 'cbc.ca:player' | ||||
|     _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)' | ||||
|     _GEO_COUNTRIES = ['CA'] | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.cbc.ca/player/play/2683190193', | ||||
|         'md5': '64d25f841ddf4ddb28a235338af32e2c', | ||||
| @@ -172,21 +181,20 @@ class CBCPlayerIE(InfoExtractor): | ||||
|             'description': 'md5:dd3b692f0a139b0369943150bd1c46a9', | ||||
|             'timestamp': 1425704400, | ||||
|             'upload_date': '20150307', | ||||
|             'uploader': 'CBCC-NEW', | ||||
|             'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg', | ||||
|             'chapters': [], | ||||
|             'duration': 494.811, | ||||
|             'categories': ['AudioMobile/All in a Weekend Montreal'], | ||||
|             'tags': 'count:8', | ||||
|             'categories': ['All in a Weekend Montreal'], | ||||
|             'tags': 'count:11', | ||||
|             'location': 'Quebec', | ||||
|             'series': 'All in a Weekend Montreal', | ||||
|             'season': 'Season 2015', | ||||
|             'season_number': 2015, | ||||
|             'media_type': 'Excerpt', | ||||
|             'genres': ['Other'], | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062', | ||||
|         'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', | ||||
|         'info_dict': { | ||||
|             'id': '2164402062', | ||||
|             'ext': 'mp4', | ||||
| @@ -194,107 +202,168 @@ class CBCPlayerIE(InfoExtractor): | ||||
|             'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', | ||||
|             'timestamp': 1320410746, | ||||
|             'upload_date': '20111104', | ||||
|             'uploader': 'CBCC-NEW', | ||||
|             'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg', | ||||
|             'chapters': [], | ||||
|             'duration': 186.867, | ||||
|             'series': 'CBC News: Windsor at 6:00', | ||||
|             'categories': ['News/Canada/Windsor'], | ||||
|             'categories': ['Windsor'], | ||||
|             'location': 'Windsor', | ||||
|             'tags': ['cancer'], | ||||
|             'creators': ['Allison Johnson'], | ||||
|             'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'], | ||||
|             'media_type': 'Excerpt', | ||||
|             'genres': ['News'], | ||||
|         }, | ||||
|         'params': {'skip_download': 'm3u8'}, | ||||
|     }, { | ||||
|         # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ | ||||
|         'url': 'https://www.cbc.ca/player/play/1.2985700', | ||||
|         'md5': 'e5e708c34ae6fca156aafe17c43e8b75', | ||||
|         'info_dict': { | ||||
|             'id': '2657631896', | ||||
|             'id': '1.2985700', | ||||
|             'ext': 'mp3', | ||||
|             'title': 'CBC Montreal is organizing its first ever community hackathon!', | ||||
|             'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', | ||||
|             'timestamp': 1425704400, | ||||
|             'upload_date': '20150307', | ||||
|             'uploader': 'CBCC-NEW', | ||||
|             'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg', | ||||
|             'chapters': [], | ||||
|             'duration': 494.811, | ||||
|             'categories': ['AudioMobile/All in a Weekend Montreal'], | ||||
|             'tags': 'count:8', | ||||
|             'categories': ['All in a Weekend Montreal'], | ||||
|             'tags': 'count:11', | ||||
|             'location': 'Quebec', | ||||
|             'series': 'All in a Weekend Montreal', | ||||
|             'season': 'Season 2015', | ||||
|             'season_number': 2015, | ||||
|             'media_type': 'Excerpt', | ||||
|             'genres': ['Other'], | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'https://www.cbc.ca/player/play/1.1711287', | ||||
|         'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', | ||||
|         'info_dict': { | ||||
|             'id': '2164402062', | ||||
|             'id': '1.1711287', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Cancer survivor four times over', | ||||
|             'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', | ||||
|             'timestamp': 1320410746, | ||||
|             'upload_date': '20111104', | ||||
|             'uploader': 'CBCC-NEW', | ||||
|             'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg', | ||||
|             'chapters': [], | ||||
|             'duration': 186.867, | ||||
|             'series': 'CBC News: Windsor at 6:00', | ||||
|             'categories': ['News/Canada/Windsor'], | ||||
|             'categories': ['Windsor'], | ||||
|             'location': 'Windsor', | ||||
|             'tags': ['cancer'], | ||||
|             'creators': ['Allison Johnson'], | ||||
|             'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'], | ||||
|             'media_type': 'Excerpt', | ||||
|             'genres': ['News'], | ||||
|         }, | ||||
|         'params': {'skip_download': 'm3u8'}, | ||||
|     }, { | ||||
|         # Has subtitles | ||||
|         # These broadcasts expire after ~1 month, can find new test URL here: | ||||
|         # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast | ||||
|         'url': 'https://www.cbc.ca/player/play/1.7159484', | ||||
|         'md5': '6ed6cd0fc2ef568d2297ba68a763d455', | ||||
|         'url': 'https://www.cbc.ca/player/play/video/9.6424403', | ||||
|         'md5': '8025909eaffcf0adf59922904def9a5e', | ||||
|         'info_dict': { | ||||
|             'id': '2324213316001', | ||||
|             'id': '9.6424403', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'The National | School boards sue social media giants', | ||||
|             'description': 'md5:4b4db69322fa32186c3ce426da07402c', | ||||
|             'timestamp': 1711681200, | ||||
|             'duration': 2743.400, | ||||
|             'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, | ||||
|             'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg', | ||||
|             'uploader': 'CBCC-NEW', | ||||
|             'title': 'The National | N.W.T. wildfire emergency', | ||||
|             'description': 'md5:ada33d36d1df69347ed575905bfd496c', | ||||
|             'timestamp': 1718589600, | ||||
|             'duration': 2692.833, | ||||
|             'subtitles': { | ||||
|                 'en-US': [{ | ||||
|                     'name': 'English Captions', | ||||
|                     'url': 'https://cbchls.akamaized.net/delivery/news-shows/2024/06/17/NAT_JUN16-00-55-00/NAT_JUN16_cc.vtt', | ||||
|                 }], | ||||
|             }, | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/6272b5c6-5e78-4c05-915d-0e36672e33d1,1714756287822/full/max/0/default.jpg', | ||||
|             'chapters': 'count:5', | ||||
|             'upload_date': '20240329', | ||||
|             'categories': 'count:4', | ||||
|             'upload_date': '20240617', | ||||
|             'categories': ['News', 'The National', 'The National Latest Broadcasts'], | ||||
|             'series': 'The National - Full Show', | ||||
|             'tags': 'count:1', | ||||
|             'creators': ['News'], | ||||
|             'tags': ['The National'], | ||||
|             'location': 'Canada', | ||||
|             'media_type': 'Full Program', | ||||
|             'genres': ['News'], | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'https://www.cbc.ca/player/play/video/1.7194274', | ||||
|         'md5': '188b96cf6bdcb2540e178a6caa957128', | ||||
|         'info_dict': { | ||||
|             'id': '2334524995812', | ||||
|             'id': '1.7194274', | ||||
|             'ext': 'mp4', | ||||
|             'title': '#TheMoment a rare white spirit moose was spotted in Alberta', | ||||
|             'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3', | ||||
|             'timestamp': 1714788791, | ||||
|             'duration': 77.678, | ||||
|             'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, | ||||
|             'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg', | ||||
|             'uploader': 'CBCC-NEW', | ||||
|             'chapters': 'count:0', | ||||
|             'upload_date': '20240504', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/1.7194274,1717224990425/full/max/0/default.jpg', | ||||
|             'chapters': [], | ||||
|             'categories': 'count:3', | ||||
|             'series': 'The National', | ||||
|             'tags': 'count:15', | ||||
|             'creators': ['encoder'], | ||||
|             'tags': 'count:17', | ||||
|             'location': 'Canada', | ||||
|             'media_type': 'Excerpt', | ||||
|             'upload_date': '20240504', | ||||
|             'genres': ['News'], | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'https://www.cbc.ca/player/play/video/9.6427282', | ||||
|         'info_dict': { | ||||
|             'id': '9.6427282', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Men\'s Soccer - Argentina vs Morocco', | ||||
|             'description': 'Argentina faces Morocco on the football pitch at Saint Etienne Stadium.', | ||||
|             'series': 'CBC Sports', | ||||
|             'media_type': 'Event Coverage', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/a4c5c0c2-99fa-4bd3-8061-5a63879c1b33,1718828053500/full/max/0/default.jpg', | ||||
|             'timestamp': 1721825400.0, | ||||
|             'upload_date': '20240724', | ||||
|             'duration': 10568.0, | ||||
|             'chapters': [], | ||||
|             'genres': [], | ||||
|             'tags': ['2024 Paris Olympic Games'], | ||||
|             'categories': ['Olympics Summer Soccer', 'Summer Olympics Replays', 'Summer Olympics Soccer Replays'], | ||||
|             'location': 'Canada', | ||||
|         }, | ||||
|         'params': {'skip_download': 'm3u8'}, | ||||
|     }, { | ||||
|         'url': 'https://www.cbc.ca/player/play/video/9.6459530', | ||||
|         'md5': '6c1bb76693ab321a2e99c347a1d5ecbc', | ||||
|         'info_dict': { | ||||
|             'id': '9.6459530', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Parts of Jasper incinerated as wildfire rages', | ||||
|             'description': 'md5:6f1caa8d128ad3f629257ef5fecf0962', | ||||
|             'series': 'The National', | ||||
|             'media_type': 'Excerpt', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/507c0086-31a2-494d-96e4-bffb1048d045,1721953984375/full/max/0/default.jpg', | ||||
|             'timestamp': 1721964091.012, | ||||
|             'upload_date': '20240726', | ||||
|             'duration': 952.285, | ||||
|             'chapters': [], | ||||
|             'genres': [], | ||||
|             'tags': 'count:23', | ||||
|             'categories': ['News (FAST)', 'News', 'The National', 'TV News Shows', 'The National '], | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'https://www.cbc.ca/player/play/video/9.6420651', | ||||
|         'md5': '71a850c2c6ee5e912de169f5311bb533', | ||||
|         'info_dict': { | ||||
|             'id': '9.6420651', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Is it a breath of fresh air? Measuring air quality in Edmonton', | ||||
|             'description': 'md5:3922b92cc8b69212d739bd9dd095b1c3', | ||||
|             'series': 'CBC News Edmonton', | ||||
|             'media_type': 'Excerpt', | ||||
|             'thumbnail': 'https://i.cbc.ca/ais/73c4ab9c-7ad4-46ee-bb9b-020fdc01c745,1718214547576/full/max/0/default.jpg', | ||||
|             'timestamp': 1718220065.768, | ||||
|             'upload_date': '20240612', | ||||
|             'duration': 286.086, | ||||
|             'chapters': [], | ||||
|             'genres': ['News'], | ||||
|             'categories': ['News', 'Edmonton'], | ||||
|             'tags': 'count:7', | ||||
|             'location': 'Edmonton', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'cbcplayer:1.7159484', | ||||
| @@ -307,23 +376,113 @@ class CBCPlayerIE(InfoExtractor): | ||||
|         'only_matching': True, | ||||
|     }] | ||||
| 
 | ||||
|     def _parse_param(self, asset_data, name): | ||||
|         return traverse_obj(asset_data, ('params', lambda _, v: v['name'] == name, 'value', {str}, any)) | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id = self._match_id(url) | ||||
|         if '.' in video_id: | ||||
|             webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) | ||||
|             video_id = self._search_json( | ||||
|                 r'window\.__INITIAL_STATE__\s*=', webpage, | ||||
|                 'initial state', video_id)['video']['currentClip']['mediaId'] | ||||
|         webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) | ||||
|         data = self._search_json( | ||||
|             r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)['video']['currentClip'] | ||||
|         assets = traverse_obj( | ||||
|             data, ('media', 'assets', lambda _, v: url_or_none(v['key']) and v['type'])) | ||||
| 
 | ||||
|         if not assets and (media_id := traverse_obj(data, ('mediaId', {str}))): | ||||
|             # XXX: Deprecated; CBC is migrating off of ThePlatform | ||||
|             return { | ||||
|                 '_type': 'url_transparent', | ||||
|                 'ie_key': 'ThePlatform', | ||||
|                 'url': smuggle_url( | ||||
|                     f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{media_id}?mbr=true&formats=MPEG4,FLV,MP3', { | ||||
|                         'force_smil_url': True, | ||||
|                     }), | ||||
|                 'id': media_id, | ||||
|                 '_format_sort_fields': ('res', 'proto'),  # Prioritize direct http formats over HLS | ||||
|             } | ||||
| 
 | ||||
|         is_live = traverse_obj(data, ('media', 'streamType', {str})) == 'Live' | ||||
|         formats, subtitles = [], {} | ||||
| 
 | ||||
|         for sub in traverse_obj(data, ('media', 'textTracks', lambda _, v: url_or_none(v['src']))): | ||||
|             subtitles.setdefault(sub.get('language') or 'und', []).append({ | ||||
|                 'url': sub['src'], | ||||
|                 'name': sub.get('label'), | ||||
|             }) | ||||
| 
 | ||||
|         for asset in assets: | ||||
|             asset_key = asset['key'] | ||||
|             asset_type = asset['type'] | ||||
|             if asset_type != 'medianet': | ||||
|                 self.report_warning(f'Skipping unsupported asset type "{asset_type}": {asset_key}') | ||||
|                 continue | ||||
|             asset_data = self._download_json(asset_key, video_id, f'Downloading {asset_type} JSON') | ||||
|             ext = mimetype2ext(self._parse_param(asset_data, 'contentType')) | ||||
|             if ext == 'm3u8': | ||||
|                 fmts, subs = self._extract_m3u8_formats_and_subtitles( | ||||
|                     asset_data['url'], video_id, 'mp4', m3u8_id='hls', live=is_live) | ||||
|                 formats.extend(fmts) | ||||
|                 # Avoid slow/error-prone webvtt-over-m3u8 if direct https vtt is available | ||||
|                 if not subtitles: | ||||
|                     self._merge_subtitles(subs, target=subtitles) | ||||
|                 if is_live or not fmts: | ||||
|                     continue | ||||
|                 # Check for direct https mp4 format | ||||
|                 best_video_fmt = traverse_obj(fmts, ( | ||||
|                     lambda _, v: v.get('vcodec') != 'none' and v['tbr'], all, | ||||
|                     {functools.partial(sorted, key=lambda x: x['tbr'])}, -1, {dict})) or {} | ||||
|                 base_url = self._search_regex( | ||||
|                     r'(https?://[^?#]+?/)hdntl=', best_video_fmt.get('url'), 'base url', default=None) | ||||
|                 if not base_url or '/live/' in base_url: | ||||
|                     continue | ||||
|                 mp4_url = base_url + replace_extension(url_basename(best_video_fmt['url']), 'mp4') | ||||
|                 if self._request_webpage( | ||||
|                         HEADRequest(mp4_url), video_id, 'Checking for https format', | ||||
|                         errnote=False, fatal=False): | ||||
|                     formats.append({ | ||||
|                         **best_video_fmt, | ||||
|                         'url': mp4_url, | ||||
|                         'format_id': 'https-mp4', | ||||
|                         'protocol': 'https', | ||||
|                         'manifest_url': None, | ||||
|                         'acodec': None, | ||||
|                     }) | ||||
|             else: | ||||
|                 formats.append({ | ||||
|                     'url': asset_data['url'], | ||||
|                     'ext': ext, | ||||
|                     'vcodec': 'none' if self._parse_param(asset_data, 'mediaType') == 'audio' else None, | ||||
|                 }) | ||||
| 
 | ||||
|         chapters = traverse_obj(data, ( | ||||
|             'media', 'chapters', lambda _, v: float(v['startTime']) is not None, { | ||||
|                 'start_time': ('startTime', {functools.partial(float_or_none, scale=1000)}), | ||||
|                 'end_time': ('endTime', {functools.partial(float_or_none, scale=1000)}), | ||||
|                 'title': ('name', {str}), | ||||
|             })) | ||||
|         # Filter out pointless single chapters with start_time==0 and no end_time | ||||
|         if len(chapters) == 1 and not (chapters[0].get('start_time') or chapters[0].get('end_time')): | ||||
|             chapters = [] | ||||
| 
 | ||||
|         return { | ||||
|             '_type': 'url_transparent', | ||||
|             'ie_key': 'ThePlatform', | ||||
|             'url': smuggle_url( | ||||
|                 f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{video_id}?mbr=true&formats=MPEG4,FLV,MP3', { | ||||
|                     'force_smil_url': True, | ||||
|                 }), | ||||
|             **traverse_obj(data, { | ||||
|                 'title': ('title', {str}), | ||||
|                 'description': ('description', {str.strip}), | ||||
|                 'thumbnail': ('image', 'url', {url_or_none}, {functools.partial(update_url, query=None)}), | ||||
|                 'timestamp': ('publishedAt', {functools.partial(float_or_none, scale=1000)}), | ||||
|                 'media_type': ('media', 'clipType', {str}), | ||||
|                 'series': ('showName', {str}), | ||||
|                 'season_number': ('media', 'season', {int_or_none}), | ||||
|                 'duration': ('media', 'duration', {float_or_none}, {lambda x: None if is_live else x}), | ||||
|                 'location': ('media', 'region', {str}), | ||||
|                 'tags': ('tags', ..., 'name', {str}), | ||||
|                 'genres': ('media', 'genre', all), | ||||
|                 'categories': ('categories', ..., 'name', {str}), | ||||
|             }), | ||||
|             'id': video_id, | ||||
|             '_format_sort_fields': ('res', 'proto'),  # Prioritize direct http formats over HLS | ||||
|             'formats': formats, | ||||
|             'subtitles': subtitles, | ||||
|             'chapters': chapters, | ||||
|             'is_live': is_live, | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 trainman261
					trainman261