diff --git a/yt_dlp/extractor/educast.py b/yt_dlp/extractor/educast.py index 8b2ecb2bc..3d185bbf3 100644 --- a/yt_dlp/extractor/educast.py +++ b/yt_dlp/extractor/educast.py @@ -1,146 +1,94 @@ -from .common import InfoExtractor +import re +from urllib import parse + +from .common import ( + ExtractorError, + InfoExtractor, +) from ..networking import HEADRequest from ..utils import ( - float_or_none, - int_or_none, mimetype2ext, - str_or_none, traverse_obj, unified_timestamp, ) -class EducastIE(InfoExtractor): - _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)' +class EducastBaseIE(InfoExtractor): _API_BASE = 'https://educast.fccn.pt' - _TESTS = [ - { - 'note': 'test for public Educast video downloading the merged format', - 'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html', - 'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc', - 'info_dict': { - 'id': '2o06o2c6hm', - 'ext': 'mp4', - 'title': 'Fundamentos de Bases de Dados', - 'alt_title': '', - 'description': '', - 'uploader': 'Professor Luís Cavique', - 'channel': 'UAB - Fundamentos de Base de dados', - 'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w', - 'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover', - 'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'], - 'timestamp': 1410946740, - 'upload_date': '20140917', - 'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/', - 'formats': [ - { - 'format_id': 'presenter-0', - 'ext': 'm4a', - 'vcodec': 'none', - 'acodec': 'mp4a.40.2', - 'protocol': 'http_dash_segments', - }, - { - 'format_id': 'presenter-1', - 'ext': 'mp4', - 'vcodec': 'avc1.77.40', - 'acodec': 'mp4a.40.2', - 'protocol': 'm3u8_native', - }, - { - 'format_id': 'presenter-2', - 'ext': 'mp4', - 'vcodec': 'avc1.4d4028', - 'acodec': 'none', - 'protocol': 'http_dash_segments', - 'fps': 25, - }, - { - 'format_id': 'presentation-0', - 'ext': 'mp4', - 'vcodec': 'avc1.77.40', - 'acodec': 'none', - 'protocol': 'm3u8_native', - }, - { - 'format_id': 'presentation-1', - 'ext': 'mp4', - 'vcodec': 'avc1.4d4028', - 'acodec': 'none', - 'protocol': 'http_dash_segments', - 'fps': 25, - }, - { - 'format_id': 'merged', - 'ext': 'mp4', - 'protocol': 'https', - 'format_note': 'single stream, may be lower res', - }, - ], - }, + @staticmethod + def _paginate_and_collect(get_page_func, parse_func): + videos = [] + page = 1 + while True: + webpage = get_page_func(page) + if not webpage: + break + new_videos = parse_func(webpage) + found = False + for v in new_videos: + if not any(existing['id'] == v['id'] for existing in videos): + videos.append(v) + found = True + if not found: + break + page += 1 + return videos + + +class EducastIE(EducastBaseIE): + _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'note': 'test for public Educast video downloading the merged format', + 'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html', + 'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc', + 'info_dict': { + 'id': '2o06o2c6hm', + 'ext': 'mp4', + 'title': 'Fundamentos de Bases de Dados', + 'alt_title': '', + 'description': '', + 'uploader': 'Professor Luís Cavique', + 'channel': 'UAB - Fundamentos de Base de dados', + 'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w', + 'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover', + 'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'], + 'timestamp': 1410946740, + 'upload_date': '20140917', + 'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/', + 'duration': 1041, }, - { - 'note': 'test for private Educast video downloading the merged format', - 'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html', - 'md5': '242a4a8d1a84a4c3aab93771c3da244e', - 'info_dict': { - 'id': 'jhwehqk9', - 'ext': 'mp4', - 'title': ' Exercícios 8B. Equações Diferenciais Parciais', - 'alt_title': '', - 'description': '', - 'uploader': ' Rui Miguel Saramago', - 'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', - 'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7', - 'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover', - 'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'], - 'license': 'http://creativecommons.org/licenses/by/4.0/', - 'formats': [ - { - 'format_id': 'presenter-0', - 'ext': 'm4a', - 'vcodec': 'none', - 'acodec': 'mp4a.40.2', - 'protocol': 'http_dash_segments', - }, - { - 'format_id': 'presenter-1', - 'ext': 'mp4', - 'vcodec': 'avc1.77.40', - 'acodec': 'mp4a.40.2', - 'protocol': 'm3u8_native', - }, - { - 'format_id': 'presenter-2', - 'ext': 'mp4', - 'vcodec': 'avc1.4d4028', - 'acodec': 'none', - 'protocol': 'http_dash_segments', - 'fps': 25, - }, - { - 'format_id': 'merged', - 'ext': 'mp4', - 'protocol': 'https', - 'format_note': 'single stream, may be lower res', - }, - ], - }, - 'skip': 'This video is private and requires authentication to access', + }, { + 'note': 'test for private Educast video downloading the merged format', + 'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html', + 'md5': '242a4a8d1a84a4c3aab93771c3da244e', + 'info_dict': { + 'id': 'jhwehqk9', + 'ext': 'mp4', + 'title': ' Exercícios 8B. Equações Diferenciais Parciais', + 'alt_title': '', + 'description': '', + 'uploader': ' Rui Miguel Saramago', + 'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', + 'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7', + 'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover', + 'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'], + 'license': 'http://creativecommons.org/licenses/by/4.0/', + 'duration': 2756, }, - { - 'note': 'test for deprecated streaming url, should rely on fallback', - 'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html', - 'md5': '88055700118db7411d1cc0da48ca1747', - 'info_dict': { - 'id': '2by2fw4fkx', - 'ext': 'mp4', - 'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_', - }, - 'skip': 'This video is private and requires authentication to access', + 'skip': 'This video is private and requires authentication to access', + }, { + 'note': 'test for deprecated streaming url, should rely on fallback', + 'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html', + 'md5': '88055700118db7411d1cc0da48ca1747', + 'info_dict': { + 'id': '2by2fw4fkx', + 'ext': 'mp4', + 'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_', }, - ] + 'expected_warnings': ['Este vídeo não está preparado para HTML5'], + 'skip': 'This video is private and requires authentication to access', + }] def parse_timestamp(self, timestamp_str): if isinstance(timestamp_str, str) and '.' in timestamp_str: @@ -163,47 +111,48 @@ def _extract_video_formats(self, video_json, video_id): formats += self._extract_m3u8_formats(hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', fatal=False) for f in formats: - f['format_id'] = str_or_none(video_json.get('role')) - f['width'] = int_or_none(video_json.get('width')) - f['height'] = int_or_none(video_json.get('height')) - f['duration'] = float_or_none(video_json.get('duration')) - f['filesize_approx'] = int_or_none(float_or_none(f.get('duration')) * float_or_none(f.get('tbr')) * 1000 / 8) + f['format_id'] = video_json.get('role') return formats def _extract_from_json(self, video_id): data_json_url = f'https://educast.fccn.pt/vod/clips/{video_id}/video_player/data.json' - data_json = self._download_json(data_json_url, video_id, fatal=False) - if not data_json: + try: + data_json = self._download_json(data_json_url, video_id) + except ExtractorError as e: + self.report_warning(e) return None if data_json.get('error'): - self.to_screen(data_json.get('error')) + self.report_warning(data_json.get('error')) return None formats = [] info = { 'id': video_id, - 'title': str_or_none(traverse_obj(data_json, ('clip', 'name'))), 'formats': formats, - 'alt_title': str_or_none(data_json.get('subtitle')), - 'description': str_or_none(data_json.get('clipDescription')), - 'uploader': str_or_none(data_json.get('author')), - 'timestamp': self.parse_timestamp(data_json.get('timestamp')), - 'thumbnail': str_or_none(data_json.get('cover')), - 'license': str_or_none(data_json.get('licenceURL')), - 'webpage_url': str_or_none(data_json.get('url')), - 'channel': str_or_none(traverse_obj(data_json, ('channel', 'name'))), - 'channel_url': str_or_none(traverse_obj(data_json, ('channel', 'url'))), + **traverse_obj(data_json, { + 'title': ('clip', 'name', {str}), + 'alt_title': ('subtitle', {str}), + 'description': ('clipDescription', {str}), + 'uploader': ('author', {str}), + 'timestamp': ('timestamp', {self.parse_timestamp}), + 'thumbnail': ('cover', {str}), + 'license': ('licenceURL', {str}), + 'webpage_url': ('url', {str}), + 'channel': ('channel', 'name', {str}), + 'channel_url': ('channel', 'url', {str}), + 'duration': ('videos', 0, 'duration', {int}), + }), 'categories': [cat for cat in ( - str_or_none(traverse_obj(data_json, ('area', 'name'))), - str_or_none(traverse_obj(data_json, ('institution', 'name'))), + traverse_obj(data_json, ('area', 'name'), expected_type=str), + traverse_obj(data_json, ('institution', 'name'), expected_type=str), ) if cat], } for video_json in data_json.get('videos') or []: formats.extend(self._extract_video_formats(video_json, video_id)) - download_url = str_or_none(data_json.get('downloadURL')) + download_url = data_json.get('downloadURL') if download_url: formats.append({ 'format_id': 'merged', @@ -215,8 +164,6 @@ def _extract_from_json(self, video_id): return info def _try_fallback(self, url, video_id): - import re - # Last resort for videos with no working streaming option KNOWN_BASENAMES = ['desktop.mp4', 'ipod.m4v', 'quicktime.mov'] for basename in KNOWN_BASENAMES: @@ -231,9 +178,11 @@ def _try_fallback(self, url, video_id): if ext not in ('mp4', 'm4v', 'mov'): continue title = None - m = re.search(r'filename\s*=\s*"([^"]+)"', response.get_header('content-disposition'), re.IGNORECASE) - if m: - title = m.group(1).strip().removesuffix(f'.{ext}') + ext_header = response.get_header('content-disposition') + if ext_header: + m = re.search(r'filename\s*=\s*"([^"]+)"', ext_header, re.IGNORECASE) + if m: + title = m.group(1).strip().removesuffix(f'.{ext}') return { 'id': video_id, 'title': title, @@ -244,60 +193,33 @@ def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_json(video_id) or self._try_fallback(url, video_id) - @staticmethod - def _paginate_and_collect(get_page_func, parse_func, max_videos=None): - videos = [] - page = 1 - while True: - if max_videos is not None and len(videos) >= max_videos: - break - webpage = get_page_func(page) - if not webpage: - break - new_videos = parse_func(webpage) - found = False - for v in new_videos: - if not any(existing['id'] == v['id'] for existing in videos): - videos.append(v) - found = True - if max_videos is not None and len(videos) >= max_videos: - break - if not found or (max_videos is not None and len(videos) >= max_videos): - break - page += 1 - return videos - -class EducastChannelIE(InfoExtractor): +class EducastChannelIE(EducastBaseIE): IE_NAME = 'educast:channel' _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/vod/channels/(?P[a-zA-Z0-9]+)/?(?:$|[?#])' - _TESTS = [ + _TESTS = [{ + 'note': 'test for private Educast Channel', + 'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak', + 'info_dict': { - 'note': 'test for private Educast Channel', - 'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak', - 'info_dict': - { - 'id': '2o0eonmrak', - 'title': 'Vídeos Institucionais FCT-FCCN', - 'description': str, - }, - 'playlist_mincount': 26, + 'id': '2o0eonmrak', + 'title': 'Vídeos Institucionais FCT-FCCN', + 'description': str, }, - { - 'note': 'test for private Educast Channel', - 'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7', - 'info_dict': { - 'id': '2fudccnyj7', - 'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', - 'description': str, - }, - 'playlist_mincount': 26, - 'skip': 'This channel is private and requires authentication to access', + 'playlist_mincount': 26, + }, { + 'note': 'test for private Educast Channel', + 'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7', + 'info_dict': { + 'id': '2fudccnyj7', + 'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', + 'description': str, }, - ] + 'playlist_mincount': 26, + 'skip': 'This channel is private and requires authentication to access', + }] def _extract_video_links_from_html(self, webpage, ie_key): - import re videos_by_id = {} pattern = r'href="https://educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)/(?P