diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index fbbd9571f7..d7c38eaa46 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -576,6 +576,7 @@ ) from .ebaumsworld import EbaumsWorldIE from .ebay import EbayIE +from .educast import EducastChannelIE, EducastIE, EducastResultsIE from .egghead import ( EggheadCourseIE, EggheadLessonIE, diff --git a/yt_dlp/extractor/educast.py b/yt_dlp/extractor/educast.py new file mode 100644 index 0000000000..f97d2c5480 --- /dev/null +++ b/yt_dlp/extractor/educast.py @@ -0,0 +1,347 @@ +import re +from urllib import parse + +from .common import ( + ExtractorError, + InfoExtractor, +) +from ..networking import HEADRequest +from ..utils import ( + mimetype2ext, + traverse_obj, + unified_timestamp, +) + + +class EducastBaseIE(InfoExtractor): + _API_BASE = 'https://educast.fccn.pt' + + @staticmethod + def _paginate_and_collect(get_page_func, parse_func): + videos = [] + page = 1 + while True: + webpage = get_page_func(page) + if not webpage: + break + new_videos = parse_func(webpage) + found = False + for v in new_videos: + if not any(existing['id'] == v['id'] for existing in videos): + videos.append(v) + found = True + if not found: + break + page += 1 + return videos + + +class EducastIE(EducastBaseIE): + _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'note': 'test for public Educast video downloading the merged format', + 'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html', + 'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc', + 'info_dict': { + 'id': '2o06o2c6hm', + 'ext': 'mp4', + 'title': 'Fundamentos de Bases de Dados', + 'alt_title': '', + 'description': '', + 'uploader': 'Professor Luís Cavique', + 'channel': 'UAB - Fundamentos de Base de dados', + 'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w', + 'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover', + 'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'], + 'timestamp': 1410946740, + 'upload_date': '20140917', + 'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/', + 'duration': 1041, + }, + }, { + 'note': 'test for private Educast video downloading the merged format', + 'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html', + 'md5': '242a4a8d1a84a4c3aab93771c3da244e', + 'info_dict': { + 'id': 'jhwehqk9', + 'ext': 'mp4', + 'title': ' Exercícios 8B. Equações Diferenciais Parciais', + 'alt_title': '', + 'description': '', + 'uploader': ' Rui Miguel Saramago', + 'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', + 'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7', + 'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover', + 'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'], + 'license': 'http://creativecommons.org/licenses/by/4.0/', + 'duration': 2756, + }, + 'skip': 'This video is private and requires authentication to access', + }, { + 'note': 'test for deprecated streaming url, should rely on fallback', + 'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html', + 'md5': '88055700118db7411d1cc0da48ca1747', + 'info_dict': { + 'id': '2by2fw4fkx', + 'ext': 'mp4', + 'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_', + }, + 'expected_warnings': ['Este vídeo não está preparado para HTML5'], + 'skip': 'This video is private and requires authentication to access', + }] + + def _extract_video_formats(self, video_json, video_id): + formats = [] + dash_url = traverse_obj(video_json, ('dash', 'url')) + if dash_url: + formats += self._extract_mpd_formats(dash_url, video_id, mpd_id='dash', fatal=False) + + hls_url = traverse_obj(video_json, ('hls', 'url')) + if hls_url: + formats += self._extract_m3u8_formats(hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', fatal=False) + + for f in formats: + f['format_id'] = video_json.get('role') + + return formats + + def _extract_from_json(self, video_id): + data_json_url = f'https://educast.fccn.pt/vod/clips/{video_id}/video_player/data.json' + try: + data_json = self._download_json(data_json_url, video_id) + except ExtractorError as e: + self.report_warning(e) + return None + if data_json.get('error'): + self.report_warning(data_json.get('error')) + return None + + formats = [] + info = { + 'id': video_id, + 'formats': formats, + **traverse_obj(data_json, { + 'title': ('clip', 'name', {str}), + 'alt_title': ('subtitle', {str}), + 'description': ('clipDescription', {str}), + 'uploader': ('author', {str}), + 'timestamp': ('timestamp', {unified_timestamp}, {lambda x: x - 3600}), + 'thumbnail': ('cover', {str}), + 'license': ('licenceURL', {str}), + 'webpage_url': ('url', {str}), + 'channel': ('channel', 'name', {str}), + 'channel_url': ('channel', 'url', {str}), + 'duration': ('videos', 0, 'duration', {int}), + }), + 'categories': [cat for cat in ( + traverse_obj(data_json, ('area', 'name'), expected_type=str), + traverse_obj(data_json, ('institution', 'name'), expected_type=str), + ) if cat], + } + + for video_json in data_json.get('videos') or []: + formats.extend(self._extract_video_formats(video_json, video_id)) + + download_url = data_json.get('downloadURL') + if download_url: + formats.append({ + 'format_id': 'merged', + 'url': download_url, + 'quality': 0, + 'format_note': 'single stream, may be lower res', + }) + + return info + + def _try_fallback(self, url, video_id): + # Last resort for videos with no working streaming option + KNOWN_BASENAMES = ['desktop.mp4', 'ipod.m4v', 'quicktime.mov'] + for basename in KNOWN_BASENAMES: + format_url = url.replace('streaming.html', basename) + response = self._request_webpage( + HEADRequest(format_url), video_id, + note=f'Checking availability of {basename} fallback', + fatal=False, errnote=False) + if not response: + continue + ext = mimetype2ext(response.get_header('content-type')) + if ext not in ('mp4', 'm4v', 'mov'): + continue + title = None + ext_header = response.get_header('content-disposition') + if ext_header: + m = re.search(r'filename\s*=\s*"([^"]+)"', ext_header, re.IGNORECASE) + if m: + title = m.group(1).strip().removesuffix(f'.{ext}') + return { + 'id': video_id, + 'title': title, + 'url': format_url, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_json(video_id) or self._try_fallback(url, video_id) + + +class EducastChannelIE(EducastBaseIE): + IE_NAME = 'educast:channel' + _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/vod/channels/(?P[a-zA-Z0-9]+)/?(?:$|[?#])' + _TESTS = [{ + 'note': 'test for private Educast Channel', + 'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak', + 'info_dict': + { + 'id': '2o0eonmrak', + 'title': 'Vídeos Institucionais FCT-FCCN', + 'description': str, + }, + 'playlist_mincount': 26, + }, { + 'note': 'test for private Educast Channel', + 'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7', + 'info_dict': { + 'id': '2fudccnyj7', + 'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', + 'description': str, + }, + 'playlist_mincount': 26, + 'skip': 'This channel is private and requires authentication to access', + }] + + def _extract_video_links_from_html(self, webpage, ie_key): + videos_by_id = {} + pattern = r'href="https://educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)/(?P