From b7d54b33e90297d31637174efa4ee430a500b030 Mon Sep 17 00:00:00 2001 From: Alexandre Ramos Date: Mon, 2 Jun 2025 02:00:20 +0100 Subject: [PATCH 1/3] [ie/educast] Add Extractor Adds a new extractor for the **Educast** platform, allowing download of both presenter and presentation streams in native quality, along with full metadata support. Includes support for downloading from individual video pages, full channel pages, and search results. Private pages are accesible via cookies. Co-authored-by: Filipe Resendes --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/educast.py | 455 ++++++++++++++++++++++++++++++++ 2 files changed, 456 insertions(+) create mode 100644 yt_dlp/extractor/educast.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b0c52e0fc..14c6ca2dd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -576,6 +576,7 @@ ) from .ebaumsworld import EbaumsWorldIE from .ebay import EbayIE +from .educast import EducastChannelIE, EducastIE, EducastResultsIE from .egghead import ( EggheadCourseIE, EggheadLessonIE, diff --git a/yt_dlp/extractor/educast.py b/yt_dlp/extractor/educast.py new file mode 100644 index 000000000..8b2ecb2bc --- /dev/null +++ b/yt_dlp/extractor/educast.py @@ -0,0 +1,455 @@ +from .common import InfoExtractor +from ..networking import HEADRequest +from ..utils import ( + float_or_none, + int_or_none, + mimetype2ext, + str_or_none, + traverse_obj, + unified_timestamp, +) + + +class EducastIE(InfoExtractor): + _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)' + _API_BASE = 'https://educast.fccn.pt' + + _TESTS = [ + { + 'note': 'test for public Educast video downloading the merged format', + 'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html', + 'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc', + 'info_dict': { + 'id': '2o06o2c6hm', + 'ext': 'mp4', + 'title': 'Fundamentos de Bases de Dados', + 'alt_title': '', + 'description': '', + 'uploader': 'Professor Luís Cavique', + 'channel': 'UAB - Fundamentos de Base de dados', + 'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w', + 'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover', + 'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'], + 'timestamp': 1410946740, + 'upload_date': '20140917', + 'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/', + 'formats': [ + { + 'format_id': 'presenter-0', + 'ext': 'm4a', + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + 'protocol': 'http_dash_segments', + }, + { + 'format_id': 'presenter-1', + 'ext': 'mp4', + 'vcodec': 'avc1.77.40', + 'acodec': 'mp4a.40.2', + 'protocol': 'm3u8_native', + }, + { + 'format_id': 'presenter-2', + 'ext': 'mp4', + 'vcodec': 'avc1.4d4028', + 'acodec': 'none', + 'protocol': 'http_dash_segments', + 'fps': 25, + }, + { + 'format_id': 'presentation-0', + 'ext': 'mp4', + 'vcodec': 'avc1.77.40', + 'acodec': 'none', + 'protocol': 'm3u8_native', + }, + { + 'format_id': 'presentation-1', + 'ext': 'mp4', + 'vcodec': 'avc1.4d4028', + 'acodec': 'none', + 'protocol': 'http_dash_segments', + 'fps': 25, + }, + { + 'format_id': 'merged', + 'ext': 'mp4', + 'protocol': 'https', + 'format_note': 'single stream, may be lower res', + }, + ], + }, + }, + { + 'note': 'test for private Educast video downloading the merged format', + 'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html', + 'md5': '242a4a8d1a84a4c3aab93771c3da244e', + 'info_dict': { + 'id': 'jhwehqk9', + 'ext': 'mp4', + 'title': ' Exercícios 8B. Equações Diferenciais Parciais', + 'alt_title': '', + 'description': '', + 'uploader': ' Rui Miguel Saramago', + 'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', + 'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7', + 'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover', + 'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'], + 'license': 'http://creativecommons.org/licenses/by/4.0/', + 'formats': [ + { + 'format_id': 'presenter-0', + 'ext': 'm4a', + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + 'protocol': 'http_dash_segments', + }, + { + 'format_id': 'presenter-1', + 'ext': 'mp4', + 'vcodec': 'avc1.77.40', + 'acodec': 'mp4a.40.2', + 'protocol': 'm3u8_native', + }, + { + 'format_id': 'presenter-2', + 'ext': 'mp4', + 'vcodec': 'avc1.4d4028', + 'acodec': 'none', + 'protocol': 'http_dash_segments', + 'fps': 25, + }, + { + 'format_id': 'merged', + 'ext': 'mp4', + 'protocol': 'https', + 'format_note': 'single stream, may be lower res', + }, + ], + }, + 'skip': 'This video is private and requires authentication to access', + }, + { + 'note': 'test for deprecated streaming url, should rely on fallback', + 'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html', + 'md5': '88055700118db7411d1cc0da48ca1747', + 'info_dict': { + 'id': '2by2fw4fkx', + 'ext': 'mp4', + 'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_', + }, + 'skip': 'This video is private and requires authentication to access', + }, + ] + + def parse_timestamp(self, timestamp_str): + if isinstance(timestamp_str, str) and '.' in timestamp_str: + day, month, year_time = timestamp_str.split('.', 2) + year, time = year_time.split(' ', 1) + reformatted = f'{year}-{month}-{day} {time}' + timestamp = unified_timestamp(reformatted) + if timestamp is not None: + timestamp -= 3600 # Lisbon time (UTC+1) + return timestamp + + def _extract_video_formats(self, video_json, video_id): + formats = [] + dash_url = traverse_obj(video_json, ('dash', 'url')) + if dash_url: + formats += self._extract_mpd_formats(dash_url, video_id, mpd_id='dash', fatal=False) + + hls_url = traverse_obj(video_json, ('hls', 'url')) + if hls_url: + formats += self._extract_m3u8_formats(hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', fatal=False) + + for f in formats: + f['format_id'] = str_or_none(video_json.get('role')) + f['width'] = int_or_none(video_json.get('width')) + f['height'] = int_or_none(video_json.get('height')) + f['duration'] = float_or_none(video_json.get('duration')) + f['filesize_approx'] = int_or_none(float_or_none(f.get('duration')) * float_or_none(f.get('tbr')) * 1000 / 8) + + return formats + + def _extract_from_json(self, video_id): + data_json_url = f'https://educast.fccn.pt/vod/clips/{video_id}/video_player/data.json' + data_json = self._download_json(data_json_url, video_id, fatal=False) + if not data_json: + return None + if data_json.get('error'): + self.to_screen(data_json.get('error')) + return None + + formats = [] + info = { + 'id': video_id, + 'title': str_or_none(traverse_obj(data_json, ('clip', 'name'))), + 'formats': formats, + 'alt_title': str_or_none(data_json.get('subtitle')), + 'description': str_or_none(data_json.get('clipDescription')), + 'uploader': str_or_none(data_json.get('author')), + 'timestamp': self.parse_timestamp(data_json.get('timestamp')), + 'thumbnail': str_or_none(data_json.get('cover')), + 'license': str_or_none(data_json.get('licenceURL')), + 'webpage_url': str_or_none(data_json.get('url')), + 'channel': str_or_none(traverse_obj(data_json, ('channel', 'name'))), + 'channel_url': str_or_none(traverse_obj(data_json, ('channel', 'url'))), + 'categories': [cat for cat in ( + str_or_none(traverse_obj(data_json, ('area', 'name'))), + str_or_none(traverse_obj(data_json, ('institution', 'name'))), + ) if cat], + } + + for video_json in data_json.get('videos') or []: + formats.extend(self._extract_video_formats(video_json, video_id)) + + download_url = str_or_none(data_json.get('downloadURL')) + if download_url: + formats.append({ + 'format_id': 'merged', + 'url': download_url, + 'quality': 0, + 'format_note': 'single stream, may be lower res', + }) + + return info + + def _try_fallback(self, url, video_id): + import re + + # Last resort for videos with no working streaming option + KNOWN_BASENAMES = ['desktop.mp4', 'ipod.m4v', 'quicktime.mov'] + for basename in KNOWN_BASENAMES: + format_url = url.replace('streaming.html', basename) + response = self._request_webpage( + HEADRequest(format_url), video_id, + note=f'Checking availability of {basename} fallback', + fatal=False, errnote=False) + if not response: + continue + ext = mimetype2ext(response.get_header('content-type')) + if ext not in ('mp4', 'm4v', 'mov'): + continue + title = None + m = re.search(r'filename\s*=\s*"([^"]+)"', response.get_header('content-disposition'), re.IGNORECASE) + if m: + title = m.group(1).strip().removesuffix(f'.{ext}') + return { + 'id': video_id, + 'title': title, + 'url': format_url, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_json(video_id) or self._try_fallback(url, video_id) + + @staticmethod + def _paginate_and_collect(get_page_func, parse_func, max_videos=None): + videos = [] + page = 1 + while True: + if max_videos is not None and len(videos) >= max_videos: + break + webpage = get_page_func(page) + if not webpage: + break + new_videos = parse_func(webpage) + found = False + for v in new_videos: + if not any(existing['id'] == v['id'] for existing in videos): + videos.append(v) + found = True + if max_videos is not None and len(videos) >= max_videos: + break + if not found or (max_videos is not None and len(videos) >= max_videos): + break + page += 1 + return videos + + +class EducastChannelIE(InfoExtractor): + IE_NAME = 'educast:channel' + _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/vod/channels/(?P[a-zA-Z0-9]+)/?(?:$|[?#])' + _TESTS = [ + { + 'note': 'test for private Educast Channel', + 'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak', + 'info_dict': + { + 'id': '2o0eonmrak', + 'title': 'Vídeos Institucionais FCT-FCCN', + 'description': str, + }, + 'playlist_mincount': 26, + }, + { + 'note': 'test for private Educast Channel', + 'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7', + 'info_dict': { + 'id': '2fudccnyj7', + 'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação', + 'description': str, + }, + 'playlist_mincount': 26, + 'skip': 'This channel is private and requires authentication to access', + }, + ] + + def _extract_video_links_from_html(self, webpage, ie_key): + import re + videos_by_id = {} + pattern = r'href="https://educast\.fccn\.pt/vod/clips/(?P[a-zA-Z0-9]+)/(?P