[ie/educast] Add Extractor

Adds a new extractor for the **Educast** platform, allowing download of both presenter and presentation streams in native quality, along with full metadata support. Includes support for downloading from individual video pages, full channel pages, and search results. Private pages are accesible via cookies. Co-authored-by: Filipe Resendes <filipe.resendes@tecnico.ulisboa.pt>
2025-12-14 04:05:16 +00:00 · 2025-06-02 02:00:20 +01:00
parent c723c4e5e7
commit b7d54b33e9
2 changed files with 456 additions and 0 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -576,6 +576,7 @@ from .eagleplatform import (
 )
 from .ebaumsworld import EbaumsWorldIE
 from .ebay import EbayIE
+from .educast import EducastChannelIE, EducastIE, EducastResultsIE
 from .egghead import (
    EggheadCourseIE,
    EggheadLessonIE,
--- a/yt_dlp/extractor/educast.py
+++ b/yt_dlp/extractor/educast.py
@@ -0,0 +1,455 @@
+from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    mimetype2ext,
+    str_or_none,
+    traverse_obj,
+    unified_timestamp,
+)
+
+
+class EducastIE(InfoExtractor):
+    _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P<id>[a-zA-Z0-9]+)'
+    _API_BASE = 'https://educast.fccn.pt'
+
+    _TESTS = [
+        {
+            'note': 'test for public Educast video downloading the merged format',
+            'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html',
+            'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc',
+            'info_dict': {
+                'id': '2o06o2c6hm',
+                'ext': 'mp4',
+                'title': 'Fundamentos de Bases de Dados',
+                'alt_title': '',
+                'description': '',
+                'uploader': 'Professor Luís Cavique',
+                'channel': 'UAB - Fundamentos de Base de dados',
+                'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w',
+                'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover',
+                'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'],
+                'timestamp': 1410946740,
+                'upload_date': '20140917',
+                'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/',
+                'formats': [
+                    {
+                        'format_id': 'presenter-0',
+                        'ext': 'm4a',
+                        'vcodec': 'none',
+                        'acodec': 'mp4a.40.2',
+                        'protocol': 'http_dash_segments',
+                    },
+                    {
+                        'format_id': 'presenter-1',
+                        'ext': 'mp4',
+                        'vcodec': 'avc1.77.40',
+                        'acodec': 'mp4a.40.2',
+                        'protocol': 'm3u8_native',
+                    },
+                    {
+                        'format_id': 'presenter-2',
+                        'ext': 'mp4',
+                        'vcodec': 'avc1.4d4028',
+                        'acodec': 'none',
+                        'protocol': 'http_dash_segments',
+                        'fps': 25,
+                    },
+                    {
+                        'format_id': 'presentation-0',
+                        'ext': 'mp4',
+                        'vcodec': 'avc1.77.40',
+                        'acodec': 'none',
+                        'protocol': 'm3u8_native',
+                    },
+                    {
+                        'format_id': 'presentation-1',
+                        'ext': 'mp4',
+                        'vcodec': 'avc1.4d4028',
+                        'acodec': 'none',
+                        'protocol': 'http_dash_segments',
+                        'fps': 25,
+                    },
+                    {
+                        'format_id': 'merged',
+                        'ext': 'mp4',
+                        'protocol': 'https',
+                        'format_note': 'single stream, may be lower res',
+                    },
+                ],
+            },
+        },
+        {
+            'note': 'test for private Educast video downloading the merged format',
+            'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html',
+            'md5': '242a4a8d1a84a4c3aab93771c3da244e',
+            'info_dict': {
+                'id': 'jhwehqk9',
+                'ext': 'mp4',
+                'title': ' Exercícios 8B. Equações Diferenciais Parciais',
+                'alt_title': '',
+                'description': '',
+                'uploader': ' Rui Miguel Saramago',
+                'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação',
+                'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7',
+                'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover',
+                'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'],
+                'license': 'http://creativecommons.org/licenses/by/4.0/',
+                'formats': [
+                    {
+                        'format_id': 'presenter-0',
+                        'ext': 'm4a',
+                        'vcodec': 'none',
+                        'acodec': 'mp4a.40.2',
+                        'protocol': 'http_dash_segments',
+                    },
+                    {
+                        'format_id': 'presenter-1',
+                        'ext': 'mp4',
+                        'vcodec': 'avc1.77.40',
+                        'acodec': 'mp4a.40.2',
+                        'protocol': 'm3u8_native',
+                    },
+                    {
+                        'format_id': 'presenter-2',
+                        'ext': 'mp4',
+                        'vcodec': 'avc1.4d4028',
+                        'acodec': 'none',
+                        'protocol': 'http_dash_segments',
+                        'fps': 25,
+                    },
+                    {
+                        'format_id': 'merged',
+                        'ext': 'mp4',
+                        'protocol': 'https',
+                        'format_note': 'single stream, may be lower res',
+                    },
+                ],
+            },
+            'skip': 'This video is private and requires authentication to access',
+        },
+        {
+            'note': 'test for deprecated streaming url, should rely on fallback',
+            'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html',
+            'md5': '88055700118db7411d1cc0da48ca1747',
+            'info_dict': {
+                'id': '2by2fw4fkx',
+                'ext': 'mp4',
+                'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_',
+            },
+            'skip': 'This video is private and requires authentication to access',
+        },
+    ]
+
+    def parse_timestamp(self, timestamp_str):
+        if isinstance(timestamp_str, str) and '.' in timestamp_str:
+            day, month, year_time = timestamp_str.split('.', 2)
+            year, time = year_time.split(' ', 1)
+            reformatted = f'{year}-{month}-{day} {time}'
+            timestamp = unified_timestamp(reformatted)
+            if timestamp is not None:
+                timestamp -= 3600  # Lisbon time (UTC+1)
+        return timestamp
+
+    def _extract_video_formats(self, video_json, video_id):
+        formats = []
+        dash_url = traverse_obj(video_json, ('dash', 'url'))
+        if dash_url:
+            formats += self._extract_mpd_formats(dash_url, video_id, mpd_id='dash', fatal=False)
+
+        hls_url = traverse_obj(video_json, ('hls', 'url'))
+        if hls_url:
+            formats += self._extract_m3u8_formats(hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', fatal=False)
+
+        for f in formats:
+            f['format_id'] = str_or_none(video_json.get('role'))
+            f['width'] = int_or_none(video_json.get('width'))
+            f['height'] = int_or_none(video_json.get('height'))
+            f['duration'] = float_or_none(video_json.get('duration'))
+            f['filesize_approx'] = int_or_none(float_or_none(f.get('duration')) * float_or_none(f.get('tbr')) * 1000 / 8)
+
+        return formats
+
+    def _extract_from_json(self, video_id):
+        data_json_url = f'https://educast.fccn.pt/vod/clips/{video_id}/video_player/data.json'
+        data_json = self._download_json(data_json_url, video_id, fatal=False)
+        if not data_json:
+            return None
+        if data_json.get('error'):
+            self.to_screen(data_json.get('error'))
+            return None
+
+        formats = []
+        info = {
+            'id': video_id,
+            'title': str_or_none(traverse_obj(data_json, ('clip', 'name'))),
+            'formats': formats,
+            'alt_title': str_or_none(data_json.get('subtitle')),
+            'description': str_or_none(data_json.get('clipDescription')),
+            'uploader': str_or_none(data_json.get('author')),
+            'timestamp': self.parse_timestamp(data_json.get('timestamp')),
+            'thumbnail': str_or_none(data_json.get('cover')),
+            'license': str_or_none(data_json.get('licenceURL')),
+            'webpage_url': str_or_none(data_json.get('url')),
+            'channel': str_or_none(traverse_obj(data_json, ('channel', 'name'))),
+            'channel_url': str_or_none(traverse_obj(data_json, ('channel', 'url'))),
+            'categories': [cat for cat in (
+                str_or_none(traverse_obj(data_json, ('area', 'name'))),
+                str_or_none(traverse_obj(data_json, ('institution', 'name'))),
+            ) if cat],
+        }
+
+        for video_json in data_json.get('videos') or []:
+            formats.extend(self._extract_video_formats(video_json, video_id))
+
+        download_url = str_or_none(data_json.get('downloadURL'))
+        if download_url:
+            formats.append({
+                'format_id': 'merged',
+                'url': download_url,
+                'quality': 0,
+                'format_note': 'single stream, may be lower res',
+            })
+
+        return info
+
+    def _try_fallback(self, url, video_id):
+        import re
+
+        # Last resort for videos with no working streaming option
+        KNOWN_BASENAMES = ['desktop.mp4', 'ipod.m4v', 'quicktime.mov']
+        for basename in KNOWN_BASENAMES:
+            format_url = url.replace('streaming.html', basename)
+            response = self._request_webpage(
+                HEADRequest(format_url), video_id,
+                note=f'Checking availability of {basename} fallback',
+                fatal=False, errnote=False)
+            if not response:
+                continue
+            ext = mimetype2ext(response.get_header('content-type'))
+            if ext not in ('mp4', 'm4v', 'mov'):
+                continue
+            title = None
+            m = re.search(r'filename\s*=\s*"([^"]+)"', response.get_header('content-disposition'), re.IGNORECASE)
+            if m:
+                title = m.group(1).strip().removesuffix(f'.{ext}')
+            return {
+                'id': video_id,
+                'title': title,
+                'url': format_url,
+            }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self._extract_from_json(video_id) or self._try_fallback(url, video_id)
+
+    @staticmethod
+    def _paginate_and_collect(get_page_func, parse_func, max_videos=None):
+        videos = []
+        page = 1
+        while True:
+            if max_videos is not None and len(videos) >= max_videos:
+                break
+            webpage = get_page_func(page)
+            if not webpage:
+                break
+            new_videos = parse_func(webpage)
+            found = False
+            for v in new_videos:
+                if not any(existing['id'] == v['id'] for existing in videos):
+                    videos.append(v)
+                    found = True
+                    if max_videos is not None and len(videos) >= max_videos:
+                        break
+            if not found or (max_videos is not None and len(videos) >= max_videos):
+                break
+            page += 1
+        return videos
+
+
+class EducastChannelIE(InfoExtractor):
+    IE_NAME = 'educast:channel'
+    _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/vod/channels/(?P<id>[a-zA-Z0-9]+)/?(?:$|[?#])'
+    _TESTS = [
+        {
+            'note': 'test for private Educast Channel',
+            'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak',
+            'info_dict':
+            {
+                'id': '2o0eonmrak',
+                'title': 'Vídeos Institucionais FCT-FCCN',
+                'description': str,
+            },
+            'playlist_mincount': 26,
+        },
+        {
+            'note': 'test for private Educast Channel',
+            'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7',
+            'info_dict': {
+                'id': '2fudccnyj7',
+                'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação',
+                'description': str,
+            },
+            'playlist_mincount': 26,
+            'skip': 'This channel is private and requires authentication to access',
+        },
+    ]
+
+    def _extract_video_links_from_html(self, webpage, ie_key):
+        import re
+        videos_by_id = {}
+        pattern = r'href="https://educast\.fccn\.pt/vod/clips/(?P<id>[a-zA-Z0-9]+)/(?P<option>[^?"/]+)'
+        for m in re.finditer(pattern, webpage or '', re.IGNORECASE):
+            video_id = m.group('id')
+            option = m.group('option')
+            if video_id not in videos_by_id:
+                videos_by_id[video_id] = []
+            videos_by_id[video_id].append(option)
+
+        videos = []
+        for video_id, candidates in videos_by_id.items():
+            # prefer 'streaming.html'
+            candidates.sort(key=lambda x: x[0] == 'streaming.html')
+            chosen_url = f'{EducastIE._API_BASE}/vod/clips/{video_id}/{candidates[0]}'
+            videos.append({
+                '_type': 'url',
+                'url': chosen_url,
+                'ie_key': ie_key,
+                'id': video_id,
+            })
+        return videos
+
+    def _extract_videos(self, url, channel_id, webpage=None):
+        max_downloads = None
+        if hasattr(self, '_downloader') and self._downloader:
+            max_downloads = self._downloader.params.get('max_downloads')
+
+        def get_page(page):
+            import urllib.parse
+            url_parts = list(urllib.parse.urlparse(url))
+            query = urllib.parse.parse_qs(url_parts[4])
+            query['page'] = [str(page)]
+            url_parts[4] = urllib.parse.urlencode(query, doseq=True)
+            page_url = urllib.parse.urlunparse(url_parts)
+
+            return self._download_webpage(page_url, channel_id, note=f'Downloading page {page}', fatal=False)
+
+        def parse_func(page_result):
+            return self._extract_video_links_from_html(page_result, EducastIE.ie_key())
+
+        try:
+            videos = EducastIE._paginate_and_collect(get_page, parse_func, max_videos=max_downloads)
+            if videos:
+                return videos
+        except Exception:
+            pass
+        # Fallback: parse HTML for video links
+        return self._extract_video_links_from_html(webpage, EducastIE.ie_key())
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+        webpage = self._download_webpage(url, channel_id)
+        description = (
+            self._og_search_description(webpage, default=None)
+            or self._html_search_meta('description', webpage, default=None)
+            or self._html_search_regex(
+                r'<div[^>]+class="[^\"]*channel-description[^\"]*">([^<]+)',
+                webpage, 'description', default=None)
+        )
+        return {
+            '_type': 'playlist',
+            'id': channel_id,
+            'title': self._og_search_title(webpage, default='Unknown Channel'),
+            'description': description,
+            'entries': self._extract_videos(url, channel_id, webpage),
+        }
+
+
+class EducastResultsIE(InfoExtractor):
+    IE_NAME = 'educast:results'
+    _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/results\?(?P<params>(search|organization|category|channel)=[^#]+)'
+    _TESTS = [
+        {
+            'url': 'https://educast.fccn.pt/results?search=Sat%C3%A9lite',
+            'info_dict': {
+                'id': 'search=Sat%C3%A9lite',
+                'title': 'Results for search=Satélite',
+            },
+            'playlist_mincount': 1,
+            'params': {'max_downloads': 3},
+        },
+        {
+            'url': 'https://educast.fccn.pt/results?organization=fccn.pt',
+            'info_dict': {
+                'id': 'organization=fccn.pt',
+                'title': 'Results for organization=fccn.pt',
+            },
+            'playlist_mincount': 1,
+            'params': {'max_downloads': 3},
+        },
+        {
+            'url': 'https://educast.fccn.pt/results?category=Technology%20&%20Applied%20sciences',
+            'info_dict': {
+                'id': 'category=Technology%20&%20Applied%20sciences',
+                'title': 'Results for category=Technology%20&%20Applied%20sciences',
+            },
+            'playlist_mincount': 1,
+            'params': {'max_downloads': 3},
+        },
+        {
+            'url': 'https://educast.fccn.pt/results?channel=16mfovn0pt',
+            'info_dict': {
+                'id': 'channel=16mfovn0pt',
+                'title': 'Results for channel=16mfovn0pt',
+            },
+            'playlist_mincount': 1,
+            'params': {'max_downloads': 3},
+        },
+    ]
+
+    def _extract_video_links_from_html(self, webpage, ie_key):
+        import re
+        videos = []
+        for m in re.finditer(r'/vod/clips/([a-zA-Z0-9]+)/streaming.html', webpage or '', re.IGNORECASE):
+            video_id = m.group(1)
+            url = f'{EducastIE._API_BASE}/vod/clips/{video_id}/streaming.html'
+            if not any(v['id'] == video_id for v in videos):
+                videos.append({
+                    '_type': 'url',
+                    'url': url,
+                    'ie_key': ie_key,
+                    'id': video_id,
+                })
+        return videos
+
+    def _extract_videos(self, params, webpage=None):
+        import urllib.parse
+        max_downloads = None
+        if hasattr(self, '_downloader') and self._downloader:
+            max_downloads = self._downloader.params.get('max_downloads')
+
+        def get_page(page):
+            base_url = f'{EducastIE._API_BASE}/results?{params}'
+            url_parts = list(urllib.parse.urlparse(base_url))
+            query = urllib.parse.parse_qs(url_parts[4])
+            query['page'] = [str(page)]
+            url_parts[4] = urllib.parse.urlencode(query, doseq=True)
+            page_url = urllib.parse.urlunparse(url_parts)
+            return self._download_webpage(page_url, params, note=f'Downloading results page {page}', fatal=False)
+
+        def parse_func(webpage):
+            return self._extract_video_links_from_html(webpage, EducastIE.ie_key())
+        return EducastIE._paginate_and_collect(get_page, parse_func, max_videos=max_downloads)
+
+    def _real_extract(self, url):
+        import urllib.parse
+        params = self._match_valid_url(url).group('params')
+        params_decoded = urllib.parse.unquote(params)
+        webpage = self._download_webpage(url, params)
+        return {
+            '_type': 'playlist',
+            'id': params,
+            'title': f'Results for {params_decoded}',
+            'entries': self._extract_videos(params, webpage),
+        }