[ie/educast] Address PR review comments

Removed local imports and used module-level ones Switched to traverse_obj-based metadata unpacking Avoided overwriting resolution in formats; removed duration from format dicts Removed redundant max_downloads logic (handled by core) Replaced to_screen with report_warning for error in data_json Added expected_warnings for the third EducastIE test Handled missing content-disposition header Co-authored-by: Filipe Resendes <filipe.resendes@tecnico.ulisboa.pt>
2026-02-08 06:57:27 +00:00 · 2025-06-03 23:56:57 +01:00
parent b7d54b33e9
commit 4978e987a1
1 changed files with 162 additions and 260 deletions
--- a/yt_dlp/extractor/educast.py
+++ b/yt_dlp/extractor/educast.py
@@ -1,146 +1,94 @@
-from .common import InfoExtractor
+import re
+from urllib import parse
+
+from .common import (
+    ExtractorError,
+    InfoExtractor,
+)
 from ..networking import HEADRequest
 from ..utils import (
-    float_or_none,
-    int_or_none,
    mimetype2ext,
-    str_or_none,
    traverse_obj,
    unified_timestamp,
 )


-class EducastIE(InfoExtractor):
-    _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P<id>[a-zA-Z0-9]+)'
+class EducastBaseIE(InfoExtractor):
    _API_BASE = 'https://educast.fccn.pt'

-    _TESTS = [
-        {
-            'note': 'test for public Educast video downloading the merged format',
-            'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html',
-            'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc',
-            'info_dict': {
-                'id': '2o06o2c6hm',
-                'ext': 'mp4',
-                'title': 'Fundamentos de Bases de Dados',
-                'alt_title': '',
-                'description': '',
-                'uploader': 'Professor Luís Cavique',
-                'channel': 'UAB - Fundamentos de Base de dados',
-                'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w',
-                'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover',
-                'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'],
-                'timestamp': 1410946740,
-                'upload_date': '20140917',
-                'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/',
-                'formats': [
-                    {
-                        'format_id': 'presenter-0',
-                        'ext': 'm4a',
-                        'vcodec': 'none',
-                        'acodec': 'mp4a.40.2',
-                        'protocol': 'http_dash_segments',
-                    },
-                    {
-                        'format_id': 'presenter-1',
-                        'ext': 'mp4',
-                        'vcodec': 'avc1.77.40',
-                        'acodec': 'mp4a.40.2',
-                        'protocol': 'm3u8_native',
-                    },
-                    {
-                        'format_id': 'presenter-2',
-                        'ext': 'mp4',
-                        'vcodec': 'avc1.4d4028',
-                        'acodec': 'none',
-                        'protocol': 'http_dash_segments',
-                        'fps': 25,
-                    },
-                    {
-                        'format_id': 'presentation-0',
-                        'ext': 'mp4',
-                        'vcodec': 'avc1.77.40',
-                        'acodec': 'none',
-                        'protocol': 'm3u8_native',
-                    },
-                    {
-                        'format_id': 'presentation-1',
-                        'ext': 'mp4',
-                        'vcodec': 'avc1.4d4028',
-                        'acodec': 'none',
-                        'protocol': 'http_dash_segments',
-                        'fps': 25,
-                    },
-                    {
-                        'format_id': 'merged',
-                        'ext': 'mp4',
-                        'protocol': 'https',
-                        'format_note': 'single stream, may be lower res',
-                    },
-                ],
-            },
+    @staticmethod
+    def _paginate_and_collect(get_page_func, parse_func):
+        videos = []
+        page = 1
+        while True:
+            webpage = get_page_func(page)
+            if not webpage:
+                break
+            new_videos = parse_func(webpage)
+            found = False
+            for v in new_videos:
+                if not any(existing['id'] == v['id'] for existing in videos):
+                    videos.append(v)
+                    found = True
+            if not found:
+                break
+            page += 1
+        return videos
+
+
+class EducastIE(EducastBaseIE):
+    _VALID_URL = r'https?://(www)?educast\.fccn\.pt/vod/clips/(?P<id>[a-zA-Z0-9]+)'
+    _TESTS = [{
+        'note': 'test for public Educast video downloading the merged format',
+        'url': 'https://educast.fccn.pt/vod/clips/2o06o2c6hm/streaming.html',
+        'md5': '264b3e2f0c6c5d3c8e1a86e57f21d0bc',
+        'info_dict': {
+            'id': '2o06o2c6hm',
+            'ext': 'mp4',
+            'title': 'Fundamentos de Bases de Dados',
+            'alt_title': '',
+            'description': '',
+            'uploader': 'Professor Luís Cavique',
+            'channel': 'UAB - Fundamentos de Base de dados',
+            'channel_url': 'https://educast.fccn.pt/results?channel=k06h42n0w',
+            'thumbnail': 'https://educast.fccn.pt/img/clips/2o06o2c6hm/delivery/cover',
+            'categories': ['Tecnologia e Ciências Aplicadas', 'FCCN'],
+            'timestamp': 1410946740,
+            'upload_date': '20140917',
+            'license': 'http://creativecommons.org/licenses/by-nc-nd/2.5/pt/',
+            'duration': 1041,
        },
-        {
-            'note': 'test for private Educast video downloading the merged format',
-            'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html',
-            'md5': '242a4a8d1a84a4c3aab93771c3da244e',
-            'info_dict': {
-                'id': 'jhwehqk9',
-                'ext': 'mp4',
-                'title': ' Exercícios 8B. Equações Diferenciais Parciais',
-                'alt_title': '',
-                'description': '',
-                'uploader': ' Rui Miguel Saramago',
-                'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação',
-                'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7',
-                'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover',
-                'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'],
-                'license': 'http://creativecommons.org/licenses/by/4.0/',
-                'formats': [
-                    {
-                        'format_id': 'presenter-0',
-                        'ext': 'm4a',
-                        'vcodec': 'none',
-                        'acodec': 'mp4a.40.2',
-                        'protocol': 'http_dash_segments',
-                    },
-                    {
-                        'format_id': 'presenter-1',
-                        'ext': 'mp4',
-                        'vcodec': 'avc1.77.40',
-                        'acodec': 'mp4a.40.2',
-                        'protocol': 'm3u8_native',
-                    },
-                    {
-                        'format_id': 'presenter-2',
-                        'ext': 'mp4',
-                        'vcodec': 'avc1.4d4028',
-                        'acodec': 'none',
-                        'protocol': 'http_dash_segments',
-                        'fps': 25,
-                    },
-                    {
-                        'format_id': 'merged',
-                        'ext': 'mp4',
-                        'protocol': 'https',
-                        'format_note': 'single stream, may be lower res',
-                    },
-                ],
-            },
-            'skip': 'This video is private and requires authentication to access',
+    }, {
+        'note': 'test for private Educast video downloading the merged format',
+        'url': 'https://educast.fccn.pt/vod/clips/jhwehqk9/streaming.html',
+        'md5': '242a4a8d1a84a4c3aab93771c3da244e',
+        'info_dict': {
+            'id': 'jhwehqk9',
+            'ext': 'mp4',
+            'title': ' Exercícios 8B. Equações Diferenciais Parciais',
+            'alt_title': '',
+            'description': '',
+            'uploader': ' Rui Miguel Saramago',
+            'channel': 'Cálculo Diferencial e Integral III - Aulas de Recuperação',
+            'channel_url': 'https://educast.fccn.pt/results?channel=2fudccnyj7',
+            'thumbnail': 'https://educast.fccn.pt/img/clips/jhwehqk9/delivery/cover',
+            'categories': ['Ciências Naturais e Matemática', 'Universidade de Lisboa'],
+            'license': 'http://creativecommons.org/licenses/by/4.0/',
+            'duration': 2756,
        },
-        {
-            'note': 'test for deprecated streaming url, should rely on fallback',
-            'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html',
-            'md5': '88055700118db7411d1cc0da48ca1747',
-            'info_dict': {
-                'id': '2by2fw4fkx',
-                'ext': 'mp4',
-                'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_',
-            },
-            'skip': 'This video is private and requires authentication to access',
+        'skip': 'This video is private and requires authentication to access',
+    }, {
+        'note': 'test for deprecated streaming url, should rely on fallback',
+        'url': 'https://educast.fccn.pt/vod/clips/2by2fw4fkx/streaming.html',
+        'md5': '88055700118db7411d1cc0da48ca1747',
+        'info_dict': {
+            'id': '2by2fw4fkx',
+            'ext': 'mp4',
+            'title': 'Teoria 3A. Sistemas de Equaces Diferenciais Lineares de Primeira Ordem_',
        },
-    ]
+        'expected_warnings': ['Este vídeo não está preparado para HTML5'],
+        'skip': 'This video is private and requires authentication to access',
+    }]

    def parse_timestamp(self, timestamp_str):
        if isinstance(timestamp_str, str) and '.' in timestamp_str:
@@ -163,47 +111,48 @@ class EducastIE(InfoExtractor):
            formats += self._extract_m3u8_formats(hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', fatal=False)

        for f in formats:
-            f['format_id'] = str_or_none(video_json.get('role'))
-            f['width'] = int_or_none(video_json.get('width'))
-            f['height'] = int_or_none(video_json.get('height'))
-            f['duration'] = float_or_none(video_json.get('duration'))
-            f['filesize_approx'] = int_or_none(float_or_none(f.get('duration')) * float_or_none(f.get('tbr')) * 1000 / 8)
+            f['format_id'] = video_json.get('role')

        return formats

    def _extract_from_json(self, video_id):
        data_json_url = f'https://educast.fccn.pt/vod/clips/{video_id}/video_player/data.json'
-        data_json = self._download_json(data_json_url, video_id, fatal=False)
-        if not data_json:
+        try:
+            data_json = self._download_json(data_json_url, video_id)
+        except ExtractorError as e:
+            self.report_warning(e)
            return None
        if data_json.get('error'):
-            self.to_screen(data_json.get('error'))
+            self.report_warning(data_json.get('error'))
            return None

        formats = []
        info = {
            'id': video_id,
-            'title': str_or_none(traverse_obj(data_json, ('clip', 'name'))),
            'formats': formats,
-            'alt_title': str_or_none(data_json.get('subtitle')),
-            'description': str_or_none(data_json.get('clipDescription')),
-            'uploader': str_or_none(data_json.get('author')),
-            'timestamp': self.parse_timestamp(data_json.get('timestamp')),
-            'thumbnail': str_or_none(data_json.get('cover')),
-            'license': str_or_none(data_json.get('licenceURL')),
-            'webpage_url': str_or_none(data_json.get('url')),
-            'channel': str_or_none(traverse_obj(data_json, ('channel', 'name'))),
-            'channel_url': str_or_none(traverse_obj(data_json, ('channel', 'url'))),
+            **traverse_obj(data_json, {
+                'title': ('clip', 'name', {str}),
+                'alt_title': ('subtitle', {str}),
+                'description': ('clipDescription', {str}),
+                'uploader': ('author', {str}),
+                'timestamp': ('timestamp', {self.parse_timestamp}),
+                'thumbnail': ('cover', {str}),
+                'license': ('licenceURL', {str}),
+                'webpage_url': ('url', {str}),
+                'channel': ('channel', 'name', {str}),
+                'channel_url': ('channel', 'url', {str}),
+                'duration': ('videos', 0, 'duration', {int}),
+            }),
            'categories': [cat for cat in (
-                str_or_none(traverse_obj(data_json, ('area', 'name'))),
-                str_or_none(traverse_obj(data_json, ('institution', 'name'))),
+                traverse_obj(data_json, ('area', 'name'), expected_type=str),
+                traverse_obj(data_json, ('institution', 'name'), expected_type=str),
            ) if cat],
        }

        for video_json in data_json.get('videos') or []:
            formats.extend(self._extract_video_formats(video_json, video_id))

-        download_url = str_or_none(data_json.get('downloadURL'))
+        download_url = data_json.get('downloadURL')
        if download_url:
            formats.append({
                'format_id': 'merged',
@@ -215,8 +164,6 @@ class EducastIE(InfoExtractor):
        return info

    def _try_fallback(self, url, video_id):
-        import re
-
        # Last resort for videos with no working streaming option
        KNOWN_BASENAMES = ['desktop.mp4', 'ipod.m4v', 'quicktime.mov']
        for basename in KNOWN_BASENAMES:
@@ -231,9 +178,11 @@ class EducastIE(InfoExtractor):
            if ext not in ('mp4', 'm4v', 'mov'):
                continue
            title = None
-            m = re.search(r'filename\s*=\s*"([^"]+)"', response.get_header('content-disposition'), re.IGNORECASE)
-            if m:
-                title = m.group(1).strip().removesuffix(f'.{ext}')
+            ext_header = response.get_header('content-disposition')
+            if ext_header:
+                m = re.search(r'filename\s*=\s*"([^"]+)"', ext_header, re.IGNORECASE)
+                if m:
+                    title = m.group(1).strip().removesuffix(f'.{ext}')
            return {
                'id': video_id,
                'title': title,
@@ -244,60 +193,33 @@ class EducastIE(InfoExtractor):
        video_id = self._match_id(url)
        return self._extract_from_json(video_id) or self._try_fallback(url, video_id)

-    @staticmethod
-    def _paginate_and_collect(get_page_func, parse_func, max_videos=None):
-        videos = []
-        page = 1
-        while True:
-            if max_videos is not None and len(videos) >= max_videos:
-                break
-            webpage = get_page_func(page)
-            if not webpage:
-                break
-            new_videos = parse_func(webpage)
-            found = False
-            for v in new_videos:
-                if not any(existing['id'] == v['id'] for existing in videos):
-                    videos.append(v)
-                    found = True
-                    if max_videos is not None and len(videos) >= max_videos:
-                        break
-            if not found or (max_videos is not None and len(videos) >= max_videos):
-                break
-            page += 1
-        return videos

-
-class EducastChannelIE(InfoExtractor):
+class EducastChannelIE(EducastBaseIE):
    IE_NAME = 'educast:channel'
    _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/vod/channels/(?P<id>[a-zA-Z0-9]+)/?(?:$|[?#])'
-    _TESTS = [
+    _TESTS = [{
+        'note': 'test for private Educast Channel',
+        'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak',
+        'info_dict':
        {
-            'note': 'test for private Educast Channel',
-            'url': 'https://educast.fccn.pt/vod/channels/2o0eonmrak',
-            'info_dict':
-            {
-                'id': '2o0eonmrak',
-                'title': 'Vídeos Institucionais FCT-FCCN',
-                'description': str,
-            },
-            'playlist_mincount': 26,
+            'id': '2o0eonmrak',
+            'title': 'Vídeos Institucionais FCT-FCCN',
+            'description': str,
        },
-        {
-            'note': 'test for private Educast Channel',
-            'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7',
-            'info_dict': {
-                'id': '2fudccnyj7',
-                'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação',
-                'description': str,
-            },
-            'playlist_mincount': 26,
-            'skip': 'This channel is private and requires authentication to access',
+        'playlist_mincount': 26,
+    }, {
+        'note': 'test for private Educast Channel',
+        'url': 'https://educast.fccn.pt/vod/channels/2fudccnyj7',
+        'info_dict': {
+            'id': '2fudccnyj7',
+            'title': 'Cálculo Diferencial e Integral III - Aulas de Recuperação',
+            'description': str,
        },
-    ]
+        'playlist_mincount': 26,
+        'skip': 'This channel is private and requires authentication to access',
+    }]

    def _extract_video_links_from_html(self, webpage, ie_key):
-        import re
        videos_by_id = {}
        pattern = r'href="https://educast\.fccn\.pt/vod/clips/(?P<id>[a-zA-Z0-9]+)/(?P<option>[^?"/]+)'
        for m in re.finditer(pattern, webpage or '', re.IGNORECASE):
@@ -321,17 +243,12 @@ class EducastChannelIE(InfoExtractor):
        return videos

    def _extract_videos(self, url, channel_id, webpage=None):
-        max_downloads = None
-        if hasattr(self, '_downloader') and self._downloader:
-            max_downloads = self._downloader.params.get('max_downloads')
-
        def get_page(page):
-            import urllib.parse
-            url_parts = list(urllib.parse.urlparse(url))
-            query = urllib.parse.parse_qs(url_parts[4])
+            url_parts = list(parse.urlparse(url))
+            query = parse.parse_qs(url_parts[4])
            query['page'] = [str(page)]
-            url_parts[4] = urllib.parse.urlencode(query, doseq=True)
-            page_url = urllib.parse.urlunparse(url_parts)
+            url_parts[4] = parse.urlencode(query, doseq=True)
+            page_url = parse.urlunparse(url_parts)

            return self._download_webpage(page_url, channel_id, note=f'Downloading page {page}', fatal=False)

@@ -339,7 +256,7 @@ class EducastChannelIE(InfoExtractor):
            return self._extract_video_links_from_html(page_result, EducastIE.ie_key())

        try:
-            videos = EducastIE._paginate_and_collect(get_page, parse_func, max_videos=max_downloads)
+            videos = EducastIE._paginate_and_collect(get_page, parse_func)
            if videos:
                return videos
        except Exception:
@@ -366,50 +283,41 @@ class EducastChannelIE(InfoExtractor):
        }


-class EducastResultsIE(InfoExtractor):
+class EducastResultsIE(EducastBaseIE):
    IE_NAME = 'educast:results'
    _VALID_URL = r'https?://(?:www\.)?educast\.fccn\.pt/results\?(?P<params>(search|organization|category|channel)=[^#]+)'
-    _TESTS = [
-        {
-            'url': 'https://educast.fccn.pt/results?search=Sat%C3%A9lite',
-            'info_dict': {
-                'id': 'search=Sat%C3%A9lite',
-                'title': 'Results for search=Satélite',
-            },
-            'playlist_mincount': 1,
-            'params': {'max_downloads': 3},
+    _TESTS = [{
+        'url': 'https://educast.fccn.pt/results?search=Sat%C3%A9lite',
+        'info_dict': {
+            'id': 'search=Sat%C3%A9lite',
+            'title': 'Results for search=Satélite',
        },
-        {
-            'url': 'https://educast.fccn.pt/results?organization=fccn.pt',
-            'info_dict': {
-                'id': 'organization=fccn.pt',
-                'title': 'Results for organization=fccn.pt',
-            },
-            'playlist_mincount': 1,
-            'params': {'max_downloads': 3},
+        'playlist_mincount': 1,
+        'params': {'max_downloads': 3},
+    }, {
+        'url': 'https://educast.fccn.pt/results?organization=fccn.pt',
+        'info_dict': {
+            'id': 'organization=fccn.pt',
+            'title': 'Results for organization=fccn.pt',
        },
-        {
-            'url': 'https://educast.fccn.pt/results?category=Technology%20&%20Applied%20sciences',
-            'info_dict': {
-                'id': 'category=Technology%20&%20Applied%20sciences',
-                'title': 'Results for category=Technology%20&%20Applied%20sciences',
-            },
-            'playlist_mincount': 1,
-            'params': {'max_downloads': 3},
+        'playlist_mincount': 1,
+    }, {
+        'url': 'https://educast.fccn.pt/results?category=Technology%20&%20Applied%20sciences',
+        'info_dict': {
+            'id': 'category=Technology%20&%20Applied%20sciences',
+            'title': 'Results for category=Technology%20&%20Applied%20sciences',
        },
-        {
-            'url': 'https://educast.fccn.pt/results?channel=16mfovn0pt',
-            'info_dict': {
-                'id': 'channel=16mfovn0pt',
-                'title': 'Results for channel=16mfovn0pt',
-            },
-            'playlist_mincount': 1,
-            'params': {'max_downloads': 3},
+        'playlist_mincount': 1,
+    }, {
+        'url': 'https://educast.fccn.pt/results?channel=16mfovn0pt',
+        'info_dict': {
+            'id': 'channel=16mfovn0pt',
+            'title': 'Results for channel=16mfovn0pt',
        },
-    ]
+        'playlist_mincount': 1,
+    }]

    def _extract_video_links_from_html(self, webpage, ie_key):
-        import re
        videos = []
        for m in re.finditer(r'/vod/clips/([a-zA-Z0-9]+)/streaming.html', webpage or '', re.IGNORECASE):
            video_id = m.group(1)
@@ -424,28 +332,22 @@ class EducastResultsIE(InfoExtractor):
        return videos

    def _extract_videos(self, params, webpage=None):
-        import urllib.parse
-        max_downloads = None
-        if hasattr(self, '_downloader') and self._downloader:
-            max_downloads = self._downloader.params.get('max_downloads')
-
        def get_page(page):
            base_url = f'{EducastIE._API_BASE}/results?{params}'
-            url_parts = list(urllib.parse.urlparse(base_url))
-            query = urllib.parse.parse_qs(url_parts[4])
+            url_parts = list(parse.urlparse(base_url))
+            query = parse.parse_qs(url_parts[4])
            query['page'] = [str(page)]
-            url_parts[4] = urllib.parse.urlencode(query, doseq=True)
-            page_url = urllib.parse.urlunparse(url_parts)
+            url_parts[4] = parse.urlencode(query, doseq=True)
+            page_url = parse.urlunparse(url_parts)
            return self._download_webpage(page_url, params, note=f'Downloading results page {page}', fatal=False)

        def parse_func(webpage):
            return self._extract_video_links_from_html(webpage, EducastIE.ie_key())
-        return EducastIE._paginate_and_collect(get_page, parse_func, max_videos=max_downloads)
+        return EducastIE._paginate_and_collect(get_page, parse_func)

    def _real_extract(self, url):
-        import urllib.parse
        params = self._match_valid_url(url).group('params')
-        params_decoded = urllib.parse.unquote(params)
+        params_decoded = parse.unquote(params)
        webpage = self._download_webpage(url, params)
        return {
            '_type': 'playlist',