Merge 9316f5c642 into f2919bd28e

2025-08-13 16:08:29 +00:00 · 2025-08-12 20:25:36 -03:00 · 2025-08-12 20:25:36 -03:00 · 8758e88c85
commit 8758e88c85
parent f2919bd28e 9316f5c642
1 changed files with 122 additions and 57 deletions
--- a/yt_dlp/extractor/tviplayer.py
+++ b/yt_dlp/extractor/tviplayer.py
@ -1,78 +1,143 @@
+import json
+import re
+
 from .common import InfoExtractor
-from ..utils import traverse_obj
+from ..utils import ExtractorError, js_to_json, traverse_obj


 class TVIPlayerIE(InfoExtractor):
-    _VALID_URL = r'https?://tviplayer\.iol\.pt(/programa/[\w-]+/[a-f0-9]+)?/\w+/(?P<id>\w+)'
+    _VALID_URL = r'https?://tviplayer\.iol\.pt/(?:programa/[^/]+/[0-9a-f]+/(?:video|episodio)|video|episodio|[^/]+/[^/]+|[^/]+)/(?P<id>[0-9A-Za-z]+)(?:[/?#]|$)'
    _TESTS = [{
-        'url': 'https://tviplayer.iol.pt/programa/jornal-das-8/53c6b3903004dc006243d0cf/video/61c8e8b90cf2c7ea0f0f71a9',
+        'url': 'https://tviplayer.iol.pt/programa/a-protegida/67a63479d34ef72ee441fa79/episodio/t1e120',
        'info_dict': {
-            'id': '61c8e8b90cf2c7ea0f0f71a9',
+            'id': '689683000cf20ac1d5f35341',
            'ext': 'mp4',
-            'duration': 4167,
-            'title': 'Jornal das 8 - 26 de dezembro de 2021',
-            'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/61c8ee630cf2cc58e7d98d9f/',
-            'season_number': 8,
-            'season': 'Season 8',
-        },
-    }, {
-        'url': 'https://tviplayer.iol.pt/programa/isabel/62b471090cf26256cd2a8594/video/62be445f0cf2ea4f0a5218e5',
-        'info_dict': {
-            'id': '62be445f0cf2ea4f0a5218e5',
-            'ext': 'mp4',
-            'duration': 3255,
-            'season': 'Season 1',
-            'title': 'Isabel - Episódio 1',
-            'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62beac200cf2f9a86eab856b/',
-            'season_number': 1,
-        },
-    }, {
-        # no /programa/
-        'url': 'https://tviplayer.iol.pt/video/62c4131c0cf2f9a86eac06bb',
-        'info_dict': {
-            'id': '62c4131c0cf2f9a86eac06bb',
-            'ext': 'mp4',
-            'title': 'David e Mickael Carreira respondem: «Qual é o próximo a ser pai?»',
-            'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62c416490cf2ea367d4433fd/',
-            'season': 'Season 2',
-            'duration': 148,
-            'season_number': 2,
-        },
-    }, {
-        # episodio url
-        'url': 'https://tviplayer.iol.pt/programa/para-sempre/61716c360cf2365a5ed894c4/episodio/t1e187',
-        'info_dict': {
-            'id': 't1e187',
-            'ext': 'mp4',
-            'season': 'Season 1',
-            'title': 'Quem denunciou Pedro?',
-            'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62eda30b0cf2ea367d48973b/',
-            'duration': 1250,
+            'duration': 1593,
+            'title': 'A Protegida - Clarice descobre o que une Óscar a Gonçalo e Mónica',
+            'thumbnail': 'https://img.iol.pt/image/id/68971037d34ef72ee44941a6/',
            'season_number': 1,
        },
    }]

    def _real_initialize(self):
-        self.wms_auth_sign_token = self._download_webpage(
-            'https://services.iol.pt/matrix?userId=', 'wmsAuthSign',
-            note='Trying to get wmsAuthSign token')
+        # try to obtain the wmsAuthSign token; if it fails, continue without it
+        try:
+            self.wms_auth_sign_token = self._download_webpage(
+                'https://services.iol.pt/matrix?userId=', 'wmsAuthSign',
+                note='Downloading wmsAuthSign token')
+        except Exception:
+            self.wms_auth_sign_token = None
+
+    def _extract_enclosing_js_object(self, webpage, keyword):
+        """
+        Find a JS object (balanced braces) that contains keyword (e.g. "videoUrl").
+        Returns the text of the object (including braces) or None.
+        """
+        k = re.search(re.escape(keyword), webpage)
+        if not k:
+            return None
+        pos = k.start()
+        # find an opening brace before pos
+        start = webpage.rfind('{', 0, pos)
+        if start == -1:
+            return None
+        depth = 0
+        for i in range(start, len(webpage)):
+            ch = webpage[i]
+            if ch == '{':
+                depth += 1
+            elif ch == '}':
+                depth -= 1
+                if depth == 0:
+                    return webpage[start:i + 1]
+        return None

    def _real_extract(self, url):
        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, video_id or 'tviplayer')

-        json_data = self._search_json(
-            r'<script>\s*jsonData\s*=', webpage, 'json_data', video_id)
+        video_info = None
+
+        # 1) Try to find a literal "const opts = { ... };" block first
+        m_opts = re.search(r'const\s+opts\s*=\s*({.*?})\s*;', webpage, flags=re.S)
+        if m_opts:
+            try:
+                opts = self._parse_json(m_opts.group(1), video_id or 'tviplayer', transform_source=js_to_json)
+            except Exception:
+                opts = None
+            if opts:
+                # try opts.video[0] or opts itself
+                video_info = traverse_obj(opts, ('video', 0)) or opts.get('video') or opts
+
+        # 2) If not found, try to extract any JS object that contains "videoUrl"
+        if not video_info:
+            obj_text = self._extract_enclosing_js_object(webpage, 'videoUrl')
+            if obj_text:
+                try:
+                    parsed = self._parse_json(obj_text, video_id or 'tviplayer', transform_source=js_to_json)
+                except Exception:
+                    # fallback: try to json.loads after small cleanup
+                    try:
+                        cleaned = re.sub(r',\s*([}\]])', r'\1', obj_text).replace("'", '"')
+                        parsed = json.loads(cleaned)
+                    except Exception:
+                        parsed = None
+                if parsed:
+                    # parsed might be the video object or contain video: [...]
+                    if isinstance(parsed, dict):
+                        video_info = traverse_obj(parsed, ('video', 0)) or parsed
+
+        # 3) Legacy fallback: jsonData = {...}
+        if not video_info:
+            try:
+                jd = self._search_json(r'jsonData\s*=', webpage, 'json data', video_id)
+                if jd:
+                    video_info = traverse_obj(jd, ('video', 0)) or jd
+            except ExtractorError:
+                video_info = None
+
+        # 4) Last resort: search for a direct "videoUrl" key anywhere and build minimal object
+        if not video_info:
+            m = re.search(r'["\']videoUrl["\']\s*:\s*["\'](https?://[^"\']+)["\']', webpage, flags=re.S)
+            if m:
+                video_info = {
+                    'id': video_id or None,
+                    'videoUrl': m.group(1),
+                }
+
+        if not video_info:
+            raise ExtractorError('Unable to locate video data in webpage', expected=True)
+
+        # Determine id/title/thumbnail/duration/videoUrl
+        vid = video_info.get('id') or video_id
+        title = video_info.get('title') or self._og_search_title(webpage)
+        thumbnail = video_info.get('cover') or video_info.get('thumbnail') or self._og_search_thumbnail(webpage)
+        duration = video_info.get('duration')
+        try:
+            duration = int(duration) if duration is not None else None
+        except Exception:
+            try:
+                duration = int(float(duration))
+            except Exception:
+                duration = None
+
+        video_url = video_info.get('videoUrl') or video_info.get('url') or video_info.get('video_url')
+        if not video_url:
+            raise ExtractorError('No video URL found in the page data', expected=True)
+
+        # append token if we have it
+        if self.wms_auth_sign_token:
+            sep = '&' if '?' in video_url else '?'
+            video_url = f'{video_url}{sep}wmsAuthSign={self.wms_auth_sign_token}'
+
+        formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, vid or video_id, ext='mp4')

-        formats, subtitles = self._extract_m3u8_formats_and_subtitles(
-            f'{json_data["videoUrl"]}?wmsAuthSign={self.wms_auth_sign_token}',
-            video_id, ext='mp4')
        return {
-            'id': video_id,
-            'title': json_data.get('title') or self._og_search_title(webpage),
-            'thumbnail': json_data.get('cover') or self._og_search_thumbnail(webpage),
-            'duration': json_data.get('duration'),
+            'id': vid or video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
            'formats': formats,
            'subtitles': subtitles,
-            'season_number': traverse_obj(json_data, ('program', 'seasonNum')),
+            'season_number': traverse_obj(video_info, ('program', 'seasonNum')),
        }