[ie/Sejm] Fix after provider switch (#9669)

2025-06-27 17:08:32 +00:00 · 2025-06-04 15:21:26 +02:00 · 2025-06-04 15:21:26 +02:00 · bf05fb45fb
commit bf05fb45fb
parent 4e7c1ea346
2 changed files with 66 additions and 48 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1849,7 +1849,7 @@
    SCTEIE,
    SCTECourseIE,
 )
-from .sejmpl import SejmIE
+from .sejmpl import SejmBlueonlineIE, SejmIE
 from .sen import SenIE
 from .senalcolombia import SenalColombiaLiveIE
 from .senategov import (
--- a/yt_dlp/extractor/sejmpl.py
+++ b/yt_dlp/extractor/sejmpl.py
@ -1,15 +1,15 @@
 import datetime as dt
+import urllib.parse

 from .common import InfoExtractor
-from .redge import RedCDNLivxIE
 from ..utils import (
    clean_html,
    join_nonempty,
    js_to_json,
    strip_or_none,
+    try_get,
    update_url_query,
 )
-from ..utils.traversal import traverse_obj


 def is_dst(date):
@ -30,7 +30,7 @@ class SejmIE(InfoExtractor):
    _VALID_URL = (
        r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
        r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
-        r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
+        r'https?://(?:(?:www\.)?sejm\.gov\.pl|sejm-embed\.redcdn\.pl)/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
    )
    IE_NAME = 'sejm'

@ -46,7 +46,7 @@ class SejmIE(InfoExtractor):
        },
        'playlist': [{
            'info_dict': {
-                'id': 'ENC01-722340000000-722360145000',
+                'id': 'ENC01-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000',
                'ext': 'mp4',
                'duration': 20145,
                'title': '1. posiedzenie Sejmu X kadencji - ENC01',
@ -54,7 +54,7 @@ class SejmIE(InfoExtractor):
            },
        }, {
            'info_dict': {
-                'id': 'ENC30-722340000000-722360145000',
+                'id': 'ENC30-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000',
                'ext': 'mp4',
                'duration': 20145,
                'title': '1. posiedzenie Sejmu X kadencji - ENC30',
@ -62,7 +62,7 @@ class SejmIE(InfoExtractor):
            },
        }, {
            'info_dict': {
-                'id': 'ENC31-722340000000-722360145000',
+                'id': 'ENC31-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000',
                'ext': 'mp4',
                'duration': 20145,
                'title': '1. posiedzenie Sejmu X kadencji - ENC31',
@ -70,7 +70,7 @@ class SejmIE(InfoExtractor):
            },
        }, {
            'info_dict': {
-                'id': 'ENC32-722340000000-722360145000',
+                'id': 'ENC32-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000',
                'ext': 'mp4',
                'duration': 20145,
                'title': '1. posiedzenie Sejmu X kadencji - ENC32',
@ -79,7 +79,7 @@ class SejmIE(InfoExtractor):
        }, {
            # sign lang interpreter
            'info_dict': {
-                'id': 'Migacz-ENC01-1-722340000000-722360145000',
+                'id': 'Migacz-ENC01-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000',
                'ext': 'mp4',
                'duration': 20145,
                'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
@ -98,7 +98,7 @@ class SejmIE(InfoExtractor):
        },
        'playlist': [{
            'info_dict': {
-                'id': 'ENC08-1-503831270000-503840040000',
+                'id': 'ENC08-9377A9D65518E9A5C125808E002E9FF2-503831270000-503840040000',
                'ext': 'mp4',
                'duration': 8770,
                'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
@ -130,11 +130,15 @@ class SejmIE(InfoExtractor):
    }]

    def _real_extract(self, url):
+        # API (publicly documented) provides some metadata, and starting at 10th term, m3u8 URLs. Before then it's broken.
+        # Frame provides timeframe and cameras available (including SLI; except for 7th term, where it provides a URL),
+        # but is missing other necessary metadata (live_status, title).
+        # Transmisje_arch JSON provides useful metadata (only place with live_status!), but not URLs/cameras.
        term, video_id = self._match_valid_url(url).group('term', 'id')
        frame = self._download_webpage(
-            f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
+            f'https://www.sejm.gov.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
            video_id)
-        # despite it says "transmisje_arch", it works for live streams too!
+        player_config = self._search_json(r'var\splayerConfig\s*=\s*', frame, 'player config', video_id, transform_source=js_to_json)
        data = self._download_json(
            f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
            video_id)
@ -162,49 +166,35 @@ def _real_extract(self, url):

        entries = []

-        def add_entry(file, legacy_file=False):
-            if not file:
-                return
-            file = self._proto_relative_url(file)
-            if not legacy_file:
-                file = update_url_query(file, {'startTime': start_time})
-                if stop_time is not None:
-                    file = update_url_query(file, {'stopTime': stop_time})
-                stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
-            common_info = {
-                'url': file,
-                'duration': duration,
-            }
-            if legacy_file:
+        def add_entry(camera):
+            if player_config.get('isMP4'):
+                # Special case in 7th term. Instead of a camera name, this is a URL to a simple MP4 file.
                entries.append({
-                    **common_info,
+                    'url': self._proto_relative_url(camera),
                    'id': video_id,
                    'title': title,
+                    'duration': duration,
+                    'live_status': live_status,
                })
-            else:
-                entries.append({
-                    **common_info,
-                    '_type': 'url_transparent',
-                    'ie_key': RedCDNLivxIE.ie_key(),
-                    'id': stream_id,
-                    'title': join_nonempty(title, stream_id, delim=' - '),
-                })
+                return
+            url = f'https://sejm.c.blueonline.tv/stream/{camera}/{video_id}/manifest.mpd?start={start_time}'
+            if stop_time is not None:
+                url = update_url_query(url, {'stop': stop_time})
+            entries.append({
+                '_type': 'url_transparent',
+                'url': url,
+                'ie_key': SejmBlueonlineIE.ie_key(),
+                'id': camera,
+                'duration': duration,
+                'title': join_nonempty(title, camera, delim=' - '),
+                'live_status': live_status,
+            })

-        cameras = self._search_json(
-            r'var\s+cameras\s*=', frame, 'camera list', video_id,
-            contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
-            fatal=False) or []
-        for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
-            if camera_file.get('flv'):
-                add_entry(camera_file['flv'])
-            elif camera_file.get('mp4'):
-                # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
-                add_entry(camera_file['mp4'], legacy_file=True)
-            else:
-                self.report_warning('Unknown camera stream type found')
+        for camera in player_config['cameras']:
+            add_entry(camera)

        if params.get('mig'):
-            add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
+            add_entry(player_config['sli'])

        return {
            '_type': 'playlist',
@ -216,3 +206,31 @@ def add_entry(file, legacy_file=False):
            'live_status': live_status,
            'location': strip_or_none(data.get('location')),
        }
+
+
+class SejmBlueonlineIE(InfoExtractor):
+    IE_DESC = False
+    _VALID_URL = r'https?://sejm\.c\.blueonline\.tv//?stream/(?P<camera>[\dA-Za-z-]+)/(?P<id>[\dA-F]+)/(?:playlist.m3u8|manifest.mpd)\?'
+    _TESTS = [{
+        'url': 'https://sejm.c.blueonline.tv/stream/Migacz-ENC01/6181EF1AD9CEEBB5C1258A6D006452B5/manifest.mpd?start=722340000000&stop=722360145000',
+        'info_dict': {
+            'id': 'Migacz-ENC01-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000',
+            'ext': 'mp4',
+            'title': '_',
+        },
+    }]
+
+    def _real_extract(self, url):
+        camera, video_id = self._match_valid_url(url).group('camera', 'id')
+        qs = urllib.parse.urlparse(url).query
+        query = urllib.parse.parse_qs(qs)
+        start_time = try_get(query, lambda q: q['start'][0])
+        stop_time = try_get(query, lambda q: q['stop'][0])
+        formats = []
+        formats.extend(self._extract_m3u8_formats(f'https://sejm.c.blueonline.tv/stream/{camera}/{video_id}/playlist.m3u8?{qs}', video_id, live=stop_time is None))
+        formats.extend(self._extract_mpd_formats(f'https://sejm.c.blueonline.tv/stream/{camera}/{video_id}/manifest.mpd?{qs}', video_id))
+        return {
+            'id': join_nonempty(camera, video_id, start_time, stop_time),
+            'title': '_',
+            'formats': formats,
+        }