diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 34c98b537..535d687d9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1849,7 +1849,7 @@ SCTEIE, SCTECourseIE, ) -from .sejmpl import SejmIE +from .sejmpl import SejmBlueonlineIE, SejmIE from .sen import SenIE from .senalcolombia import SenalColombiaLiveIE from .senategov import ( diff --git a/yt_dlp/extractor/sejmpl.py b/yt_dlp/extractor/sejmpl.py index eb433d2ac..24ed0ce1b 100644 --- a/yt_dlp/extractor/sejmpl.py +++ b/yt_dlp/extractor/sejmpl.py @@ -1,15 +1,15 @@ import datetime as dt +import urllib.parse from .common import InfoExtractor -from .redge import RedCDNLivxIE from ..utils import ( clean_html, join_nonempty, js_to_json, strip_or_none, + try_get, update_url_query, ) -from ..utils.traversal import traverse_obj def is_dst(date): @@ -30,7 +30,7 @@ class SejmIE(InfoExtractor): _VALID_URL = ( r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P[\dA-F]+)', r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P[\dA-F]+)', - r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P\d+)\.nsf/VideoFrame\.xsp/(?P[\dA-F]+)', + r'https?://(?:(?:www\.)?sejm\.gov\.pl|sejm-embed\.redcdn\.pl)/[Ss]ejm(?P\d+)\.nsf/VideoFrame\.xsp/(?P[\dA-F]+)', ) IE_NAME = 'sejm' @@ -46,7 +46,7 @@ class SejmIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'id': 'ENC01-722340000000-722360145000', + 'id': 'ENC01-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000', 'ext': 'mp4', 'duration': 20145, 'title': '1. posiedzenie Sejmu X kadencji - ENC01', @@ -54,7 +54,7 @@ class SejmIE(InfoExtractor): }, }, { 'info_dict': { - 'id': 'ENC30-722340000000-722360145000', + 'id': 'ENC30-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000', 'ext': 'mp4', 'duration': 20145, 'title': '1. posiedzenie Sejmu X kadencji - ENC30', @@ -62,7 +62,7 @@ class SejmIE(InfoExtractor): }, }, { 'info_dict': { - 'id': 'ENC31-722340000000-722360145000', + 'id': 'ENC31-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000', 'ext': 'mp4', 'duration': 20145, 'title': '1. posiedzenie Sejmu X kadencji - ENC31', @@ -70,7 +70,7 @@ class SejmIE(InfoExtractor): }, }, { 'info_dict': { - 'id': 'ENC32-722340000000-722360145000', + 'id': 'ENC32-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000', 'ext': 'mp4', 'duration': 20145, 'title': '1. posiedzenie Sejmu X kadencji - ENC32', @@ -79,7 +79,7 @@ class SejmIE(InfoExtractor): }, { # sign lang interpreter 'info_dict': { - 'id': 'Migacz-ENC01-1-722340000000-722360145000', + 'id': 'Migacz-ENC01-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000', 'ext': 'mp4', 'duration': 20145, 'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01', @@ -98,7 +98,7 @@ class SejmIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'id': 'ENC08-1-503831270000-503840040000', + 'id': 'ENC08-9377A9D65518E9A5C125808E002E9FF2-503831270000-503840040000', 'ext': 'mp4', 'duration': 8770, 'title': 'Debata "Lepsza Polska: obywatelska" - ENC08', @@ -130,11 +130,15 @@ class SejmIE(InfoExtractor): }] def _real_extract(self, url): + # API (publicly documented) provides some metadata, and starting at 10th term, m3u8 URLs. Before then it's broken. + # Frame provides timeframe and cameras available (including SLI; except for 7th term, where it provides a URL), + # but is missing other necessary metadata (live_status, title). + # Transmisje_arch JSON provides useful metadata (only place with live_status!), but not URLs/cameras. term, video_id = self._match_valid_url(url).group('term', 'id') frame = self._download_webpage( - f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}', + f'https://www.sejm.gov.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}', video_id) - # despite it says "transmisje_arch", it works for live streams too! + player_config = self._search_json(r'var\splayerConfig\s*=\s*', frame, 'player config', video_id, transform_source=js_to_json) data = self._download_json( f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}', video_id) @@ -162,49 +166,35 @@ def _real_extract(self, url): entries = [] - def add_entry(file, legacy_file=False): - if not file: - return - file = self._proto_relative_url(file) - if not legacy_file: - file = update_url_query(file, {'startTime': start_time}) - if stop_time is not None: - file = update_url_query(file, {'stopTime': stop_time}) - stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id') - common_info = { - 'url': file, - 'duration': duration, - } - if legacy_file: + def add_entry(camera): + if player_config.get('isMP4'): + # Special case in 7th term. Instead of a camera name, this is a URL to a simple MP4 file. entries.append({ - **common_info, + 'url': self._proto_relative_url(camera), 'id': video_id, 'title': title, + 'duration': duration, + 'live_status': live_status, }) - else: - entries.append({ - **common_info, - '_type': 'url_transparent', - 'ie_key': RedCDNLivxIE.ie_key(), - 'id': stream_id, - 'title': join_nonempty(title, stream_id, delim=' - '), - }) + return + url = f'https://sejm.c.blueonline.tv/stream/{camera}/{video_id}/manifest.mpd?start={start_time}' + if stop_time is not None: + url = update_url_query(url, {'stop': stop_time}) + entries.append({ + '_type': 'url_transparent', + 'url': url, + 'ie_key': SejmBlueonlineIE.ie_key(), + 'id': camera, + 'duration': duration, + 'title': join_nonempty(title, camera, delim=' - '), + 'live_status': live_status, + }) - cameras = self._search_json( - r'var\s+cameras\s*=', frame, 'camera list', video_id, - contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json, - fatal=False) or [] - for camera_file in traverse_obj(cameras, (..., 'file', {dict})): - if camera_file.get('flv'): - add_entry(camera_file['flv']) - elif camera_file.get('mp4'): - # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx - add_entry(camera_file['mp4'], legacy_file=True) - else: - self.report_warning('Unknown camera stream type found') + for camera in player_config['cameras']: + add_entry(camera) if params.get('mig'): - add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False)) + add_entry(player_config['sli']) return { '_type': 'playlist', @@ -216,3 +206,31 @@ def add_entry(file, legacy_file=False): 'live_status': live_status, 'location': strip_or_none(data.get('location')), } + + +class SejmBlueonlineIE(InfoExtractor): + IE_DESC = False + _VALID_URL = r'https?://sejm\.c\.blueonline\.tv//?stream/(?P[\dA-Za-z-]+)/(?P[\dA-F]+)/(?:playlist.m3u8|manifest.mpd)\?' + _TESTS = [{ + 'url': 'https://sejm.c.blueonline.tv/stream/Migacz-ENC01/6181EF1AD9CEEBB5C1258A6D006452B5/manifest.mpd?start=722340000000&stop=722360145000', + 'info_dict': { + 'id': 'Migacz-ENC01-6181EF1AD9CEEBB5C1258A6D006452B5-722340000000-722360145000', + 'ext': 'mp4', + 'title': '_', + }, + }] + + def _real_extract(self, url): + camera, video_id = self._match_valid_url(url).group('camera', 'id') + qs = urllib.parse.urlparse(url).query + query = urllib.parse.parse_qs(qs) + start_time = try_get(query, lambda q: q['start'][0]) + stop_time = try_get(query, lambda q: q['stop'][0]) + formats = [] + formats.extend(self._extract_m3u8_formats(f'https://sejm.c.blueonline.tv/stream/{camera}/{video_id}/playlist.m3u8?{qs}', video_id, live=stop_time is None)) + formats.extend(self._extract_mpd_formats(f'https://sejm.c.blueonline.tv/stream/{camera}/{video_id}/manifest.mpd?{qs}', video_id)) + return { + 'id': join_nonempty(camera, video_id, start_time, stop_time), + 'title': '_', + 'formats': formats, + }