diff --git a/yt_dlp/extractor/medialaan.py b/yt_dlp/extractor/medialaan.py index c80b6dff1..bc12cad80 100644 --- a/yt_dlp/extractor/medialaan.py +++ b/yt_dlp/extractor/medialaan.py @@ -1,15 +1,73 @@ -import re - from .common import InfoExtractor from ..utils import ( + clean_html, + determine_ext, extract_attributes, int_or_none, - mimetype2ext, - parse_iso8601, + parse_resolution, + str_or_none, + url_or_none, ) +from ..utils.traversal import find_elements, traverse_obj -class MedialaanIE(InfoExtractor): +class MedialaanBaseIE(InfoExtractor): + def _extract_from_mychannels_api(self, mychannels_id): + webpage = self._download_webpage( + f'https://mychannels.video/embed/{mychannels_id}', mychannels_id) + brand_config = self._search_json( + r'window\.mychannels\.brand_config\s*=', webpage, 'brand config', mychannels_id) + response = self._download_json( + f'https://api.mychannels.world/v1/embed/video/{mychannels_id}', + mychannels_id, headers={'X-Mychannels-Brand': brand_config['brand']}) + + formats = [] + for stream in traverse_obj(response, ( + 'streams', lambda _, v: url_or_none(v['url']), + )): + source_url = stream['url'] + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, mychannels_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + format_id = traverse_obj(stream, ('quality', {str})) + formats.append({ + 'ext': ext, + 'format_id': format_id, + 'url': source_url, + **parse_resolution(format_id), + }) + + return { + 'id': mychannels_id, + 'formats': formats, + **traverse_obj(response, { + 'title': ('title', {clean_html}), + 'description': ('description', {clean_html}, filter), + 'duration': ('durationMs', {int_or_none(scale=1000)}, {lambda x: x if x >= 0 else None}), + 'genres': ('genre', 'title', {str}, filter, all, filter), + 'is_live': ('live', {bool}), + 'release_timestamp': ('publicationTimestampMs', {int_or_none(scale=1000)}), + 'tags': ('tags', ..., 'title', {str}, filter, all, filter), + 'thumbnail': ('image', 'baseUrl', {url_or_none}), + }), + **traverse_obj(response, ('channel', { + 'channel': ('title', {clean_html}), + 'channel_id': ('id', {str_or_none}), + })), + **traverse_obj(response, ('organisation', { + 'uploader': ('title', {clean_html}), + 'uploader_id': ('id', {str_or_none}), + })), + **traverse_obj(response, ('show', { + 'series': ('title', {clean_html}), + 'series_id': ('id', {str_or_none}), + })), + } + + +class MedialaanIE(MedialaanBaseIE): _VALID_URL = r'''(?x) https?:// (?: @@ -32,7 +90,7 @@ class MedialaanIE(InfoExtractor): tubantia| volkskrant )\.nl - )/video/(?:[^/]+/)*[^/?&#]+~p + )/videos?/(?:[^/]+/)*[^/?&#]+(?:-|~p) ) (?P\d+) ''' @@ -42,19 +100,83 @@ class MedialaanIE(InfoExtractor): 'id': '193993', 'ext': 'mp4', 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', - 'thumbnail': r're:https?://images\.mychannels\.video/imgix/.+', - 'timestamp': 1611663540, - 'upload_date': '20210126', + 'description': 'In een nieuwe Gegenpressing video bespreken Yadran Blanco en Dennis Kas het nieuws omrent NAC.', 'duration': 238, - }, - 'params': { - 'skip_download': True, + 'channel': 'BN DeStem', + 'channel_id': '418', + 'genres': ['Sports'], + 'release_date': '20210126', + 'release_timestamp': 1611663540, + 'series': 'Korte Reportage', + 'series_id': '972', + 'tags': 'count:2', + 'thumbnail': r're:https?://images\.mychannels\.video/imgix/.+\.(?:jpe?g|png)', + 'uploader': 'BN De Stem', + 'uploader_id': '26', }, }, { 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', - 'only_matching': True, + 'info_dict': { + 'id': '194093', + 'ext': 'mp4', + 'title': 'Noodbevel in Doetinchem: politie stuurt mensen centrum uit', + 'description': 'md5:77e85b2cb26cfff9dc1fe2b1db524001', + 'duration': 44, + 'channel': 'De Gelderlander', + 'channel_id': '320', + 'genres': ['News'], + 'release_date': '20210126', + 'release_timestamp': 1611690600, + 'series': 'Snel Nieuws', + 'series_id': '984', + 'tags': 'count:1', + 'thumbnail': r're:https?://images\.mychannels\.video/imgix/.+\.(?:jpe?g|png)', + 'uploader': 'De Gelderlander', + 'uploader_id': '25', + }, }, { - 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', + 'url': 'https://www.7sur7.be/videos/production/lla-tendance-tiktok-qui-enflamme-lespagne-707650', + 'info_dict': { + 'id': '707650', + 'ext': 'mp4', + 'title': 'La tendance TikTok qui enflamme l’Espagne', + 'description': 'md5:c7ec4cb733190f227fc8935899f533b5', + 'duration': 70, + 'channel': 'Lifestyle', + 'channel_id': '770', + 'genres': ['Beauty & Lifestyle'], + 'release_date': '20240906', + 'release_timestamp': 1725617330, + 'series': 'Lifestyle', + 'series_id': '1848', + 'tags': 'count:1', + 'thumbnail': r're:https?://images\.mychannels\.video/imgix/.+\.(?:jpe?g|png)', + 'uploader': '7sur7', + 'uploader_id': '67', + }, + }, { + 'url': 'https://mychannels.video/embed/313117', + 'info_dict': { + 'id': '313117', + 'ext': 'mp4', + 'title': str, + 'description': 'md5:255e2e52f6fe8a57103d06def438f016', + 'channel': 'AD', + 'channel_id': '238', + 'genres': ['News'], + 'live_status': 'is_live', + 'release_date': '20241225', + 'release_timestamp': 1735169425, + 'series': 'Nieuws Update', + 'series_id': '3337', + 'tags': 'count:1', + 'thumbnail': r're:https?://images\.mychannels\.video/imgix/.+\.(?:jpe?g|png)', + 'uploader': 'AD', + 'uploader_id': '1', + }, + 'params': {'skip_download': 'Livestream'}, + }, { + 'url': 'https://embed.mychannels.video/sdk/production/193993', 'only_matching': True, }, { 'url': 'https://embed.mychannels.video/script/production/193993', @@ -62,9 +184,6 @@ class MedialaanIE(InfoExtractor): }, { 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, - }, { - 'url': 'https://mychannels.video/embed/193993', - 'only_matching': True, }, { 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, @@ -75,51 +194,32 @@ class MedialaanIE(InfoExtractor): 'id': '1576607', 'ext': 'mp4', 'title': 'Tom Waes blaastest', + 'channel': 'De Morgen', + 'channel_id': '352', + 'description': 'Tom Waes werkt mee aan een alcoholcampagne op Werchter', 'duration': 62, + 'genres': ['News'], + 'release_date': '20250705', + 'release_timestamp': 1751730795, + 'series': 'Nieuwsvideo\'s', + 'series_id': '1683', + 'tags': 'count:1', 'thumbnail': r're:https?://video-images\.persgroep\.be/aws_generated.+\.jpg', - 'timestamp': 1751730795, - 'upload_date': '20250705', + 'uploader': 'De Morgen', + 'uploader_id': '17', }, 'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}}, }] @classmethod def _extract_embed_urls(cls, url, webpage): - entries = [] - for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): - mychannels_id = extract_attributes(element).get('data-mychannels-id') - if mychannels_id: - entries.append('https://mychannels.video/embed/' + mychannels_id) - return entries + return traverse_obj(webpage, ( + {find_elements(tag='div', attr='data-mychannels-type', value='video', html=True)}, + ..., {extract_attributes}, 'data-mychannels-id', {str}, + {lambda x: f'https://mychannels.video/embed/{x}'}, {url_or_none}, filter, all, filter, + )) def _real_extract(self, url): - production_id = self._match_id(url) - production = self._download_json( - 'https://embed.mychannels.video/sdk/production/' + production_id, - production_id, query={'options': 'UUUU_default'})['productions'][0] - title = production['title'] + mychannels_id = self._match_id(url) - formats = [] - for source in (production.get('sources') or []): - src = source.get('src') - if not src: - continue - ext = mimetype2ext(source.get('type')) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, production_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'ext': ext, - 'url': src, - }) - - return { - 'id': production_id, - 'title': title, - 'formats': formats, - 'thumbnail': production.get('posterUrl'), - 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), - 'duration': int_or_none(production.get('duration')) or None, - } + return self._extract_from_mychannels_api(mychannels_id) diff --git a/yt_dlp/extractor/vtm.py b/yt_dlp/extractor/vtm.py index 41b41ec17..c48117c47 100644 --- a/yt_dlp/extractor/vtm.py +++ b/yt_dlp/extractor/vtm.py @@ -1,60 +1,42 @@ -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - try_get, -) +from .medialaan import MedialaanBaseIE +from ..utils import str_or_none +from ..utils.traversal import require, traverse_obj -class VTMIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' - _TEST = { +class VTMIE(MedialaanBaseIE): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/[^/?#]+~v(?P[\da-f-]+)' + _TESTS = [{ 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', - 'md5': '37dca85fbc3a33f2de28ceb834b071f8', 'info_dict': { 'id': '192445', 'ext': 'mp4', 'title': 'Gast vernielt Genkse hotelkamer', - 'timestamp': 1611060180, - 'upload_date': '20210119', + 'channel': 'VTM', + 'channel_id': '867', + 'description': 'md5:75fce957d219646ff1b65ba449ab97b5', 'duration': 74, - # TODO: fix url _type result processing - # 'series': 'Op Interventie', + 'genres': ['Documentaries'], + 'release_date': '20210119', + 'release_timestamp': 1611060180, + 'series': 'Op Interventie', + 'series_id': '2658', + 'tags': 'count:2', + 'thumbnail': r're:https?://images\.mychannels\.video/imgix/.+\.(?:jpe?g|png)', + 'uploader': 'VTM', + 'uploader_id': '74', }, - } + }] + + def _real_initialize(self): + if not self._get_cookies('https://vtm.be/').get('authId'): + self.raise_login_required() def _real_extract(self, url): uuid = self._match_id(url) - video = self._download_json( - 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', - uuid, query={ - 'query': '''{ - getComponent(type: Video, uuid: "%s") { - ... on Video { - description - duration - myChannelsVideo - program { - title - } - publishedAt - title - } - } -}''' % uuid, # noqa: UP031 - }, headers={ - 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', - })['data']['getComponent'] + webpage = self._download_webpage(url, uuid) + apollo_state = self._search_json( + r'window\.__APOLLO_STATE__\s*=', webpage, 'apollo state', uuid) + mychannels_id = traverse_obj(apollo_state, ( + f'Video:{{"uuid":"{uuid}"}}', 'myChannelsVideo', {str_or_none}, {require('mychannels ID')})) - return { - '_type': 'url', - 'id': uuid, - 'title': video.get('title'), - 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], - 'description': video.get('description'), - 'timestamp': parse_iso8601(video.get('publishedAt')), - 'duration': int_or_none(video.get('duration')), - 'series': try_get(video, lambda x: x['program']['title']), - 'ie_key': 'Medialaan', - } + return self._extract_from_mychannels_api(mychannels_id)