From 613852e4a8517070319e39c83d308e0d0d95111b Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Fri, 14 Mar 2025 15:59:41 +0100 Subject: [PATCH] [ie/Tagesschau] Update extractor for current website --- yt_dlp/extractor/tagesschau.py | 64 ++++++++++++++++------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index 4c537dfd14..abe7c7cc87 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -1,19 +1,19 @@ -import re - from .common import InfoExtractor from ..utils import ( UnsupportedError, extract_attributes, + get_elements_html_by_attribute, int_or_none, - js_to_json, parse_iso8601, try_get, ) class TagesschauIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?(?P[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' + _VALID_URL = [ + r'https?://(?:www\.)?tagesschau\.de(?:/[^/#?]+)*/(?P[^/#?\.]+)', + r'https?://(?:www\.)?(?Ptagesschau\.de)/?', + ] _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', @@ -106,55 +106,51 @@ class TagesschauIE(InfoExtractor): }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') or mobj.group('path') - display_id = video_id.lstrip('-') - - webpage = self._download_webpage(url, display_id) + webpage_id = self._match_id(url) + webpage = self._download_webpage(url, webpage_id) title = self._html_search_regex( r']*class="headline"[^>]*>(.+?)', webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) entries = [] - videos = re.findall(r']+>', webpage) - num = 0 - for video in videos: - video = extract_attributes(video).get('data-config') - if not video: - continue - video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) - video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray']) + media_players = get_elements_html_by_attribute( + 'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False) + + for player in media_players: + data = self._parse_json(extract_attributes(player)['data-v'], webpage_id) + media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id'] + video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media']) if not video_formats: continue - num += 1 + formats = [] for video_format in video_formats: - media_url = video_format.get('_stream') or '' - formats = [] + media_url = video_format.get('url') or '' if media_url.endswith('master.m3u8'): - formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') + formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls') elif media_url.endswith('.mp3'): - formats = [{ + formats.append({ 'url': media_url, 'vcodec': 'none', - }] - if not formats: - continue - entries.append({ - 'id': f'{display_id}-{num}', - 'title': try_get(video, lambda x: x['mc']['_title']), - 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), - 'formats': formats, - }) + 'format_note': video_format.get('forcedLabel'), + }) + if not formats: + continue + entries.append({ + 'id': media_id, + 'title': try_get(data, lambda x: x['mc']['meta']['title']), + 'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])), + 'formats': formats, + }) if not entries: raise UnsupportedError(url) if len(entries) > 1: - return self.playlist_result(entries, display_id, title) + return self.playlist_result(entries, webpage_id, title) return { - 'id': display_id, + 'id': entries[0]['id'], 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'formats': entries[0]['formats'],