From 613852e4a8517070319e39c83d308e0d0d95111b Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Fri, 14 Mar 2025 15:59:41 +0100 Subject: [PATCH 1/5] [ie/Tagesschau] Update extractor for current website --- yt_dlp/extractor/tagesschau.py | 64 ++++++++++++++++------------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index 4c537dfd1..abe7c7cc8 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -1,19 +1,19 @@ -import re - from .common import InfoExtractor from ..utils import ( UnsupportedError, extract_attributes, + get_elements_html_by_attribute, int_or_none, - js_to_json, parse_iso8601, try_get, ) class TagesschauIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?(?P[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' + _VALID_URL = [ + r'https?://(?:www\.)?tagesschau\.de(?:/[^/#?]+)*/(?P[^/#?\.]+)', + r'https?://(?:www\.)?(?Ptagesschau\.de)/?', + ] _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', @@ -106,55 +106,51 @@ class TagesschauIE(InfoExtractor): }] def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') or mobj.group('path') - display_id = video_id.lstrip('-') - - webpage = self._download_webpage(url, display_id) + webpage_id = self._match_id(url) + webpage = self._download_webpage(url, webpage_id) title = self._html_search_regex( r']*class="headline"[^>]*>(.+?)', webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) entries = [] - videos = re.findall(r']+>', webpage) - num = 0 - for video in videos: - video = extract_attributes(video).get('data-config') - if not video: - continue - video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) - video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray']) + media_players = get_elements_html_by_attribute( + 'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False) + + for player in media_players: + data = self._parse_json(extract_attributes(player)['data-v'], webpage_id) + media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id'] + video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media']) if not video_formats: continue - num += 1 + formats = [] for video_format in video_formats: - media_url = video_format.get('_stream') or '' - formats = [] + media_url = video_format.get('url') or '' if media_url.endswith('master.m3u8'): - formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') + formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls') elif media_url.endswith('.mp3'): - formats = [{ + formats.append({ 'url': media_url, 'vcodec': 'none', - }] - if not formats: - continue - entries.append({ - 'id': f'{display_id}-{num}', - 'title': try_get(video, lambda x: x['mc']['_title']), - 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), - 'formats': formats, - }) + 'format_note': video_format.get('forcedLabel'), + }) + if not formats: + continue + entries.append({ + 'id': media_id, + 'title': try_get(data, lambda x: x['mc']['meta']['title']), + 'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])), + 'formats': formats, + }) if not entries: raise UnsupportedError(url) if len(entries) > 1: - return self.playlist_result(entries, display_id, title) + return self.playlist_result(entries, webpage_id, title) return { - 'id': display_id, + 'id': entries[0]['id'], 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'formats': entries[0]['formats'], From 6e65718b5859798c1dc1b2509dbcaa37046f62ec Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Fri, 14 Mar 2025 16:18:43 +0100 Subject: [PATCH 2/5] [ie/Tagesschau] Use _yes_playlist --- yt_dlp/extractor/tagesschau.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index abe7c7cc8..9bde4998c 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -146,7 +146,8 @@ def _real_extract(self, url): if not entries: raise UnsupportedError(url) - if len(entries) > 1: + if len(entries) > 1 and self._yes_playlist( + webpage_id, entries[0]['id'], playlist_label='all media on', video_label='file'): return self.playlist_result(entries, webpage_id, title) return { From ec641289d19e7a78643d6b374ad819c677fc9c3d Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Fri, 14 Mar 2025 22:43:07 +0100 Subject: [PATCH 3/5] [ie/Tagesschau] Use traverse_obj() --- yt_dlp/extractor/tagesschau.py | 85 +++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index 9bde4998c..99c3411bb 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -5,8 +5,9 @@ get_elements_html_by_attribute, int_or_none, parse_iso8601, - try_get, + url_or_none, ) +from ..utils.traversal import traverse_obj class TagesschauIE(InfoExtractor): @@ -108,54 +109,64 @@ class TagesschauIE(InfoExtractor): def _real_extract(self, url): webpage_id = self._match_id(url) webpage = self._download_webpage(url, webpage_id) - - title = self._html_search_regex( - r']*class="headline"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) - - entries = [] media_players = get_elements_html_by_attribute( 'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False) + entries = [] for player in media_players: data = self._parse_json(extract_attributes(player)['data-v'], webpage_id) - media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id'] - video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media']) - if not video_formats: + media_id = traverse_obj(data, ('mc', 'pluginData', ( + ('trackingSAND@all', 'av_content_id'), + ('trackingPiano@all', 'avContent', 'av_content_id'), + ('trackingAgf@all', 'playerID')), any)) + if not media_id: + self.report_warning('Skipping unrecognized media file') continue - formats = [] - for video_format in video_formats: - media_url = video_format.get('url') or '' - if media_url.endswith('master.m3u8'): - formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls') - elif media_url.endswith('.mp3'): - formats.append({ - 'url': media_url, - 'vcodec': 'none', - 'format_note': video_format.get('forcedLabel'), - }) - if not formats: - continue - entries.append({ + + entry = { 'id': media_id, - 'title': try_get(data, lambda x: x['mc']['meta']['title']), - 'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])), - 'formats': formats, - }) + **traverse_obj(data, { + 'title': ('mc', ( + ('pluginData', 'trackingPiano@all', 'avContent', 'av_content'), + ('meta', 'title')), any), + 'duration': ('mc', 'meta', 'durationSeconds', {int_or_none}), + 'thumbnail': ( + 'pc', 'generic', 'imageTemplateConfig', 'size', -1, + 'value', {lambda v: (v + '.webp') if v else None}), + 'timestamp': ( + 'mc', 'pluginData', 'trackingPiano@all', 'avContent', + 'd:av_publication_date', {parse_iso8601}), + }), + } + input_formats = traverse_obj(data, ( + 'mc', 'streams', 0, 'media', lambda _, v: url_or_none(v.get('url'))), default=[]) + + formats = [] + for input_format in input_formats: + file_url = input_format['url'] + if file_url.endswith('master.m3u8'): + formats = self._extract_m3u8_formats(file_url, media_id, 'mp4', m3u8_id='hls') + break + if file_url.endswith('.mp3'): + formats.append(traverse_obj(input_format, { + 'url': 'url', + 'vcodec': {lambda _: 'none'}, + 'format_note': ('forcedLabel', {str}), + })) + if not formats: + self.report_warning(f'Skipping file {media_id} because it has no formats') + continue + entry['formats'] = formats + entries.append(entry) if not entries: raise UnsupportedError(url) if len(entries) > 1 and self._yes_playlist( webpage_id, entries[0]['id'], playlist_label='all media on', video_label='file'): + title = self._html_search_regex( + r']*class="headline"[^>]*>(.+?)', + webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) return self.playlist_result(entries, webpage_id, title) - return { - 'id': entries[0]['id'], - 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage), - 'formats': entries[0]['formats'], - 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)), - 'description': self._og_search_description(webpage), - 'duration': entries[0]['duration'], - } + return entries[0] From 3c31737931d5f3027179d3167b5fb2dd98445f50 Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Sat, 15 Mar 2025 19:50:35 +0100 Subject: [PATCH 4/5] [ie/Tagesschau] Update and expand tests --- yt_dlp/extractor/tagesschau.py | 143 +++++++++++++++++++++++++-------- 1 file changed, 111 insertions(+), 32 deletions(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index 99c3411bb..3db81e2cc 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -17,65 +17,139 @@ class TagesschauIE(InfoExtractor): ] _TESTS = [{ + # Single video without recommendations 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 'md5': 'ccb9359bf8c4795836e43759f3408a93', 'info_dict': { - 'id': 'video-102143-1', + 'id': 'video-102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'duration': 138, + 'thumbnail': 'https://images.tagesschau.de/image/eb0b0d74-03ac-45ec-9300-0851fd6823d3/AAABj-POS-g/AAABkZLhkrw/16x9-1280/sendungslogo-tagesschau-100.webp', + 'timestamp': 1437250200, + 'upload_date': '20150718', }, }, { + # Single video embedded + 'url': 'https://www.tagesschau.de/multimedia/sendung/tagesschau_20_uhr/video-102143~player.html', + 'md5': 'ccb9359bf8c4795836e43759f3408a93', + 'info_dict': { + 'id': 'video-102143', + 'ext': 'mp4', + 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'duration': 138, + 'thumbnail': 'https://images.tagesschau.de/image/eb0b0d74-03ac-45ec-9300-0851fd6823d3/AAABj-POS-g/AAABkZLhkrw/16x9-1280/sendungslogo-tagesschau-100.webp', + 'timestamp': 1437250200, + 'upload_date': '20150718', + }, + }, { + # Single video with recommendations, `--no-playlist` 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '5c15e8f3da049e48829ec9786d835536', 'info_dict': { - 'id': 'ts-5727-1', + 'id': 'video-45741', 'ext': 'mp4', - 'title': 'Ganze Sendung', + 'title': 'tagesschau 20:00 Uhr', 'duration': 932, + 'thumbnail': 'https://images.tagesschau.de/image/eb0b0d74-03ac-45ec-9300-0851fd6823d3/AAABj-POS-g/AAABkZLhkrw/16x9-1280/sendungslogo-tagesschau-100.webp', + 'timestamp': 1417723200, + 'upload_date': '20141204', }, + 'params': {'noplaylist': True}, }, { - # exclusive audio - 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', + # Single audio embedded + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-157831~player.html', 'md5': '4bff8f23504df56a0d86ed312d654182', 'info_dict': { - 'id': 'audio-29417-1', + 'id': 'audio-157831', 'ext': 'mp3', 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', + 'duration': 200, + 'thumbnail': 'https://images.tagesschau.de/image/197a5977-3f5f-4c21-8c08-fad6ecb4b493/AAABj864C3w/AAABkZLhkrw/16x9-1280/default-audioplayer-100.webp', + 'timestamp': 1679687280, + 'upload_date': '20230324', }, }, { - 'url': 'http://www.tagesschau.de/inland/bnd-303.html', - 'md5': 'f049fa1698d7564e9ca4c3325108f034', + # Single audio with recommendations, `--no-playlist` + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-157831.html', + 'md5': '4bff8f23504df56a0d86ed312d654182', 'info_dict': { - 'id': 'bnd-303-1', - 'ext': 'mp3', - 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa', - }, - }, { - 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', - 'info_dict': { - 'id': 'afd-parteitag-135', - 'title': 'AfD', - }, - 'playlist_mincount': 15, - }, { - 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', - 'info_dict': { - 'id': 'audio-29417-1', + 'id': 'audio-157831', 'ext': 'mp3', 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', + 'duration': 200, + 'thumbnail': 'https://images.tagesschau.de/image/197a5977-3f5f-4c21-8c08-fad6ecb4b493/AAABj864C3w/AAABkZLhkrw/16x9-1280/default-audioplayer-100.webp', + 'timestamp': 1679687280, + 'upload_date': '20230324', }, + 'params': {'noplaylist': True}, }, { - 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html', + # Article with multimedia content, `--no-playlist` + 'url': 'https://www.tagesschau.de/inland/bundestagswahl/bundestagswahl-ergebnisse-104.html', + 'md5': 'f72b42f213f632dbbe76551fabebcaef', 'info_dict': { - 'id': 'podcast-11km-327', - 'ext': 'mp3', - 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen', - 'upload_date': '20230322', - 'timestamp': 1679482808, - 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg', - 'description': 'md5:dad059931fe4b3693e3656e93a249848', + 'id': 'video-1437570', + 'ext': 'mp4', + 'title': 'Union mit Kanzlerkandidat Merz gewinnt Bundestagswahl: Parteienlandschaft im Umbruch', + 'duration': 181, + 'thumbnail': 'https://images.tagesschau.de/image/ab8aa6ce-4fd5-4c1e-921d-807b07848a80/AAABlTZ3CAQ/AAABkZLpihI/20x9-1280/union-wahl-siegesfeier-100.webp', + 'timestamp': 1740401379, + 'upload_date': '20250224', }, + 'params': {'noplaylist': True}, + }, { + # Topic page with multimedia content, `--no-playlist` + 'url': 'https://www.tagesschau.de/thema/em_2024', + 'md5': '07be4d381753e8411b527c8f0a36229f', + 'info_dict': { + 'id': 'audio-195242', + 'ext': 'mp3', + 'title': 'Optimistische Verbraucherstimmung kommt an der Börse nicht an ', + 'thumbnail': 'https://images.tagesschau.de/image/490fbbe9-1718-4fe0-8f51-538a3182d28e/AAABkOOc5Z0/AAABkZLpihI/20x9-1280/em-2024-fans-100.webp', + 'timestamp': 1721825201, + 'upload_date': '20240724', + }, + 'params': {'noplaylist': True}, + }, { + # Playlist, single video with recommendations + 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', + 'info_dict': { + 'id': 'ts-5727', + 'title': 'tagesschau', + }, + 'playlist_mincount': 8, + }, { + # Playlist, single audio with recommendations + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-157831.html', + 'info_dict': { + 'id': 'audio-157831', + 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet', + }, + 'playlist_mincount': 5, + }, { + # Playlist, article with multimedia content + 'url': 'https://www.tagesschau.de/inland/bundestagswahl/bundestagswahl-ergebnisse-104.html', + 'info_dict': { + 'id': 'bundestagswahl-ergebnisse-104', + 'title': 'Vorläufiges Ergebnis der Bundestagswahl: Union stärkste Kraft, FDP und BSW draußen', + }, + 'playlist_mincount': 20, + }, { + # Playlist, topic page with multimedia content + 'url': 'https://www.tagesschau.de/thema/em_2024', + 'info_dict': { + 'id': 'em_2024', + 'title': 'EM 2024', + }, + 'playlist_mincount': 10, + }, { + # Podcast feed + 'url': 'https://www.tagesschau.de/multimedia/podcast/11km/11km-feed-100.html', + 'info_dict': { + 'id': '11km-feed-100', + 'title': '11KM: der tagesschau-Podcast', + }, + 'playlist_mincount': 250, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -101,9 +175,14 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/100sekunden/index.html', 'only_matching': True, }, { - # playlist article with collapsing sections 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', 'only_matching': True, + }, { + 'url': 'https://www.tagesschau.de', + 'only_matching': True, + }, { + 'url': 'https://www.tagesschau.de/', + 'only_matching': True, }] def _real_extract(self, url): From 5d0fa223595ba31d4104be99e9fb95cf358396c9 Mon Sep 17 00:00:00 2001 From: InvalidUsernameException Date: Sat, 15 Mar 2025 21:54:52 +0100 Subject: [PATCH 5/5] [ie/Tagesschau] Improve title extraction for playlists --- yt_dlp/extractor/tagesschau.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py index 3db81e2cc..e60fc5e9c 100644 --- a/yt_dlp/extractor/tagesschau.py +++ b/yt_dlp/extractor/tagesschau.py @@ -243,9 +243,8 @@ def _real_extract(self, url): if len(entries) > 1 and self._yes_playlist( webpage_id, entries[0]['id'], playlist_label='all media on', video_label='file'): - title = self._html_search_regex( - r']*class="headline"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) + title = self._html_search_meta( + ['og:title', 'title', 'twitter:title'], webpage, 'title', fatal=False) return self.playlist_result(entries, webpage_id, title) return entries[0]