[ie/Tagesschau] Update extractor for current website

2025-08-24 13:28:29 +00:00 · 2025-03-14 15:59:41 +01:00 · 2025-03-14 15:59:41 +01:00 · 613852e4a8
commit 613852e4a8
parent e67d786c7c
1 changed files with 30 additions and 34 deletions
--- a/yt_dlp/extractor/tagesschau.py
+++ b/yt_dlp/extractor/tagesschau.py
@ -1,19 +1,19 @@
-import re
-
 from .common import InfoExtractor
 from ..utils import (
    UnsupportedError,
    extract_attributes,
+    get_elements_html_by_attribute,
    int_or_none,
-    js_to_json,
    parse_iso8601,
    try_get,
 )


 class TagesschauIE(InfoExtractor):
-    _WORKING = False
-    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
+    _VALID_URL = [
+        r'https?://(?:www\.)?tagesschau\.de(?:/[^/#?]+)*/(?P<id>[^/#?\.]+)',
+        r'https?://(?:www\.)?(?P<id>tagesschau\.de)/?',
+    ]

    _TESTS = [{
        'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
@ -106,44 +106,40 @@ class TagesschauIE(InfoExtractor):
    }]

    def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        video_id = mobj.group('id') or mobj.group('path')
-        display_id = video_id.lstrip('-')
-
-        webpage = self._download_webpage(url, display_id)
+        webpage_id = self._match_id(url)
+        webpage = self._download_webpage(url, webpage_id)

        title = self._html_search_regex(
            r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
            webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)

        entries = []
-        videos = re.findall(r'<div[^>]+>', webpage)
-        num = 0
-        for video in videos:
-            video = extract_attributes(video).get('data-config')
-            if not video:
-                continue
-            video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
-            video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
+        media_players = get_elements_html_by_attribute(
+            'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False)
+
+        for player in media_players:
+            data = self._parse_json(extract_attributes(player)['data-v'], webpage_id)
+            media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id']
+            video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media'])
            if not video_formats:
                continue
-            num += 1
-            for video_format in video_formats:
-                media_url = video_format.get('_stream') or ''
            formats = []
+            for video_format in video_formats:
+                media_url = video_format.get('url') or ''
                if media_url.endswith('master.m3u8'):
-                    formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
+                    formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls')
                elif media_url.endswith('.mp3'):
-                    formats = [{
+                    formats.append({
                        'url': media_url,
                        'vcodec': 'none',
-                    }]
+                        'format_note': video_format.get('forcedLabel'),
+                    })
            if not formats:
                continue
            entries.append({
-                    'id': f'{display_id}-{num}',
-                    'title': try_get(video, lambda x: x['mc']['_title']),
-                    'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
+                'id': media_id,
+                'title': try_get(data, lambda x: x['mc']['meta']['title']),
+                'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])),
                'formats': formats,
            })

@ -151,10 +147,10 @@ def _real_extract(self, url):
            raise UnsupportedError(url)

        if len(entries) > 1:
-            return self.playlist_result(entries, display_id, title)
+            return self.playlist_result(entries, webpage_id, title)

        return {
-            'id': display_id,
+            'id': entries[0]['id'],
            'title': title,
            'thumbnail': self._og_search_thumbnail(webpage),
            'formats': entries[0]['formats'],