From 613852e4a8517070319e39c83d308e0d0d95111b Mon Sep 17 00:00:00 2001
From: InvalidUsernameException
 <InvalidUsernameException@users.noreply.github.com>
Date: Fri, 14 Mar 2025 15:59:41 +0100
Subject: [PATCH] [ie/Tagesschau] Update extractor for current website

---
 yt_dlp/extractor/tagesschau.py | 64 ++++++++++++++++------------------
 1 file changed, 30 insertions(+), 34 deletions(-)
diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py
index 4c537dfd14..abe7c7cc87 100644
--- a/yt_dlp/extractor/tagesschau.py
+++ b/yt_dlp/extractor/tagesschau.py
@@ -1,19 +1,19 @@
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     UnsupportedError,
     extract_attributes,
+    get_elements_html_by_attribute,
     int_or_none,
-    js_to_json,
     parse_iso8601,
     try_get,
 )
 
 
 class TagesschauIE(InfoExtractor):
-    _WORKING = False
-    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
+    _VALID_URL = [
+        r'https?://(?:www\.)?tagesschau\.de(?:/[^/#?]+)*/(?P<id>[^/#?\.]+)',
+        r'https?://(?:www\.)?(?P<id>tagesschau\.de)/?',
+    ]
 
     _TESTS = [{
         'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
@@ -106,55 +106,51 @@ class TagesschauIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = self._match_valid_url(url)
-        video_id = mobj.group('id') or mobj.group('path')
-        display_id = video_id.lstrip('-')
-
-        webpage = self._download_webpage(url, display_id)
+        webpage_id = self._match_id(url)
+        webpage = self._download_webpage(url, webpage_id)
 
         title = self._html_search_regex(
             r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
             webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
 
         entries = []
-        videos = re.findall(r'<div[^>]+>', webpage)
-        num = 0
-        for video in videos:
-            video = extract_attributes(video).get('data-config')
-            if not video:
-                continue
-            video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
-            video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
+        media_players = get_elements_html_by_attribute(
+            'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False)
+
+        for player in media_players:
+            data = self._parse_json(extract_attributes(player)['data-v'], webpage_id)
+            media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id']
+            video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media'])
             if not video_formats:
                 continue
-            num += 1
+            formats = []
             for video_format in video_formats:
-                media_url = video_format.get('_stream') or ''
-                formats = []
+                media_url = video_format.get('url') or ''
                 if media_url.endswith('master.m3u8'):
-                    formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
+                    formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls')
                 elif media_url.endswith('.mp3'):
-                    formats = [{
+                    formats.append({
                         'url': media_url,
                         'vcodec': 'none',
-                    }]
-                if not formats:
-                    continue
-                entries.append({
-                    'id': f'{display_id}-{num}',
-                    'title': try_get(video, lambda x: x['mc']['_title']),
-                    'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
-                    'formats': formats,
-                })
+                        'format_note': video_format.get('forcedLabel'),
+                    })
+            if not formats:
+                continue
+            entries.append({
+                'id': media_id,
+                'title': try_get(data, lambda x: x['mc']['meta']['title']),
+                'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])),
+                'formats': formats,
+            })
 
         if not entries:
             raise UnsupportedError(url)
 
         if len(entries) > 1:
-            return self.playlist_result(entries, display_id, title)
+            return self.playlist_result(entries, webpage_id, title)
 
         return {
-            'id': display_id,
+            'id': entries[0]['id'],
             'title': title,
             'thumbnail': self._og_search_thumbnail(webpage),
             'formats': entries[0]['formats'],