1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-07-10 07:18:33 +00:00

[ie/Tagesschau] Update extractor for current website

This commit is contained in:
InvalidUsernameException 2025-03-14 15:59:41 +01:00
parent e67d786c7c
commit 613852e4a8

View File

@ -1,19 +1,19 @@
import re
from .common import InfoExtractor
from ..utils import (
UnsupportedError,
extract_attributes,
get_elements_html_by_attribute,
int_or_none,
js_to_json,
parse_iso8601,
try_get,
)
class TagesschauIE(InfoExtractor):
_WORKING = False
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
_VALID_URL = [
r'https?://(?:www\.)?tagesschau\.de(?:/[^/#?]+)*/(?P<id>[^/#?\.]+)',
r'https?://(?:www\.)?(?P<id>tagesschau\.de)/?',
]
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
@ -106,55 +106,51 @@ class TagesschauIE(InfoExtractor):
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('path')
display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id)
webpage_id = self._match_id(url)
webpage = self._download_webpage(url, webpage_id)
title = self._html_search_regex(
r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
entries = []
videos = re.findall(r'<div[^>]+>', webpage)
num = 0
for video in videos:
video = extract_attributes(video).get('data-config')
if not video:
continue
video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
media_players = get_elements_html_by_attribute(
'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False)
for player in media_players:
data = self._parse_json(extract_attributes(player)['data-v'], webpage_id)
media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id']
video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media'])
if not video_formats:
continue
num += 1
formats = []
for video_format in video_formats:
media_url = video_format.get('_stream') or ''
formats = []
media_url = video_format.get('url') or ''
if media_url.endswith('master.m3u8'):
formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls')
elif media_url.endswith('.mp3'):
formats = [{
formats.append({
'url': media_url,
'vcodec': 'none',
}]
if not formats:
continue
entries.append({
'id': f'{display_id}-{num}',
'title': try_get(video, lambda x: x['mc']['_title']),
'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
'formats': formats,
})
'format_note': video_format.get('forcedLabel'),
})
if not formats:
continue
entries.append({
'id': media_id,
'title': try_get(data, lambda x: x['mc']['meta']['title']),
'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])),
'formats': formats,
})
if not entries:
raise UnsupportedError(url)
if len(entries) > 1:
return self.playlist_result(entries, display_id, title)
return self.playlist_result(entries, webpage_id, title)
return {
'id': display_id,
'id': entries[0]['id'],
'title': title,
'thumbnail': self._og_search_thumbnail(webpage),
'formats': entries[0]['formats'],