1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-07-10 07:18:33 +00:00

[ie/Tagesschau] Update extractor for current website

This commit is contained in:
InvalidUsernameException 2025-03-14 15:59:41 +01:00
parent e67d786c7c
commit 613852e4a8

View File

@ -1,19 +1,19 @@
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
UnsupportedError, UnsupportedError,
extract_attributes, extract_attributes,
get_elements_html_by_attribute,
int_or_none, int_or_none,
js_to_json,
parse_iso8601, parse_iso8601,
try_get, try_get,
) )
class TagesschauIE(InfoExtractor): class TagesschauIE(InfoExtractor):
_WORKING = False _VALID_URL = [
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' r'https?://(?:www\.)?tagesschau\.de(?:/[^/#?]+)*/(?P<id>[^/#?\.]+)',
r'https?://(?:www\.)?(?P<id>tagesschau\.de)/?',
]
_TESTS = [{ _TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
@ -106,44 +106,40 @@ class TagesschauIE(InfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._match_valid_url(url) webpage_id = self._match_id(url)
video_id = mobj.group('id') or mobj.group('path') webpage = self._download_webpage(url, webpage_id)
display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id)
title = self._html_search_regex( title = self._html_search_regex(
r'<span[^>]*class="headline"[^>]*>(.+?)</span>', r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False) webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
entries = [] entries = []
videos = re.findall(r'<div[^>]+>', webpage) media_players = get_elements_html_by_attribute(
num = 0 'data-v-type', 'MediaPlayer(?:InlinePlay)?', webpage, escape_value=False)
for video in videos:
video = extract_attributes(video).get('data-config') for player in media_players:
if not video: data = self._parse_json(extract_attributes(player)['data-v'], webpage_id)
continue media_id = data['mc']['pluginData']['trackingSAND@all']['av_content_id']
video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False) video_formats = try_get(data, lambda x: x['mc']['streams'][0]['media'])
video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
if not video_formats: if not video_formats:
continue continue
num += 1
for video_format in video_formats:
media_url = video_format.get('_stream') or ''
formats = [] formats = []
for video_format in video_formats:
media_url = video_format.get('url') or ''
if media_url.endswith('master.m3u8'): if media_url.endswith('master.m3u8'):
formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls') formats += self._extract_m3u8_formats(media_url, media_id, 'mp4', m3u8_id='hls')
elif media_url.endswith('.mp3'): elif media_url.endswith('.mp3'):
formats = [{ formats.append({
'url': media_url, 'url': media_url,
'vcodec': 'none', 'vcodec': 'none',
}] 'format_note': video_format.get('forcedLabel'),
})
if not formats: if not formats:
continue continue
entries.append({ entries.append({
'id': f'{display_id}-{num}', 'id': media_id,
'title': try_get(video, lambda x: x['mc']['_title']), 'title': try_get(data, lambda x: x['mc']['meta']['title']),
'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])), 'duration': int_or_none(try_get(data, lambda x: x['mc']['meta']['durationSeconds'])),
'formats': formats, 'formats': formats,
}) })
@ -151,10 +147,10 @@ def _real_extract(self, url):
raise UnsupportedError(url) raise UnsupportedError(url)
if len(entries) > 1: if len(entries) > 1:
return self.playlist_result(entries, display_id, title) return self.playlist_result(entries, webpage_id, title)
return { return {
'id': display_id, 'id': entries[0]['id'],
'title': title, 'title': title,
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': self._og_search_thumbnail(webpage),
'formats': entries[0]['formats'], 'formats': entries[0]['formats'],