[^/]+/[^/?#&]+)'
- _TESTS = [{
- # clip
- 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
- 'info_dict': {
- 'id': '5386045029001',
- 'ext': 'mp4',
- 'title': 'Chrysler Imperial',
- 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
- 'timestamp': 1491399228,
- 'upload_date': '20170405',
- 'uploader_id': '618566855001',
- 'series': 'RPM+',
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- # episode
- 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
- 'info_dict': {
- 'id': '5395865725001',
- 'title': 'Épisode 13 : Les retrouvailles',
- 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473',
- 'ext': 'mp4',
- 'timestamp': 1492019320,
- 'upload_date': '20170412',
- 'uploader_id': '618566855001',
- 'series': "L'amour est dans le pré",
- 'season_number': 5,
- 'episode': 'Épisode 13',
- 'episode_number': 13,
- },
- 'params': {
- 'skip_download': True,
- },
- }]
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- brightcove_id = self._search_regex(
- r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
-
- data = self._parse_json(
- self._search_regex(
- r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data',
- default='{}'),
- video_id, transform_source=js_to_json, fatal=False)
-
- title = try_get(
- data, lambda x: x['video']['nom'],
- str) or self._html_search_meta(
- 'dcterms.Title', webpage, 'title', fatal=True)
-
- description = self._html_search_meta(
- ('dcterms.Description', 'description'), webpage, 'description')
-
- series = try_get(
- data, lambda x: x['emission']['nom']) or self._search_regex(
- r']+class="banner-card__subtitle h4"[^>]*>([^<]+)',
- webpage, 'series', default=None)
-
- season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {}
- season = try_get(season_el, lambda x: x['nom'], str)
- season_number = int_or_none(try_get(season_el, lambda x: x['numero']))
-
- episode_el = try_get(season_el, lambda x: x['episode'], dict) or {}
- episode = try_get(episode_el, lambda x: x['nom'], str)
- episode_number = int_or_none(try_get(episode_el, lambda x: x['numero']))
-
- return {
- '_type': 'url_transparent',
- 'ie_key': BrightcoveNewIE.ie_key(),
- 'url': smuggle_url(
- self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
- {'geo_countries': ['CA']}),
- 'id': brightcove_id,
- 'title': title,
- 'description': description,
- 'series': series,
- 'season': season,
- 'season_number': season_number,
- 'episode': episode,
- 'episode_number': episode_number,
- }
diff --git a/yt_dlp/extractor/ntvcojp.py b/yt_dlp/extractor/ntvcojp.py
index 422ec6eb0..76c5936ba 100644
--- a/yt_dlp/extractor/ntvcojp.py
+++ b/yt_dlp/extractor/ntvcojp.py
@@ -1,55 +1,82 @@
-from .common import InfoExtractor
+from .streaks import StreaksBaseIE
from ..utils import (
- ExtractorError,
- smuggle_url,
- traverse_obj,
+ int_or_none,
+ parse_iso8601,
+ str_or_none,
+ url_or_none,
)
+from ..utils.traversal import require, traverse_obj
-class NTVCoJpCUIE(InfoExtractor):
+class NTVCoJpCUIE(StreaksBaseIE):
IE_NAME = 'cu.ntv.co.jp'
- IE_DESC = 'Nippon Television Network'
- _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program)(?P[^/?]+)'
- _TEST = {
- 'url': 'https://cu.ntv.co.jp/televiva-chill-gohan_181031/',
+ IE_DESC = '日テレ無料TADA!'
+ _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program-list|search)(?P[\w-]+)/?(?:[?#]|$)'
+ _TESTS = [{
+ 'url': 'https://cu.ntv.co.jp/gaki_20250525/',
'info_dict': {
- 'id': '5978891207001',
+ 'id': 'gaki_20250525',
'ext': 'mp4',
- 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸',
- 'upload_date': '20181213',
- 'description': 'md5:1985b51a9abc285df0104d982a325f2a',
- 'uploader_id': '3855502814001',
- 'timestamp': 1544669941,
+ 'title': '放送開始36年!方正ココリコが選ぶ神回&地獄回!',
+ 'cast': 'count:2',
+ 'description': 'md5:1e1db556224d627d4d2f74370c650927',
+ 'display_id': 'ref:gaki_20250525',
+ 'duration': 1450,
+ 'episode': '放送開始36年!方正ココリコが選ぶ神回&地獄回!',
+ 'episode_id': '000000010172808',
+ 'episode_number': 255,
+ 'genres': ['variety'],
+ 'live_status': 'not_live',
+ 'modified_date': '20250525',
+ 'modified_timestamp': 1748145537,
+ 'release_date': '20250525',
+ 'release_timestamp': 1748145539,
+ 'series': 'ダウンタウンのガキの使いやあらへんで!',
+ 'series_id': 'gaki',
+ 'thumbnail': r're:https?://.+\.jpg',
+ 'timestamp': 1748145197,
+ 'upload_date': '20250525',
+ 'uploader': '日本テレビ放送網',
+ 'uploader_id': '0x7FE2',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }
-
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- player_config = self._search_nuxt_data(webpage, display_id)
- video_id = traverse_obj(player_config, ('movie', 'video_id'))
- if not video_id:
- raise ExtractorError('Failed to extract video ID for Brightcove')
- account_id = traverse_obj(player_config, ('player', 'account')) or '3855502814001'
- title = traverse_obj(player_config, ('movie', 'name'))
- if not title:
- og_title = self._og_search_title(webpage, fatal=False) or traverse_obj(player_config, ('player', 'title'))
- if og_title:
- title = og_title.split('(', 1)[0].strip()
- description = (traverse_obj(player_config, ('movie', 'description'))
- or self._html_search_meta(['description', 'og:description'], webpage))
+
+ info = self._search_json(
+ r'window\.app\s*=', webpage, 'video info',
+ display_id)['falcorCache']['catalog']['episode'][display_id]['value']
+ media_id = traverse_obj(info, (
+ 'streaks_data', 'mediaid', {str_or_none}, {require('Streaks media ID')}))
+ non_phonetic = (lambda _, v: v['is_phonetic'] is False, 'value', {str})
+
return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': description,
- 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}),
- 'ie_key': 'BrightcoveNew',
+ **self._extract_from_streaks_api('ntv-tada', media_id, headers={
+ 'X-Streaks-Api-Key': 'df497719056b44059a0483b8faad1f4a',
+ }),
+ **traverse_obj(info, {
+ 'id': ('content_id', {str_or_none}),
+ 'title': ('title', *non_phonetic, any),
+ 'age_limit': ('is_adult_only_content', {lambda x: 18 if x else None}),
+ 'cast': ('credit', ..., 'name', *non_phonetic),
+ 'genres': ('genre', ..., {str}),
+ 'release_timestamp': ('pub_date', {parse_iso8601}),
+ 'tags': ('tags', ..., {str}),
+ 'thumbnail': ('artwork', ..., 'url', any, {url_or_none}),
+ }),
+ **traverse_obj(info, ('tv_episode_info', {
+ 'duration': ('duration', {int_or_none}),
+ 'episode_number': ('episode_number', {int}),
+ 'series': ('parent_show_title', *non_phonetic, any),
+ 'series_id': ('show_content_id', {str}),
+ })),
+ **traverse_obj(info, ('custom_data', {
+ 'description': ('program_detail', {str}),
+ 'episode': ('episode_title', {str}),
+ 'episode_id': ('episode_id', {str_or_none}),
+ 'uploader': ('network_name', {str}),
+ 'uploader_id': ('network_id', {str}),
+ })),
}
diff --git a/yt_dlp/extractor/odnoklassniki.py b/yt_dlp/extractor/odnoklassniki.py
index d27d1c3f0..18eba42e6 100644
--- a/yt_dlp/extractor/odnoklassniki.py
+++ b/yt_dlp/extractor/odnoklassniki.py
@@ -273,6 +273,8 @@ def _extract_desktop(self, url):
return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'}))
elif error:
raise ExtractorError(error, expected=True)
+ elif '>Access to this video is restricted
' in webpage:
+ self.raise_login_required()
player = self._parse_json(
unescapeHTML(self._search_regex(
@@ -429,7 +431,7 @@ def _extract_mobile(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
- f'http://m.ok.ru/video/{video_id}', video_id,
+ f'https://m.ok.ru/video/{video_id}', video_id,
note='Downloading mobile webpage')
error = self._search_regex(
diff --git a/yt_dlp/extractor/qqmusic.py b/yt_dlp/extractor/qqmusic.py
index fb46e0d12..56a8e7300 100644
--- a/yt_dlp/extractor/qqmusic.py
+++ b/yt_dlp/extractor/qqmusic.py
@@ -15,7 +15,6 @@
str_or_none,
strip_jsonp,
traverse_obj,
- unescapeHTML,
url_or_none,
urljoin,
)
@@ -425,7 +424,7 @@ def _real_extract(self, url):
return self.playlist_result(entries, list_id, **traverse_obj(list_json, ('cdlist', 0, {
'title': ('dissname', {str}),
- 'description': ('desc', {unescapeHTML}, {clean_html}),
+ 'description': ('desc', {clean_html}),
})))
diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py
index c489dc731..027f7a7b6 100644
--- a/yt_dlp/extractor/rai.py
+++ b/yt_dlp/extractor/rai.py
@@ -765,7 +765,7 @@ class RaiCulturaIE(RaiNewsIE): # XXX: Do not subclass from concrete IE
class RaiSudtirolIE(RaiBaseIE):
- _VALID_URL = r'https?://raisudtirol\.rai\.it/.+media=(?P\w+)'
+ _VALID_URL = r'https?://rai(?:bz|sudtirol)\.rai\.it/.+media=(?P\w+)'
_TESTS = [{
# mp4 file
'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460',
@@ -791,6 +791,9 @@ class RaiSudtirolIE(RaiBaseIE):
'formats': 'count:6',
},
'params': {'skip_download': True},
+ }, {
+ 'url': 'https://raibz.rai.it/de/index.php?media=Ptv1751660400',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/sauceplus.py b/yt_dlp/extractor/sauceplus.py
new file mode 100644
index 000000000..75d7022d3
--- /dev/null
+++ b/yt_dlp/extractor/sauceplus.py
@@ -0,0 +1,41 @@
+from .floatplane import FloatplaneBaseIE
+
+
+class SaucePlusIE(FloatplaneBaseIE):
+ IE_DESC = 'Sauce+'
+ _VALID_URL = r'https?://(?:(?:www|beta)\.)?sauceplus\.com/post/(?P\w+)'
+ _BASE_URL = 'https://www.sauceplus.com'
+ _HEADERS = {
+ 'Origin': _BASE_URL,
+ 'Referer': f'{_BASE_URL}/',
+ }
+ _IMPERSONATE_TARGET = True
+ _TESTS = [{
+ 'url': 'https://www.sauceplus.com/post/YbBwIa2A5g',
+ 'info_dict': {
+ 'id': 'eit4Ugu5TL',
+ 'ext': 'mp4',
+ 'display_id': 'YbBwIa2A5g',
+ 'title': 'Scare the Coyote - Episode 3',
+ 'description': '',
+ 'thumbnail': r're:^https?://.*\.jpe?g$',
+ 'duration': 2975,
+ 'comment_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'release_date': '20250627',
+ 'release_timestamp': 1750993500,
+ 'uploader': 'Scare The Coyote',
+ 'uploader_id': '683e0a3269688656a5a49a44',
+ 'uploader_url': 'https://www.sauceplus.com/channel/ScareTheCoyote/home',
+ 'channel': 'Scare The Coyote',
+ 'channel_id': '683e0a326968866ceba49a45',
+ 'channel_url': 'https://www.sauceplus.com/channel/ScareTheCoyote/home/main',
+ 'availability': 'subscriber_only',
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ def _real_initialize(self):
+ if not self._get_cookies(self._BASE_URL).get('__Host-sp-sess'):
+ self.raise_login_required()
diff --git a/yt_dlp/extractor/skyit.py b/yt_dlp/extractor/skyit.py
index 0013d2621..fe45be774 100644
--- a/yt_dlp/extractor/skyit.py
+++ b/yt_dlp/extractor/skyit.py
@@ -213,7 +213,7 @@ class CieloTVItIE(SkyItIE): # XXX: Do not subclass from concrete IE
class TV8ItIE(SkyItVideoIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'tv8.it'
- _VALID_URL = r'https?://(?:www\.)?tv8\.it/(?:show)?video/[0-9a-z-]+-(?P\d+)'
+ _VALID_URL = r'https?://(?:www\.)?tv8\.it/(?:show)?video/(?:[0-9a-z-]+-)?(?P\d+)'
_TESTS = [{
'url': 'https://www.tv8.it/video/ogni-mattina-ucciso-asino-di-andrea-lo-cicero-630529',
'md5': '9ab906a3f75ea342ed928442f9dabd21',
@@ -227,6 +227,19 @@ class TV8ItIE(SkyItVideoIE): # XXX: Do not subclass from concrete IE
'thumbnail': 'https://videoplatform.sky.it/still/2020/11/18/1605717753954_ogni-mattina-ucciso-asino-di-andrea-lo-cicero_videostill_1.jpg',
},
'params': {'skip_download': 'm3u8'},
+ }, {
+ 'url': 'https://www.tv8.it/video/964361',
+ 'md5': '1e58e807154658a16edc29e45be38107',
+ 'info_dict': {
+ 'id': '964361',
+ 'ext': 'mp4',
+ 'title': 'GialappaShow - S.4 Ep.2',
+ 'description': 'md5:60bb4ff5af18bbeeaedabc1de5f9e1e2',
+ 'duration': 8030,
+ 'thumbnail': 'https://videoplatform.sky.it/captures/494/2024/11/06/964361/964361_1730888412914_thumb_494.jpg',
+ 'timestamp': 1730821499,
+ 'upload_date': '20241105',
+ },
}]
_DOMAIN = 'mtv8'
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
index 3496a08ef..404e29897 100644
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -242,7 +242,7 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f
format_urls.add(format_url)
formats.append({
'format_id': 'download',
- 'ext': urlhandle_detect_ext(urlh, default='mp3'),
+ 'ext': urlhandle_detect_ext(urlh),
'filesize': int_or_none(urlh.headers.get('Content-Length')),
'url': format_url,
'quality': 10,
diff --git a/yt_dlp/extractor/sportdeutschland.py b/yt_dlp/extractor/sportdeutschland.py
index 2d6acb876..8349d9604 100644
--- a/yt_dlp/extractor/sportdeutschland.py
+++ b/yt_dlp/extractor/sportdeutschland.py
@@ -25,6 +25,7 @@ class SportDeutschlandIE(InfoExtractor):
'upload_date': '20230114',
'timestamp': 1673733618,
},
+ 'skip': 'not found',
}, {
'url': 'https://sportdeutschland.tv/deutscherbadmintonverband/bwf-tour-1-runde-feld-1-yonex-gainward-german-open-2022-0',
'info_dict': {
@@ -41,6 +42,7 @@ class SportDeutschlandIE(InfoExtractor):
'upload_date': '20220309',
'timestamp': 1646860727.0,
},
+ 'skip': 'not found',
}, {
'url': 'https://sportdeutschland.tv/ggcbremen/formationswochenende-latein-2023',
'info_dict': {
@@ -68,6 +70,7 @@ class SportDeutschlandIE(InfoExtractor):
'live_status': 'was_live',
},
}],
+ 'skip': 'not found',
}, {
'url': 'https://sportdeutschland.tv/dtb/gymnastik-international-tag-1',
'info_dict': {
@@ -82,13 +85,30 @@ class SportDeutschlandIE(InfoExtractor):
'live_status': 'is_live',
},
'skip': 'live',
+ }, {
+ 'url': 'https://sportdeutschland.tv/rostock-griffins/gfl2-rostock-griffins-vs-elmshorn-fighting-pirates',
+ 'md5': '35c11a19395c938cdd076b93bda54cde',
+ 'info_dict': {
+ 'id': '9f27a97d-1544-4d0b-aa03-48d92d17a03a',
+ 'ext': 'mp4',
+ 'title': 'GFL2: Rostock Griffins vs. Elmshorn Fighting Pirates',
+ 'display_id': 'rostock-griffins/gfl2-rostock-griffins-vs-elmshorn-fighting-pirates',
+ 'channel': 'Rostock Griffins',
+ 'channel_url': 'https://sportdeutschland.tv/rostock-griffins',
+ 'live_status': 'was_live',
+ 'description': 'md5:60cb00067e55dafa27b0933a43d72862',
+ 'channel_id': '9635f21c-3f67-4584-9ce4-796e9a47276b',
+ 'timestamp': 1749913117,
+ 'upload_date': '20250614',
+ },
}]
def _process_video(self, asset_id, video):
is_live = video['type'] == 'mux_live'
token = self._download_json(
- f'https://api.sportdeutschland.tv/api/frontend/asset-token/{asset_id}',
- video['id'], query={'type': video['type'], 'playback_id': video['src']})['token']
+ f'https://api.sportdeutschland.tv/api/web/personal/asset-token/{asset_id}',
+ video['id'], query={'type': video['type'], 'playback_id': video['src']},
+ headers={'Referer': 'https://sportdeutschland.tv/'})['token']
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
f'https://stream.mux.com/{video["src"]}.m3u8?token={token}', video['id'], live=is_live)
diff --git a/yt_dlp/extractor/sproutvideo.py b/yt_dlp/extractor/sproutvideo.py
index c0923594e..494042738 100644
--- a/yt_dlp/extractor/sproutvideo.py
+++ b/yt_dlp/extractor/sproutvideo.py
@@ -41,6 +41,7 @@ class SproutVideoIE(InfoExtractor):
'duration': 703,
'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
},
+ 'skip': 'Account Disabled',
}, {
# http formats 'sd' and 'hd' are available
'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90',
@@ -98,10 +99,17 @@ def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
webpage = self._download_webpage(
- url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}))
+ url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}), impersonate=True)
data = self._search_json(
- r'var\s+dat\s*=\s*["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+',
- end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode())
+ r'(?:var|const|let)\s+(?:dat|playerInfo)\s*=\s*["\']', webpage, 'player info', video_id,
+ contains_pattern=r'[A-Za-z0-9+/=]+', end_pattern=r'["\'];',
+ transform_source=lambda x: base64.b64decode(x).decode())
+
+ # SproutVideo may send player info for 'SMPTE Color Monitor Test' [a791d7b71b12ecc52e]
+ # e.g. if the user-agent we used with the webpage request is too old
+ video_uid = data['videoUid']
+ if video_id != video_uid:
+ raise ExtractorError(f'{self.IE_NAME} sent the wrong video data ({video_uid})')
formats, subtitles = [], {}
headers = {
diff --git a/yt_dlp/extractor/srmediathek.py b/yt_dlp/extractor/srmediathek.py
index fc63d9b1a..d6cab6ae7 100644
--- a/yt_dlp/extractor/srmediathek.py
+++ b/yt_dlp/extractor/srmediathek.py
@@ -1,57 +1,102 @@
from .ard import ARDMediathekBaseIE
from ..utils import (
ExtractorError,
- get_element_by_attribute,
+ clean_html,
+ extract_attributes,
+ parse_duration,
+ parse_qs,
+ unified_strdate,
+)
+from ..utils.traversal import (
+ find_element,
+ require,
+ traverse_obj,
)
class SRMediathekIE(ARDMediathekBaseIE):
- _WORKING = False
IE_NAME = 'sr:mediathek'
IE_DESC = 'Saarländischer Rundfunk'
- _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P[0-9]+)'
+ _CLS_COMMON = 'teaser__image__caption__text teaser__image__caption__text--'
+ _VALID_URL = r'https?://(?:www\.)?sr-mediathek\.de/index\.php\?.*?&id=(?P\d+)'
_TESTS = [{
- 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',
+ 'url': 'https://www.sr-mediathek.de/index.php?seite=7&id=141317',
'info_dict': {
- 'id': '28455',
+ 'id': '141317',
'ext': 'mp4',
- 'title': 'sportarena (26.10.2014)',
- 'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ',
- 'thumbnail': r're:^https?://.*\.jpg$',
- },
- 'skip': 'no longer available',
- }, {
- 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682',
- 'info_dict': {
- 'id': '37682',
- 'ext': 'mp4',
- 'title': 'Love, Cakes and Rock\'n\'Roll',
- 'description': 'md5:18bf9763631c7d326c22603681e1123d',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'title': 'Kärnten, da will ich hin!',
+ 'channel': 'SR Fernsehen',
+ 'description': 'md5:7732e71e803379a499732864a572a456',
+ 'duration': 1788.0,
+ 'release_date': '20250525',
+ 'series': 'da will ich hin!',
+ 'series_id': 'DWIH',
+ 'thumbnail': r're:https?://.+\.jpg',
},
}, {
- 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480',
- 'only_matching': True,
+ 'url': 'https://www.sr-mediathek.de/index.php?seite=7&id=153853',
+ 'info_dict': {
+ 'id': '153853',
+ 'ext': 'mp3',
+ 'title': 'Kappes, Klöße, Kokosmilch: Bruschetta mit Nduja',
+ 'channel': 'SR 3',
+ 'description': 'md5:3935798de3562b10c4070b408a15e225',
+ 'duration': 139.0,
+ 'release_date': '20250523',
+ 'series': 'Kappes, Klöße, Kokosmilch',
+ 'series_id': 'SR3_KKK_A',
+ 'thumbnail': r're:https?://.+\.jpg',
+ },
+ }, {
+ 'url': 'https://www.sr-mediathek.de/index.php?seite=7&id=31406&pnr=&tbl=pf',
+ 'info_dict': {
+ 'id': '31406',
+ 'ext': 'mp3',
+ 'title': 'Das Leben schwer nehmen, ist einfach zu anstrengend',
+ 'channel': 'SR 1',
+ 'description': 'md5:3e03fd556af831ad984d0add7175fb0c',
+ 'duration': 1769.0,
+ 'release_date': '20230717',
+ 'series': 'Abendrot',
+ 'series_id': 'SR1_AB_P',
+ 'thumbnail': r're:https?://.+\.jpg',
+ },
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ description = self._og_search_description(webpage)
- if '>Der gewünschte Beitrag ist leider nicht mehr verfügbar.<' in webpage:
+ if description == 'Der gewünschte Beitrag ist leider nicht mehr vorhanden.':
raise ExtractorError(f'Video {video_id} is no longer available', expected=True)
- media_collection_url = self._search_regex(
- r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url')
- info = self._extract_media_info(media_collection_url, webpage, video_id)
- info.update({
+ player_url = traverse_obj(webpage, (
+ {find_element(tag='div', id=f'player{video_id}', html=True)},
+ {extract_attributes}, 'data-mediacollection-ardplayer',
+ {self._proto_relative_url}, {require('player URL')}))
+ article = traverse_obj(webpage, (
+ {find_element(cls='article__content')},
+ {find_element(tag='p')}, {clean_html}))
+
+ return {
+ **self._extract_media_info(player_url, webpage, video_id),
'id': video_id,
- 'title': get_element_by_attribute('class', 'ardplayer-title', webpage),
- 'description': self._og_search_description(webpage),
+ 'title': traverse_obj(webpage, (
+ {find_element(cls='ardplayer-title')}, {clean_html})),
+ 'channel': traverse_obj(webpage, (
+ {find_element(cls=f'{self._CLS_COMMON}subheadline')},
+ {lambda x: x.split('|')[0]}, {clean_html})),
+ 'description': description,
+ 'duration': parse_duration(self._search_regex(
+ r'(\d{2}:\d{2}:\d{2})', article, 'duration')),
+ 'release_date': unified_strdate(self._search_regex(
+ r'(\d{2}\.\d{2}\.\d{4})', article, 'release_date')),
+ 'series': traverse_obj(webpage, (
+ {find_element(cls=f'{self._CLS_COMMON}headline')}, {clean_html})),
+ 'series_id': traverse_obj(webpage, (
+ {find_element(cls='teaser__link', html=True)},
+ {extract_attributes}, 'href', {parse_qs}, 'sen', ..., {str}, any)),
'thumbnail': self._og_search_thumbnail(webpage),
- })
- return info
+ }
diff --git a/yt_dlp/extractor/stacommu.py b/yt_dlp/extractor/stacommu.py
index 830018518..e6866f151 100644
--- a/yt_dlp/extractor/stacommu.py
+++ b/yt_dlp/extractor/stacommu.py
@@ -4,6 +4,7 @@
from ..utils import (
int_or_none,
traverse_obj,
+ url_basename,
url_or_none,
)
@@ -65,9 +66,19 @@ def _extract_ppv(self, url):
hls_info, decrypt = self._call_encrypted_api(
video_id, ':watchArchive', 'stream information', data={'method': 1})
+ formats = self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id)
+ for f in formats:
+ # bitrates are exaggerated in PPV playlists, so avoid wrong/huge filesize_approx values
+ if f.get('tbr'):
+ f['tbr'] = int(f['tbr'] / 2.5)
+ # prefer variants with the same basename as the master playlist to avoid partial streams
+ f['format_id'] = url_basename(f['url']).partition('.')[0]
+ if not f['format_id'].startswith(url_basename(f['manifest_url']).partition('.')[0]):
+ f['preference'] = -10
+
return {
'id': video_id,
- 'formats': self._get_formats(hls_info, ('hls', 'urls', ..., {url_or_none}), video_id),
+ 'formats': formats,
'hls_aes': self._extract_hls_key(hls_info, 'hls', decrypt),
**traverse_obj(video_info, {
'title': ('displayName', {str}),
diff --git a/yt_dlp/extractor/startrek.py b/yt_dlp/extractor/startrek.py
index c59187173..802702d44 100644
--- a/yt_dlp/extractor/startrek.py
+++ b/yt_dlp/extractor/startrek.py
@@ -1,76 +1,76 @@
from .common import InfoExtractor
-from ..utils import int_or_none, urljoin
+from .youtube import YoutubeIE
+from ..utils import (
+ clean_html,
+ parse_iso8601,
+ update_url,
+ url_or_none,
+)
+from ..utils.traversal import subs_list_to_dict, traverse_obj
class StarTrekIE(InfoExtractor):
- _WORKING = False
- _VALID_URL = r'(?Phttps?://(?:intl|www)\.startrek\.com)/videos/(?P[^/]+)'
+ IE_NAME = 'startrek'
+ IE_DESC = 'STAR TREK'
+ _VALID_URL = r'https?://(?:www\.)?startrek\.com(?:/en-(?:ca|un))?/videos/(?P[^/?#]+)'
_TESTS = [{
- 'url': 'https://intl.startrek.com/videos/watch-welcoming-jess-bush-to-the-ready-room',
- 'md5': '491df5035c9d4dc7f63c79caaf9c839e',
+ 'url': 'https://www.startrek.com/en-un/videos/official-trailer-star-trek-lower-decks-season-4',
'info_dict': {
- 'id': 'watch-welcoming-jess-bush-to-the-ready-room',
+ 'id': 'official-trailer-star-trek-lower-decks-season-4',
'ext': 'mp4',
- 'title': 'WATCH: Welcoming Jess Bush to The Ready Room',
- 'duration': 1888,
- 'timestamp': 1655388000,
- 'upload_date': '20220616',
- 'description': 'md5:1ffee884e3920afbdd6dd04e926a1221',
- 'thumbnail': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/styles/video_1920x1080/public/images/2022-06/pp_14794_rr_thumb_107_yt_16x9\.jpg(?:\?.+)?',
- 'subtitles': {'en-US': [{
- 'url': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/video/captions/2022-06/TRR_SNW_107_v4\.vtt',
- }, {
- 'url': 'https://media.startrek.com/2022/06/16/2043801155561/1069981_hls/trr_snw_107_v4-c4bfc25d/stream_vtt.m3u8',
- }]},
+ 'title': 'Official Trailer | Star Trek: Lower Decks - Season 4',
+ 'alt_title': 'md5:dd7e3191aaaf9e95db16fc3abd5ef68b',
+ 'categories': ['TRAILERS'],
+ 'description': 'md5:563d7856ddab99bee7a5e50f45531757',
+ 'release_date': '20230722',
+ 'release_timestamp': 1690033200,
+ 'series': 'Star Trek: Lower Decks',
+ 'series_id': 'star-trek-lower-decks',
+ 'thumbnail': r're:https?://.+\.(?:jpg|png)',
},
}, {
- 'url': 'https://www.startrek.com/videos/watch-ethan-peck-and-gia-sandhu-beam-down-to-the-ready-room',
- 'md5': 'f5ad74fbb86e91e0882fc0a333178d1d',
+ 'url': 'https://www.startrek.com/en-ca/videos/my-first-contact-senator-cory-booker',
'info_dict': {
- 'id': 'watch-ethan-peck-and-gia-sandhu-beam-down-to-the-ready-room',
+ 'id': 'my-first-contact-senator-cory-booker',
'ext': 'mp4',
- 'title': 'WATCH: Ethan Peck and Gia Sandhu Beam Down to The Ready Room',
- 'duration': 1986,
- 'timestamp': 1654221600,
- 'upload_date': '20220603',
- 'description': 'md5:b3aa0edacfe119386567362dec8ed51b',
- 'thumbnail': r're:https://www\.startrek\.com/sites/default/files/styles/video_1920x1080/public/images/2022-06/pp_14792_rr_thumb_105_yt_16x9_1.jpg(?:\?.+)?',
- 'subtitles': {'en-US': [{
- 'url': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/video/captions/2022-06/TRR_SNW_105_v5\.vtt',
- }]},
+ 'title': 'My First Contact: Senator Cory Booker',
+ 'alt_title': 'md5:fe74a8bdb0afab421c6e159a7680db4d',
+ 'categories': ['MY FIRST CONTACT'],
+ 'description': 'md5:a3992ab3b3e0395925d71156bbc018ce',
+ 'release_date': '20250401',
+ 'release_timestamp': 1743512400,
+ 'series': 'Star Trek: The Original Series',
+ 'series_id': 'star-trek-the-original-series',
+ 'thumbnail': r're:https?://.+\.(?:jpg|png)',
},
}]
def _real_extract(self, url):
- urlbase, video_id = self._match_valid_url(url).group('base', 'id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- player = self._search_regex(
- r'(<\s*div\s+id\s*=\s*"cvp-player-[^<]+<\s*/div\s*>)', webpage, 'player')
+ page_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps']
+ video_data = page_props['video']['data']
+ if youtube_id := video_data.get('youtube_video_id'):
+ return self.url_result(youtube_id, YoutubeIE)
- hls = self._html_search_regex(r'\bdata-hls\s*=\s*"([^"]+)"', player, 'HLS URL')
- formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls, video_id, 'mp4')
-
- captions = self._html_search_regex(
- r'\bdata-captions-url\s*=\s*"([^"]+)"', player, 'captions URL', fatal=False)
- if captions:
- subtitles.setdefault('en-US', [])[:0] = [{'url': urljoin(urlbase, captions)}]
-
- # NB: Most of the data in the json_ld is undesirable
- json_ld = self._search_json_ld(webpage, video_id, fatal=False)
+ series_id = traverse_obj(video_data, (
+ 'series_and_movies', ..., 'series_or_movie', 'slug', {str}, any))
return {
'id': video_id,
- 'title': self._html_search_regex(
- r'\bdata-title\s*=\s*"([^"]+)"', player, 'title', json_ld.get('title')),
- 'description': self._html_search_regex(
- r'(?s)<\s*div\s+class\s*=\s*"header-body"\s*>(.+?)<\s*/div\s*>',
- webpage, 'description', fatal=False),
- 'duration': int_or_none(self._html_search_regex(
- r'\bdata-duration\s*=\s*"(\d+)"', player, 'duration', fatal=False)),
- 'formats': formats,
- 'subtitles': subtitles,
- 'thumbnail': urljoin(urlbase, self._html_search_regex(
- r'\bdata-poster-url\s*=\s*"([^"]+)"', player, 'thumbnail', fatal=False)),
- 'timestamp': json_ld.get('timestamp'),
+ 'series': traverse_obj(page_props, (
+ 'queried', 'header', 'tab3', 'slices', ..., 'items',
+ lambda _, v: v['link']['slug'] == series_id, 'link_copy', {str}, any)),
+ 'series_id': series_id,
+ **traverse_obj(video_data, {
+ 'title': ('title', ..., 'text', {clean_html}, any),
+ 'alt_title': ('subhead', ..., 'text', {clean_html}, any),
+ 'categories': ('category', 'data', 'category_name', {str.upper}, filter, all),
+ 'description': ('slices', ..., 'primary', 'content', ..., 'text', {clean_html}, any),
+ 'release_timestamp': ('published', {parse_iso8601}),
+ 'subtitles': ({'url': 'legacy_subtitle_file'}, all, {subs_list_to_dict(lang='en')}),
+ 'thumbnail': ('poster_frame', 'url', {url_or_none}, {update_url(query=None)}),
+ 'url': ('legacy_video_url', {url_or_none}),
+ }),
}
diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py
index 6a72f8d42..a48d7858d 100644
--- a/yt_dlp/extractor/svt.py
+++ b/yt_dlp/extractor/svt.py
@@ -6,10 +6,13 @@
determine_ext,
dict_get,
int_or_none,
- traverse_obj,
try_get,
unified_timestamp,
)
+from ..utils.traversal import (
+ require,
+ traverse_obj,
+)
class SVTBaseIE(InfoExtractor):
@@ -97,40 +100,8 @@ def _extract_video(self, video_info, video_id):
}
-class SVTIE(SVTBaseIE):
- _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P\d+)&.*?\barticleId=(?P\d+)'
- _EMBED_REGEX = [rf'(?: