1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-10-25 11:40:59 +00:00
yt-dlp/yt_dlp/extractor/msn.py
thedenv 4815dac131
[ie/msn] Rework extractor (#12513)
Closes #3225
Authored by: thedenv, seproDev

Co-authored-by: sepro <sepro@sepr0.com>
2025-03-16 19:54:46 +01:00

216 lines
9.5 KiB
Python

from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
determine_ext,
int_or_none,
parse_iso8601,
url_or_none,
)
from ..utils.traversal import traverse_obj
class MSNIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?P<locale>[a-z]{2}-[a-z]{2})/(?:[^/?#]+/)+(?P<display_id>[^/?#]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
_TESTS = [{
'url': 'https://www.msn.com/en-gb/video/news/president-macron-interrupts-trump-over-ukraine-funding/vi-AA1zMcD7',
'info_dict': {
'id': 'AA1zMcD7',
'ext': 'mp4',
'display_id': 'president-macron-interrupts-trump-over-ukraine-funding',
'title': 'President Macron interrupts Trump over Ukraine funding',
'description': 'md5:5fd3857ac25849e7a56cb25fbe1a2a8b',
'uploader': 'k! News UK',
'uploader_id': 'BB1hz5Rj',
'duration': 59,
'thumbnail': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/AA1zMagX.img',
'tags': 'count:14',
'timestamp': 1740510914,
'upload_date': '20250225',
'release_timestamp': 1740513600,
'release_date': '20250225',
'modified_timestamp': 1741413241,
'modified_date': '20250308',
},
}, {
'url': 'https://www.msn.com/en-gb/video/watch/films-success-saved-adam-pearsons-acting-career/vi-AA1znZGE?ocid=hpmsn',
'info_dict': {
'id': 'AA1znZGE',
'ext': 'mp4',
'display_id': 'films-success-saved-adam-pearsons-acting-career',
'title': "Films' success saved Adam Pearson's acting career",
'description': 'md5:98c05f7bd9ab4f9c423400f62f2d3da5',
'uploader': 'Sky News',
'uploader_id': 'AA2eki',
'duration': 52,
'thumbnail': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/AA1zo7nU.img',
'timestamp': 1739993965,
'upload_date': '20250219',
'release_timestamp': 1739977753,
'release_date': '20250219',
'modified_timestamp': 1742076259,
'modified_date': '20250315',
},
}, {
'url': 'https://www.msn.com/en-us/entertainment/news/rock-frontman-replacements-you-might-not-know-happened/vi-AA1yLVcD',
'info_dict': {
'id': 'AA1yLVcD',
'ext': 'mp4',
'display_id': 'rock-frontman-replacements-you-might-not-know-happened',
'title': 'Rock Frontman Replacements You Might Not Know Happened',
'description': 'md5:451a125496ff0c9f6816055bb1808da9',
'uploader': 'Grunge (Video)',
'uploader_id': 'BB1oveoV',
'duration': 596,
'thumbnail': 'https://img-s-msn-com.akamaized.net/tenant/amp/entityid/AA1yM4OJ.img',
'timestamp': 1739223456,
'upload_date': '20250210',
'release_timestamp': 1739219731,
'release_date': '20250210',
'modified_timestamp': 1741427272,
'modified_date': '20250308',
},
}, {
# Dailymotion Embed
'url': 'https://www.msn.com/de-de/nachrichten/other/the-first-descendant-gameplay-trailer-zu-serena-der-neuen-gefl%C3%BCgelten-nachfahrin/vi-AA1B1d06',
'info_dict': {
'id': 'x9g6oli',
'ext': 'mp4',
'title': 'The First Descendant: Gameplay-Trailer zu Serena, der neuen geflügelten Nachfahrin',
'description': '',
'uploader': 'MeinMMO',
'uploader_id': 'x2mvqi4',
'view_count': int,
'like_count': int,
'age_limit': 0,
'duration': 60,
'thumbnail': 'https://s1.dmcdn.net/v/Y3fO61drj56vPB9SS/x1080',
'tags': ['MeinMMO', 'The First Descendant'],
'timestamp': 1742124877,
'upload_date': '20250316',
},
}, {
# Youtube Embed
'url': 'https://www.msn.com/en-gb/video/webcontent/web-content/vi-AA1ybFaJ',
'info_dict': {
'id': 'kQSChWu95nE',
'ext': 'mp4',
'title': '7 Daily Habits to Nurture Your Personal Growth',
'description': 'md5:6f233c68341b74dee30c8c121924e827',
'uploader': 'TopThink',
'uploader_id': '@TopThink',
'uploader_url': 'https://www.youtube.com/@TopThink',
'channel': 'TopThink',
'channel_id': 'UCMlGmHokrQRp-RaNO7aq4Uw',
'channel_url': 'https://www.youtube.com/channel/UCMlGmHokrQRp-RaNO7aq4Uw',
'channel_is_verified': True,
'channel_follower_count': int,
'comment_count': int,
'view_count': int,
'like_count': int,
'age_limit': 0,
'duration': 705,
'thumbnail': 'https://i.ytimg.com/vi/kQSChWu95nE/maxresdefault.jpg',
'categories': ['Howto & Style'],
'tags': ['topthink', 'top think', 'personal growth'],
'timestamp': 1722711620,
'upload_date': '20240803',
'playable_in_embed': True,
'availability': 'public',
'live_status': 'not_live',
},
}, {
# Article with social embed
'url': 'https://www.msn.com/en-in/news/techandscience/watch-earth-sets-and-rises-behind-moon-in-breathtaking-blue-ghost-video/ar-AA1zKoAc',
'info_dict': {
'id': 'AA1zKoAc',
'title': 'Watch: Earth sets and rises behind Moon in breathtaking Blue Ghost video',
'description': 'md5:0ad51cfa77e42e7f0c46cf98a619dbbf',
'uploader': 'India Today',
'uploader_id': 'AAyFWG',
'tags': 'count:11',
'timestamp': 1740485034,
'upload_date': '20250225',
'release_timestamp': 1740484875,
'release_date': '20250225',
'modified_timestamp': 1740488561,
'modified_date': '20250225',
},
'playlist_count': 1,
}]
def _real_extract(self, url):
locale, display_id, page_id = self._match_valid_url(url).group('locale', 'display_id', 'id')
json_data = self._download_json(
f'https://assets.msn.com/content/view/v2/Detail/{locale}/{page_id}', page_id)
common_metadata = traverse_obj(json_data, {
'title': ('title', {str}),
'description': (('abstract', ('body', {clean_html})), {str}, filter, any),
'timestamp': ('createdDateTime', {parse_iso8601}),
'release_timestamp': ('publishedDateTime', {parse_iso8601}),
'modified_timestamp': ('updatedDateTime', {parse_iso8601}),
'thumbnail': ('thumbnail', 'image', 'url', {url_or_none}),
'duration': ('videoMetadata', 'playTime', {int_or_none}),
'tags': ('keywords', ..., {str}),
'uploader': ('provider', 'name', {str}),
'uploader_id': ('provider', 'id', {str}),
})
page_type = json_data['type']
source_url = traverse_obj(json_data, ('sourceHref', {url_or_none}))
if page_type == 'video':
if traverse_obj(json_data, ('thirdPartyVideoPlayer', 'enabled')) and source_url:
return self.url_result(source_url)
formats = []
subtitles = {}
for file in traverse_obj(json_data, ('videoMetadata', 'externalVideoFiles', lambda _, v: url_or_none(v['url']))):
file_url = file['url']
ext = determine_ext(file_url)
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
file_url, page_id, 'mp4', m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif ext == 'mpd':
fmts, subs = self._extract_mpd_formats_and_subtitles(
file_url, page_id, mpd_id='dash', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append(
traverse_obj(file, {
'url': 'url',
'format_id': ('format', {str}),
'filesize': ('fileSize', {int_or_none}),
'height': ('height', {int_or_none}),
'width': ('width', {int_or_none}),
}))
for caption in traverse_obj(json_data, ('videoMetadata', 'closedCaptions', lambda _, v: url_or_none(v['href']))):
lang = caption.get('locale') or 'en-us'
subtitles.setdefault(lang, []).append({
'url': caption['href'],
'ext': 'ttml',
})
return {
'id': page_id,
'display_id': display_id,
'formats': formats,
'subtitles': subtitles,
**common_metadata,
}
elif page_type == 'webcontent':
if not source_url:
raise ExtractorError('Could not find source URL')
return self.url_result(source_url)
elif page_type == 'article':
entries = []
for embed_url in traverse_obj(json_data, ('socialEmbeds', ..., 'postUrl', {url_or_none})):
entries.append(self.url_result(embed_url))
return self.playlist_result(entries, page_id, **common_metadata)
raise ExtractorError(f'Unsupported page type: {page_type}')