diff --git a/yt_dlp/extractor/telecinco.py b/yt_dlp/extractor/telecinco.py index bdcae3b774..6846191bcb 100644 --- a/yt_dlp/extractor/telecinco.py +++ b/yt_dlp/extractor/telecinco.py @@ -6,20 +6,21 @@ from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, clean_html, + extract_attributes, int_or_none, join_nonempty, str_or_none, - traverse_obj, update_url, url_or_none, ) +from ..utils.traversal import traverse_obj class TelecincoBaseIE(InfoExtractor): def _parse_content(self, content, url): - video_id = content['dataMediaId'] + video_id = content['dataMediaId'][1] config = self._download_json( - content['dataConfig'], video_id, 'Downloading config JSON') + content['dataConfig'][1], video_id, 'Downloading config JSON') services = config['services'] caronte = self._download_json(services['caronte'], video_id) if traverse_obj(caronte, ('dls', 0, 'drm', {bool})): @@ -57,9 +58,9 @@ class TelecincoBaseIE(InfoExtractor): 'id': video_id, 'title': traverse_obj(config, ('info', 'title', {str})), 'formats': formats, - 'thumbnail': (traverse_obj(content, ('dataPoster', {url_or_none})) + 'thumbnail': (traverse_obj(content, ('dataPoster', 1, {url_or_none})) or traverse_obj(config, 'poster', 'imageUrl', expected_type=url_or_none)), - 'duration': traverse_obj(content, ('dataDuration', {int_or_none})), + 'duration': traverse_obj(content, ('dataDuration', 1, {int_or_none})), 'http_headers': headers, } @@ -137,30 +138,45 @@ class TelecincoIE(TelecincoBaseIE): 'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html', 'only_matching': True, }] + _ASTRO_ISLAND_RE = re.compile(r']+>') def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id, impersonate=True) - article = self._search_json( - r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=', - webpage, 'article', display_id)['article'] - description = traverse_obj(article, ('leadParagraph', {clean_html}, filter)) - if article.get('editorialType') != 'VID': + props_list = traverse_obj(webpage, ( + {self._ASTRO_ISLAND_RE.findall}, ..., + {extract_attributes}, 'props', {json.loads})) + + description = traverse_obj(props_list, (..., 'leadParagraph', 1, {clean_html}, any, filter)) + main_content = traverse_obj(props_list, (..., ('content', ('articleData', 1, 'opening')), 1, {dict}, any)) + + if traverse_obj(props_list, (..., 'editorialType', 1, {str}, any)) != 'VID': # e.g. 'ART' entries = [] - for p in traverse_obj(article, ((('opening', all), 'body'), lambda _, v: v['content'])): - content = p['content'] - type_ = p.get('type') - if type_ == 'paragraph' and isinstance(content, str): + for p in traverse_obj(props_list, (..., 'articleData', 1, ('opening', ('body', 1, ...)), 1, {dict})): + type_ = traverse_obj(p, ('type', 1, {str})) + content = traverse_obj(p, ('content', 1, {str} if type_ == 'paragraph' else {dict})) + if not content: + continue + if type_ == 'paragraph': description = join_nonempty(description, content, delim='') - elif type_ == 'video' and isinstance(content, dict): + elif type_ == 'video': entries.append(self._parse_content(content, url)) + else: + self.report_warning( + f'Skipping unsupported content type "{type_}"', display_id, only_once=True) return self.playlist_result( - entries, str_or_none(article.get('id')), - traverse_obj(article, ('title', {str})), clean_html(description)) + entries, + traverse_obj(props_list, (..., 'id', 1, {int}, {str_or_none}, any)) or display_id, + traverse_obj(main_content, ('dataTitle', 1, {str})), + clean_html(description)) - info = self._parse_content(article['opening']['content'], url) + if not main_content: + raise ExtractorError('Unable to extract main content from webpage') + + info = self._parse_content(main_content, url) info['description'] = description + return info