mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[extractor/rai] Fix RaiNews extraction (#4380)
Authored by: nixxo Closes #3911
This commit is contained in:
		| @@ -6,6 +6,7 @@ from ..compat import ( | ||||
|     compat_urlparse, | ||||
| ) | ||||
| from ..utils import ( | ||||
|     clean_html, | ||||
|     determine_ext, | ||||
|     ExtractorError, | ||||
|     filter_dict, | ||||
| @@ -45,7 +46,7 @@ class RaiBaseIE(InfoExtractor): | ||||
|         for platform in ('mon', 'flash', 'native'): | ||||
|             relinker = self._download_xml( | ||||
|                 relinker_url, video_id, | ||||
|                 note='Downloading XML metadata for platform %s' % platform, | ||||
|                 note=f'Downloading XML metadata for platform {platform}', | ||||
|                 transform_source=fix_xml_ampersands, | ||||
|                 query={'output': 45, 'pl': platform}, | ||||
|                 headers=self.geo_verification_headers()) | ||||
| @@ -99,7 +100,7 @@ class RaiBaseIE(InfoExtractor): | ||||
|                 formats.append({ | ||||
|                     'url': media_url, | ||||
|                     'tbr': bitrate if bitrate > 0 else None, | ||||
|                     'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', | ||||
|                     'format_id': f'http-{bitrate if bitrate > 0 else "http"}', | ||||
|                 }) | ||||
| 
 | ||||
|         if not formats and geoprotection is True: | ||||
| @@ -152,7 +153,7 @@ class RaiBaseIE(InfoExtractor): | ||||
|             br = int_or_none(tbr) | ||||
|             if len(fmts) == 1 and not br: | ||||
|                 br = fmts[0].get('tbr') | ||||
|             if br > 300: | ||||
|             if br or 0 > 300: | ||||
|                 tbr = compat_str(math.floor(br / 100) * 100) | ||||
|             else: | ||||
|                 tbr = '250' | ||||
| @@ -171,11 +172,11 @@ class RaiBaseIE(InfoExtractor): | ||||
|                 'vcodec': format_copy.get('vcodec'), | ||||
|                 'acodec': format_copy.get('acodec'), | ||||
|                 'fps': format_copy.get('fps'), | ||||
|                 'format_id': 'https-%s' % tbr, | ||||
|                 'format_id': f'https-{tbr}', | ||||
|             } if format_copy else { | ||||
|                 'width': _QUALITY[tbr][0], | ||||
|                 'height': _QUALITY[tbr][1], | ||||
|                 'format_id': 'https-%s' % tbr, | ||||
|                 'format_id': f'https-{tbr}', | ||||
|                 'tbr': int(tbr), | ||||
|             } | ||||
| 
 | ||||
| @@ -198,8 +199,8 @@ class RaiBaseIE(InfoExtractor): | ||||
|                 'url': _MP4_TMPL % (relinker_url, q), | ||||
|                 'protocol': 'https', | ||||
|                 'ext': 'mp4', | ||||
|                 **get_format_info(q) | ||||
|             } | ||||
|             fmt.update(get_format_info(q)) | ||||
|             formats.append(fmt) | ||||
|         return formats | ||||
| 
 | ||||
| @@ -230,7 +231,7 @@ class RaiBaseIE(InfoExtractor): | ||||
| 
 | ||||
| 
 | ||||
| class RaiPlayIE(RaiBaseIE): | ||||
|     _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE | ||||
|     _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', | ||||
|         'md5': '8970abf8caf8aef4696e7b1f2adfc696', | ||||
| @@ -248,6 +249,8 @@ class RaiPlayIE(RaiBaseIE): | ||||
|             'subtitles': { | ||||
|                 'it': 'count:4', | ||||
|             }, | ||||
|             'release_year': 2022, | ||||
|             'episode': 'Espresso nel caffè - 07/04/2014', | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
| @@ -267,6 +270,10 @@ class RaiPlayIE(RaiBaseIE): | ||||
|             'duration': 6493, | ||||
|             'series': 'Blanca', | ||||
|             'season': 'Season 1', | ||||
|             'episode_number': 1, | ||||
|             'release_year': 2021, | ||||
|             'season_number': 1, | ||||
|             'episode': 'Senza occhi', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', | ||||
| @@ -320,13 +327,13 @@ class RaiPlayIE(RaiBaseIE): | ||||
| 
 | ||||
|         alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') | ||||
| 
 | ||||
|         info = { | ||||
|         return { | ||||
|             'id': remove_start(media.get('id'), 'ContentItem-') or video_id, | ||||
|             'display_id': video_id, | ||||
|             'title': title, | ||||
|             'alt_title': strip_or_none(alt_title), | ||||
|             'alt_title': strip_or_none(alt_title or None), | ||||
|             'description': media.get('description'), | ||||
|             'uploader': strip_or_none(media.get('channel')), | ||||
|             'uploader': strip_or_none(media.get('channel') or None), | ||||
|             'creator': strip_or_none(media.get('editor') or None), | ||||
|             'duration': parse_duration(video.get('duration')), | ||||
|             'timestamp': unified_timestamp(date_published), | ||||
| @@ -337,12 +344,10 @@ class RaiPlayIE(RaiBaseIE): | ||||
|             'episode': media.get('episode_title'), | ||||
|             'episode_number': int_or_none(media.get('episode')), | ||||
|             'subtitles': subtitles, | ||||
|             'release_year': traverse_obj(media, ('track_info', 'edit_year')), | ||||
|             'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), | ||||
|             **relinker_info | ||||
|         } | ||||
| 
 | ||||
|         info.update(relinker_info) | ||||
|         return info | ||||
| 
 | ||||
| 
 | ||||
| class RaiPlayLiveIE(RaiPlayIE): | ||||
|     _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' | ||||
| @@ -357,6 +362,7 @@ class RaiPlayLiveIE(RaiPlayIE): | ||||
|             'uploader': 'Rai News 24', | ||||
|             'creator': 'Rai News 24', | ||||
|             'is_live': True, | ||||
|             'live_status': 'is_live', | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
| @@ -407,7 +413,7 @@ class RaiPlayPlaylistIE(InfoExtractor): | ||||
|                 if not s_id: | ||||
|                     continue | ||||
|                 medias = self._download_json( | ||||
|                     '%s/%s.json' % (base, s_id), s_id, | ||||
|                     f'{base}/{s_id}.json', s_id, | ||||
|                     'Downloading content set JSON', fatal=False) | ||||
|                 if not medias: | ||||
|                     continue | ||||
| @@ -426,7 +432,7 @@ class RaiPlayPlaylistIE(InfoExtractor): | ||||
| 
 | ||||
| 
 | ||||
| class RaiPlaySoundIE(RaiBaseIE): | ||||
|     _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE | ||||
|     _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html', | ||||
|         'md5': '8970abf8caf8aef4696e7b1f2adfc696', | ||||
| @@ -434,11 +440,14 @@ class RaiPlaySoundIE(RaiBaseIE): | ||||
|             'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707', | ||||
|             'ext': 'mp3', | ||||
|             'title': 'Il Ruggito del Coniglio del 10/12/2021', | ||||
|             'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', | ||||
|             'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', | ||||
|             'thumbnail': r're:^https?://.*\.jpg$', | ||||
|             'uploader': 'rai radio 2', | ||||
|             'duration': 5685, | ||||
|             'series': 'Il Ruggito del Coniglio', | ||||
|             'episode': 'Il Ruggito del Coniglio del 10/12/2021', | ||||
|             'creator': 'rai radio 2', | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
| @@ -470,7 +479,7 @@ class RaiPlaySoundIE(RaiBaseIE): | ||||
|             'id': uid or audio_id, | ||||
|             'display_id': audio_id, | ||||
|             'title': traverse_obj(media, 'title', 'episode_title'), | ||||
|             'alt_title': traverse_obj(media, ('track_info', 'media_name')), | ||||
|             'alt_title': traverse_obj(media, ('track_info', 'media_name'), expected_type=strip_or_none), | ||||
|             'description': media.get('description'), | ||||
|             'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), | ||||
|             'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), | ||||
| @@ -492,10 +501,13 @@ class RaiPlaySoundLiveIE(RaiPlaySoundIE): | ||||
|             'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44', | ||||
|             'display_id': 'radio2', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Rai Radio 2', | ||||
|             'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', | ||||
|             'thumbnail': r're:https://www.raiplaysound.it/dl/img/.+?png', | ||||
|             'uploader': 'rai radio 2', | ||||
|             'series': 'Rai Radio 2', | ||||
|             'creator': 'raiplaysound', | ||||
|             'is_live': True, | ||||
|             'live_status': 'is_live', | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': 'live', | ||||
| @@ -544,11 +556,11 @@ class RaiPlaySoundPlaylistIE(InfoExtractor): | ||||
| 
 | ||||
| 
 | ||||
| class RaiIE(RaiBaseIE): | ||||
|     _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE | ||||
|     _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P<id>{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' | ||||
|     _TESTS = [{ | ||||
|         # var uniquename = "ContentItem-..." | ||||
|         # data-id="ContentItem-..." | ||||
|         'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', | ||||
|         'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', | ||||
|         'info_dict': { | ||||
|             'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', | ||||
|             'ext': 'mp4', | ||||
| @@ -558,21 +570,9 @@ class RaiIE(RaiBaseIE): | ||||
|             'upload_date': '20140612', | ||||
|         }, | ||||
|         'skip': 'This content is available only in Italy', | ||||
|     }, { | ||||
|         # with ContentItem in many metas | ||||
|         'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', | ||||
|         'info_dict': { | ||||
|             'id': '1632c009-c843-4836-bb65-80c33084a64b', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', | ||||
|             'description': 'I film in uscita questa settimana.', | ||||
|             'thumbnail': r're:^https?://.*\.png$', | ||||
|             'duration': 833, | ||||
|             'upload_date': '20161103', | ||||
|         } | ||||
|     }, { | ||||
|         # with ContentItem in og:url | ||||
|         'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', | ||||
|         'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', | ||||
|         'md5': '06345bd97c932f19ffb129973d07a020', | ||||
|         'info_dict': { | ||||
|             'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', | ||||
| @@ -581,42 +581,17 @@ class RaiIE(RaiBaseIE): | ||||
|             'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', | ||||
|             'thumbnail': r're:^https?://.*\.jpg$', | ||||
|             'duration': 2214, | ||||
|             'upload_date': '20161103', | ||||
|             'upload_date': '20161103' | ||||
|         } | ||||
|     }, { | ||||
|         # initEdizione('ContentItem-...' | ||||
|         'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', | ||||
|         'info_dict': { | ||||
|             'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', | ||||
|             'ext': 'mp4', | ||||
|             'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', | ||||
|             'duration': 2274, | ||||
|             'upload_date': '20170401', | ||||
|         }, | ||||
|         'skip': 'Changes daily', | ||||
|     }, { | ||||
|         # HLS live stream with ContentItem in og:url | ||||
|         'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', | ||||
|         'info_dict': { | ||||
|             'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'La diretta di Rainews24', | ||||
|         }, | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
|         }, | ||||
|     }, { | ||||
|         # Direct MMS URL | ||||
|         'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', | ||||
|         'only_matching': True, | ||||
|     }, { | ||||
|         'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
| 
 | ||||
|     def _extract_from_content_id(self, content_id, url): | ||||
|         media = self._download_json( | ||||
|             'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, | ||||
|             f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', | ||||
|             content_id, 'Downloading video JSON') | ||||
| 
 | ||||
|         title = media['name'].strip() | ||||
| @@ -647,21 +622,18 @@ class RaiIE(RaiBaseIE): | ||||
| 
 | ||||
|         subtitles = self._extract_subtitles(url, media) | ||||
| 
 | ||||
|         info = { | ||||
|         return { | ||||
|             'id': content_id, | ||||
|             'title': title, | ||||
|             'description': strip_or_none(media.get('desc')), | ||||
|             'description': strip_or_none(media.get('desc') or None), | ||||
|             'thumbnails': thumbnails, | ||||
|             'uploader': media.get('author'), | ||||
|             'uploader': strip_or_none(media.get('author') or None), | ||||
|             'upload_date': unified_strdate(media.get('date')), | ||||
|             'duration': parse_duration(media.get('length')), | ||||
|             'subtitles': subtitles, | ||||
|             **relinker_info | ||||
|         } | ||||
| 
 | ||||
|         info.update(relinker_info) | ||||
| 
 | ||||
|         return info | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id = self._match_id(url) | ||||
| 
 | ||||
| @@ -674,20 +646,20 @@ class RaiIE(RaiBaseIE): | ||||
|              'twitter:player', 'jsonlink'), webpage, default=None) | ||||
|         if content_item_url: | ||||
|             content_item_id = self._search_regex( | ||||
|                 r'ContentItem-(%s)' % self._UUID_RE, content_item_url, | ||||
|                 rf'ContentItem-({self._UUID_RE})', content_item_url, | ||||
|                 'content item id', default=None) | ||||
| 
 | ||||
|         if not content_item_id: | ||||
|             content_item_id = self._search_regex( | ||||
|                 r'''(?x) | ||||
|                 rf'''(?x) | ||||
|                     (?: | ||||
|                         (?:initEdizione|drawMediaRaiTV)\(| | ||||
|                         <(?:[^>]+\bdata-id|var\s+uniquename)=| | ||||
|                         <iframe[^>]+\bsrc= | ||||
|                     ) | ||||
|                     (["\']) | ||||
|                     (?:(?!\1).)*\bContentItem-(?P<id>%s) | ||||
|                 ''' % self._UUID_RE, | ||||
|                     (?:(?!\1).)*\bContentItem-(?P<id>{self._UUID_RE}) | ||||
|                 ''', | ||||
|                 webpage, 'content item id', default=None, group='id') | ||||
| 
 | ||||
|         content_item_ids = set() | ||||
| @@ -727,11 +699,68 @@ class RaiIE(RaiBaseIE): | ||||
|             webpage, 'title', group='title', | ||||
|             default=None) or self._og_search_title(webpage) | ||||
| 
 | ||||
|         info = { | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': title, | ||||
|             **relinker_info | ||||
|         } | ||||
| 
 | ||||
|         info.update(relinker_info) | ||||
| 
 | ||||
|         return info | ||||
| class RaiNewsIE(RaiIE): | ||||
|     _VALID_URL = rf'https?://(www\.)?rainews\.it/[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' | ||||
|     _TESTS = [{ | ||||
|         # new rainews player (#3911) | ||||
|         'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html', | ||||
|         'info_dict': { | ||||
|             'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Puntata del 29/05/2022', | ||||
|             'duration': 1589, | ||||
|             'upload_date': '20220529', | ||||
|             'uploader': 'rainews', | ||||
|         } | ||||
|     }, { | ||||
|         # old content with fallback method to extract media urls | ||||
|         'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', | ||||
|         'info_dict': { | ||||
|             'id': '1632c009-c843-4836-bb65-80c33084a64b', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', | ||||
|             'description': 'I film in uscita questa settimana.', | ||||
|             'thumbnail': r're:^https?://.*\.png$', | ||||
|             'duration': 833, | ||||
|             'upload_date': '20161103' | ||||
|         }, | ||||
|         'expected_warnings': ['unable to extract player_data'], | ||||
|     }] | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         video_id = self._match_id(url) | ||||
| 
 | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
| 
 | ||||
|         player_data = self._search_json( | ||||
|             r'<rainews-player\s*data=\'', webpage, 'player_data', video_id, | ||||
|             transform_source=clean_html, fatal=False) | ||||
|         track_info = player_data.get('track_info') | ||||
|         relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') | ||||
| 
 | ||||
|         if not relinker_url: | ||||
|             # fallback on old implementation for some old content | ||||
|             try: | ||||
|                 return self._extract_from_content_id(video_id, url) | ||||
|             except GeoRestrictedError: | ||||
|                 raise | ||||
|             except ExtractorError as e: | ||||
|                 raise ExtractorError('Relinker URL not found', cause=e) | ||||
| 
 | ||||
|         relinker_info = self._extract_relinker_info(urljoin(url, relinker_url), video_id) | ||||
|         self._sort_formats(relinker_info['formats']) | ||||
| 
 | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': track_info.get('title') or self._og_search_title(webpage), | ||||
|             'upload_date': unified_strdate(track_info.get('date')), | ||||
|             'uploader': strip_or_none(track_info.get('editor') or None), | ||||
|             **relinker_info | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 nixxo
					nixxo