mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[arte] Add support for playlists and rework tests (Closes #9632)
This commit is contained in:
		| @@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor): | ||||
|         } | ||||
|  | ||||
|  | ||||
| class ArteTVPlus7IE(InfoExtractor): | ||||
|     IE_NAME = 'arte.tv:+7' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' | ||||
|  | ||||
| class ArteTVBaseIE(InfoExtractor): | ||||
|     @classmethod | ||||
|     def _extract_url_info(cls, url): | ||||
|         mobj = re.match(cls._VALID_URL, url) | ||||
| @@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor): | ||||
|             video_id = mobj.group('id') | ||||
|         return video_id, lang | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         video_id, lang = self._extract_url_info(url) | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|         return self._extract_from_webpage(webpage, video_id, lang) | ||||
|  | ||||
|     def _extract_from_webpage(self, webpage, video_id, lang): | ||||
|         patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') | ||||
|         ids = (video_id, '') | ||||
|         # some pages contain multiple videos (like | ||||
|         # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), | ||||
|         # so we first try to look for json URLs that contain the video id from | ||||
|         # the 'vid' parameter. | ||||
|         patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] | ||||
|         json_url = self._html_search_regex( | ||||
|             patterns, webpage, 'json vp url', default=None) | ||||
|         if not json_url: | ||||
|             def find_iframe_url(webpage, default=NO_DEFAULT): | ||||
|                 return self._html_search_regex( | ||||
|                     r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', | ||||
|                     webpage, 'iframe url', group='url', default=default) | ||||
|  | ||||
|             iframe_url = find_iframe_url(webpage, None) | ||||
|             if not iframe_url: | ||||
|                 embed_url = self._html_search_regex( | ||||
|                     r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) | ||||
|                 if embed_url: | ||||
|                     player = self._download_json( | ||||
|                         embed_url, video_id, 'Downloading player page') | ||||
|                     iframe_url = find_iframe_url(player['html']) | ||||
|             # en and es URLs produce react-based pages with different layout (e.g. | ||||
|             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) | ||||
|             if not iframe_url: | ||||
|                 program = self._search_regex( | ||||
|                     r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', | ||||
|                     webpage, 'program', default=None) | ||||
|                 if program: | ||||
|                     embed_html = self._parse_json(program, video_id) | ||||
|                     if embed_html: | ||||
|                         iframe_url = find_iframe_url(embed_html['embed_html']) | ||||
|             if iframe_url: | ||||
|                 json_url = compat_parse_qs( | ||||
|                     compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] | ||||
|         if json_url: | ||||
|             title = self._search_regex( | ||||
|                 r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', | ||||
|                 webpage, 'title', default=None, group='title') | ||||
|             return self._extract_from_json_url(json_url, video_id, lang, title=title) | ||||
|         # Different kind of embed URL (e.g. | ||||
|         # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) | ||||
|         embed_url = self._search_regex( | ||||
|             r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', | ||||
|             webpage, 'embed url', group='url') | ||||
|         return self.url_result(embed_url) | ||||
|  | ||||
|     def _extract_from_json_url(self, json_url, video_id, lang, title=None): | ||||
|         info = self._download_json(json_url, video_id) | ||||
|         player_info = info['videoJsonPlayer'] | ||||
| @@ -235,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor): | ||||
|         return info_dict | ||||
|  | ||||
|  | ||||
| class ArteTVPlus7IE(ArteTVBaseIE): | ||||
|     IE_NAME = 'arte.tv:+7' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|  | ||||
|     @classmethod | ||||
|     def suitable(cls, url): | ||||
|         return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         video_id, lang = self._extract_url_info(url) | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|         return self._extract_from_webpage(webpage, video_id, lang) | ||||
|  | ||||
|     def _extract_from_webpage(self, webpage, video_id, lang): | ||||
|         patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') | ||||
|         ids = (video_id, '') | ||||
|         # some pages contain multiple videos (like | ||||
|         # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), | ||||
|         # so we first try to look for json URLs that contain the video id from | ||||
|         # the 'vid' parameter. | ||||
|         patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] | ||||
|         json_url = self._html_search_regex( | ||||
|             patterns, webpage, 'json vp url', default=None) | ||||
|         if not json_url: | ||||
|             def find_iframe_url(webpage, default=NO_DEFAULT): | ||||
|                 return self._html_search_regex( | ||||
|                     r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', | ||||
|                     webpage, 'iframe url', group='url', default=default) | ||||
|  | ||||
|             iframe_url = find_iframe_url(webpage, None) | ||||
|             if not iframe_url: | ||||
|                 embed_url = self._html_search_regex( | ||||
|                     r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) | ||||
|                 if embed_url: | ||||
|                     player = self._download_json( | ||||
|                         embed_url, video_id, 'Downloading player page') | ||||
|                     iframe_url = find_iframe_url(player['html']) | ||||
|             # en and es URLs produce react-based pages with different layout (e.g. | ||||
|             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) | ||||
|             if not iframe_url: | ||||
|                 program = self._search_regex( | ||||
|                     r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', | ||||
|                     webpage, 'program', default=None) | ||||
|                 if program: | ||||
|                     embed_html = self._parse_json(program, video_id) | ||||
|                     if embed_html: | ||||
|                         iframe_url = find_iframe_url(embed_html['embed_html']) | ||||
|             if iframe_url: | ||||
|                 json_url = compat_parse_qs( | ||||
|                     compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] | ||||
|         if json_url: | ||||
|             title = self._search_regex( | ||||
|                 r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', | ||||
|                 webpage, 'title', default=None, group='title') | ||||
|             return self._extract_from_json_url(json_url, video_id, lang, title=title) | ||||
|         # Different kind of embed URL (e.g. | ||||
|         # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) | ||||
|         embed_url = self._search_regex( | ||||
|             r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', | ||||
|             webpage, 'embed url', group='url') | ||||
|         return self.url_result(embed_url) | ||||
|  | ||||
|  | ||||
| # It also uses the arte_vp_url url from the webpage to extract the information | ||||
| class ArteTVCreativeIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:creative' | ||||
| @@ -267,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:info' | ||||
|     _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TEST = { | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', | ||||
|         'info_dict': { | ||||
|             'id': '067528-000-A', | ||||
| @@ -275,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): | ||||
|             'title': 'Service civique, un cache misère ?', | ||||
|             'upload_date': '20160403', | ||||
|         }, | ||||
|     } | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVFutureIE(ArteTVPlus7IE): | ||||
| @@ -300,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:ddc' | ||||
|     _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TESTS = [] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         video_id, lang = self._extract_url_info(url) | ||||
|         if lang == 'folge': | ||||
| @@ -318,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:concert' | ||||
|     _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' | ||||
|  | ||||
|     _TEST = { | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', | ||||
|         'md5': '9ea035b7bd69696b67aa2ccaaa218161', | ||||
|         'info_dict': { | ||||
| @@ -328,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE): | ||||
|             'upload_date': '20140128', | ||||
|             'description': 'md5:486eb08f991552ade77439fe6d82c305', | ||||
|         }, | ||||
|     } | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVCinemaIE(ArteTVPlus7IE): | ||||
|     IE_NAME = 'arte.tv:cinema' | ||||
|     _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' | ||||
|  | ||||
|     _TEST = { | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://cinema.arte.tv/de/node/38291', | ||||
|         'md5': '6b275511a5107c60bacbeeda368c3aa1', | ||||
|         'info_dict': { | ||||
| @@ -345,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE): | ||||
|             'upload_date': '20160122', | ||||
|             'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', | ||||
|         }, | ||||
|     } | ||||
|     }] | ||||
|  | ||||
|  | ||||
| class ArteTVMagazineIE(ArteTVPlus7IE): | ||||
| @@ -390,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE): | ||||
|         ) | ||||
|     ''' | ||||
|  | ||||
|     _TESTS = [] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         video_id = mobj.group('id') | ||||
|         lang = mobj.group('lang') | ||||
|         json_url = mobj.group('json_url') | ||||
|         return self._extract_from_json_url(json_url, video_id, lang) | ||||
|  | ||||
|  | ||||
| class ArteTVPlaylistIE(ArteTVBaseIE): | ||||
|     IE_NAME = 'arte.tv:playlist' | ||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', | ||||
|         'info_dict': { | ||||
|             'id': 'PL-013263', | ||||
|             'title': 'Areva & Uramin', | ||||
|         }, | ||||
|         'playlist_mincount': 6, | ||||
|     }, { | ||||
|         'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', | ||||
|         'only_matching': True, | ||||
|     }] | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         playlist_id, lang = self._extract_url_info(url) | ||||
|         collection = self._download_json( | ||||
|             'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' | ||||
|             % (lang, playlist_id), playlist_id) | ||||
|         title = collection.get('title') | ||||
|         description = collection.get('shortDescription') or collection.get('teaserText') | ||||
|         entries = [ | ||||
|             self._extract_from_json_url( | ||||
|                 video['jsonUrl'], video.get('programId') or playlist_id, lang) | ||||
|             for video in collection['videos'] if video.get('jsonUrl')] | ||||
|         return self.playlist_result(entries, playlist_id, title, description) | ||||
|   | ||||
| @@ -56,6 +56,7 @@ from .arte import ( | ||||
|     ArteTVDDCIE, | ||||
|     ArteTVMagazineIE, | ||||
|     ArteTVEmbedIE, | ||||
|     ArteTVPlaylistIE, | ||||
| ) | ||||
| from .atresplayer import AtresPlayerIE | ||||
| from .atttechchannel import ATTTechChannelIE | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․