mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[arte] Add support for playlists and rework tests (Closes #9632)
This commit is contained in:
		| @@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor): | |||||||
|         } |         } | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArteTVPlus7IE(InfoExtractor): | class ArteTVBaseIE(InfoExtractor): | ||||||
|     IE_NAME = 'arte.tv:+7' |  | ||||||
|     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' |  | ||||||
|  |  | ||||||
|     @classmethod |     @classmethod | ||||||
|     def _extract_url_info(cls, url): |     def _extract_url_info(cls, url): | ||||||
|         mobj = re.match(cls._VALID_URL, url) |         mobj = re.match(cls._VALID_URL, url) | ||||||
| @@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor): | |||||||
|             video_id = mobj.group('id') |             video_id = mobj.group('id') | ||||||
|         return video_id, lang |         return video_id, lang | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |  | ||||||
|         video_id, lang = self._extract_url_info(url) |  | ||||||
|         webpage = self._download_webpage(url, video_id) |  | ||||||
|         return self._extract_from_webpage(webpage, video_id, lang) |  | ||||||
|  |  | ||||||
|     def _extract_from_webpage(self, webpage, video_id, lang): |  | ||||||
|         patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') |  | ||||||
|         ids = (video_id, '') |  | ||||||
|         # some pages contain multiple videos (like |  | ||||||
|         # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), |  | ||||||
|         # so we first try to look for json URLs that contain the video id from |  | ||||||
|         # the 'vid' parameter. |  | ||||||
|         patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] |  | ||||||
|         json_url = self._html_search_regex( |  | ||||||
|             patterns, webpage, 'json vp url', default=None) |  | ||||||
|         if not json_url: |  | ||||||
|             def find_iframe_url(webpage, default=NO_DEFAULT): |  | ||||||
|                 return self._html_search_regex( |  | ||||||
|                     r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', |  | ||||||
|                     webpage, 'iframe url', group='url', default=default) |  | ||||||
|  |  | ||||||
|             iframe_url = find_iframe_url(webpage, None) |  | ||||||
|             if not iframe_url: |  | ||||||
|                 embed_url = self._html_search_regex( |  | ||||||
|                     r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) |  | ||||||
|                 if embed_url: |  | ||||||
|                     player = self._download_json( |  | ||||||
|                         embed_url, video_id, 'Downloading player page') |  | ||||||
|                     iframe_url = find_iframe_url(player['html']) |  | ||||||
|             # en and es URLs produce react-based pages with different layout (e.g. |  | ||||||
|             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) |  | ||||||
|             if not iframe_url: |  | ||||||
|                 program = self._search_regex( |  | ||||||
|                     r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', |  | ||||||
|                     webpage, 'program', default=None) |  | ||||||
|                 if program: |  | ||||||
|                     embed_html = self._parse_json(program, video_id) |  | ||||||
|                     if embed_html: |  | ||||||
|                         iframe_url = find_iframe_url(embed_html['embed_html']) |  | ||||||
|             if iframe_url: |  | ||||||
|                 json_url = compat_parse_qs( |  | ||||||
|                     compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] |  | ||||||
|         if json_url: |  | ||||||
|             title = self._search_regex( |  | ||||||
|                 r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', |  | ||||||
|                 webpage, 'title', default=None, group='title') |  | ||||||
|             return self._extract_from_json_url(json_url, video_id, lang, title=title) |  | ||||||
|         # Different kind of embed URL (e.g. |  | ||||||
|         # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) |  | ||||||
|         embed_url = self._search_regex( |  | ||||||
|             r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', |  | ||||||
|             webpage, 'embed url', group='url') |  | ||||||
|         return self.url_result(embed_url) |  | ||||||
|  |  | ||||||
|     def _extract_from_json_url(self, json_url, video_id, lang, title=None): |     def _extract_from_json_url(self, json_url, video_id, lang, title=None): | ||||||
|         info = self._download_json(json_url, video_id) |         info = self._download_json(json_url, video_id) | ||||||
|         player_info = info['videoJsonPlayer'] |         player_info = info['videoJsonPlayer'] | ||||||
| @@ -235,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor): | |||||||
|         return info_dict |         return info_dict | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ArteTVPlus7IE(ArteTVBaseIE): | ||||||
|  |     IE_NAME = 'arte.tv:+7' | ||||||
|  |     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' | ||||||
|  |  | ||||||
|  |     _TESTS = [{ | ||||||
|  |         'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', | ||||||
|  |         'only_matching': True, | ||||||
|  |     }] | ||||||
|  |  | ||||||
|  |     @classmethod | ||||||
|  |     def suitable(cls, url): | ||||||
|  |         return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) | ||||||
|  |  | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         video_id, lang = self._extract_url_info(url) | ||||||
|  |         webpage = self._download_webpage(url, video_id) | ||||||
|  |         return self._extract_from_webpage(webpage, video_id, lang) | ||||||
|  |  | ||||||
|  |     def _extract_from_webpage(self, webpage, video_id, lang): | ||||||
|  |         patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') | ||||||
|  |         ids = (video_id, '') | ||||||
|  |         # some pages contain multiple videos (like | ||||||
|  |         # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), | ||||||
|  |         # so we first try to look for json URLs that contain the video id from | ||||||
|  |         # the 'vid' parameter. | ||||||
|  |         patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] | ||||||
|  |         json_url = self._html_search_regex( | ||||||
|  |             patterns, webpage, 'json vp url', default=None) | ||||||
|  |         if not json_url: | ||||||
|  |             def find_iframe_url(webpage, default=NO_DEFAULT): | ||||||
|  |                 return self._html_search_regex( | ||||||
|  |                     r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1', | ||||||
|  |                     webpage, 'iframe url', group='url', default=default) | ||||||
|  |  | ||||||
|  |             iframe_url = find_iframe_url(webpage, None) | ||||||
|  |             if not iframe_url: | ||||||
|  |                 embed_url = self._html_search_regex( | ||||||
|  |                     r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) | ||||||
|  |                 if embed_url: | ||||||
|  |                     player = self._download_json( | ||||||
|  |                         embed_url, video_id, 'Downloading player page') | ||||||
|  |                     iframe_url = find_iframe_url(player['html']) | ||||||
|  |             # en and es URLs produce react-based pages with different layout (e.g. | ||||||
|  |             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) | ||||||
|  |             if not iframe_url: | ||||||
|  |                 program = self._search_regex( | ||||||
|  |                     r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', | ||||||
|  |                     webpage, 'program', default=None) | ||||||
|  |                 if program: | ||||||
|  |                     embed_html = self._parse_json(program, video_id) | ||||||
|  |                     if embed_html: | ||||||
|  |                         iframe_url = find_iframe_url(embed_html['embed_html']) | ||||||
|  |             if iframe_url: | ||||||
|  |                 json_url = compat_parse_qs( | ||||||
|  |                     compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] | ||||||
|  |         if json_url: | ||||||
|  |             title = self._search_regex( | ||||||
|  |                 r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', | ||||||
|  |                 webpage, 'title', default=None, group='title') | ||||||
|  |             return self._extract_from_json_url(json_url, video_id, lang, title=title) | ||||||
|  |         # Different kind of embed URL (e.g. | ||||||
|  |         # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) | ||||||
|  |         embed_url = self._search_regex( | ||||||
|  |             r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', | ||||||
|  |             webpage, 'embed url', group='url') | ||||||
|  |         return self.url_result(embed_url) | ||||||
|  |  | ||||||
|  |  | ||||||
| # It also uses the arte_vp_url url from the webpage to extract the information | # It also uses the arte_vp_url url from the webpage to extract the information | ||||||
| class ArteTVCreativeIE(ArteTVPlus7IE): | class ArteTVCreativeIE(ArteTVPlus7IE): | ||||||
|     IE_NAME = 'arte.tv:creative' |     IE_NAME = 'arte.tv:creative' | ||||||
| @@ -267,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): | |||||||
|     IE_NAME = 'arte.tv:info' |     IE_NAME = 'arte.tv:info' | ||||||
|     _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' |     _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' | ||||||
|  |  | ||||||
|     _TEST = { |     _TESTS = [{ | ||||||
|         'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', |         'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': '067528-000-A', |             'id': '067528-000-A', | ||||||
| @@ -275,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE): | |||||||
|             'title': 'Service civique, un cache misère ?', |             'title': 'Service civique, un cache misère ?', | ||||||
|             'upload_date': '20160403', |             'upload_date': '20160403', | ||||||
|         }, |         }, | ||||||
|     } |     }] | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArteTVFutureIE(ArteTVPlus7IE): | class ArteTVFutureIE(ArteTVPlus7IE): | ||||||
| @@ -300,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE): | |||||||
|     IE_NAME = 'arte.tv:ddc' |     IE_NAME = 'arte.tv:ddc' | ||||||
|     _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' |     _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' | ||||||
|  |  | ||||||
|  |     _TESTS = [] | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         video_id, lang = self._extract_url_info(url) |         video_id, lang = self._extract_url_info(url) | ||||||
|         if lang == 'folge': |         if lang == 'folge': | ||||||
| @@ -318,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE): | |||||||
|     IE_NAME = 'arte.tv:concert' |     IE_NAME = 'arte.tv:concert' | ||||||
|     _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' |     _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' | ||||||
|  |  | ||||||
|     _TEST = { |     _TESTS = [{ | ||||||
|         'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', |         'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', | ||||||
|         'md5': '9ea035b7bd69696b67aa2ccaaa218161', |         'md5': '9ea035b7bd69696b67aa2ccaaa218161', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
| @@ -328,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE): | |||||||
|             'upload_date': '20140128', |             'upload_date': '20140128', | ||||||
|             'description': 'md5:486eb08f991552ade77439fe6d82c305', |             'description': 'md5:486eb08f991552ade77439fe6d82c305', | ||||||
|         }, |         }, | ||||||
|     } |     }] | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArteTVCinemaIE(ArteTVPlus7IE): | class ArteTVCinemaIE(ArteTVPlus7IE): | ||||||
|     IE_NAME = 'arte.tv:cinema' |     IE_NAME = 'arte.tv:cinema' | ||||||
|     _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' |     _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' | ||||||
|  |  | ||||||
|     _TEST = { |     _TESTS = [{ | ||||||
|         'url': 'http://cinema.arte.tv/de/node/38291', |         'url': 'http://cinema.arte.tv/de/node/38291', | ||||||
|         'md5': '6b275511a5107c60bacbeeda368c3aa1', |         'md5': '6b275511a5107c60bacbeeda368c3aa1', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
| @@ -345,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE): | |||||||
|             'upload_date': '20160122', |             'upload_date': '20160122', | ||||||
|             'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', |             'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', | ||||||
|         }, |         }, | ||||||
|     } |     }] | ||||||
|  |  | ||||||
|  |  | ||||||
| class ArteTVMagazineIE(ArteTVPlus7IE): | class ArteTVMagazineIE(ArteTVPlus7IE): | ||||||
| @@ -390,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE): | |||||||
|         ) |         ) | ||||||
|     ''' |     ''' | ||||||
|  |  | ||||||
|  |     _TESTS = [] | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         mobj = re.match(self._VALID_URL, url) |         mobj = re.match(self._VALID_URL, url) | ||||||
|         video_id = mobj.group('id') |         video_id = mobj.group('id') | ||||||
|         lang = mobj.group('lang') |         lang = mobj.group('lang') | ||||||
|         json_url = mobj.group('json_url') |         json_url = mobj.group('json_url') | ||||||
|         return self._extract_from_json_url(json_url, video_id, lang) |         return self._extract_from_json_url(json_url, video_id, lang) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ArteTVPlaylistIE(ArteTVBaseIE): | ||||||
|  |     IE_NAME = 'arte.tv:playlist' | ||||||
|  |     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' | ||||||
|  |  | ||||||
|  |     _TESTS = [{ | ||||||
|  |         'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': 'PL-013263', | ||||||
|  |             'title': 'Areva & Uramin', | ||||||
|  |         }, | ||||||
|  |         'playlist_mincount': 6, | ||||||
|  |     }, { | ||||||
|  |         'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', | ||||||
|  |         'only_matching': True, | ||||||
|  |     }] | ||||||
|  |  | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         playlist_id, lang = self._extract_url_info(url) | ||||||
|  |         collection = self._download_json( | ||||||
|  |             'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' | ||||||
|  |             % (lang, playlist_id), playlist_id) | ||||||
|  |         title = collection.get('title') | ||||||
|  |         description = collection.get('shortDescription') or collection.get('teaserText') | ||||||
|  |         entries = [ | ||||||
|  |             self._extract_from_json_url( | ||||||
|  |                 video['jsonUrl'], video.get('programId') or playlist_id, lang) | ||||||
|  |             for video in collection['videos'] if video.get('jsonUrl')] | ||||||
|  |         return self.playlist_result(entries, playlist_id, title, description) | ||||||
|   | |||||||
| @@ -56,6 +56,7 @@ from .arte import ( | |||||||
|     ArteTVDDCIE, |     ArteTVDDCIE, | ||||||
|     ArteTVMagazineIE, |     ArteTVMagazineIE, | ||||||
|     ArteTVEmbedIE, |     ArteTVEmbedIE, | ||||||
|  |     ArteTVPlaylistIE, | ||||||
| ) | ) | ||||||
| from .atresplayer import AtresPlayerIE | from .atresplayer import AtresPlayerIE | ||||||
| from .atttechchannel import ATTTechChannelIE | from .atttechchannel import ATTTechChannelIE | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․