mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 22:55:18 +00:00 
			
		
		
		
	This commit is contained in:
		| @@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | ||||
|             return | ||||
|  | ||||
|  | ||||
| class YoutubePlaylistBaseInfoExtractor(InfoExtractor): | ||||
|     # Extract the video ids from the playlist pages | ||||
|     def _entries(self, page, playlist_id): | ||||
|         more_widget_html = content_html = page | ||||
|         for page_num in itertools.count(1): | ||||
|             for video_id, video_title in self.extract_videos_from_page(content_html): | ||||
|                 yield self.url_result( | ||||
|                     video_id, 'Youtube', video_id=video_id, | ||||
|                     video_title=video_title) | ||||
|  | ||||
|             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) | ||||
|             if not mobj: | ||||
|                 break | ||||
|  | ||||
|             more = self._download_json( | ||||
|                 'https://youtube.com/%s' % mobj.group('more'), playlist_id, | ||||
|                 'Downloading page #%s' % page_num, | ||||
|                 transform_source=uppercase_escape) | ||||
|             content_html = more['content_html'] | ||||
|             if not content_html.strip(): | ||||
|                 # Some webpages show a "Load more" button but they don't | ||||
|                 # have more videos | ||||
|                 break | ||||
|             more_widget_html = more['load_more_widget_html'] | ||||
|  | ||||
|     def extract_videos_from_page(self, page): | ||||
|         ids_in_page = [] | ||||
|         titles_in_page = [] | ||||
|         for mobj in re.finditer(self._VIDEO_RE, page): | ||||
|             # The link with index 0 is not the first video of the playlist (not sure if still actual) | ||||
|             if 'index' in mobj.groupdict() and mobj.group('id') == '0': | ||||
|                 continue | ||||
|             video_id = mobj.group('id') | ||||
|             video_title = unescapeHTML(mobj.group('title')) | ||||
|             if video_title: | ||||
|                 video_title = video_title.strip() | ||||
|             try: | ||||
|                 idx = ids_in_page.index(video_id) | ||||
|                 if video_title and not titles_in_page[idx]: | ||||
|                     titles_in_page[idx] = video_title | ||||
|             except ValueError: | ||||
|                 ids_in_page.append(video_id) | ||||
|                 titles_in_page.append(video_title) | ||||
|         return zip(ids_in_page, titles_in_page) | ||||
|  | ||||
|  | ||||
| class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|     IE_DESC = 'YouTube.com' | ||||
|     _VALID_URL = r"""(?x)^ | ||||
| @@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|         } | ||||
|  | ||||
|  | ||||
| class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | ||||
| class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): | ||||
|     IE_DESC = 'YouTube.com playlists' | ||||
|     _VALID_URL = r"""(?x)(?: | ||||
|                         (?:https?://)? | ||||
| @@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | ||||
|                         ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) | ||||
|                      )""" | ||||
|     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' | ||||
|     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' | ||||
|     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' | ||||
|     IE_NAME = 'youtube:playlist' | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', | ||||
| @@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | ||||
|             else: | ||||
|                 self.report_warning('Youtube gives an alert message: ' + match) | ||||
|  | ||||
|         # Extract the video ids from the playlist pages | ||||
|         def _entries(): | ||||
|             more_widget_html = content_html = page | ||||
|             for page_num in itertools.count(1): | ||||
|                 matches = re.finditer(self._VIDEO_RE, content_html) | ||||
|                 # We remove the duplicates and the link with index 0 | ||||
|                 # (it's not the first video of the playlist) | ||||
|                 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') | ||||
|                 for vid_id in new_ids: | ||||
|                     yield self.url_result(vid_id, 'Youtube', video_id=vid_id) | ||||
|  | ||||
|                 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) | ||||
|                 if not mobj: | ||||
|                     break | ||||
|  | ||||
|                 more = self._download_json( | ||||
|                     'https://youtube.com/%s' % mobj.group('more'), playlist_id, | ||||
|                     'Downloading page #%s' % page_num, | ||||
|                     transform_source=uppercase_escape) | ||||
|                 content_html = more['content_html'] | ||||
|                 if not content_html.strip(): | ||||
|                     # Some webpages show a "Load more" button but they don't | ||||
|                     # have more videos | ||||
|                     break | ||||
|                 more_widget_html = more['load_more_widget_html'] | ||||
|  | ||||
|         playlist_title = self._html_search_regex( | ||||
|             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', | ||||
|             page, 'title') | ||||
|  | ||||
|         return self.playlist_result(_entries(), playlist_id, playlist_title) | ||||
|         return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         # Extract playlist id | ||||
| @@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | ||||
|         return self._extract_playlist(playlist_id) | ||||
|  | ||||
|  | ||||
| class YoutubeChannelIE(InfoExtractor): | ||||
| class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): | ||||
|     IE_DESC = 'YouTube.com channels' | ||||
|     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' | ||||
|     _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' | ||||
|     _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' | ||||
|     IE_NAME = 'youtube:channel' | ||||
|     _TESTS = [{ | ||||
|         'note': 'paginated channel', | ||||
| @@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor): | ||||
|         } | ||||
|     }] | ||||
|  | ||||
|     @staticmethod | ||||
|     def extract_videos_from_page(page): | ||||
|         ids_in_page = [] | ||||
|         titles_in_page = [] | ||||
|         for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): | ||||
|             video_id = mobj.group('id') | ||||
|             video_title = unescapeHTML(mobj.group('title')) | ||||
|             try: | ||||
|                 idx = ids_in_page.index(video_id) | ||||
|                 if video_title and not titles_in_page[idx]: | ||||
|                     titles_in_page[idx] = video_title | ||||
|             except ValueError: | ||||
|                 ids_in_page.append(video_id) | ||||
|                 titles_in_page.append(video_title) | ||||
|         return zip(ids_in_page, titles_in_page) | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         channel_id = self._match_id(url) | ||||
|  | ||||
| @@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor): | ||||
|                 for video_id, video_title in self.extract_videos_from_page(channel_page)] | ||||
|             return self.playlist_result(entries, channel_id) | ||||
|  | ||||
|         def _entries(): | ||||
|             more_widget_html = content_html = channel_page | ||||
|             for pagenum in itertools.count(1): | ||||
|  | ||||
|                 for video_id, video_title in self.extract_videos_from_page(content_html): | ||||
|                     yield self.url_result( | ||||
|                         video_id, 'Youtube', video_id=video_id, | ||||
|                         video_title=video_title) | ||||
|  | ||||
|                 mobj = re.search( | ||||
|                     r'data-uix-load-more-href="/?(?P<more>[^"]+)"', | ||||
|                     more_widget_html) | ||||
|                 if not mobj: | ||||
|                     break | ||||
|  | ||||
|                 more = self._download_json( | ||||
|                     'https://youtube.com/%s' % mobj.group('more'), channel_id, | ||||
|                     'Downloading page #%s' % (pagenum + 1), | ||||
|                     transform_source=uppercase_escape) | ||||
|                 content_html = more['content_html'] | ||||
|                 more_widget_html = more['load_more_widget_html'] | ||||
|  | ||||
|         return self.playlist_result(_entries(), channel_id) | ||||
|         return self.playlist_result(self._entries(channel_page, channel_id), channel_id) | ||||
|  | ||||
|  | ||||
| class YoutubeUserIE(YoutubeChannelIE): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․