mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 22:55:18 +00:00 
			
		
		
		
	[extractor/bitchute] Improve BitChuteChannelIE (#5066)
				
					
				
			Authored by: flashdagger, pukkandan
This commit is contained in:
		| @@ -1,14 +1,18 @@ | ||||
| import itertools | ||||
| import functools | ||||
| import re | ||||
| 
 | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     ExtractorError, | ||||
|     HEADRequest, | ||||
|     OnDemandPagedList, | ||||
|     clean_html, | ||||
|     get_element_by_class, | ||||
|     get_elements_html_by_class, | ||||
|     int_or_none, | ||||
|     orderedSet, | ||||
|     parse_count, | ||||
|     parse_duration, | ||||
|     traverse_obj, | ||||
|     unified_strdate, | ||||
|     urlencode_postdata, | ||||
| @@ -109,51 +113,103 @@ class BitChuteIE(InfoExtractor): | ||||
| 
 | ||||
| 
 | ||||
| class BitChuteChannelIE(InfoExtractor): | ||||
|     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' | ||||
|     _TEST = { | ||||
|         'url': 'https://www.bitchute.com/channel/victoriaxrave/', | ||||
|         'playlist_mincount': 185, | ||||
|     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)' | ||||
|     _TESTS = [{ | ||||
|         'url': 'https://www.bitchute.com/channel/bitchute/', | ||||
|         'info_dict': { | ||||
|             'id': 'victoriaxrave', | ||||
|             'id': 'bitchute', | ||||
|             'title': 'BitChute', | ||||
|             'description': 'md5:5329fb3866125afa9446835594a9b138', | ||||
|         }, | ||||
|         'playlist': [ | ||||
|             { | ||||
|                 'md5': '7e427d7ed7af5a75b5855705ec750e2b', | ||||
|                 'info_dict': { | ||||
|                     'id': 'UGlrF9o9b-Q', | ||||
|                     'ext': 'mp4', | ||||
|                     'filesize': None, | ||||
|                     'title': 'This is the first video on #BitChute !', | ||||
|                     'description': 'md5:a0337e7b1fe39e32336974af8173a034', | ||||
|                     'thumbnail': r're:^https?://.*\.jpg$', | ||||
|                     'uploader': 'BitChute', | ||||
|                     'upload_date': '20170103', | ||||
|                     'duration': 16, | ||||
|                     'view_count': int, | ||||
|                 }, | ||||
|             } | ||||
|         ], | ||||
|         'params': { | ||||
|             'skip_download': True, | ||||
|             'playlist_items': '-1', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/', | ||||
|         'playlist_mincount': 20, | ||||
|         'info_dict': { | ||||
|             'id': 'wV9Imujxasw9', | ||||
|             'title': 'Bruce MacDonald and "The Light of Darkness"', | ||||
|             'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', | ||||
|         } | ||||
|     }] | ||||
| 
 | ||||
|     _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' | ||||
|     PAGE_SIZE = 25 | ||||
|     HTML_CLASS_NAMES = { | ||||
|         'channel': { | ||||
|             'container': 'channel-videos-container', | ||||
|             'title': 'channel-videos-title', | ||||
|             'description': 'channel-videos-text', | ||||
|         }, | ||||
|         'playlist': { | ||||
|             'container': 'playlist-video', | ||||
|             'title': 'title', | ||||
|             'description': 'description', | ||||
|         } | ||||
| 
 | ||||
|     def _entries(self, channel_id): | ||||
|         channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id | ||||
|         offset = 0 | ||||
|         for page_num in itertools.count(1): | ||||
|     } | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _make_url(playlist_id, playlist_type): | ||||
|         return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/' | ||||
| 
 | ||||
|     def _fetch_page(self, playlist_id, playlist_type, page_num): | ||||
|         playlist_url = self._make_url(playlist_id, playlist_type) | ||||
|         data = self._download_json( | ||||
|                 '%sextend/' % channel_url, channel_id, | ||||
|                 'Downloading channel page %d' % page_num, | ||||
|             f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}', | ||||
|             data=urlencode_postdata({ | ||||
|                 'csrfmiddlewaretoken': self._TOKEN, | ||||
|                 'name': '', | ||||
|                     'offset': offset, | ||||
|                 'offset': page_num * self.PAGE_SIZE, | ||||
|             }), headers={ | ||||
|                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', | ||||
|                     'Referer': channel_url, | ||||
|                 'Referer': playlist_url, | ||||
|                 'X-Requested-With': 'XMLHttpRequest', | ||||
|                     'Cookie': 'csrftoken=%s' % self._TOKEN, | ||||
|                 'Cookie': f'csrftoken={self._TOKEN}', | ||||
|             }) | ||||
|             if data.get('success') is False: | ||||
|                 break | ||||
|             html = data.get('html') | ||||
|             if not html: | ||||
|                 break | ||||
|             video_ids = re.findall( | ||||
|                 r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', | ||||
|                 html) | ||||
|             if not video_ids: | ||||
|                 break | ||||
|             offset += len(video_ids) | ||||
|             for video_id in video_ids: | ||||
|         if not data.get('success'): | ||||
|             return | ||||
|         classes = self.HTML_CLASS_NAMES[playlist_type] | ||||
|         for video_html in get_elements_html_by_class(classes['container'], data.get('html')): | ||||
|             video_id = self._search_regex( | ||||
|                 r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None) | ||||
|             if not video_id: | ||||
|                 continue | ||||
|             yield self.url_result( | ||||
|                     'https://www.bitchute.com/video/%s' % video_id, | ||||
|                     ie=BitChuteIE.ie_key(), video_id=video_id) | ||||
|                 f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True, | ||||
|                 title=clean_html(get_element_by_class(classes['title'], video_html)), | ||||
|                 description=clean_html(get_element_by_class(classes['description'], video_html)), | ||||
|                 duration=parse_duration(get_element_by_class('video-duration', video_html)), | ||||
|                 view_count=parse_count(clean_html(get_element_by_class('video-views', video_html)))) | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         channel_id = self._match_id(url) | ||||
|         playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id') | ||||
|         webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id) | ||||
| 
 | ||||
|         page_func = functools.partial(self._fetch_page, playlist_id, playlist_type) | ||||
|         return self.playlist_result( | ||||
|             self._entries(channel_id), playlist_id=channel_id) | ||||
|             OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id, | ||||
|             title=self._html_extract_title(webpage, default=None), | ||||
|             description=self._html_search_meta( | ||||
|                 ('description', 'og:description', 'twitter:description'), webpage, default=None), | ||||
|             playlist_count=int_or_none(self._html_search_regex( | ||||
|                 r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None))) | ||||
|   | ||||
| @@ -418,6 +418,8 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w | ||||
|     Return the text (content) and the html (whole) of the tag with the specified | ||||
|     attribute in the passed HTML document | ||||
|     """ | ||||
|     if not value: | ||||
|         return | ||||
| 
 | ||||
|     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' | ||||
| 
 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 MMM
					MMM