mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 22:55:18 +00:00 
			
		
		
		
	[extractor/bitchute] Improve BitChuteChannelIE (#5066)
				
					
				
			Authored by: flashdagger, pukkandan
This commit is contained in:
		| @@ -1,14 +1,18 @@ | |||||||
| import itertools | import functools | ||||||
| import re | import re | ||||||
| 
 | 
 | ||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     HEADRequest, |     HEADRequest, | ||||||
|  |     OnDemandPagedList, | ||||||
|     clean_html, |     clean_html, | ||||||
|     get_element_by_class, |     get_element_by_class, | ||||||
|  |     get_elements_html_by_class, | ||||||
|     int_or_none, |     int_or_none, | ||||||
|     orderedSet, |     orderedSet, | ||||||
|  |     parse_count, | ||||||
|  |     parse_duration, | ||||||
|     traverse_obj, |     traverse_obj, | ||||||
|     unified_strdate, |     unified_strdate, | ||||||
|     urlencode_postdata, |     urlencode_postdata, | ||||||
| @@ -109,51 +113,103 @@ class BitChuteIE(InfoExtractor): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class BitChuteChannelIE(InfoExtractor): | class BitChuteChannelIE(InfoExtractor): | ||||||
|     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' |     _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)' | ||||||
|     _TEST = { |     _TESTS = [{ | ||||||
|         'url': 'https://www.bitchute.com/channel/victoriaxrave/', |         'url': 'https://www.bitchute.com/channel/bitchute/', | ||||||
|         'playlist_mincount': 185, |  | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': 'victoriaxrave', |             'id': 'bitchute', | ||||||
|  |             'title': 'BitChute', | ||||||
|  |             'description': 'md5:5329fb3866125afa9446835594a9b138', | ||||||
|         }, |         }, | ||||||
|     } |         'playlist': [ | ||||||
|  |             { | ||||||
|  |                 'md5': '7e427d7ed7af5a75b5855705ec750e2b', | ||||||
|  |                 'info_dict': { | ||||||
|  |                     'id': 'UGlrF9o9b-Q', | ||||||
|  |                     'ext': 'mp4', | ||||||
|  |                     'filesize': None, | ||||||
|  |                     'title': 'This is the first video on #BitChute !', | ||||||
|  |                     'description': 'md5:a0337e7b1fe39e32336974af8173a034', | ||||||
|  |                     'thumbnail': r're:^https?://.*\.jpg$', | ||||||
|  |                     'uploader': 'BitChute', | ||||||
|  |                     'upload_date': '20170103', | ||||||
|  |                     'duration': 16, | ||||||
|  |                     'view_count': int, | ||||||
|  |                 }, | ||||||
|  |             } | ||||||
|  |         ], | ||||||
|  |         'params': { | ||||||
|  |             'skip_download': True, | ||||||
|  |             'playlist_items': '-1', | ||||||
|  |         }, | ||||||
|  |     }, { | ||||||
|  |         'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/', | ||||||
|  |         'playlist_mincount': 20, | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': 'wV9Imujxasw9', | ||||||
|  |             'title': 'Bruce MacDonald and "The Light of Darkness"', | ||||||
|  |             'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', | ||||||
|  |         } | ||||||
|  |     }] | ||||||
| 
 | 
 | ||||||
|     _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' |     _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' | ||||||
|  |     PAGE_SIZE = 25 | ||||||
|  |     HTML_CLASS_NAMES = { | ||||||
|  |         'channel': { | ||||||
|  |             'container': 'channel-videos-container', | ||||||
|  |             'title': 'channel-videos-title', | ||||||
|  |             'description': 'channel-videos-text', | ||||||
|  |         }, | ||||||
|  |         'playlist': { | ||||||
|  |             'container': 'playlist-video', | ||||||
|  |             'title': 'title', | ||||||
|  |             'description': 'description', | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|     def _entries(self, channel_id): |     } | ||||||
|         channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id | 
 | ||||||
|         offset = 0 |     @staticmethod | ||||||
|         for page_num in itertools.count(1): |     def _make_url(playlist_id, playlist_type): | ||||||
|             data = self._download_json( |         return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/' | ||||||
|                 '%sextend/' % channel_url, channel_id, | 
 | ||||||
|                 'Downloading channel page %d' % page_num, |     def _fetch_page(self, playlist_id, playlist_type, page_num): | ||||||
|                 data=urlencode_postdata({ |         playlist_url = self._make_url(playlist_id, playlist_type) | ||||||
|                     'csrfmiddlewaretoken': self._TOKEN, |         data = self._download_json( | ||||||
|                     'name': '', |             f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}', | ||||||
|                     'offset': offset, |             data=urlencode_postdata({ | ||||||
|                 }), headers={ |                 'csrfmiddlewaretoken': self._TOKEN, | ||||||
|                     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', |                 'name': '', | ||||||
|                     'Referer': channel_url, |                 'offset': page_num * self.PAGE_SIZE, | ||||||
|                     'X-Requested-With': 'XMLHttpRequest', |             }), headers={ | ||||||
|                     'Cookie': 'csrftoken=%s' % self._TOKEN, |                 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', | ||||||
|                 }) |                 'Referer': playlist_url, | ||||||
|             if data.get('success') is False: |                 'X-Requested-With': 'XMLHttpRequest', | ||||||
|                 break |                 'Cookie': f'csrftoken={self._TOKEN}', | ||||||
|             html = data.get('html') |             }) | ||||||
|             if not html: |         if not data.get('success'): | ||||||
|                 break |             return | ||||||
|             video_ids = re.findall( |         classes = self.HTML_CLASS_NAMES[playlist_type] | ||||||
|                 r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', |         for video_html in get_elements_html_by_class(classes['container'], data.get('html')): | ||||||
|                 html) |             video_id = self._search_regex( | ||||||
|             if not video_ids: |                 r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None) | ||||||
|                 break |             if not video_id: | ||||||
|             offset += len(video_ids) |                 continue | ||||||
|             for video_id in video_ids: |             yield self.url_result( | ||||||
|                 yield self.url_result( |                 f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True, | ||||||
|                     'https://www.bitchute.com/video/%s' % video_id, |                 title=clean_html(get_element_by_class(classes['title'], video_html)), | ||||||
|                     ie=BitChuteIE.ie_key(), video_id=video_id) |                 description=clean_html(get_element_by_class(classes['description'], video_html)), | ||||||
|  |                 duration=parse_duration(get_element_by_class('video-duration', video_html)), | ||||||
|  |                 view_count=parse_count(clean_html(get_element_by_class('video-views', video_html)))) | ||||||
| 
 | 
 | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         channel_id = self._match_id(url) |         playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id') | ||||||
|  |         webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id) | ||||||
|  | 
 | ||||||
|  |         page_func = functools.partial(self._fetch_page, playlist_id, playlist_type) | ||||||
|         return self.playlist_result( |         return self.playlist_result( | ||||||
|             self._entries(channel_id), playlist_id=channel_id) |             OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id, | ||||||
|  |             title=self._html_extract_title(webpage, default=None), | ||||||
|  |             description=self._html_search_meta( | ||||||
|  |                 ('description', 'og:description', 'twitter:description'), webpage, default=None), | ||||||
|  |             playlist_count=int_or_none(self._html_search_regex( | ||||||
|  |                 r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None))) | ||||||
|   | |||||||
| @@ -418,6 +418,8 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w | |||||||
|     Return the text (content) and the html (whole) of the tag with the specified |     Return the text (content) and the html (whole) of the tag with the specified | ||||||
|     attribute in the passed HTML document |     attribute in the passed HTML document | ||||||
|     """ |     """ | ||||||
|  |     if not value: | ||||||
|  |         return | ||||||
| 
 | 
 | ||||||
|     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' |     quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' | ||||||
| 
 | 
 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 MMM
					MMM