diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bb595f924..d30626b92 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -259,7 +259,9 @@ BiliBiliIE, BiliBiliPlayerIE, BilibiliPlaylistIE, + BiliBiliSearchAllIE, BiliBiliSearchIE, + BiliBiliSearchPageIE, BilibiliSeriesListIE, BilibiliSpaceAudioIE, BilibiliSpaceVideoIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index d00ac6317..1cc3181e1 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -1777,7 +1777,96 @@ def _real_extract(self, url): return self.playlist_result(self._entries(category, subcategory, query), query, query) -class BiliBiliSearchIE(SearchInfoExtractor): +class BiliBiliSearchBaseIE(BilibiliBaseIE): + def _extract_search_result(self, result_data): + live_room_prefix = 'https://live.bilibili.com/' + bili_user_prefix = 'https://space.bilibili.com/' + result_type = result_data.get('type') + if result_type == 'video': + return self.url_result(result_data['arcurl']) + elif result_type == 'live_room': + return self.url_result(live_room_prefix + str(result_data['roomid'])) + elif result_type in ['media_ft', 'media_bangumi']: + return self.url_result(result_data['url']) + elif result_type == 'bili_user': + return self.url_result(bili_user_prefix + str(result_data['mid'])) + + +class BiliBiliSearchAllIE(SearchInfoExtractor, BiliBiliSearchBaseIE): + IE_DESC = 'Bilibili all search' + _MAX_RESULTS = 100000 + _SEARCH_KEY = 'biliallsearch' + _TESTS = [{ + 'url': 'biliallsearch3:靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'playlist_count': 3, + 'info_dict': { + 'id': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + 'title': '靡烟 出道一年,我怎么还在等你单推的女人睡觉后开播啊', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1n44y1Q7sc', + 'ext': 'mp4', + 'title': '“出道一年,我怎么还在等你单推的女人睡觉后开播啊?”【一分钟了解靡烟miya】', + 'timestamp': 1669889987, + 'upload_date': '20221201', + 'description': 'md5:43343c0973defff527b5a4b403b4abf9', + 'tags': list, + 'uploader': '靡烟miya', + 'duration': 123.156, + 'uploader_id': '1958703906', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + '_old_archive_ids': ['bilibili 988222410_part1'], + }, + }], + }, { + 'url': 'biliallsearch:LOL', + 'playlist_count': 1, + 'info_dict': { + 'id': 'LOL', + 'title': 'LOL', + }, + }] + + def _search_results(self, query): + headers = self.geo_verification_headers() + headers['Referer'] = 'https://www.bilibili.com/' + page_size = 50 + if not self._get_cookies('https://api.bilibili.com').get('buvid3'): + self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') + for page_num in itertools.count(1): + query_params = { + 'keyword': query, + 'page': page_num, + 'dynamic_offset': (page_num - 1) * page_size, + 'platform': 'pc', + } + api_url = r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2' + try: + search_all_result = self._download_json( + api_url, video_id=query, query=self._sign_wbi(query_params, query), + headers=headers, + ) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: + raise ExtractorError('Request is blocked by server (-412).', expected=True) + raise + status_code = search_all_result['code'] + if status_code == -400: + raise ExtractorError('Invalid request (-400).', expected=True) + result_list = search_all_result['data'].get('result') + if not result_list: + self.write_debug(f'Response: {search_all_result}') + raise ExtractorError(f'Result not found in the response ({status_code}).', + expected=True) + for result_data in traverse_obj(result_list, (..., 'data', ...)): + yield self._extract_search_result(result_data) + + +class BiliBiliSearchIE(SearchInfoExtractor, BilibiliBaseIE): IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' @@ -1812,21 +1901,16 @@ class BiliBiliSearchIE(SearchInfoExtractor): def _search_results(self, query): if not self._get_cookies('https://api.bilibili.com').get('buvid3'): self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') + headers = self.geo_verification_headers() + headers['Referer'] = 'https://www.bilibili.com/' for page_num in itertools.count(1): videos = self._download_json( - 'https://api.bilibili.com/x/web-interface/search/type', query, - note=f'Extracting results from page {page_num}', query={ - 'Search_key': query, + 'https://api.bilibili.com/x/web-interface/wbi/search/type', query, + note=f'Extracting results from page {page_num}', query=self._sign_wbi({ 'keyword': query, 'page': page_num, - 'context': '', - 'duration': 0, - 'tids_2': '', - '__refresh__': 'true', 'search_type': 'video', - 'tids': 0, - 'highlight': 1, - })['data'].get('result') + }, query), headers=headers)['data'].get('result') if not videos: break for video in videos: @@ -2564,3 +2648,126 @@ def _real_extract(self, url): 'Referer': url, }, } + + +class BiliBiliSearchPageIE(BiliBiliSearchBaseIE): + IE_DESC = 'Bilibili Search Page URL Extractor' + _VALID_URL = r'https?://search\.bilibili\.com/(?Pall|video|bangumi|pgc|live|upuser).*' + _TESTS = [{ + 'url': r'https://search.bilibili.com/all?keyword=yt+-+dlp+%E4%B8%8B%E8%BD%BD%E5%99%A8', + 'playlist_count': 36, + 'info_dict': { + 'id': 'yt - dlp 下载器', + 'title': 'yt - dlp 下载器', + }, + }, { + 'url': r'https://search.bilibili.com/bangumi/?keyword=%E5%AD%A4%E7%8B%AC%E6%91%87%E6%BB%9A&from_source=webtop_search&spm_id_from=333.1007&search_source=5', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '孤独摇滚', + 'title': '孤独摇滚', + }, + 'skip': 'geo-restricted', + }, { + 'url': r'https://search.bilibili.com/video?keyword=%E8%AE%A9%E5%AD%90%E5%BC%B9%E9%A3%9E&from_source=webtop_search&spm_id_from=333.1007&search_source=5&order=dm&duration=4&tids=181&page=3&o=72', + 'playlist_mincount': 4, + 'info_dict': { + 'id': '让子弹飞', + 'title': '让子弹飞', + }, + }] + + def _real_extract(self, url): + headers = self.geo_verification_headers() + headers['Referer'] = url + entries = [] + params = parse_qs(url) + query = { + 'platform': 'pc', + 'page_size': 36, + } + if not self._get_cookies('https://api.bilibili.com').get('buvid3'): + self._set_cookie('.bilibili.com', 'buvid3', f'{uuid.uuid4()}infoc') + search_type = self._match_valid_url(url).group('type') + raw_playlist_id = traverse_obj(params, ('keyword', 0)) + if not raw_playlist_id: + raise ExtractorError('Please specify the keyword to search for!', expected=True) + playlist_id = urllib.parse.unquote_plus(raw_playlist_id) + search_type_mapping = { + 'video': 'video', + 'bangumi': 'media_bangumi', + 'pgc': 'media_ft', + 'live': 'live_room', + 'upuser': 'bili_user', + 'all': 'video', # 'all' search calls video search after page 1 + } + valid_params = [ + 'keyword', + 'page', + 'order', + 'duration', + 'tids', + 'search_type', # Only when searching for live_room or live_user + 'order_sort', + 'user_type', + ] + for valid_param in valid_params: + param_value = traverse_obj(params, (valid_param, 0)) + if param_value is not None: + query[valid_param] = param_value + page_num = int(query.get('page', 1)) + param_offset = int_or_none(traverse_obj(params, ('o', 0))) + if page_num == 1: + query['dynamic_offset'] = 0 + elif param_offset is not None: + query['dynamic_offset'] = param_offset + else: + query['dynamic_offset'] = query['page_size'] * (page_num - 1) + if search_type == 'live' and traverse_obj(params, ('search_type', 0)) == 'live_user': + raise ExtractorError('Live users are not downloadable!', expected=True) + if search_type == 'all' and page_num == 1: + try: + search_all_result = self._download_json( + r'https://api.bilibili.com/x/web-interface/wbi/search/all/v2', + video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: + raise ExtractorError('Request is blocked by server (-412).', expected=True) + raise + status_code = search_all_result['code'] + if status_code == -400: + raise ExtractorError('Invalid request (-400).', expected=True) + result_list = search_all_result['data'].get('result') + if not result_list: + self.write_debug(f'Response: {search_all_result}') + raise ExtractorError(f'Result not found in the response ({status_code}).', + expected=True) + + entries = [self._extract_search_result(result_data) for result_data in traverse_obj(result_list, (..., 'data', ...))] + else: + query = { + 'search_type': search_type_mapping[search_type], + **query, # search_type in type is overridden when specified in url params + } + try: + search_type_result = self._download_json( + r'https://api.bilibili.com/x/web-interface/wbi/search/type', + video_id=playlist_id, query=self._sign_wbi(query, playlist_id), headers=headers, + ) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: + raise ExtractorError('Request is blocked by server (-412).') + raise + status_code = search_type_result['code'] + if status_code == -400: + raise ExtractorError('Invalid request (-400).') + result_list = search_type_result['data'].get('result') + if not result_list: + self.write_debug(f'Response: {search_type_result}') + raise ExtractorError( + f'Result not found in the response ({status_code}). ' + 'You might want to try a VPN or a proxy server (with --proxy)', expected=True) + + entries = [self._extract_search_result(result_data) for result_data in result_list] + + return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=playlist_id)