mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[ie/weibo] Fix extractor and support user extraction (#7657)
Closes #3964, Closes #4673, Closes #6979 Authored by: c-basalt
This commit is contained in:
		| @@ -2371,7 +2371,8 @@ from .webofstories import ( | |||||||
| ) | ) | ||||||
| from .weibo import ( | from .weibo import ( | ||||||
|     WeiboIE, |     WeiboIE, | ||||||
|     WeiboMobileIE |     WeiboVideoIE, | ||||||
|  |     WeiboUserIE, | ||||||
| ) | ) | ||||||
| from .weiqitv import WeiqiTVIE | from .weiqitv import WeiqiTVIE | ||||||
| from .weverse import ( | from .weverse import ( | ||||||
|   | |||||||
| @@ -1,134 +1,241 @@ | |||||||
| from .common import InfoExtractor |  | ||||||
| 
 |  | ||||||
| import json |  | ||||||
| import random | import random | ||||||
| import re | import itertools | ||||||
|  | import urllib.parse | ||||||
| 
 | 
 | ||||||
| from ..compat import ( | from .common import InfoExtractor | ||||||
|     compat_parse_qs, |  | ||||||
|     compat_str, |  | ||||||
| ) |  | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     js_to_json, |     int_or_none, | ||||||
|  |     make_archive_id, | ||||||
|  |     mimetype2ext, | ||||||
|  |     parse_resolution, | ||||||
|  |     str_or_none, | ||||||
|     strip_jsonp, |     strip_jsonp, | ||||||
|  |     traverse_obj, | ||||||
|  |     url_or_none, | ||||||
|     urlencode_postdata, |     urlencode_postdata, | ||||||
|  |     urljoin, | ||||||
| ) | ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class WeiboIE(InfoExtractor): | class WeiboBaseIE(InfoExtractor): | ||||||
|     _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' |     def _update_visitor_cookies(self, video_id): | ||||||
|     _TEST = { |         visitor_data = self._download_json( | ||||||
|         'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', |             'https://passport.weibo.com/visitor/genvisitor', video_id, | ||||||
|         'info_dict': { |             note='Generating first-visit guest request', | ||||||
|             'id': 'Fp6RGfbff', |             transform_source=strip_jsonp, | ||||||
|             'ext': 'mp4', |             data=urlencode_postdata({ | ||||||
|             'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', |                 'cb': 'gen_callback', | ||||||
|         } |                 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', | ||||||
|     } |             })) | ||||||
| 
 | 
 | ||||||
|     def _real_extract(self, url): |         self._download_webpage( | ||||||
|         video_id = self._match_id(url) |             'https://passport.weibo.com/visitor/visitor', video_id, | ||||||
|         # to get Referer url for genvisitor |             note='Running first-visit callback to get guest cookies', | ||||||
|         webpage, urlh = self._download_webpage_handle(url, video_id) |             query={ | ||||||
| 
 |                 'a': 'incarnate', | ||||||
|         visitor_url = urlh.url |                 't': visitor_data['data']['tid'], | ||||||
| 
 |                 'w': 2, | ||||||
|         if 'passport.weibo.com' in visitor_url: |                 'c': '%03d' % visitor_data['data']['confidence'], | ||||||
|             # first visit |                 'cb': 'cross_domain', | ||||||
|             visitor_data = self._download_json( |                 'from': 'weibo', | ||||||
|                 'https://passport.weibo.com/visitor/genvisitor', video_id, |                 '_rand': random.random(), | ||||||
|                 note='Generating first-visit data', |  | ||||||
|                 transform_source=strip_jsonp, |  | ||||||
|                 headers={'Referer': visitor_url}, |  | ||||||
|                 data=urlencode_postdata({ |  | ||||||
|                     'cb': 'gen_callback', |  | ||||||
|                     'fp': json.dumps({ |  | ||||||
|                         'os': '2', |  | ||||||
|                         'browser': 'Gecko57,0,0,0', |  | ||||||
|                         'fonts': 'undefined', |  | ||||||
|                         'screenInfo': '1440*900*24', |  | ||||||
|                         'plugins': '', |  | ||||||
|                     }), |  | ||||||
|                 })) |  | ||||||
| 
 |  | ||||||
|             tid = visitor_data['data']['tid'] |  | ||||||
|             cnfd = '%03d' % visitor_data['data']['confidence'] |  | ||||||
| 
 |  | ||||||
|             self._download_webpage( |  | ||||||
|                 'https://passport.weibo.com/visitor/visitor', video_id, |  | ||||||
|                 note='Running first-visit callback', |  | ||||||
|                 query={ |  | ||||||
|                     'a': 'incarnate', |  | ||||||
|                     't': tid, |  | ||||||
|                     'w': 2, |  | ||||||
|                     'c': cnfd, |  | ||||||
|                     'cb': 'cross_domain', |  | ||||||
|                     'from': 'weibo', |  | ||||||
|                     '_rand': random.random(), |  | ||||||
|                 }) |  | ||||||
| 
 |  | ||||||
|             webpage = self._download_webpage( |  | ||||||
|                 url, video_id, note='Revisiting webpage') |  | ||||||
| 
 |  | ||||||
|         title = self._html_extract_title(webpage) |  | ||||||
| 
 |  | ||||||
|         video_formats = compat_parse_qs(self._search_regex( |  | ||||||
|             r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) |  | ||||||
| 
 |  | ||||||
|         formats = [] |  | ||||||
|         supported_resolutions = (480, 720) |  | ||||||
|         for res in supported_resolutions: |  | ||||||
|             vid_urls = video_formats.get(compat_str(res)) |  | ||||||
|             if not vid_urls or not isinstance(vid_urls, list): |  | ||||||
|                 continue |  | ||||||
| 
 |  | ||||||
|             vid_url = vid_urls[0] |  | ||||||
|             formats.append({ |  | ||||||
|                 'url': vid_url, |  | ||||||
|                 'height': res, |  | ||||||
|             }) |             }) | ||||||
| 
 | 
 | ||||||
|         uploader = self._og_search_property( |     def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): | ||||||
|             'nick-name', webpage, 'uploader', default=None) |         webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) | ||||||
|  |         if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': | ||||||
|  |             self._update_visitor_cookies(video_id) | ||||||
|  |             webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) | ||||||
|  |         return self._parse_json(webpage, video_id, fatal=fatal) | ||||||
| 
 | 
 | ||||||
|  |     def _extract_formats(self, video_info): | ||||||
|  |         media_info = traverse_obj(video_info, ('page_info', 'media_info')) | ||||||
|  |         formats = traverse_obj(media_info, ( | ||||||
|  |             'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', { | ||||||
|  |                 'url': 'url', | ||||||
|  |                 'format': ('quality_desc', {str}), | ||||||
|  |                 'format_id': ('label', {str}), | ||||||
|  |                 'ext': ('mime', {mimetype2ext}), | ||||||
|  |                 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}), | ||||||
|  |                 'vcodec': ('video_codecs', {str}), | ||||||
|  |                 'fps': ('fps', {int_or_none}), | ||||||
|  |                 'width': ('width', {int_or_none}), | ||||||
|  |                 'height': ('height', {int_or_none}), | ||||||
|  |                 'filesize': ('size', {int_or_none}), | ||||||
|  |                 'acodec': ('audio_codecs', {str}), | ||||||
|  |                 'asr': ('audio_sample_rate', {int_or_none}), | ||||||
|  |                 'audio_channels': ('audio_channels', {int_or_none}), | ||||||
|  |             })) | ||||||
|  |         if not formats:  # fallback, should be barely used | ||||||
|  |             for url in set(traverse_obj(media_info, (..., {url_or_none}))): | ||||||
|  |                 if 'label=' in url:  # filter out non-video urls | ||||||
|  |                     format_id, resolution = self._search_regex( | ||||||
|  |                         r'label=(\w+)&template=(\d+x\d+)', url, 'format info', | ||||||
|  |                         group=(1, 2), default=(None, None)) | ||||||
|  |                     formats.append({ | ||||||
|  |                         'url': url, | ||||||
|  |                         'format_id': format_id, | ||||||
|  |                         **parse_resolution(resolution), | ||||||
|  |                         **traverse_obj(media_info, ( | ||||||
|  |                             'video_details', lambda _, v: v['label'].startswith(format_id), { | ||||||
|  |                                 'size': ('size', {int_or_none}), | ||||||
|  |                                 'tbr': ('bitrate', {int_or_none}), | ||||||
|  |                             } | ||||||
|  |                         ), get_all=False), | ||||||
|  |                     }) | ||||||
|  |         return formats | ||||||
|  | 
 | ||||||
|  |     def _parse_video_info(self, video_info, video_id=None): | ||||||
|         return { |         return { | ||||||
|             'id': video_id, |             'id': video_id, | ||||||
|             'title': title, |             'extractor_key': WeiboIE.ie_key(), | ||||||
|             'uploader': uploader, |             'extractor': WeiboIE.IE_NAME, | ||||||
|             'formats': formats |             'formats': self._extract_formats(video_info), | ||||||
|  |             'http_headers': {'Referer': 'https://weibo.com/'}, | ||||||
|  |             '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)], | ||||||
|  |             **traverse_obj(video_info, { | ||||||
|  |                 'id': (('id', 'id_str', 'mid'), {str_or_none}), | ||||||
|  |                 'display_id': ('mblogid', {str_or_none}), | ||||||
|  |                 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}), | ||||||
|  |                 'description': ('text_raw', {str}), | ||||||
|  |                 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), | ||||||
|  |                 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}), | ||||||
|  |                 'thumbnail': ('page_info', 'page_pic', {url_or_none}), | ||||||
|  |                 'uploader': ('user', 'screen_name', {str}), | ||||||
|  |                 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}), | ||||||
|  |                 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}), | ||||||
|  |                 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}), | ||||||
|  |                 'like_count': ('attitudes_count', {int_or_none}), | ||||||
|  |                 'repost_count': ('reposts_count', {int_or_none}), | ||||||
|  |             }, get_all=False), | ||||||
|  |             'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None, | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class WeiboMobileIE(InfoExtractor): | class WeiboIE(WeiboBaseIE): | ||||||
|     _VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?' |     _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)' | ||||||
|     _TEST = { |     _TESTS = [{ | ||||||
|         'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', |         'url': 'https://weibo.com/7827771738/N4xlMvjhI', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': '4910815147462302', | ||||||
|  |             'ext': 'mp4', | ||||||
|  |             'display_id': 'N4xlMvjhI', | ||||||
|  |             'title': '【睡前消息暑假版第一期:拉泰国一把  对中国有好处】', | ||||||
|  |             'description': 'md5:e2637a7673980d68694ea7c43cf12a5f', | ||||||
|  |             'duration': 918, | ||||||
|  |             'timestamp': 1686312819, | ||||||
|  |             'upload_date': '20230609', | ||||||
|  |             'thumbnail': r're:https://.*\.jpg', | ||||||
|  |             'uploader': '睡前视频基地', | ||||||
|  |             'uploader_id': '7827771738', | ||||||
|  |             'uploader_url': 'https://weibo.com/u/7827771738', | ||||||
|  |             'view_count': int, | ||||||
|  |             'like_count': int, | ||||||
|  |             'repost_count': int, | ||||||
|  |             'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'], | ||||||
|  |         }, | ||||||
|  |     }, { | ||||||
|  |         'url': 'https://m.weibo.cn/status/4189191225395228', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'id': '4189191225395228', |             'id': '4189191225395228', | ||||||
|             'ext': 'mp4', |             'ext': 'mp4', | ||||||
|             'title': '午睡当然是要甜甜蜜蜜的啦', |             'display_id': 'FBqgOmDxO', | ||||||
|             'uploader': '柴犬柴犬' |             'title': '柴犬柴犬的秒拍视频', | ||||||
|  |             'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f', | ||||||
|  |             'duration': 53, | ||||||
|  |             'timestamp': 1514264429, | ||||||
|  |             'upload_date': '20171226', | ||||||
|  |             'thumbnail': r're:https://.*\.jpg', | ||||||
|  |             'uploader': '柴犬柴犬', | ||||||
|  |             'uploader_id': '5926682210', | ||||||
|  |             'uploader_url': 'https://weibo.com/u/5926682210', | ||||||
|  |             'view_count': int, | ||||||
|  |             'like_count': int, | ||||||
|  |             'repost_count': int, | ||||||
|         } |         } | ||||||
|     } |     }, { | ||||||
|  |         'url': 'https://weibo.com/0/4224132150961381', | ||||||
|  |         'note': 'no playback_list example', | ||||||
|  |         'only_matching': True, | ||||||
|  |     }] | ||||||
| 
 | 
 | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         video_id = self._match_id(url) |         video_id = self._match_id(url) | ||||||
|         # to get Referer url for genvisitor |  | ||||||
|         webpage = self._download_webpage(url, video_id, note='visit the page') |  | ||||||
| 
 | 
 | ||||||
|         weibo_info = self._parse_json(self._search_regex( |         return self._parse_video_info(self._weibo_download_json( | ||||||
|             r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', |             f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id)) | ||||||
|             webpage, 'js_code', flags=re.DOTALL), |  | ||||||
|             video_id, transform_source=js_to_json) |  | ||||||
| 
 | 
 | ||||||
|         status_data = weibo_info.get('status', {}) |  | ||||||
|         page_info = status_data.get('page_info') |  | ||||||
|         title = status_data['status_title'] |  | ||||||
|         uploader = status_data.get('user', {}).get('screen_name') |  | ||||||
| 
 | 
 | ||||||
|         return { | class WeiboVideoIE(WeiboBaseIE): | ||||||
|             'id': video_id, |     _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)' | ||||||
|             'title': title, |     _TESTS = [{ | ||||||
|             'uploader': uploader, |         'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow', | ||||||
|             'url': page_info['media_info']['stream_url'] |         'info_dict': { | ||||||
|  |             'id': '4797700463137878', | ||||||
|  |             'ext': 'mp4', | ||||||
|  |             'display_id': 'LEZDodaiW', | ||||||
|  |             'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', | ||||||
|  |             'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ', | ||||||
|  |             'duration': 76, | ||||||
|  |             'timestamp': 1659344278, | ||||||
|  |             'upload_date': '20220801', | ||||||
|  |             'thumbnail': r're:https://.*\.jpg', | ||||||
|  |             'uploader': '君子爱财陈平安', | ||||||
|  |             'uploader_id': '3905382233', | ||||||
|  |             'uploader_url': 'https://weibo.com/u/3905382233', | ||||||
|  |             'view_count': int, | ||||||
|  |             'like_count': int, | ||||||
|  |             'repost_count': int, | ||||||
|         } |         } | ||||||
|  |     }] | ||||||
|  | 
 | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         video_id = self._match_id(url) | ||||||
|  | 
 | ||||||
|  |         post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode() | ||||||
|  |         video_info = self._weibo_download_json( | ||||||
|  |             f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}', | ||||||
|  |             video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo'] | ||||||
|  |         return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class WeiboUserIE(WeiboBaseIE): | ||||||
|  |     _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)' | ||||||
|  |     _TESTS = [{ | ||||||
|  |         'url': 'https://weibo.com/u/2066652961?tabtype=video', | ||||||
|  |         'info_dict': { | ||||||
|  |             'id': '2066652961', | ||||||
|  |             'title': '萧影殿下的视频', | ||||||
|  |             'description': '萧影殿下的全部视频', | ||||||
|  |             'uploader': '萧影殿下', | ||||||
|  |         }, | ||||||
|  |         'playlist_mincount': 195, | ||||||
|  |     }] | ||||||
|  | 
 | ||||||
|  |     def _fetch_page(self, uid, cursor=0, page=1): | ||||||
|  |         return self._weibo_download_json( | ||||||
|  |             'https://weibo.com/ajax/profile/getWaterFallContent', | ||||||
|  |             uid, note=f'Downloading videos page {page}', | ||||||
|  |             query={'uid': uid, 'cursor': cursor})['data'] | ||||||
|  | 
 | ||||||
|  |     def _entries(self, uid, first_page): | ||||||
|  |         cursor = 0 | ||||||
|  |         for page in itertools.count(1): | ||||||
|  |             response = first_page if page == 1 else self._fetch_page(uid, cursor, page) | ||||||
|  |             for video_info in traverse_obj(response, ('list', ..., {dict})): | ||||||
|  |                 yield self._parse_video_info(video_info) | ||||||
|  |             cursor = response.get('next_cursor') | ||||||
|  |             if (int_or_none(cursor) or -1) < 0: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         uid = self._match_id(url) | ||||||
|  |         first_page = self._fetch_page(uid) | ||||||
|  |         uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False) | ||||||
|  |         metainfo = { | ||||||
|  |             'title': f'{uploader}的视频', | ||||||
|  |             'description': f'{uploader}的全部视频', | ||||||
|  |             'uploader': uploader, | ||||||
|  |         } if uploader else {} | ||||||
|  | 
 | ||||||
|  |         return self.playlist_result(self._entries(uid, first_page), uid, **metainfo) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 c-basalt
					c-basalt