mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[bilibili] Fix extraction, improve and cleanup
This commit is contained in:
		| @@ -1,34 +1,42 @@ | ||||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| import calendar | ||||
| import datetime | ||||
| import re | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| from ..compat import compat_str | ||||
| from ..compat import ( | ||||
|     compat_etree_fromstring, | ||||
|     compat_str, | ||||
|     compat_parse_qs, | ||||
|     compat_xml_parse_error, | ||||
| ) | ||||
| from ..utils import ( | ||||
|     int_or_none, | ||||
|     unescapeHTML, | ||||
|     ExtractorError, | ||||
|     int_or_none, | ||||
|     float_or_none, | ||||
|     xpath_text, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class BiliBiliIE(InfoExtractor): | ||||
|     _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' | ||||
|     _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)' | ||||
|  | ||||
|     _TESTS = [{ | ||||
|         'url': 'http://www.bilibili.tv/video/av1074402/', | ||||
|         'md5': '2c301e4dab317596e837c3e7633e7d86', | ||||
|         'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', | ||||
|         'info_dict': { | ||||
|             'id': '1554319', | ||||
|             'ext': 'flv', | ||||
|             'title': '【金坷垃】金泡沫', | ||||
|             'duration': 308313, | ||||
|             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', | ||||
|             'duration': 308.067, | ||||
|             'timestamp': 1398012660, | ||||
|             'upload_date': '20140420', | ||||
|             'thumbnail': 're:^https?://.+\.jpg', | ||||
|             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', | ||||
|             'timestamp': 1397983878, | ||||
|             'uploader': '菊子桑', | ||||
|             'uploader_id': '156160', | ||||
|         }, | ||||
|     }, { | ||||
|         'url': 'http://www.bilibili.com/video/av1041170/', | ||||
| @@ -36,75 +44,110 @@ class BiliBiliIE(InfoExtractor): | ||||
|             'id': '1041170', | ||||
|             'title': '【BD1080P】刀语【诸神&异域】', | ||||
|             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', | ||||
|             'uploader': '枫叶逝去', | ||||
|             'timestamp': 1396501299, | ||||
|         }, | ||||
|         'playlist_count': 9, | ||||
|     }] | ||||
|  | ||||
|     # BiliBili blocks keys from time to time. The current key is extracted from | ||||
|     # the Android client | ||||
|     # TODO: find the sign algorithm used in the flash player | ||||
|     _APP_KEY = '86385cdc024c0f6c' | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         video_id = mobj.group('id') | ||||
|         page_num = mobj.group('page_num') or '1' | ||||
|  | ||||
|         view_data = self._download_json( | ||||
|             'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), | ||||
|             video_id) | ||||
|         if 'error' in view_data: | ||||
|             raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) | ||||
|         webpage = self._download_webpage(url, video_id) | ||||
|  | ||||
|         cid = view_data['cid'] | ||||
|         title = unescapeHTML(view_data['title']) | ||||
|         params = compat_parse_qs(self._search_regex( | ||||
|             [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', | ||||
|              r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], | ||||
|             webpage, 'player parameters')) | ||||
|         cid = params['cid'][0] | ||||
|  | ||||
|         doc = self._download_xml( | ||||
|             'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, | ||||
|             cid, | ||||
|             'Downloading page %s/%s' % (page_num, view_data['pages']) | ||||
|         ) | ||||
|         info_xml_str = self._download_webpage( | ||||
|             'http://interface.bilibili.com/v_cdn_play', | ||||
|             cid, query={'appkey': self._APP_KEY, 'cid': cid}, | ||||
|             note='Downloading video info page') | ||||
|  | ||||
|         if xpath_text(doc, './result') == 'error': | ||||
|             raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True) | ||||
|         err_msg = None | ||||
|         durls = None | ||||
|         info_xml = None | ||||
|         try: | ||||
|             info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) | ||||
|         except compat_xml_parse_error: | ||||
|             info_json = self._parse_json(info_xml_str, video_id, fatal=False) | ||||
|             err_msg = (info_json or {}).get('error_text') | ||||
|         else: | ||||
|             err_msg = xpath_text(info_xml, './message') | ||||
|  | ||||
|         if info_xml is not None: | ||||
|             durls = info_xml.findall('./durl') | ||||
|         if not durls: | ||||
|             if err_msg: | ||||
|                 raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) | ||||
|             else: | ||||
|                 raise ExtractorError('No videos found!') | ||||
|  | ||||
|         entries = [] | ||||
|  | ||||
|         for durl in doc.findall('./durl'): | ||||
|         for durl in durls: | ||||
|             size = xpath_text(durl, ['./filesize', './size']) | ||||
|             formats = [{ | ||||
|                 'url': durl.find('./url').text, | ||||
|                 'filesize': int_or_none(size), | ||||
|                 'ext': 'flv', | ||||
|             }] | ||||
|             backup_urls = durl.find('./backup_url') | ||||
|             if backup_urls is not None: | ||||
|                 for backup_url in backup_urls.findall('./url'): | ||||
|                     formats.append({'url': backup_url.text}) | ||||
|             formats.reverse() | ||||
|             for backup_url in durl.findall('./backup_url/url'): | ||||
|                 formats.append({ | ||||
|                     'url': backup_url.text, | ||||
|                     # backup URLs have lower priorities | ||||
|                     'preference': -2 if 'hd.mp4' in backup_url.text else -3, | ||||
|                 }) | ||||
|  | ||||
|             self._sort_formats(formats) | ||||
|  | ||||
|             entries.append({ | ||||
|                 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), | ||||
|                 'title': title, | ||||
|                 'duration': int_or_none(xpath_text(durl, './length'), 1000), | ||||
|                 'formats': formats, | ||||
|             }) | ||||
|  | ||||
|         title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') | ||||
|         description = self._html_search_meta('description', webpage) | ||||
|         datetime_str = self._html_search_regex( | ||||
|             r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False) | ||||
|         if datetime_str: | ||||
|             timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) | ||||
|  | ||||
|         # TODO 'view_count' requires deobfuscating Javascript | ||||
|         info = { | ||||
|             'id': compat_str(cid), | ||||
|             'title': title, | ||||
|             'description': view_data.get('description'), | ||||
|             'thumbnail': view_data.get('pic'), | ||||
|             'uploader': view_data.get('author'), | ||||
|             'timestamp': int_or_none(view_data.get('created')), | ||||
|             'view_count': int_or_none(view_data.get('play')), | ||||
|             'duration': int_or_none(xpath_text(doc, './timelength')), | ||||
|             'description': description, | ||||
|             'timestamp': timestamp, | ||||
|             'thumbnail': self._html_search_meta('thumbnailUrl', webpage), | ||||
|             'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), | ||||
|         } | ||||
|  | ||||
|         uploader_mobj = re.search( | ||||
|             r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', | ||||
|             webpage) | ||||
|         if uploader_mobj: | ||||
|             info.update({ | ||||
|                 'uploader': uploader_mobj.group('name'), | ||||
|                 'uploader_id': uploader_mobj.group('id'), | ||||
|             }) | ||||
|  | ||||
|         for entry in entries: | ||||
|             entry.update(info) | ||||
|  | ||||
|         if len(entries) == 1: | ||||
|             entries[0].update(info) | ||||
|             return entries[0] | ||||
|         else: | ||||
|             info.update({ | ||||
|             return { | ||||
|                 '_type': 'multi_video', | ||||
|                 'id': video_id, | ||||
|                 'title': title, | ||||
|                 'description': description, | ||||
|                 'entries': entries, | ||||
|             }) | ||||
|             return info | ||||
|             } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Yen Chi Hsuan
					Yen Chi Hsuan