mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:35:12 +00:00 
			
		
		
		
	TudouIE: extract all the segments of the video and download the best quality (closes #975)
Also simplify a bit the extraction of the id from the url and write directly the title for the test video
This commit is contained in:
		@@ -1,24 +1,34 @@
 | 
				
			|||||||
 | 
					# coding: utf-8
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .common import InfoExtractor
 | 
					from .common import InfoExtractor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TudouIE(InfoExtractor):
 | 
					class TudouIE(InfoExtractor):
 | 
				
			||||||
    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)'
 | 
					    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
 | 
				
			||||||
    _TEST = {
 | 
					    _TEST = {
 | 
				
			||||||
        u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
 | 
					        u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
 | 
				
			||||||
        u'file': u'159447792.f4v',
 | 
					        u'file': u'159448201.f4v',
 | 
				
			||||||
        u'md5': u'ad7c358a01541e926a1e413612c6b10a',
 | 
					        u'md5': u'140a49ed444bd22f93330985d8475fcb',
 | 
				
			||||||
        u'info_dict': {
 | 
					        u'info_dict': {
 | 
				
			||||||
            u"title": u"\u5361\u9a6c\u4e54\u56fd\u8db3\u5f00\u5927\u811a\u957f\u4f20\u51b2\u540a\u96c6\u9526"
 | 
					            u"title": u"卡马乔国足开大脚长传冲吊集锦"
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _url_for_id(self, id, quality = None):
 | 
				
			||||||
 | 
					        info_url = "http://v2.tudou.com/f?id="+str(id)
 | 
				
			||||||
 | 
					        if quality:
 | 
				
			||||||
 | 
					            info_url += '&hd' + quality
 | 
				
			||||||
 | 
					        webpage = self._download_webpage(info_url, id, "Opening the info webpage")
 | 
				
			||||||
 | 
					        final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
 | 
				
			||||||
 | 
					        return final_url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _real_extract(self, url):
 | 
					    def _real_extract(self, url):
 | 
				
			||||||
        mobj = re.match(self._VALID_URL, url)
 | 
					        mobj = re.match(self._VALID_URL, url)
 | 
				
			||||||
        video_id = mobj.group(2).replace('.html','')
 | 
					        video_id = mobj.group(2)
 | 
				
			||||||
        webpage = self._download_webpage(url, video_id)
 | 
					        webpage = self._download_webpage(url, video_id)
 | 
				
			||||||
        video_id = re.search('"k":(.+?),',webpage).group(1)
 | 
					 | 
				
			||||||
        title = re.search(",kw:\"(.+)\"",webpage)
 | 
					        title = re.search(",kw:\"(.+)\"",webpage)
 | 
				
			||||||
        if title is None:
 | 
					        if title is None:
 | 
				
			||||||
            title = re.search(",kw: \'(.+)\'",webpage)
 | 
					            title = re.search(",kw: \'(.+)\'",webpage)
 | 
				
			||||||
@@ -27,14 +37,27 @@ class TudouIE(InfoExtractor):
 | 
				
			|||||||
        if thumbnail_url is None:
 | 
					        if thumbnail_url is None:
 | 
				
			||||||
            thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
 | 
					            thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
 | 
				
			||||||
        thumbnail_url = thumbnail_url.group(1)
 | 
					        thumbnail_url = thumbnail_url.group(1)
 | 
				
			||||||
        info_url = "http://v2.tudou.com/f?id="+str(video_id)
 | 
					
 | 
				
			||||||
        webpage = self._download_webpage(info_url, video_id, "Opening the info webpage")
 | 
					        segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
 | 
				
			||||||
        final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1)
 | 
					        segments = json.loads(segs_json)
 | 
				
			||||||
 | 
					        # It looks like the keys are the arguments that have to be passed as
 | 
				
			||||||
 | 
					        # the hd field in the request url, we pick the higher
 | 
				
			||||||
 | 
					        quality = sorted(segments.keys())[-1]
 | 
				
			||||||
 | 
					        parts = segments[quality]
 | 
				
			||||||
 | 
					        result = []
 | 
				
			||||||
 | 
					        len_parts = len(parts)
 | 
				
			||||||
 | 
					        if len_parts > 1:
 | 
				
			||||||
 | 
					            self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
 | 
				
			||||||
 | 
					        for part in parts:
 | 
				
			||||||
 | 
					            part_id = part['k']
 | 
				
			||||||
 | 
					            final_url = self._url_for_id(part_id, quality)
 | 
				
			||||||
            ext = (final_url.split('?')[0]).split('.')[-1]
 | 
					            ext = (final_url.split('?')[0]).split('.')[-1]
 | 
				
			||||||
        return [{
 | 
					            part_info = {'id': part_id,
 | 
				
			||||||
            'id':        video_id,
 | 
					 | 
				
			||||||
                          'url': final_url,
 | 
					                          'url': final_url,
 | 
				
			||||||
                          'ext': ext,
 | 
					                          'ext': ext,
 | 
				
			||||||
                          'title': title,
 | 
					                          'title': title,
 | 
				
			||||||
                          'thumbnail': thumbnail_url,
 | 
					                          'thumbnail': thumbnail_url,
 | 
				
			||||||
        }]
 | 
					                          }
 | 
				
			||||||
 | 
					            result.append(part_info)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user