mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	merged unescapeHTML branch; removed lxml dependency
This commit is contained in:
		| @@ -24,11 +24,6 @@ try: | ||||
| except ImportError: | ||||
| 	from cgi import parse_qs | ||||
|  | ||||
| try: | ||||
| 	import lxml.etree | ||||
| except ImportError: | ||||
| 	pass # Handled below | ||||
|  | ||||
| try: | ||||
| 	import xml.etree.ElementTree | ||||
| except ImportError: # Python<2.5: Not officially supported, but let it slip | ||||
| @@ -193,8 +188,8 @@ class YoutubeIE(InfoExtractor): | ||||
| 			end = start + float(dur) | ||||
| 			start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) | ||||
| 			end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) | ||||
| 			caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) | ||||
| 			caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional | ||||
| 			caption = unescapeHTML(caption) | ||||
| 			caption = unescapeHTML(caption) # double cycle, inentional | ||||
| 			srt += str(n) + '\n' | ||||
| 			srt += start + ' --> ' + end + '\n' | ||||
| 			srt += caption + '\n\n' | ||||
| @@ -364,18 +359,9 @@ class YoutubeIE(InfoExtractor): | ||||
| 					pass | ||||
|  | ||||
| 		# description | ||||
| 		try: | ||||
| 			lxml.etree | ||||
| 		except NameError: | ||||
| 			video_description = u'No description available.' | ||||
| 			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) | ||||
| 			if mobj is not None: | ||||
| 				video_description = mobj.group(1).decode('utf-8') | ||||
| 		else: | ||||
| 			html_parser = lxml.etree.HTMLParser(encoding='utf-8') | ||||
| 			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) | ||||
| 			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) | ||||
| 			# TODO use another parser | ||||
| 		video_description = get_element_by_id("eow-description", video_webpage) | ||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | ||||
| 		else: video_description = '' | ||||
| 			 | ||||
| 		# closed captions | ||||
| 		video_subtitles = None | ||||
| @@ -992,7 +978,7 @@ class YahooIE(InfoExtractor): | ||||
| 			self._downloader.trouble(u'ERROR: Unable to extract media URL') | ||||
| 			return | ||||
| 		video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') | ||||
| 		video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) | ||||
| 		video_url = unescapeHTML(video_url) | ||||
|  | ||||
| 		return [{ | ||||
| 			'id':		video_id.decode('utf-8'), | ||||
| @@ -1069,18 +1055,9 @@ class VimeoIE(InfoExtractor): | ||||
| 		video_thumbnail = config["video"]["thumbnail"] | ||||
|  | ||||
| 		# Extract video description | ||||
| 		try: | ||||
| 			lxml.etree | ||||
| 		except NameError: | ||||
| 			video_description = u'No description available.' | ||||
| 			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) | ||||
| 			if mobj is not None: | ||||
| 				video_description = mobj.group(1) | ||||
| 		else: | ||||
| 			html_parser = lxml.etree.HTMLParser() | ||||
| 			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) | ||||
| 			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() | ||||
| 			# TODO use another parser | ||||
| 		video_description = get_element_by_id("description", webpage) | ||||
| 		if video_description: video_description = clean_html(video_description.decode('utf8')) | ||||
| 		else: video_description = '' | ||||
|  | ||||
| 		# Extract upload date | ||||
| 		video_upload_date = u'NA' | ||||
| @@ -2248,8 +2225,6 @@ class EscapistIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) | ||||
|  | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
|  | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @@ -2265,11 +2240,11 @@ class EscapistIE(InfoExtractor): | ||||
| 			return | ||||
|  | ||||
| 		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage) | ||||
| 		description = htmlParser.unescape(descMatch.group(1)) | ||||
| 		description = unescapeHTML(descMatch.group(1)) | ||||
| 		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage) | ||||
| 		imgUrl = htmlParser.unescape(imgMatch.group(1)) | ||||
| 		imgUrl = unescapeHTML(imgMatch.group(1)) | ||||
| 		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage) | ||||
| 		playerUrl = htmlParser.unescape(playerUrlMatch.group(1)) | ||||
| 		playerUrl = unescapeHTML(playerUrlMatch.group(1)) | ||||
| 		configUrlMatch = re.search('config=(.*)$', playerUrl) | ||||
| 		configUrl = urllib2.unquote(configUrlMatch.group(1)) | ||||
|  | ||||
| @@ -2324,8 +2299,6 @@ class CollegeHumorIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
|  | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
|  | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @@ -2391,8 +2364,6 @@ class XVideosIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
|  | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
|  | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @@ -2475,8 +2446,6 @@ class SoundcloudIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
|  | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
|  | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @@ -2561,8 +2530,6 @@ class InfoQIE(InfoExtractor): | ||||
| 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) | ||||
|  | ||||
| 	def _real_extract(self, url): | ||||
| 		htmlParser = HTMLParser.HTMLParser() | ||||
|  | ||||
| 		mobj = re.match(self._VALID_URL, url) | ||||
| 		if mobj is None: | ||||
| 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) | ||||
| @@ -2782,8 +2749,6 @@ class StanfordOpenClassroomIE(InfoExtractor): | ||||
| 			info['format'] = info['ext'] | ||||
| 			return [info] | ||||
| 		elif mobj.group('course'): # A course page | ||||
| 			unescapeHTML = HTMLParser.HTMLParser().unescape | ||||
|  | ||||
| 			course = mobj.group('course') | ||||
| 			info = { | ||||
| 				'id': simplify_title(course), | ||||
| @@ -2822,8 +2787,6 @@ class StanfordOpenClassroomIE(InfoExtractor): | ||||
| 			return results | ||||
| 			 | ||||
| 		else: # Root page | ||||
| 			unescapeHTML = HTMLParser.HTMLParser().unescape | ||||
|  | ||||
| 			info = { | ||||
| 				'id': 'Stanford OpenClassroom', | ||||
| 				'type': 'playlist', | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Filippo Valsorda
					Filippo Valsorda