mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	Move StanfordOC IE into its own file
This commit is contained in:
		| @@ -37,6 +37,7 @@ from .extractor.myvideo import MyVideoIE | |||||||
| from .extractor.statigram import StatigramIE | from .extractor.statigram import StatigramIE | ||||||
| from .extractor.photobucket import PhotobucketIE | from .extractor.photobucket import PhotobucketIE | ||||||
| from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE | from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE | ||||||
|  | from .extractor.stanfordoc import StanfordOpenClassroomIE | ||||||
| from .extractor.vimeo import VimeoIE | from .extractor.vimeo import VimeoIE | ||||||
| from .extractor.xvideos import XVideosIE | from .extractor.xvideos import XVideosIE | ||||||
| from .extractor.yahoo import YahooIE, YahooSearchIE | from .extractor.yahoo import YahooIE, YahooSearchIE | ||||||
| @@ -150,101 +151,6 @@ class MixcloudIE(InfoExtractor): | |||||||
|             'player_url': player_url.decode('utf-8'), |             'player_url': player_url.decode('utf-8'), | ||||||
|         }] |         }] | ||||||
|  |  | ||||||
| class StanfordOpenClassroomIE(InfoExtractor): |  | ||||||
|     """Information extractor for Stanford's Open ClassRoom""" |  | ||||||
|  |  | ||||||
|     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' |  | ||||||
|     IE_NAME = u'stanfordoc' |  | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |  | ||||||
|         mobj = re.match(self._VALID_URL, url) |  | ||||||
|         if mobj is None: |  | ||||||
|             raise ExtractorError(u'Invalid URL: %s' % url) |  | ||||||
|  |  | ||||||
|         if mobj.group('course') and mobj.group('video'): # A specific video |  | ||||||
|             course = mobj.group('course') |  | ||||||
|             video = mobj.group('video') |  | ||||||
|             info = { |  | ||||||
|                 'id': course + '_' + video, |  | ||||||
|                 'uploader': None, |  | ||||||
|                 'upload_date': None, |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             self.report_extraction(info['id']) |  | ||||||
|             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' |  | ||||||
|             xmlUrl = baseUrl + video + '.xml' |  | ||||||
|             try: |  | ||||||
|                 metaXml = compat_urllib_request.urlopen(xmlUrl).read() |  | ||||||
|             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: |  | ||||||
|                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) |  | ||||||
|             mdoc = xml.etree.ElementTree.fromstring(metaXml) |  | ||||||
|             try: |  | ||||||
|                 info['title'] = mdoc.findall('./title')[0].text |  | ||||||
|                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text |  | ||||||
|             except IndexError: |  | ||||||
|                 raise ExtractorError(u'Invalid metadata XML file') |  | ||||||
|             info['ext'] = info['url'].rpartition('.')[2] |  | ||||||
|             return [info] |  | ||||||
|         elif mobj.group('course'): # A course page |  | ||||||
|             course = mobj.group('course') |  | ||||||
|             info = { |  | ||||||
|                 'id': course, |  | ||||||
|                 'type': 'playlist', |  | ||||||
|                 'uploader': None, |  | ||||||
|                 'upload_date': None, |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             coursepage = self._download_webpage(url, info['id'], |  | ||||||
|                                         note='Downloading course info page', |  | ||||||
|                                         errnote='Unable to download course info page') |  | ||||||
|  |  | ||||||
|             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) |  | ||||||
|  |  | ||||||
|             info['description'] = self._html_search_regex('<description>([^<]+)</description>', |  | ||||||
|                 coursepage, u'description', fatal=False) |  | ||||||
|  |  | ||||||
|             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) |  | ||||||
|             info['list'] = [ |  | ||||||
|                 { |  | ||||||
|                     'type': 'reference', |  | ||||||
|                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), |  | ||||||
|                 } |  | ||||||
|                     for vpage in links] |  | ||||||
|             results = [] |  | ||||||
|             for entry in info['list']: |  | ||||||
|                 assert entry['type'] == 'reference' |  | ||||||
|                 results += self.extract(entry['url']) |  | ||||||
|             return results |  | ||||||
|         else: # Root page |  | ||||||
|             info = { |  | ||||||
|                 'id': 'Stanford OpenClassroom', |  | ||||||
|                 'type': 'playlist', |  | ||||||
|                 'uploader': None, |  | ||||||
|                 'upload_date': None, |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             self.report_download_webpage(info['id']) |  | ||||||
|             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' |  | ||||||
|             try: |  | ||||||
|                 rootpage = compat_urllib_request.urlopen(rootURL).read() |  | ||||||
|             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: |  | ||||||
|                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) |  | ||||||
|  |  | ||||||
|             info['title'] = info['id'] |  | ||||||
|  |  | ||||||
|             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) |  | ||||||
|             info['list'] = [ |  | ||||||
|                 { |  | ||||||
|                     'type': 'reference', |  | ||||||
|                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), |  | ||||||
|                 } |  | ||||||
|                     for cpage in links] |  | ||||||
|  |  | ||||||
|             results = [] |  | ||||||
|             for entry in info['list']: |  | ||||||
|                 assert entry['type'] == 'reference' |  | ||||||
|                 results += self.extract(entry['url']) |  | ||||||
|             return results |  | ||||||
|  |  | ||||||
| class MTVIE(InfoExtractor): | class MTVIE(InfoExtractor): | ||||||
|     """Information extractor for MTV.com""" |     """Information extractor for MTV.com""" | ||||||
|   | |||||||
							
								
								
									
										112
									
								
								youtube_dl/extractor/stanfordoc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								youtube_dl/extractor/stanfordoc.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,112 @@ | |||||||
|  | import re | ||||||
|  | import socket | ||||||
|  | import xml.etree.ElementTree | ||||||
|  |  | ||||||
|  | from .common import InfoExtractor | ||||||
|  | from ..utils import ( | ||||||
|  |     compat_http_client, | ||||||
|  |     compat_str, | ||||||
|  |     compat_urllib_error, | ||||||
|  |     compat_urllib_request, | ||||||
|  |  | ||||||
|  |     ExtractorError, | ||||||
|  |     orderedSet, | ||||||
|  |     unescapeHTML, | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class StanfordOpenClassroomIE(InfoExtractor): | ||||||
|  |     """Information extractor for Stanford's Open ClassRoom""" | ||||||
|  |  | ||||||
|  |     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' | ||||||
|  |     IE_NAME = u'stanfordoc' | ||||||
|  |  | ||||||
|  |     def _real_extract(self, url): | ||||||
|  |         mobj = re.match(self._VALID_URL, url) | ||||||
|  |         if mobj is None: | ||||||
|  |             raise ExtractorError(u'Invalid URL: %s' % url) | ||||||
|  |  | ||||||
|  |         if mobj.group('course') and mobj.group('video'): # A specific video | ||||||
|  |             course = mobj.group('course') | ||||||
|  |             video = mobj.group('video') | ||||||
|  |             info = { | ||||||
|  |                 'id': course + '_' + video, | ||||||
|  |                 'uploader': None, | ||||||
|  |                 'upload_date': None, | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             self.report_extraction(info['id']) | ||||||
|  |             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' | ||||||
|  |             xmlUrl = baseUrl + video + '.xml' | ||||||
|  |             try: | ||||||
|  |                 metaXml = compat_urllib_request.urlopen(xmlUrl).read() | ||||||
|  |             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | ||||||
|  |                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) | ||||||
|  |             mdoc = xml.etree.ElementTree.fromstring(metaXml) | ||||||
|  |             try: | ||||||
|  |                 info['title'] = mdoc.findall('./title')[0].text | ||||||
|  |                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text | ||||||
|  |             except IndexError: | ||||||
|  |                 raise ExtractorError(u'Invalid metadata XML file') | ||||||
|  |             info['ext'] = info['url'].rpartition('.')[2] | ||||||
|  |             return [info] | ||||||
|  |         elif mobj.group('course'): # A course page | ||||||
|  |             course = mobj.group('course') | ||||||
|  |             info = { | ||||||
|  |                 'id': course, | ||||||
|  |                 'type': 'playlist', | ||||||
|  |                 'uploader': None, | ||||||
|  |                 'upload_date': None, | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             coursepage = self._download_webpage(url, info['id'], | ||||||
|  |                                         note='Downloading course info page', | ||||||
|  |                                         errnote='Unable to download course info page') | ||||||
|  |  | ||||||
|  |             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) | ||||||
|  |  | ||||||
|  |             info['description'] = self._html_search_regex('<description>([^<]+)</description>', | ||||||
|  |                 coursepage, u'description', fatal=False) | ||||||
|  |  | ||||||
|  |             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) | ||||||
|  |             info['list'] = [ | ||||||
|  |                 { | ||||||
|  |                     'type': 'reference', | ||||||
|  |                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), | ||||||
|  |                 } | ||||||
|  |                     for vpage in links] | ||||||
|  |             results = [] | ||||||
|  |             for entry in info['list']: | ||||||
|  |                 assert entry['type'] == 'reference' | ||||||
|  |                 results += self.extract(entry['url']) | ||||||
|  |             return results | ||||||
|  |         else: # Root page | ||||||
|  |             info = { | ||||||
|  |                 'id': 'Stanford OpenClassroom', | ||||||
|  |                 'type': 'playlist', | ||||||
|  |                 'uploader': None, | ||||||
|  |                 'upload_date': None, | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             self.report_download_webpage(info['id']) | ||||||
|  |             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' | ||||||
|  |             try: | ||||||
|  |                 rootpage = compat_urllib_request.urlopen(rootURL).read() | ||||||
|  |             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | ||||||
|  |                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err)) | ||||||
|  |  | ||||||
|  |             info['title'] = info['id'] | ||||||
|  |  | ||||||
|  |             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) | ||||||
|  |             info['list'] = [ | ||||||
|  |                 { | ||||||
|  |                     'type': 'reference', | ||||||
|  |                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), | ||||||
|  |                 } | ||||||
|  |                     for cpage in links] | ||||||
|  |  | ||||||
|  |             results = [] | ||||||
|  |             for entry in info['list']: | ||||||
|  |                 assert entry['type'] == 'reference' | ||||||
|  |                 results += self.extract(entry['url']) | ||||||
|  |             return results | ||||||
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
					Philipp Hagemeister