mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[mixcloud] Rewrite extractor (fixes #278)
This commit is contained in:
		| @@ -5,34 +5,27 @@ import socket | |||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     compat_http_client, |     compat_http_client, | ||||||
|     compat_str, |  | ||||||
|     compat_urllib_error, |     compat_urllib_error, | ||||||
|     compat_urllib_request, |     compat_urllib_request, | ||||||
|  |     unified_strdate, | ||||||
|     ExtractorError, |  | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| class MixcloudIE(InfoExtractor): | class MixcloudIE(InfoExtractor): | ||||||
|     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ |  | ||||||
|     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' |     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' | ||||||
|     IE_NAME = u'mixcloud' |     IE_NAME = u'mixcloud' | ||||||
|  |  | ||||||
|     def report_download_json(self, file_id): |     _TEST = { | ||||||
|         """Report JSON download.""" |         u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/', | ||||||
|         self.to_screen(u'Downloading json') |         u'file': u'dholbach-cryptkeeper.mp3', | ||||||
|  |         u'info_dict': { | ||||||
|     def get_urls(self, jsonData, fmt, bitrate='best'): |             u'title': u'Cryptkeeper', | ||||||
|         """Get urls from 'audio_formats' section in json""" |             u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', | ||||||
|         try: |             u'uploader': u'Daniel Holbach', | ||||||
|             bitrate_list = jsonData[fmt] |             u'uploader_id': u'dholbach', | ||||||
|             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: |             u'upload_date': u'20111115', | ||||||
|                 bitrate = max(bitrate_list) # select highest |         }, | ||||||
|  |     } | ||||||
|             url_list = jsonData[fmt][bitrate] |  | ||||||
|         except TypeError: # we have no bitrate info. |  | ||||||
|             url_list = jsonData[fmt] |  | ||||||
|         return url_list |  | ||||||
|  |  | ||||||
|     def check_urls(self, url_list): |     def check_urls(self, url_list): | ||||||
|         """Returns 1st active url from list""" |         """Returns 1st active url from list""" | ||||||
| @@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor): | |||||||
|  |  | ||||||
|         return None |         return None | ||||||
|  |  | ||||||
|     def _print_formats(self, formats): |  | ||||||
|         print('Available formats:') |  | ||||||
|         for fmt in formats.keys(): |  | ||||||
|             for b in formats[fmt]: |  | ||||||
|                 try: |  | ||||||
|                     ext = formats[fmt][b][0] |  | ||||||
|                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])) |  | ||||||
|                 except TypeError: # we have no bitrate info |  | ||||||
|                     ext = formats[fmt][0] |  | ||||||
|                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])) |  | ||||||
|                     break |  | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         mobj = re.match(self._VALID_URL, url) |         mobj = re.match(self._VALID_URL, url) | ||||||
|         if mobj is None: |  | ||||||
|             raise ExtractorError(u'Invalid URL: %s' % url) |  | ||||||
|         # extract uploader & filename from url |  | ||||||
|         uploader = mobj.group(1).decode('utf-8') |  | ||||||
|         file_id = uploader + "-" + mobj.group(2).decode('utf-8') |  | ||||||
|  |  | ||||||
|         # construct API request |         uploader = mobj.group(1) | ||||||
|         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' |         cloudcast_name = mobj.group(2) | ||||||
|         # retrieve .json file with links to files |         track_id = '-'.join((uploader, cloudcast_name)) | ||||||
|         request = compat_urllib_request.Request(file_url) |         api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name) | ||||||
|         try: |         webpage = self._download_webpage(url, track_id) | ||||||
|             self.report_download_json(file_url) |         json_data = self._download_webpage(api_url, track_id, | ||||||
|             jsonData = compat_urllib_request.urlopen(request).read() |             u'Downloading cloudcast info') | ||||||
|         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: |         info = json.loads(json_data) | ||||||
|             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err)) |  | ||||||
|  |  | ||||||
|         # parse JSON |         preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') | ||||||
|         json_data = json.loads(jsonData) |         song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') | ||||||
|         player_url = json_data['player_swf_url'] |         template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) | ||||||
|         formats = dict(json_data['audio_formats']) |         final_song_url = self.check_urls(template_url % i for i in range(30)) | ||||||
|  |  | ||||||
|         req_format = self._downloader.params.get('format', None) |         return { | ||||||
|  |             'id': track_id, | ||||||
|         if self._downloader.params.get('listformats', None): |             'title': info['name'], | ||||||
|             self._print_formats(formats) |             'url': final_song_url, | ||||||
|             return |             'ext': 'mp3', | ||||||
|  |             'description': info['description'], | ||||||
|         if req_format is None or req_format == 'best': |             'thumbnail': info['pictures'].get('extra_large'), | ||||||
|             for format_param in formats.keys(): |             'uploader': info['user']['name'], | ||||||
|                 url_list = self.get_urls(formats, format_param) |             'uploader_id': info['user']['username'], | ||||||
|                 # check urls |             'upload_date': unified_strdate(info['created_time']), | ||||||
|                 file_url = self.check_urls(url_list) |             'view_count': info['play_count'], | ||||||
|                 if file_url is not None: |         } | ||||||
|                     break # got it! |  | ||||||
|         else: |  | ||||||
|             if req_format not in formats: |  | ||||||
|                 raise ExtractorError(u'Format is not available') |  | ||||||
|  |  | ||||||
|             url_list = self.get_urls(formats, req_format) |  | ||||||
|             file_url = self.check_urls(url_list) |  | ||||||
|             format_param = req_format |  | ||||||
|  |  | ||||||
|         return [{ |  | ||||||
|             'id': file_id.decode('utf-8'), |  | ||||||
|             'url': file_url.decode('utf-8'), |  | ||||||
|             'uploader': uploader.decode('utf-8'), |  | ||||||
|             'upload_date': None, |  | ||||||
|             'title': json_data['name'], |  | ||||||
|             'ext': file_url.split('.')[-1].decode('utf-8'), |  | ||||||
|             'format': (format_param is None and u'NA' or format_param.decode('utf-8')), |  | ||||||
|             'thumbnail': json_data['thumbnail_url'], |  | ||||||
|             'description': json_data['description'], |  | ||||||
|             'player_url': player_url.decode('utf-8'), |  | ||||||
|         }] |  | ||||||
|   | |||||||
| @@ -700,7 +700,16 @@ def unified_strdate(date_str): | |||||||
|     date_str = date_str.replace(',',' ') |     date_str = date_str.replace(',',' ') | ||||||
|     # %z (UTC offset) is only supported in python>=3.2 |     # %z (UTC offset) is only supported in python>=3.2 | ||||||
|     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) |     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) | ||||||
|     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] |     format_expressions = [ | ||||||
|  |         '%d %B %Y', | ||||||
|  |         '%B %d %Y', | ||||||
|  |         '%b %d %Y', | ||||||
|  |         '%Y-%m-%d', | ||||||
|  |         '%d/%m/%Y', | ||||||
|  |         '%Y/%m/%d %H:%M:%S', | ||||||
|  |         '%d.%m.%Y %H:%M', | ||||||
|  |         '%Y-%m-%dT%H:%M:%SZ', | ||||||
|  |     ] | ||||||
|     for expression in format_expressions: |     for expression in format_expressions: | ||||||
|         try: |         try: | ||||||
|             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') |             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jaime Marquínez Ferrándiz
					Jaime Marquínez Ferrándiz