mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[orf] Use new extraction method (Fixes #2057)
This commit is contained in:
		| @@ -73,6 +73,10 @@ class InfoExtractor(object): | ||||
|                                  by this field. | ||||
|                                  -1 for default (order by other properties), | ||||
|                                  -2 or smaller for less than default. | ||||
|                     * quality    Order number of the video quality of this | ||||
|                                  format, irrespective of the file format. | ||||
|                                  -1 for default (order by other properties), | ||||
|                                  -2 or smaller for less than default. | ||||
|     url:            Final video URL. | ||||
|     ext:            Video filename extension. | ||||
|     format:         The video format, defaults to ext (used for --get-format) | ||||
| @@ -483,6 +487,7 @@ class InfoExtractor(object): | ||||
|  | ||||
|             return ( | ||||
|                 preference, | ||||
|                 f.get('quality') if f.get('quality') is not None else -1, | ||||
|                 f.get('height') if f.get('height') is not None else -1, | ||||
|                 f.get('width') if f.get('width') is not None else -1, | ||||
|                 ext_preference, | ||||
|   | ||||
| @@ -1,54 +1,98 @@ | ||||
| # coding: utf-8 | ||||
| from __future__ import unicode_literals | ||||
|  | ||||
| import re | ||||
| import xml.etree.ElementTree | ||||
| import json | ||||
| import re | ||||
|  | ||||
| from .common import InfoExtractor | ||||
| from ..utils import ( | ||||
|     compat_urlparse, | ||||
|     ExtractorError, | ||||
|     find_xpath_attr, | ||||
|     HEADRequest, | ||||
|     unified_strdate, | ||||
| ) | ||||
|  | ||||
|  | ||||
| class ORFIE(InfoExtractor): | ||||
|     _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' | ||||
|     _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)' | ||||
|  | ||||
|     _TEST = { | ||||
|         'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747', | ||||
|         'file': '7319747.mp4', | ||||
|         'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375', | ||||
|         'info_dict': { | ||||
|             'title': 'Was Sie schon immer über Klassik wissen wollten', | ||||
|             'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4', | ||||
|             'duration': 3508, | ||||
|             'upload_date': '20140105', | ||||
|         }, | ||||
|         'skip': 'Blocked outside of Austria', | ||||
|     } | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         mobj = re.match(self._VALID_URL, url) | ||||
|         playlist_id = mobj.group('id') | ||||
|         webpage = self._download_webpage(url, playlist_id) | ||||
|  | ||||
|         flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') | ||||
|         flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] | ||||
|         flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) | ||||
|         playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') | ||||
|         playlist = json.loads(playlist_json) | ||||
|         data_json = self._search_regex( | ||||
|             r'initializeAdworx\((.+?)\);\n', webpage, 'video info') | ||||
|         all_data = json.loads(data_json) | ||||
|         sdata = all_data[0]['values']['segments'] | ||||
|  | ||||
|         videos = [] | ||||
|         ns = '{http://tempuri.org/XMLSchema.xsd}' | ||||
|         xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} | ||||
|         webpage_description = self._og_search_description(webpage) | ||||
|         for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): | ||||
|             # Get best quality url | ||||
|             rtmp_url = None | ||||
|             for q in ['Q6A', 'Q4A', 'Q1A']: | ||||
|                 video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) | ||||
|                 if video_url is not None: | ||||
|                     rtmp_url = video_url.text | ||||
|                     break | ||||
|             if rtmp_url is None: | ||||
|                 raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) | ||||
|             description = self._html_search_regex( | ||||
|                 r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage, | ||||
|                 u'description', default=webpage_description, flags=re.DOTALL) | ||||
|             videos.append({ | ||||
|         def quality_to_int(s): | ||||
|             m = re.search('([0-9]+)', s) | ||||
|             if m is None: | ||||
|                 return -1 | ||||
|             return int(m.group(1)) | ||||
|  | ||||
|         entries = [] | ||||
|         for sd in sdata: | ||||
|             video_id = sd['id'] | ||||
|             formats = [{ | ||||
|                 'preference': -10 if fd['delivery'] == 'hls' else None, | ||||
|                 'format_id': '%s-%s-%s' % ( | ||||
|                     fd['delivery'], fd['quality'], fd['quality_string']), | ||||
|                 'url': fd['src'], | ||||
|                 'protocol': fd['protocol'], | ||||
|                 'quality': quality_to_int(fd['quality']), | ||||
|             } for fd in sd['playlist_item_array']['sources']] | ||||
|  | ||||
|             # Check for geoblocking. | ||||
|             # There is a property is_geoprotection, but that's always false | ||||
|             geo_str = sd.get('geoprotection_string') | ||||
|             if geo_str: | ||||
|                 try: | ||||
|                     http_url = next( | ||||
|                         f['url'] | ||||
|                         for f in formats | ||||
|                         if re.match(r'^https?://.*\.mp4$', f['url'])) | ||||
|                 except StopIteration: | ||||
|                     pass | ||||
|                 else: | ||||
|                     req = HEADRequest(http_url) | ||||
|                     response = self._request_webpage( | ||||
|                         req, video_id, | ||||
|                         note='Testing for geoblocking', | ||||
|                         errnote=(( | ||||
|                             'This video seems to be blocked outside of %s. ' | ||||
|                             'You may want to try the streaming-* formats.') | ||||
|                             % geo_str), | ||||
|                         fatal=False) | ||||
|  | ||||
|             self._sort_formats(formats) | ||||
|  | ||||
|             upload_date = unified_strdate(sd['created_date']) | ||||
|             entries.append({ | ||||
|                 '_type': 'video', | ||||
|                 'id': info['id'], | ||||
|                 'title': info['title'], | ||||
|                 'url': rtmp_url, | ||||
|                 'ext': 'flv', | ||||
|                 'description': description, | ||||
|                 }) | ||||
|                 'id': video_id, | ||||
|                 'title': sd['header'], | ||||
|                 'formats': formats, | ||||
|                 'description': sd.get('description'), | ||||
|                 'duration': int(sd['duration_in_seconds']), | ||||
|                 'upload_date': upload_date, | ||||
|                 'thumbnail': sd.get('image_full_url'), | ||||
|             }) | ||||
|  | ||||
|         return videos | ||||
|         return { | ||||
|             '_type': 'playlist', | ||||
|             'entries': entries, | ||||
|             'id': playlist_id, | ||||
|         } | ||||
|   | ||||
| @@ -764,6 +764,7 @@ def unified_strdate(date_str): | ||||
|         '%Y-%m-%d', | ||||
|         '%d/%m/%Y', | ||||
|         '%Y/%m/%d %H:%M:%S', | ||||
|         '%Y-%m-%d %H:%M:%S', | ||||
|         '%d.%m.%Y %H:%M', | ||||
|         '%Y-%m-%dT%H:%M:%SZ', | ||||
|         '%Y-%m-%dT%H:%M:%S.%fZ', | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
					Philipp Hagemeister