mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[orf] Use new extraction method (Fixes #2057)
This commit is contained in:
		| @@ -73,6 +73,10 @@ class InfoExtractor(object): | |||||||
|                                  by this field. |                                  by this field. | ||||||
|                                  -1 for default (order by other properties), |                                  -1 for default (order by other properties), | ||||||
|                                  -2 or smaller for less than default. |                                  -2 or smaller for less than default. | ||||||
|  |                     * quality    Order number of the video quality of this | ||||||
|  |                                  format, irrespective of the file format. | ||||||
|  |                                  -1 for default (order by other properties), | ||||||
|  |                                  -2 or smaller for less than default. | ||||||
|     url:            Final video URL. |     url:            Final video URL. | ||||||
|     ext:            Video filename extension. |     ext:            Video filename extension. | ||||||
|     format:         The video format, defaults to ext (used for --get-format) |     format:         The video format, defaults to ext (used for --get-format) | ||||||
| @@ -483,6 +487,7 @@ class InfoExtractor(object): | |||||||
|  |  | ||||||
|             return ( |             return ( | ||||||
|                 preference, |                 preference, | ||||||
|  |                 f.get('quality') if f.get('quality') is not None else -1, | ||||||
|                 f.get('height') if f.get('height') is not None else -1, |                 f.get('height') if f.get('height') is not None else -1, | ||||||
|                 f.get('width') if f.get('width') is not None else -1, |                 f.get('width') if f.get('width') is not None else -1, | ||||||
|                 ext_preference, |                 ext_preference, | ||||||
|   | |||||||
| @@ -1,54 +1,98 @@ | |||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  |  | ||||||
| import re |  | ||||||
| import xml.etree.ElementTree |  | ||||||
| import json | import json | ||||||
|  | import re | ||||||
|  |  | ||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     compat_urlparse, |     HEADRequest, | ||||||
|     ExtractorError, |     unified_strdate, | ||||||
|     find_xpath_attr, |  | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| class ORFIE(InfoExtractor): | class ORFIE(InfoExtractor): | ||||||
|     _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' |     _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)' | ||||||
|  |  | ||||||
|  |     _TEST = { | ||||||
|  |         'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747', | ||||||
|  |         'file': '7319747.mp4', | ||||||
|  |         'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375', | ||||||
|  |         'info_dict': { | ||||||
|  |             'title': 'Was Sie schon immer über Klassik wissen wollten', | ||||||
|  |             'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4', | ||||||
|  |             'duration': 3508, | ||||||
|  |             'upload_date': '20140105', | ||||||
|  |         }, | ||||||
|  |         'skip': 'Blocked outside of Austria', | ||||||
|  |     } | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         mobj = re.match(self._VALID_URL, url) |         mobj = re.match(self._VALID_URL, url) | ||||||
|         playlist_id = mobj.group('id') |         playlist_id = mobj.group('id') | ||||||
|         webpage = self._download_webpage(url, playlist_id) |         webpage = self._download_webpage(url, playlist_id) | ||||||
|  |  | ||||||
|         flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') |         data_json = self._search_regex( | ||||||
|         flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] |             r'initializeAdworx\((.+?)\);\n', webpage, 'video info') | ||||||
|         flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) |         all_data = json.loads(data_json) | ||||||
|         playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') |         sdata = all_data[0]['values']['segments'] | ||||||
|         playlist = json.loads(playlist_json) |  | ||||||
|  |  | ||||||
|         videos = [] |         def quality_to_int(s): | ||||||
|         ns = '{http://tempuri.org/XMLSchema.xsd}' |             m = re.search('([0-9]+)', s) | ||||||
|         xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} |             if m is None: | ||||||
|         webpage_description = self._og_search_description(webpage) |                 return -1 | ||||||
|         for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): |             return int(m.group(1)) | ||||||
|             # Get best quality url |  | ||||||
|             rtmp_url = None |         entries = [] | ||||||
|             for q in ['Q6A', 'Q4A', 'Q1A']: |         for sd in sdata: | ||||||
|                 video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) |             video_id = sd['id'] | ||||||
|                 if video_url is not None: |             formats = [{ | ||||||
|                     rtmp_url = video_url.text |                 'preference': -10 if fd['delivery'] == 'hls' else None, | ||||||
|                     break |                 'format_id': '%s-%s-%s' % ( | ||||||
|             if rtmp_url is None: |                     fd['delivery'], fd['quality'], fd['quality_string']), | ||||||
|                 raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) |                 'url': fd['src'], | ||||||
|             description = self._html_search_regex( |                 'protocol': fd['protocol'], | ||||||
|                 r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage, |                 'quality': quality_to_int(fd['quality']), | ||||||
|                 u'description', default=webpage_description, flags=re.DOTALL) |             } for fd in sd['playlist_item_array']['sources']] | ||||||
|             videos.append({ |  | ||||||
|  |             # Check for geoblocking. | ||||||
|  |             # There is a property is_geoprotection, but that's always false | ||||||
|  |             geo_str = sd.get('geoprotection_string') | ||||||
|  |             if geo_str: | ||||||
|  |                 try: | ||||||
|  |                     http_url = next( | ||||||
|  |                         f['url'] | ||||||
|  |                         for f in formats | ||||||
|  |                         if re.match(r'^https?://.*\.mp4$', f['url'])) | ||||||
|  |                 except StopIteration: | ||||||
|  |                     pass | ||||||
|  |                 else: | ||||||
|  |                     req = HEADRequest(http_url) | ||||||
|  |                     response = self._request_webpage( | ||||||
|  |                         req, video_id, | ||||||
|  |                         note='Testing for geoblocking', | ||||||
|  |                         errnote=(( | ||||||
|  |                             'This video seems to be blocked outside of %s. ' | ||||||
|  |                             'You may want to try the streaming-* formats.') | ||||||
|  |                             % geo_str), | ||||||
|  |                         fatal=False) | ||||||
|  |  | ||||||
|  |             self._sort_formats(formats) | ||||||
|  |  | ||||||
|  |             upload_date = unified_strdate(sd['created_date']) | ||||||
|  |             entries.append({ | ||||||
|                 '_type': 'video', |                 '_type': 'video', | ||||||
|                 'id': info['id'], |                 'id': video_id, | ||||||
|                 'title': info['title'], |                 'title': sd['header'], | ||||||
|                 'url': rtmp_url, |                 'formats': formats, | ||||||
|                 'ext': 'flv', |                 'description': sd.get('description'), | ||||||
|                 'description': description, |                 'duration': int(sd['duration_in_seconds']), | ||||||
|                 }) |                 'upload_date': upload_date, | ||||||
|  |                 'thumbnail': sd.get('image_full_url'), | ||||||
|  |             }) | ||||||
|  |  | ||||||
|         return videos |         return { | ||||||
|  |             '_type': 'playlist', | ||||||
|  |             'entries': entries, | ||||||
|  |             'id': playlist_id, | ||||||
|  |         } | ||||||
|   | |||||||
| @@ -764,6 +764,7 @@ def unified_strdate(date_str): | |||||||
|         '%Y-%m-%d', |         '%Y-%m-%d', | ||||||
|         '%d/%m/%Y', |         '%d/%m/%Y', | ||||||
|         '%Y/%m/%d %H:%M:%S', |         '%Y/%m/%d %H:%M:%S', | ||||||
|  |         '%Y-%m-%d %H:%M:%S', | ||||||
|         '%d.%m.%Y %H:%M', |         '%d.%m.%Y %H:%M', | ||||||
|         '%Y-%m-%dT%H:%M:%SZ', |         '%Y-%m-%dT%H:%M:%SZ', | ||||||
|         '%Y-%m-%dT%H:%M:%S.%fZ', |         '%Y-%m-%dT%H:%M:%S.%fZ', | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
					Philipp Hagemeister