mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	Merge branch 'master' into openload-phantomjs-method
This commit is contained in:
		| @@ -245,6 +245,10 @@ class InfoExtractor(object): | ||||
|                     specified in the URL. | ||||
|     end_time:       Time in seconds where the reproduction should end, as | ||||
|                     specified in the URL. | ||||
|     chapters:       A list of dictionaries, with the following entries: | ||||
|                         * "start_time" - The start time of the chapter in seconds | ||||
|                         * "end_time" - The end time of the chapter in seconds | ||||
|                         * "title" (optional, string) | ||||
|  | ||||
|     The following fields should only be used when the video belongs to some logical | ||||
|     chapter or section: | ||||
| @@ -976,6 +980,23 @@ class InfoExtractor(object): | ||||
|             return info | ||||
|         if isinstance(json_ld, dict): | ||||
|             json_ld = [json_ld] | ||||
|  | ||||
|         def extract_video_object(e): | ||||
|             assert e['@type'] == 'VideoObject' | ||||
|             info.update({ | ||||
|                 'url': e.get('contentUrl'), | ||||
|                 'title': unescapeHTML(e.get('name')), | ||||
|                 'description': unescapeHTML(e.get('description')), | ||||
|                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), | ||||
|                 'duration': parse_duration(e.get('duration')), | ||||
|                 'timestamp': unified_timestamp(e.get('uploadDate')), | ||||
|                 'filesize': float_or_none(e.get('contentSize')), | ||||
|                 'tbr': int_or_none(e.get('bitrate')), | ||||
|                 'width': int_or_none(e.get('width')), | ||||
|                 'height': int_or_none(e.get('height')), | ||||
|                 'view_count': int_or_none(e.get('interactionCount')), | ||||
|             }) | ||||
|  | ||||
|         for e in json_ld: | ||||
|             if e.get('@context') == 'http://schema.org': | ||||
|                 item_type = e.get('@type') | ||||
| @@ -1000,18 +1021,11 @@ class InfoExtractor(object): | ||||
|                         'description': unescapeHTML(e.get('articleBody')), | ||||
|                     }) | ||||
|                 elif item_type == 'VideoObject': | ||||
|                     info.update({ | ||||
|                         'url': e.get('contentUrl'), | ||||
|                         'title': unescapeHTML(e.get('name')), | ||||
|                         'description': unescapeHTML(e.get('description')), | ||||
|                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), | ||||
|                         'duration': parse_duration(e.get('duration')), | ||||
|                         'timestamp': unified_timestamp(e.get('uploadDate')), | ||||
|                         'filesize': float_or_none(e.get('contentSize')), | ||||
|                         'tbr': int_or_none(e.get('bitrate')), | ||||
|                         'width': int_or_none(e.get('width')), | ||||
|                         'height': int_or_none(e.get('height')), | ||||
|                     }) | ||||
|                     extract_video_object(e) | ||||
|                 elif item_type == 'WebPage': | ||||
|                     video = e.get('video') | ||||
|                     if isinstance(video, dict) and video.get('@type') == 'VideoObject': | ||||
|                         extract_video_object(video) | ||||
|                 break | ||||
|         return dict((k, v) for k, v in info.items() if v is not None) | ||||
|  | ||||
| @@ -1303,40 +1317,50 @@ class InfoExtractor(object): | ||||
|                               entry_protocol='m3u8', preference=None, | ||||
|                               m3u8_id=None, note=None, errnote=None, | ||||
|                               fatal=True, live=False): | ||||
|  | ||||
|         res = self._download_webpage_handle( | ||||
|             m3u8_url, video_id, | ||||
|             note=note or 'Downloading m3u8 information', | ||||
|             errnote=errnote or 'Failed to download m3u8 information', | ||||
|             fatal=fatal) | ||||
|  | ||||
|         if res is False: | ||||
|             return [] | ||||
|  | ||||
|         m3u8_doc, urlh = res | ||||
|         m3u8_url = urlh.geturl() | ||||
|  | ||||
|         return self._parse_m3u8_formats( | ||||
|             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, | ||||
|             preference=preference, m3u8_id=m3u8_id, live=live) | ||||
|  | ||||
|     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, | ||||
|                             entry_protocol='m3u8', preference=None, | ||||
|                             m3u8_id=None, live=False): | ||||
|         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access | ||||
|             return [] | ||||
|  | ||||
|         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] | ||||
|         formats = [] | ||||
|  | ||||
|         format_url = lambda u: ( | ||||
|             u | ||||
|             if re.match(r'^https?://', u) | ||||
|             else compat_urlparse.urljoin(m3u8_url, u)) | ||||
|  | ||||
|         # We should try extracting formats only from master playlists [1], i.e. | ||||
|         # playlists that describe available qualities. On the other hand media | ||||
|         # playlists [2] should be returned as is since they contain just the media | ||||
|         # without qualities renditions. | ||||
|         # References: | ||||
|         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 | ||||
|         # 2. https://github.com/rg3/youtube-dl/issues/12211 | ||||
|  | ||||
|         # We should try extracting formats only from master playlists [1, 4.3.4], | ||||
|         # i.e. playlists that describe available qualities. On the other hand | ||||
|         # media playlists [1, 4.3.3] should be returned as is since they contain | ||||
|         # just the media without qualities renditions. | ||||
|         # Fortunately, master playlist can be easily distinguished from media | ||||
|         # playlist based on particular tags availability. As of [1, 2] master | ||||
|         # playlist tags MUST NOT appear in a media playist and vice versa. | ||||
|         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist | ||||
|         # and MUST NOT appear in master playlist thus we can clearly detect media | ||||
|         # playlist with this criterion. | ||||
|         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 | ||||
|         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 | ||||
|         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 | ||||
|         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] | ||||
|         # master playlist tags MUST NOT appear in a media playist and vice versa. | ||||
|         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every | ||||
|         # media playlist and MUST NOT appear in master playlist thus we can | ||||
|         # clearly detect media playlist with this criterion. | ||||
|  | ||||
|         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is | ||||
|             return [{ | ||||
|                 'url': m3u8_url, | ||||
| @@ -1345,52 +1369,72 @@ class InfoExtractor(object): | ||||
|                 'protocol': entry_protocol, | ||||
|                 'preference': preference, | ||||
|             }] | ||||
|         audio_in_video_stream = {} | ||||
|         last_info = {} | ||||
|         last_media = {} | ||||
|  | ||||
|         groups = {} | ||||
|         last_stream_inf = {} | ||||
|  | ||||
|         def extract_media(x_media_line): | ||||
|             media = parse_m3u8_attributes(x_media_line) | ||||
|             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED | ||||
|             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') | ||||
|             if not (media_type and group_id and name): | ||||
|                 return | ||||
|             groups.setdefault(group_id, []).append(media) | ||||
|             if media_type not in ('VIDEO', 'AUDIO'): | ||||
|                 return | ||||
|             media_url = media.get('URI') | ||||
|             if media_url: | ||||
|                 format_id = [] | ||||
|                 for v in (group_id, name): | ||||
|                     if v: | ||||
|                         format_id.append(v) | ||||
|                 f = { | ||||
|                     'format_id': '-'.join(format_id), | ||||
|                     'url': format_url(media_url), | ||||
|                     'manifest_url': m3u8_url, | ||||
|                     'language': media.get('LANGUAGE'), | ||||
|                     'ext': ext, | ||||
|                     'protocol': entry_protocol, | ||||
|                     'preference': preference, | ||||
|                 } | ||||
|                 if media_type == 'AUDIO': | ||||
|                     f['vcodec'] = 'none' | ||||
|                 formats.append(f) | ||||
|  | ||||
|         def build_stream_name(): | ||||
|             # Despite specification does not mention NAME attribute for | ||||
|             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] | ||||
|             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) | ||||
|             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 | ||||
|             stream_name = last_stream_inf.get('NAME') | ||||
|             if stream_name: | ||||
|                 return stream_name | ||||
|             # If there is no NAME in EXT-X-STREAM-INF it will be obtained | ||||
|             # from corresponding rendition group | ||||
|             stream_group_id = last_stream_inf.get('VIDEO') | ||||
|             if not stream_group_id: | ||||
|                 return | ||||
|             stream_group = groups.get(stream_group_id) | ||||
|             if not stream_group: | ||||
|                 return stream_group_id | ||||
|             rendition = stream_group[0] | ||||
|             return rendition.get('NAME') or stream_group_id | ||||
|  | ||||
|         for line in m3u8_doc.splitlines(): | ||||
|             if line.startswith('#EXT-X-STREAM-INF:'): | ||||
|                 last_info = parse_m3u8_attributes(line) | ||||
|                 last_stream_inf = parse_m3u8_attributes(line) | ||||
|             elif line.startswith('#EXT-X-MEDIA:'): | ||||
|                 media = parse_m3u8_attributes(line) | ||||
|                 media_type = media.get('TYPE') | ||||
|                 if media_type in ('VIDEO', 'AUDIO'): | ||||
|                     group_id = media.get('GROUP-ID') | ||||
|                     media_url = media.get('URI') | ||||
|                     if media_url: | ||||
|                         format_id = [] | ||||
|                         for v in (group_id, media.get('NAME')): | ||||
|                             if v: | ||||
|                                 format_id.append(v) | ||||
|                         f = { | ||||
|                             'format_id': '-'.join(format_id), | ||||
|                             'url': format_url(media_url), | ||||
|                             'language': media.get('LANGUAGE'), | ||||
|                             'ext': ext, | ||||
|                             'protocol': entry_protocol, | ||||
|                             'preference': preference, | ||||
|                         } | ||||
|                         if media_type == 'AUDIO': | ||||
|                             f['vcodec'] = 'none' | ||||
|                             if group_id and not audio_in_video_stream.get(group_id): | ||||
|                                 audio_in_video_stream[group_id] = False | ||||
|                         formats.append(f) | ||||
|                     else: | ||||
|                         # When there is no URI in EXT-X-MEDIA let this tag's | ||||
|                         # data be used by regular URI lines below | ||||
|                         last_media = media | ||||
|                         if media_type == 'AUDIO' and group_id: | ||||
|                             audio_in_video_stream[group_id] = True | ||||
|                 extract_media(line) | ||||
|             elif line.startswith('#') or not line.strip(): | ||||
|                 continue | ||||
|             else: | ||||
|                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) | ||||
|                 tbr = float_or_none( | ||||
|                     last_stream_inf.get('AVERAGE-BANDWIDTH') or | ||||
|                     last_stream_inf.get('BANDWIDTH'), scale=1000) | ||||
|                 format_id = [] | ||||
|                 if m3u8_id: | ||||
|                     format_id.append(m3u8_id) | ||||
|                 # Despite specification does not mention NAME attribute for | ||||
|                 # EXT-X-STREAM-INF it still sometimes may be present | ||||
|                 stream_name = last_info.get('NAME') or last_media.get('NAME') | ||||
|                 stream_name = build_stream_name() | ||||
|                 # Bandwidth of live streams may differ over time thus making | ||||
|                 # format_id unpredictable. So it's better to keep provided | ||||
|                 # format_id intact. | ||||
| @@ -1400,14 +1444,14 @@ class InfoExtractor(object): | ||||
|                 f = { | ||||
|                     'format_id': '-'.join(format_id), | ||||
|                     'url': manifest_url, | ||||
|                     'manifest_url': manifest_url, | ||||
|                     'manifest_url': m3u8_url, | ||||
|                     'tbr': tbr, | ||||
|                     'ext': ext, | ||||
|                     'fps': float_or_none(last_info.get('FRAME-RATE')), | ||||
|                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), | ||||
|                     'protocol': entry_protocol, | ||||
|                     'preference': preference, | ||||
|                 } | ||||
|                 resolution = last_info.get('RESOLUTION') | ||||
|                 resolution = last_stream_inf.get('RESOLUTION') | ||||
|                 if resolution: | ||||
|                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) | ||||
|                     if mobj: | ||||
| @@ -1423,13 +1467,26 @@ class InfoExtractor(object): | ||||
|                         'vbr': vbr, | ||||
|                         'abr': abr, | ||||
|                     }) | ||||
|                 f.update(parse_codecs(last_info.get('CODECS'))) | ||||
|                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none': | ||||
|                     # TODO: update acodec for audio only formats with the same GROUP-ID | ||||
|                     f['acodec'] = 'none' | ||||
|                 codecs = parse_codecs(last_stream_inf.get('CODECS')) | ||||
|                 f.update(codecs) | ||||
|                 audio_group_id = last_stream_inf.get('AUDIO') | ||||
|                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which | ||||
|                 # references a rendition group MUST have a CODECS attribute. | ||||
|                 # However, this is not always respected, for example, [2] | ||||
|                 # contains EXT-X-STREAM-INF tag which references AUDIO | ||||
|                 # rendition group but does not have CODECS and despite | ||||
|                 # referencing audio group an audio group, it represents | ||||
|                 # a complete (with audio and video) format. So, for such cases | ||||
|                 # we will ignore references to rendition groups and treat them | ||||
|                 # as complete formats. | ||||
|                 if audio_group_id and codecs and f.get('vcodec') != 'none': | ||||
|                     audio_group = groups.get(audio_group_id) | ||||
|                     if audio_group and audio_group[0].get('URI'): | ||||
|                         # TODO: update acodec for audio only formats with | ||||
|                         # the same GROUP-ID | ||||
|                         f['acodec'] = 'none' | ||||
|                 formats.append(f) | ||||
|                 last_info = {} | ||||
|                 last_media = {} | ||||
|                 last_stream_inf = {} | ||||
|         return formats | ||||
|  | ||||
|     @staticmethod | ||||
| @@ -1803,7 +1860,7 @@ class InfoExtractor(object): | ||||
|                             'ext': mimetype2ext(mime_type), | ||||
|                             'width': int_or_none(representation_attrib.get('width')), | ||||
|                             'height': int_or_none(representation_attrib.get('height')), | ||||
|                             'tbr': int_or_none(bandwidth, 1000), | ||||
|                             'tbr': float_or_none(bandwidth, 1000), | ||||
|                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')), | ||||
|                             'fps': int_or_none(representation_attrib.get('frameRate')), | ||||
|                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, | ||||
| @@ -2182,7 +2239,7 @@ class InfoExtractor(object): | ||||
|  | ||||
|     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): | ||||
|         mobj = re.search( | ||||
|             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)', | ||||
|             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', | ||||
|             webpage) | ||||
|         if mobj: | ||||
|             try: | ||||
| @@ -2258,11 +2315,17 @@ class InfoExtractor(object): | ||||
|  | ||||
|     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, | ||||
|                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): | ||||
|         urls = [] | ||||
|         formats = [] | ||||
|         for source in jwplayer_sources_data: | ||||
|             source_url = self._proto_relative_url(source['file']) | ||||
|             source_url = self._proto_relative_url(source.get('file')) | ||||
|             if not source_url: | ||||
|                 continue | ||||
|             if base_url: | ||||
|                 source_url = compat_urlparse.urljoin(base_url, source_url) | ||||
|             if source_url in urls: | ||||
|                 continue | ||||
|             urls.append(source_url) | ||||
|             source_type = source.get('type') or '' | ||||
|             ext = mimetype2ext(source_type) or determine_ext(source_url) | ||||
|             if source_type == 'hls' or ext == 'm3u8': | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Tithen-Firion
					Tithen-Firion