mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[extractor/common] Extract f4m and m3u8 formats, subtitles and info
This commit is contained in:
		| @@ -18,6 +18,7 @@ from ..compat import ( | ||||
|     compat_HTTPError, | ||||
|     compat_http_client, | ||||
|     compat_urllib_error, | ||||
|     compat_urllib_parse, | ||||
|     compat_urllib_parse_urlparse, | ||||
|     compat_urllib_request, | ||||
|     compat_urlparse, | ||||
| @@ -37,6 +38,7 @@ from ..utils import ( | ||||
|     RegexNotFoundError, | ||||
|     sanitize_filename, | ||||
|     unescapeHTML, | ||||
|     url_basename, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @@ -978,69 +980,165 @@ class InfoExtractor(object): | ||||
|         self._sort_formats(formats) | ||||
|         return formats | ||||
|  | ||||
|     # TODO: improve extraction | ||||
|     def _extract_smil_formats(self, smil_url, video_id, fatal=True): | ||||
|         smil = self._download_xml( | ||||
|             smil_url, video_id, 'Downloading SMIL file', | ||||
|             'Unable to download SMIL file', fatal=fatal) | ||||
|     @staticmethod | ||||
|     def _xpath_ns(path, namespace=None): | ||||
|         if not namespace: | ||||
|             return path | ||||
|         out = [] | ||||
|         for c in path.split('/'): | ||||
|             if not c or c == '.': | ||||
|                 out.append(c) | ||||
|             else: | ||||
|                 out.append('{%s}%s' % (namespace, c)) | ||||
|         return '/'.join(out) | ||||
|  | ||||
|     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): | ||||
|         smil = self._download_smil(smil_url, video_id, fatal=fatal) | ||||
|  | ||||
|         if smil is False: | ||||
|             assert not fatal | ||||
|             return [] | ||||
|  | ||||
|         base = smil.find('./head/meta').get('base') | ||||
|         namespace = self._search_regex( | ||||
|             r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) | ||||
|  | ||||
|         return self._parse_smil_formats( | ||||
|             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) | ||||
|  | ||||
|     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): | ||||
|         smil = self._download_smil(smil_url, video_id, fatal=fatal) | ||||
|         if smil is False: | ||||
|             return {} | ||||
|         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) | ||||
|  | ||||
|     def _download_smil(self, smil_url, video_id, fatal=True): | ||||
|         return self._download_xml( | ||||
|             smil_url, video_id, 'Downloading SMIL file', | ||||
|             'Unable to download SMIL file', fatal=fatal) | ||||
|  | ||||
|     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): | ||||
|         namespace = self._search_regex( | ||||
|             r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) | ||||
|  | ||||
|         formats = self._parse_smil_formats( | ||||
|             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) | ||||
|         subtitles = self._parse_smil_subtitles(smil, namespace=namespace) | ||||
|  | ||||
|         video_id = os.path.splitext(url_basename(smil_url))[0] | ||||
|         title = None | ||||
|         description = None | ||||
|         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): | ||||
|             name = meta.attrib.get('name') | ||||
|             content = meta.attrib.get('content') | ||||
|             if not name or not content: | ||||
|                 continue | ||||
|             if not title and name == 'title': | ||||
|                 title = content | ||||
|             elif not description and name in ('description', 'abstract'): | ||||
|                 description = content | ||||
|  | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': title or video_id, | ||||
|             'description': description, | ||||
|             'formats': formats, | ||||
|             'subtitles': subtitles, | ||||
|         } | ||||
|  | ||||
|     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): | ||||
|         base = smil_url | ||||
|         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): | ||||
|             b = meta.get('base') or meta.get('httpBase') | ||||
|             if b: | ||||
|                 base = b | ||||
|                 break | ||||
|  | ||||
|         formats = [] | ||||
|         rtmp_count = 0 | ||||
|         if smil.findall('./body/seq/video'): | ||||
|             video = smil.findall('./body/seq/video')[0] | ||||
|             fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) | ||||
|             formats.extend(fmts) | ||||
|         else: | ||||
|             for video in smil.findall('./body/switch/video'): | ||||
|                 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) | ||||
|                 formats.extend(fmts) | ||||
|         http_count = 0 | ||||
|  | ||||
|         videos = smil.findall(self._xpath_ns('.//video', namespace)) | ||||
|         for video in videos: | ||||
|             src = video.get('src') | ||||
|             if not src: | ||||
|                 continue | ||||
|  | ||||
|             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) | ||||
|             filesize = int_or_none(video.get('size') or video.get('fileSize')) | ||||
|             width = int_or_none(video.get('width')) | ||||
|             height = int_or_none(video.get('height')) | ||||
|             proto = video.get('proto') | ||||
|             ext = video.get('ext') | ||||
|             src_ext = determine_ext(src) | ||||
|             streamer = video.get('streamer') or base | ||||
|  | ||||
|             if proto == 'rtmp' or streamer.startswith('rtmp'): | ||||
|                 rtmp_count += 1 | ||||
|                 formats.append({ | ||||
|                     'url': streamer, | ||||
|                     'play_path': src, | ||||
|                     'ext': 'flv', | ||||
|                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), | ||||
|                     'tbr': bitrate, | ||||
|                     'filesize': filesize, | ||||
|                     'width': width, | ||||
|                     'height': height, | ||||
|                 }) | ||||
|                 continue | ||||
|  | ||||
|             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) | ||||
|  | ||||
|             if proto == 'm3u8' or src_ext == 'm3u8': | ||||
|                 formats.extend(self._extract_m3u8_formats( | ||||
|                     src_url, video_id, ext or 'mp4', m3u8_id='hls')) | ||||
|                 continue | ||||
|  | ||||
|             if src_ext == 'f4m': | ||||
|                 f4m_url = src_url | ||||
|                 if not f4m_params: | ||||
|                     f4m_params = { | ||||
|                         'hdcore': '3.2.0', | ||||
|                         'plugin': 'flowplayer-3.2.0.1', | ||||
|                     } | ||||
|                 f4m_url += '&' if '?' in f4m_url else '?' | ||||
|                 f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8') | ||||
|                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) | ||||
|                 continue | ||||
|  | ||||
|             if src_url.startswith('http'): | ||||
|                 http_count += 1 | ||||
|                 formats.append({ | ||||
|                     'url': src_url, | ||||
|                     'ext': ext or src_ext or 'flv', | ||||
|                     'format_id': 'http-%d' % (bitrate or http_count), | ||||
|                     'tbr': bitrate, | ||||
|                     'filesize': filesize, | ||||
|                     'width': width, | ||||
|                     'height': height, | ||||
|                 }) | ||||
|                 continue | ||||
|  | ||||
|         self._sort_formats(formats) | ||||
|  | ||||
|         return formats | ||||
|  | ||||
|     def _parse_smil_video(self, video, video_id, base, rtmp_count): | ||||
|         src = video.get('src') | ||||
|         if not src: | ||||
|             return [], rtmp_count | ||||
|         bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) | ||||
|         width = int_or_none(video.get('width')) | ||||
|         height = int_or_none(video.get('height')) | ||||
|         proto = video.get('proto') | ||||
|         if not proto: | ||||
|             if base: | ||||
|                 if base.startswith('rtmp'): | ||||
|                     proto = 'rtmp' | ||||
|                 elif base.startswith('http'): | ||||
|                     proto = 'http' | ||||
|         ext = video.get('ext') | ||||
|         if proto == 'm3u8': | ||||
|             return self._extract_m3u8_formats(src, video_id, ext), rtmp_count | ||||
|         elif proto == 'rtmp': | ||||
|             rtmp_count += 1 | ||||
|             streamer = video.get('streamer') or base | ||||
|             return ([{ | ||||
|                 'url': streamer, | ||||
|                 'play_path': src, | ||||
|                 'ext': 'flv', | ||||
|                 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), | ||||
|                 'tbr': bitrate, | ||||
|                 'width': width, | ||||
|                 'height': height, | ||||
|             }], rtmp_count) | ||||
|         elif proto.startswith('http'): | ||||
|             return ([{ | ||||
|                 'url': base + src, | ||||
|                 'ext': ext or 'flv', | ||||
|                 'tbr': bitrate, | ||||
|                 'width': width, | ||||
|                 'height': height, | ||||
|             }], rtmp_count) | ||||
|     def _parse_smil_subtitles(self, smil, namespace=None): | ||||
|         subtitles = {} | ||||
|         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): | ||||
|             src = textstream.get('src') | ||||
|             if not src: | ||||
|                 continue | ||||
|             ext = textstream.get('ext') or determine_ext(src) | ||||
|             if not ext: | ||||
|                 type_ = textstream.get('type') | ||||
|                 if type_ == 'text/srt': | ||||
|                     ext = 'srt' | ||||
|             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') | ||||
|             subtitles.setdefault(lang, []).append({ | ||||
|                 'url': src, | ||||
|                 'ext': ext, | ||||
|             }) | ||||
|         return subtitles | ||||
|  | ||||
|     def _live_title(self, name): | ||||
|         """ Generate the title for a live video """ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․