mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[extractor] Detect sttp as subtitles in MPD
				
					
				
			Closes #656 Solution by: fstirlitz
This commit is contained in:
		| @@ -2596,215 +2596,223 @@ class InfoExtractor(object): | |||||||
|                     mime_type = representation_attrib['mimeType'] |                     mime_type = representation_attrib['mimeType'] | ||||||
|                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) |                     content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) | ||||||
|  |  | ||||||
|                     if content_type in ('video', 'audio', 'text') or mime_type == 'image/jpeg': |                     codecs = representation_attrib.get('codecs', '') | ||||||
|                         base_url = '' |                     if content_type not in ('video', 'audio', 'text'): | ||||||
|                         for element in (representation, adaptation_set, period, mpd_doc): |                         if mime_type == 'image/jpeg': | ||||||
|                             base_url_e = element.find(_add_ns('BaseURL')) |                             content_type = 'image/jpeg' | ||||||
|                             if base_url_e is not None: |                         if codecs.split('.')[0] == 'stpp': | ||||||
|                                 base_url = base_url_e.text + base_url |                             content_type = 'text' | ||||||
|                                 if re.match(r'^https?://', base_url): |  | ||||||
|                                     break |  | ||||||
|                         if mpd_base_url and not re.match(r'^https?://', base_url): |  | ||||||
|                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'): |  | ||||||
|                                 mpd_base_url += '/' |  | ||||||
|                             base_url = mpd_base_url + base_url |  | ||||||
|                         representation_id = representation_attrib.get('id') |  | ||||||
|                         lang = representation_attrib.get('lang') |  | ||||||
|                         url_el = representation.find(_add_ns('BaseURL')) |  | ||||||
|                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) |  | ||||||
|                         bandwidth = int_or_none(representation_attrib.get('bandwidth')) |  | ||||||
|                         if representation_id is not None: |  | ||||||
|                             format_id = representation_id |  | ||||||
|                         else: |                         else: | ||||||
|                             format_id = content_type |                             self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) | ||||||
|                         if mpd_id: |                             continue | ||||||
|                             format_id = mpd_id + '-' + format_id |  | ||||||
|                         if content_type in ('video', 'audio'): |  | ||||||
|                             f = { |  | ||||||
|                                 'format_id': format_id, |  | ||||||
|                                 'manifest_url': mpd_url, |  | ||||||
|                                 'ext': mimetype2ext(mime_type), |  | ||||||
|                                 'width': int_or_none(representation_attrib.get('width')), |  | ||||||
|                                 'height': int_or_none(representation_attrib.get('height')), |  | ||||||
|                                 'tbr': float_or_none(bandwidth, 1000), |  | ||||||
|                                 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), |  | ||||||
|                                 'fps': int_or_none(representation_attrib.get('frameRate')), |  | ||||||
|                                 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, |  | ||||||
|                                 'format_note': 'DASH %s' % content_type, |  | ||||||
|                                 'filesize': filesize, |  | ||||||
|                                 'container': mimetype2ext(mime_type) + '_dash', |  | ||||||
|                             } |  | ||||||
|                             f.update(parse_codecs(representation_attrib.get('codecs'))) |  | ||||||
|                         elif content_type == 'text': |  | ||||||
|                             f = { |  | ||||||
|                                 'ext': mimetype2ext(mime_type), |  | ||||||
|                                 'manifest_url': mpd_url, |  | ||||||
|                                 'filesize': filesize, |  | ||||||
|                             } |  | ||||||
|                         elif mime_type == 'image/jpeg': |  | ||||||
|                             # See test case in VikiIE |  | ||||||
|                             # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 |  | ||||||
|                             f = { |  | ||||||
|                                 'format_id': format_id, |  | ||||||
|                                 'ext': 'mhtml', |  | ||||||
|                                 'manifest_url': mpd_url, |  | ||||||
|                                 'format_note': 'DASH storyboards (jpeg)', |  | ||||||
|                                 'acodec': 'none', |  | ||||||
|                                 'vcodec': 'none', |  | ||||||
|                             } |  | ||||||
|                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) |  | ||||||
|  |  | ||||||
|                         def prepare_template(template_name, identifiers): |                     base_url = '' | ||||||
|                             tmpl = representation_ms_info[template_name] |                     for element in (representation, adaptation_set, period, mpd_doc): | ||||||
|                             # First of, % characters outside $...$ templates |                         base_url_e = element.find(_add_ns('BaseURL')) | ||||||
|                             # must be escaped by doubling for proper processing |                         if base_url_e is not None: | ||||||
|                             # by % operator string formatting used further (see |                             base_url = base_url_e.text + base_url | ||||||
|                             # https://github.com/ytdl-org/youtube-dl/issues/16867). |                             if re.match(r'^https?://', base_url): | ||||||
|                             t = '' |                                 break | ||||||
|                             in_template = False |                     if mpd_base_url and not re.match(r'^https?://', base_url): | ||||||
|                             for c in tmpl: |                         if not mpd_base_url.endswith('/') and not base_url.startswith('/'): | ||||||
|  |                             mpd_base_url += '/' | ||||||
|  |                         base_url = mpd_base_url + base_url | ||||||
|  |                     representation_id = representation_attrib.get('id') | ||||||
|  |                     lang = representation_attrib.get('lang') | ||||||
|  |                     url_el = representation.find(_add_ns('BaseURL')) | ||||||
|  |                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) | ||||||
|  |                     bandwidth = int_or_none(representation_attrib.get('bandwidth')) | ||||||
|  |                     if representation_id is not None: | ||||||
|  |                         format_id = representation_id | ||||||
|  |                     else: | ||||||
|  |                         format_id = content_type | ||||||
|  |                     if mpd_id: | ||||||
|  |                         format_id = mpd_id + '-' + format_id | ||||||
|  |                     if content_type in ('video', 'audio'): | ||||||
|  |                         f = { | ||||||
|  |                             'format_id': format_id, | ||||||
|  |                             'manifest_url': mpd_url, | ||||||
|  |                             'ext': mimetype2ext(mime_type), | ||||||
|  |                             'width': int_or_none(representation_attrib.get('width')), | ||||||
|  |                             'height': int_or_none(representation_attrib.get('height')), | ||||||
|  |                             'tbr': float_or_none(bandwidth, 1000), | ||||||
|  |                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')), | ||||||
|  |                             'fps': int_or_none(representation_attrib.get('frameRate')), | ||||||
|  |                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, | ||||||
|  |                             'format_note': 'DASH %s' % content_type, | ||||||
|  |                             'filesize': filesize, | ||||||
|  |                             'container': mimetype2ext(mime_type) + '_dash', | ||||||
|  |                         } | ||||||
|  |                         f.update(parse_codecs(codecs)) | ||||||
|  |                     elif content_type == 'text': | ||||||
|  |                         f = { | ||||||
|  |                             'ext': mimetype2ext(mime_type), | ||||||
|  |                             'manifest_url': mpd_url, | ||||||
|  |                             'filesize': filesize, | ||||||
|  |                         } | ||||||
|  |                     elif content_type == 'image/jpeg': | ||||||
|  |                         # See test case in VikiIE | ||||||
|  |                         # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1 | ||||||
|  |                         f = { | ||||||
|  |                             'format_id': format_id, | ||||||
|  |                             'ext': 'mhtml', | ||||||
|  |                             'manifest_url': mpd_url, | ||||||
|  |                             'format_note': 'DASH storyboards (jpeg)', | ||||||
|  |                             'acodec': 'none', | ||||||
|  |                             'vcodec': 'none', | ||||||
|  |                         } | ||||||
|  |                     representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) | ||||||
|  |  | ||||||
|  |                     def prepare_template(template_name, identifiers): | ||||||
|  |                         tmpl = representation_ms_info[template_name] | ||||||
|  |                         # First of, % characters outside $...$ templates | ||||||
|  |                         # must be escaped by doubling for proper processing | ||||||
|  |                         # by % operator string formatting used further (see | ||||||
|  |                         # https://github.com/ytdl-org/youtube-dl/issues/16867). | ||||||
|  |                         t = '' | ||||||
|  |                         in_template = False | ||||||
|  |                         for c in tmpl: | ||||||
|  |                             t += c | ||||||
|  |                             if c == '$': | ||||||
|  |                                 in_template = not in_template | ||||||
|  |                             elif c == '%' and not in_template: | ||||||
|                                 t += c |                                 t += c | ||||||
|                                 if c == '$': |                         # Next, $...$ templates are translated to their | ||||||
|                                     in_template = not in_template |                         # %(...) counterparts to be used with % operator | ||||||
|                                 elif c == '%' and not in_template: |                         if representation_id is not None: | ||||||
|                                     t += c |                             t = t.replace('$RepresentationID$', representation_id) | ||||||
|                             # Next, $...$ templates are translated to their |                         t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) | ||||||
|                             # %(...) counterparts to be used with % operator |                         t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) | ||||||
|                             if representation_id is not None: |                         t.replace('$$', '$') | ||||||
|                                 t = t.replace('$RepresentationID$', representation_id) |                         return t | ||||||
|                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) |  | ||||||
|                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) |  | ||||||
|                             t.replace('$$', '$') |  | ||||||
|                             return t |  | ||||||
|  |  | ||||||
|                         # @initialization is a regular template like @media one |                     # @initialization is a regular template like @media one | ||||||
|                         # so it should be handled just the same way (see |                     # so it should be handled just the same way (see | ||||||
|                         # https://github.com/ytdl-org/youtube-dl/issues/11605) |                     # https://github.com/ytdl-org/youtube-dl/issues/11605) | ||||||
|                         if 'initialization' in representation_ms_info: |                     if 'initialization' in representation_ms_info: | ||||||
|                             initialization_template = prepare_template( |                         initialization_template = prepare_template( | ||||||
|                                 'initialization', |                             'initialization', | ||||||
|                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and |                             # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and | ||||||
|                                 # $Time$ shall not be included for @initialization thus |                             # $Time$ shall not be included for @initialization thus | ||||||
|                                 # only $Bandwidth$ remains |                             # only $Bandwidth$ remains | ||||||
|                                 ('Bandwidth', )) |                             ('Bandwidth', )) | ||||||
|                             representation_ms_info['initialization_url'] = initialization_template % { |                         representation_ms_info['initialization_url'] = initialization_template % { | ||||||
|                                 'Bandwidth': bandwidth, |                             'Bandwidth': bandwidth, | ||||||
|                             } |                         } | ||||||
|  |  | ||||||
|                         def location_key(location): |                     def location_key(location): | ||||||
|                             return 'url' if re.match(r'^https?://', location) else 'path' |                         return 'url' if re.match(r'^https?://', location) else 'path' | ||||||
|  |  | ||||||
|                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: |                     if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: | ||||||
|  |  | ||||||
|                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) |                         media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) | ||||||
|                             media_location_key = location_key(media_template) |                         media_location_key = location_key(media_template) | ||||||
|  |  | ||||||
|                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ |                         # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ | ||||||
|                             # can't be used at the same time |                         # can't be used at the same time | ||||||
|                             if '%(Number' in media_template and 's' not in representation_ms_info: |                         if '%(Number' in media_template and 's' not in representation_ms_info: | ||||||
|                                 segment_duration = None |                             segment_duration = None | ||||||
|                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: |                             if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: | ||||||
|                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) |                                 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) | ||||||
|                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) |                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) | ||||||
|                                 representation_ms_info['fragments'] = [{ |                             representation_ms_info['fragments'] = [{ | ||||||
|                                     media_location_key: media_template % { |                                 media_location_key: media_template % { | ||||||
|                                         'Number': segment_number, |                                     'Number': segment_number, | ||||||
|                                         'Bandwidth': bandwidth, |                                     'Bandwidth': bandwidth, | ||||||
|                                     }, |                                 }, | ||||||
|                                     'duration': segment_duration, |                                 'duration': segment_duration, | ||||||
|                                 } for segment_number in range( |                             } for segment_number in range( | ||||||
|                                     representation_ms_info['start_number'], |                                 representation_ms_info['start_number'], | ||||||
|                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])] |                                 representation_ms_info['total_number'] + representation_ms_info['start_number'])] | ||||||
|                             else: |                         else: | ||||||
|                                 # $Number*$ or $Time$ in media template with S list available |                             # $Number*$ or $Time$ in media template with S list available | ||||||
|                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg |                             # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg | ||||||
|                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 |                             # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 | ||||||
|                                 representation_ms_info['fragments'] = [] |                             representation_ms_info['fragments'] = [] | ||||||
|                                 segment_time = 0 |                             segment_time = 0 | ||||||
|                                 segment_d = None |                             segment_d = None | ||||||
|                                 segment_number = representation_ms_info['start_number'] |                             segment_number = representation_ms_info['start_number'] | ||||||
|  |  | ||||||
|                                 def add_segment_url(): |                             def add_segment_url(): | ||||||
|                                     segment_url = media_template % { |                                 segment_url = media_template % { | ||||||
|                                         'Time': segment_time, |                                     'Time': segment_time, | ||||||
|                                         'Bandwidth': bandwidth, |                                     'Bandwidth': bandwidth, | ||||||
|                                         'Number': segment_number, |                                     'Number': segment_number, | ||||||
|                                     } |                                 } | ||||||
|                                     representation_ms_info['fragments'].append({ |                                 representation_ms_info['fragments'].append({ | ||||||
|                                         media_location_key: segment_url, |                                     media_location_key: segment_url, | ||||||
|                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']), |                                     'duration': float_or_none(segment_d, representation_ms_info['timescale']), | ||||||
|                                     }) |                                 }) | ||||||
|  |  | ||||||
|                                 for num, s in enumerate(representation_ms_info['s']): |                             for num, s in enumerate(representation_ms_info['s']): | ||||||
|                                     segment_time = s.get('t') or segment_time |                                 segment_time = s.get('t') or segment_time | ||||||
|                                     segment_d = s['d'] |                                 segment_d = s['d'] | ||||||
|  |                                 add_segment_url() | ||||||
|  |                                 segment_number += 1 | ||||||
|  |                                 for r in range(s.get('r', 0)): | ||||||
|  |                                     segment_time += segment_d | ||||||
|                                     add_segment_url() |                                     add_segment_url() | ||||||
|                                     segment_number += 1 |                                     segment_number += 1 | ||||||
|                                     for r in range(s.get('r', 0)): |                                 segment_time += segment_d | ||||||
|                                         segment_time += segment_d |                     elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: | ||||||
|                                         add_segment_url() |                         # No media template | ||||||
|                                         segment_number += 1 |                         # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI | ||||||
|                                     segment_time += segment_d |                         # or any YouTube dashsegments video | ||||||
|                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: |                         fragments = [] | ||||||
|                             # No media template |                         segment_index = 0 | ||||||
|                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI |                         timescale = representation_ms_info['timescale'] | ||||||
|                             # or any YouTube dashsegments video |                         for s in representation_ms_info['s']: | ||||||
|                             fragments = [] |                             duration = float_or_none(s['d'], timescale) | ||||||
|                             segment_index = 0 |                             for r in range(s.get('r', 0) + 1): | ||||||
|                             timescale = representation_ms_info['timescale'] |                                 segment_uri = representation_ms_info['segment_urls'][segment_index] | ||||||
|                             for s in representation_ms_info['s']: |                                 fragments.append({ | ||||||
|                                 duration = float_or_none(s['d'], timescale) |                                     location_key(segment_uri): segment_uri, | ||||||
|                                 for r in range(s.get('r', 0) + 1): |                                     'duration': duration, | ||||||
|                                     segment_uri = representation_ms_info['segment_urls'][segment_index] |                                 }) | ||||||
|                                     fragments.append({ |                                 segment_index += 1 | ||||||
|                                         location_key(segment_uri): segment_uri, |                         representation_ms_info['fragments'] = fragments | ||||||
|                                         'duration': duration, |                     elif 'segment_urls' in representation_ms_info: | ||||||
|                                     }) |                         # Segment URLs with no SegmentTimeline | ||||||
|                                     segment_index += 1 |                         # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 | ||||||
|                             representation_ms_info['fragments'] = fragments |                         # https://github.com/ytdl-org/youtube-dl/pull/14844 | ||||||
|                         elif 'segment_urls' in representation_ms_info: |                         fragments = [] | ||||||
|                             # Segment URLs with no SegmentTimeline |                         segment_duration = float_or_none( | ||||||
|                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 |                             representation_ms_info['segment_duration'], | ||||||
|                             # https://github.com/ytdl-org/youtube-dl/pull/14844 |                             representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None | ||||||
|                             fragments = [] |                         for segment_url in representation_ms_info['segment_urls']: | ||||||
|                             segment_duration = float_or_none( |                             fragment = { | ||||||
|                                 representation_ms_info['segment_duration'], |                                 location_key(segment_url): segment_url, | ||||||
|                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None |                             } | ||||||
|                             for segment_url in representation_ms_info['segment_urls']: |                             if segment_duration: | ||||||
|                                 fragment = { |                                 fragment['duration'] = segment_duration | ||||||
|                                     location_key(segment_url): segment_url, |                             fragments.append(fragment) | ||||||
|                                 } |                         representation_ms_info['fragments'] = fragments | ||||||
|                                 if segment_duration: |                     # If there is a fragments key available then we correctly recognized fragmented media. | ||||||
|                                     fragment['duration'] = segment_duration |                     # Otherwise we will assume unfragmented media with direct access. Technically, such | ||||||
|                                 fragments.append(fragment) |                     # assumption is not necessarily correct since we may simply have no support for | ||||||
|                             representation_ms_info['fragments'] = fragments |                     # some forms of fragmented media renditions yet, but for now we'll use this fallback. | ||||||
|                         # If there is a fragments key available then we correctly recognized fragmented media. |                     if 'fragments' in representation_ms_info: | ||||||
|                         # Otherwise we will assume unfragmented media with direct access. Technically, such |                         f.update({ | ||||||
|                         # assumption is not necessarily correct since we may simply have no support for |                             # NB: mpd_url may be empty when MPD manifest is parsed from a string | ||||||
|                         # some forms of fragmented media renditions yet, but for now we'll use this fallback. |                             'url': mpd_url or base_url, | ||||||
|                         if 'fragments' in representation_ms_info: |                             'fragment_base_url': base_url, | ||||||
|                             f.update({ |                             'fragments': [], | ||||||
|                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string |                             'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml', | ||||||
|                                 'url': mpd_url or base_url, |                         }) | ||||||
|                                 'fragment_base_url': base_url, |                         if 'initialization_url' in representation_ms_info: | ||||||
|                                 'fragments': [], |                             initialization_url = representation_ms_info['initialization_url'] | ||||||
|                                 'protocol': 'http_dash_segments' if mime_type != 'image/jpeg' else 'mhtml', |                             if not f.get('url'): | ||||||
|                             }) |                                 f['url'] = initialization_url | ||||||
|                             if 'initialization_url' in representation_ms_info: |                             f['fragments'].append({location_key(initialization_url): initialization_url}) | ||||||
|                                 initialization_url = representation_ms_info['initialization_url'] |                         f['fragments'].extend(representation_ms_info['fragments']) | ||||||
|                                 if not f.get('url'): |  | ||||||
|                                     f['url'] = initialization_url |  | ||||||
|                                 f['fragments'].append({location_key(initialization_url): initialization_url}) |  | ||||||
|                             f['fragments'].extend(representation_ms_info['fragments']) |  | ||||||
|                         else: |  | ||||||
|                             # Assuming direct URL to unfragmented media. |  | ||||||
|                             f['url'] = base_url |  | ||||||
|                         if content_type in ('video', 'audio') or mime_type == 'image/jpeg': |  | ||||||
|                             formats.append(f) |  | ||||||
|                         elif content_type == 'text': |  | ||||||
|                             subtitles.setdefault(lang or 'und', []).append(f) |  | ||||||
|                     else: |                     else: | ||||||
|                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) |                         # Assuming direct URL to unfragmented media. | ||||||
|  |                         f['url'] = base_url | ||||||
|  |                     if content_type in ('video', 'audio') or mime_type == 'image/jpeg': | ||||||
|  |                         formats.append(f) | ||||||
|  |                     elif content_type == 'text': | ||||||
|  |                         subtitles.setdefault(lang or 'und', []).append(f) | ||||||
|  |  | ||||||
|         return formats, subtitles |         return formats, subtitles | ||||||
|  |  | ||||||
|     def _extract_ism_formats(self, *args, **kwargs): |     def _extract_ism_formats(self, *args, **kwargs): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 pukkandan
					pukkandan