mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[webvtt, extractor/youtube] Extract auto-subs from livestream VODs
Closes #4130 Authored by: pukkandan, fstirlitz
This commit is contained in:
		| @@ -2298,7 +2298,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             microformats = traverse_obj( |             microformats = traverse_obj( | ||||||
|                 prs, (..., 'microformat', 'playerMicroformatRenderer'), |                 prs, (..., 'microformat', 'playerMicroformatRenderer'), | ||||||
|                 expected_type=dict, default=[]) |                 expected_type=dict, default=[]) | ||||||
|             _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url) |             _, is_live, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) | ||||||
|             start_time = time.time() |             start_time = time.time() | ||||||
| 
 | 
 | ||||||
|         def mpd_feed(format_id, delay): |         def mpd_feed(format_id, delay): | ||||||
| @@ -3136,7 +3136,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             self.report_warning(last_error) |             self.report_warning(last_error) | ||||||
|         return prs, player_url |         return prs, player_url | ||||||
| 
 | 
 | ||||||
|     def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration): |     def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, is_live, duration): | ||||||
|         itags, stream_ids = {}, [] |         itags, stream_ids = {}, [] | ||||||
|         itag_qualities, res_qualities = {}, {} |         itag_qualities, res_qualities = {}, {} | ||||||
|         q = qualities([ |         q = qualities([ | ||||||
| @@ -3293,17 +3293,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                 if val in qdict), -1) |                 if val in qdict), -1) | ||||||
|             return True |             return True | ||||||
| 
 | 
 | ||||||
|  |         subtitles = {} | ||||||
|         for sd in streaming_data: |         for sd in streaming_data: | ||||||
|             hls_manifest_url = get_hls and sd.get('hlsManifestUrl') |             hls_manifest_url = get_hls and sd.get('hlsManifestUrl') | ||||||
|             if hls_manifest_url: |             if hls_manifest_url: | ||||||
|                 for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): |                 fmts, subs = self._extract_m3u8_formats_and_subtitles(hls_manifest_url, video_id, 'mp4', fatal=False, live=is_live) | ||||||
|  |                 subtitles = self._merge_subtitles(subs, subtitles) | ||||||
|  |                 for f in fmts: | ||||||
|                     if process_manifest_format(f, 'hls', self._search_regex( |                     if process_manifest_format(f, 'hls', self._search_regex( | ||||||
|                             r'/itag/(\d+)', f['url'], 'itag', default=None)): |                             r'/itag/(\d+)', f['url'], 'itag', default=None)): | ||||||
|                         yield f |                         yield f | ||||||
| 
 | 
 | ||||||
|             dash_manifest_url = get_dash and sd.get('dashManifestUrl') |             dash_manifest_url = get_dash and sd.get('dashManifestUrl') | ||||||
|             if dash_manifest_url: |             if dash_manifest_url: | ||||||
|                 for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): |                 formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) | ||||||
|  |                 subtitles = self._merge_subtitles(subs, subtitles)  # Prioritize HLS subs over DASH | ||||||
|  |                 for f in formats: | ||||||
|                     if process_manifest_format(f, 'dash', f['format_id']): |                     if process_manifest_format(f, 'dash', f['format_id']): | ||||||
|                         f['filesize'] = int_or_none(self._search_regex( |                         f['filesize'] = int_or_none(self._search_regex( | ||||||
|                             r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) |                             r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) | ||||||
| @@ -3311,6 +3316,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                             f['is_from_start'] = True |                             f['is_from_start'] = True | ||||||
| 
 | 
 | ||||||
|                         yield f |                         yield f | ||||||
|  |         yield subtitles | ||||||
| 
 | 
 | ||||||
|     def _extract_storyboard(self, player_responses, duration): |     def _extract_storyboard(self, player_responses, duration): | ||||||
|         spec = get_first( |         spec = get_first( | ||||||
| @@ -3371,9 +3377,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             is_live = get_first(live_broadcast_details, 'isLiveNow') |             is_live = get_first(live_broadcast_details, 'isLiveNow') | ||||||
| 
 | 
 | ||||||
|         streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) |         streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) | ||||||
|         formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration)) |         *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, is_live, duration) | ||||||
| 
 | 
 | ||||||
|         return live_broadcast_details, is_live, streaming_data, formats |         return live_broadcast_details, is_live, streaming_data, formats, subtitles | ||||||
| 
 | 
 | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         url, smuggled_data = unsmuggle_url(url, {}) |         url, smuggled_data = unsmuggle_url(url, {}) | ||||||
| @@ -3464,8 +3470,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                     'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. ' |                     'The livestream has not finished processing. Only 4 hours of the video can be currently downloaded. ' | ||||||
|                     'This is a known issue and patches are welcome') |                     'This is a known issue and patches are welcome') | ||||||
| 
 | 
 | ||||||
|         live_broadcast_details, is_live, streaming_data, formats = self._list_formats( |         live_broadcast_details, is_live, streaming_data, formats, automatic_captions = \ | ||||||
|             video_id, microformats, video_details, player_responses, player_url, duration) |             self._list_formats(video_id, microformats, video_details, player_responses, player_url) | ||||||
| 
 | 
 | ||||||
|         if not formats: |         if not formats: | ||||||
|             if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): |             if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): | ||||||
| @@ -3595,6 +3601,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             'release_timestamp': live_start_time, |             'release_timestamp': live_start_time, | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|  |         subtitles = {} | ||||||
|         pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) |         pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) | ||||||
|         if pctr: |         if pctr: | ||||||
|             def get_lang_code(track): |             def get_lang_code(track): | ||||||
| @@ -3624,7 +3631,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             # NB: Constructing the full subtitle dictionary is slow |             # NB: Constructing the full subtitle dictionary is slow | ||||||
|             get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( |             get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( | ||||||
|                 self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) |                 self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) | ||||||
|             subtitles, automatic_captions = {}, {} |  | ||||||
|             for lang_code, caption_track in captions.items(): |             for lang_code, caption_track in captions.items(): | ||||||
|                 base_url = caption_track.get('baseUrl') |                 base_url = caption_track.get('baseUrl') | ||||||
|                 orig_lang = parse_qs(base_url).get('lang', [None])[-1] |                 orig_lang = parse_qs(base_url).get('lang', [None])[-1] | ||||||
| @@ -3655,8 +3661,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|                     # Setting tlang=lang returns damaged subtitles. |                     # Setting tlang=lang returns damaged subtitles. | ||||||
|                     process_language(automatic_captions, base_url, trans_code, trans_name, |                     process_language(automatic_captions, base_url, trans_code, trans_name, | ||||||
|                                      {} if orig_lang == orig_trans_code else {'tlang': trans_code}) |                                      {} if orig_lang == orig_trans_code else {'tlang': trans_code}) | ||||||
|             info['automatic_captions'] = automatic_captions | 
 | ||||||
|             info['subtitles'] = subtitles |         info['automatic_captions'] = automatic_captions | ||||||
|  |         info['subtitles'] = subtitles | ||||||
| 
 | 
 | ||||||
|         parsed_url = urllib.parse.urlparse(url) |         parsed_url = urllib.parse.urlparse(url) | ||||||
|         for component in [parsed_url.fragment, parsed_url.query]: |         for component in [parsed_url.fragment, parsed_url.query]: | ||||||
|   | |||||||
| @@ -161,6 +161,12 @@ class Magic(HeaderBlock): | |||||||
|     _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') |     _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') | ||||||
|     _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') |     _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') | ||||||
| 
 | 
 | ||||||
|  |     # This was removed from the spec in the 2017 revision; | ||||||
|  |     # the last spec draft to describe this syntax element is | ||||||
|  |     # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>. | ||||||
|  |     # Nevertheless, YouTube keeps serving those | ||||||
|  |     _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])') | ||||||
|  | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def __parse_tsmap(cls, parser): |     def __parse_tsmap(cls, parser): | ||||||
|         parser = parser.child() |         parser = parser.child() | ||||||
| @@ -200,13 +206,18 @@ class Magic(HeaderBlock): | |||||||
|             raise ParseError(parser) |             raise ParseError(parser) | ||||||
| 
 | 
 | ||||||
|         extra = m.group(1) |         extra = m.group(1) | ||||||
|         local, mpegts = None, None |         local, mpegts, meta = None, None, '' | ||||||
|         if parser.consume(cls._REGEX_TSMAP): |         while not parser.consume(_REGEX_NL): | ||||||
|             local, mpegts = cls.__parse_tsmap(parser) |             if parser.consume(cls._REGEX_TSMAP): | ||||||
|         if not parser.consume(_REGEX_NL): |                 local, mpegts = cls.__parse_tsmap(parser) | ||||||
|  |                 continue | ||||||
|  |             m = parser.consume(cls._REGEX_META) | ||||||
|  |             if m: | ||||||
|  |                 meta += m.group(0) | ||||||
|  |                 continue | ||||||
|             raise ParseError(parser) |             raise ParseError(parser) | ||||||
|         parser.commit() |         parser.commit() | ||||||
|         return cls(extra=extra, mpegts=mpegts, local=local) |         return cls(extra=extra, mpegts=mpegts, local=local, meta=meta) | ||||||
| 
 | 
 | ||||||
|     def write_into(self, stream): |     def write_into(self, stream): | ||||||
|         stream.write('WEBVTT') |         stream.write('WEBVTT') | ||||||
| @@ -219,6 +230,8 @@ class Magic(HeaderBlock): | |||||||
|             stream.write(',MPEGTS:') |             stream.write(',MPEGTS:') | ||||||
|             stream.write(str(self.mpegts if self.mpegts is not None else 0)) |             stream.write(str(self.mpegts if self.mpegts is not None else 0)) | ||||||
|             stream.write('\n') |             stream.write('\n') | ||||||
|  |         if self.meta: | ||||||
|  |             stream.write(self.meta) | ||||||
|         stream.write('\n') |         stream.write('\n') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 pukkandan
					pukkandan