mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[youtube] Add alternative automatic captions extraction approach (Closes #8667)
This commit is contained in:
		| @@ -975,40 +975,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | ||||
|             return {} | ||||
|         try: | ||||
|             args = player_config['args'] | ||||
|             caption_url = args['ttsurl'] | ||||
|             if not caption_url: | ||||
|                 self._downloader.report_warning(err_msg) | ||||
|                 return {} | ||||
|             timestamp = args['timestamp'] | ||||
|             # We get the available subtitles | ||||
|             list_params = compat_urllib_parse.urlencode({ | ||||
|                 'type': 'list', | ||||
|                 'tlangs': 1, | ||||
|                 'asrs': 1, | ||||
|             }) | ||||
|             list_url = caption_url + '&' + list_params | ||||
|             caption_list = self._download_xml(list_url, video_id) | ||||
|             original_lang_node = caption_list.find('track') | ||||
|             if original_lang_node is None: | ||||
|                 self._downloader.report_warning('Video doesn\'t have automatic captions') | ||||
|                 return {} | ||||
|             original_lang = original_lang_node.attrib['lang_code'] | ||||
|             caption_kind = original_lang_node.attrib.get('kind', '') | ||||
|             caption_url = args.get('ttsurl') | ||||
|             if caption_url: | ||||
|                 timestamp = args['timestamp'] | ||||
|                 # We get the available subtitles | ||||
|                 list_params = compat_urllib_parse.urlencode({ | ||||
|                     'type': 'list', | ||||
|                     'tlangs': 1, | ||||
|                     'asrs': 1, | ||||
|                 }) | ||||
|                 list_url = caption_url + '&' + list_params | ||||
|                 caption_list = self._download_xml(list_url, video_id) | ||||
|                 original_lang_node = caption_list.find('track') | ||||
|                 if original_lang_node is None: | ||||
|                     self._downloader.report_warning('Video doesn\'t have automatic captions') | ||||
|                     return {} | ||||
|                 original_lang = original_lang_node.attrib['lang_code'] | ||||
|                 caption_kind = original_lang_node.attrib.get('kind', '') | ||||
|  | ||||
|                 sub_lang_list = {} | ||||
|                 for lang_node in caption_list.findall('target'): | ||||
|                     sub_lang = lang_node.attrib['lang_code'] | ||||
|                     sub_formats = [] | ||||
|                     for ext in self._SUBTITLE_FORMATS: | ||||
|                         params = compat_urllib_parse.urlencode({ | ||||
|                             'lang': original_lang, | ||||
|                             'tlang': sub_lang, | ||||
|                             'fmt': ext, | ||||
|                             'ts': timestamp, | ||||
|                             'kind': caption_kind, | ||||
|                         }) | ||||
|                         sub_formats.append({ | ||||
|                             'url': caption_url + '&' + params, | ||||
|                             'ext': ext, | ||||
|                         }) | ||||
|                     sub_lang_list[sub_lang] = sub_formats | ||||
|                 return sub_lang_list | ||||
|  | ||||
|             # Some videos don't provide ttsurl but rather caption_tracks and | ||||
|             # caption_translation_languages (e.g. 20LmZk1hakA) | ||||
|             caption_tracks = args['caption_tracks'] | ||||
|             caption_translation_languages = args['caption_translation_languages'] | ||||
|             caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] | ||||
|             parsed_caption_url = compat_urlparse.urlparse(caption_url) | ||||
|             caption_qs = compat_parse_qs(parsed_caption_url.query) | ||||
|  | ||||
|             sub_lang_list = {} | ||||
|             for lang_node in caption_list.findall('target'): | ||||
|                 sub_lang = lang_node.attrib['lang_code'] | ||||
|             for lang in caption_translation_languages.split(','): | ||||
|                 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) | ||||
|                 sub_lang = lang_qs.get('lc', [None])[0] | ||||
|                 if not sub_lang: | ||||
|                     continue | ||||
|                 sub_formats = [] | ||||
|                 for ext in self._SUBTITLE_FORMATS: | ||||
|                     params = compat_urllib_parse.urlencode({ | ||||
|                         'lang': original_lang, | ||||
|                         'tlang': sub_lang, | ||||
|                         'fmt': ext, | ||||
|                         'ts': timestamp, | ||||
|                         'kind': caption_kind, | ||||
|                     caption_qs.update({ | ||||
|                         'tlang': [sub_lang], | ||||
|                         'fmt': [ext], | ||||
|                     }) | ||||
|                     sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( | ||||
|                         query=compat_urllib_parse.urlencode(caption_qs, True))) | ||||
|                     sub_formats.append({ | ||||
|                         'url': caption_url + '&' + params, | ||||
|                         'url': sub_url, | ||||
|                         'ext': ext, | ||||
|                     }) | ||||
|                 sub_lang_list[sub_lang] = sub_formats | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Sergey M․
					Sergey M․