mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:35:12 +00:00 
			
		
		
		
	[youtube] Add alternative automatic captions extraction approach (Closes #8667)
This commit is contained in:
		@@ -975,40 +975,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
            return {}
 | 
					            return {}
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            args = player_config['args']
 | 
					            args = player_config['args']
 | 
				
			||||||
            caption_url = args['ttsurl']
 | 
					            caption_url = args.get('ttsurl')
 | 
				
			||||||
            if not caption_url:
 | 
					            if caption_url:
 | 
				
			||||||
                self._downloader.report_warning(err_msg)
 | 
					                timestamp = args['timestamp']
 | 
				
			||||||
                return {}
 | 
					                # We get the available subtitles
 | 
				
			||||||
            timestamp = args['timestamp']
 | 
					                list_params = compat_urllib_parse.urlencode({
 | 
				
			||||||
            # We get the available subtitles
 | 
					                    'type': 'list',
 | 
				
			||||||
            list_params = compat_urllib_parse.urlencode({
 | 
					                    'tlangs': 1,
 | 
				
			||||||
                'type': 'list',
 | 
					                    'asrs': 1,
 | 
				
			||||||
                'tlangs': 1,
 | 
					                })
 | 
				
			||||||
                'asrs': 1,
 | 
					                list_url = caption_url + '&' + list_params
 | 
				
			||||||
            })
 | 
					                caption_list = self._download_xml(list_url, video_id)
 | 
				
			||||||
            list_url = caption_url + '&' + list_params
 | 
					                original_lang_node = caption_list.find('track')
 | 
				
			||||||
            caption_list = self._download_xml(list_url, video_id)
 | 
					                if original_lang_node is None:
 | 
				
			||||||
            original_lang_node = caption_list.find('track')
 | 
					                    self._downloader.report_warning('Video doesn\'t have automatic captions')
 | 
				
			||||||
            if original_lang_node is None:
 | 
					                    return {}
 | 
				
			||||||
                self._downloader.report_warning('Video doesn\'t have automatic captions')
 | 
					                original_lang = original_lang_node.attrib['lang_code']
 | 
				
			||||||
                return {}
 | 
					                caption_kind = original_lang_node.attrib.get('kind', '')
 | 
				
			||||||
            original_lang = original_lang_node.attrib['lang_code']
 | 
					
 | 
				
			||||||
            caption_kind = original_lang_node.attrib.get('kind', '')
 | 
					                sub_lang_list = {}
 | 
				
			||||||
 | 
					                for lang_node in caption_list.findall('target'):
 | 
				
			||||||
 | 
					                    sub_lang = lang_node.attrib['lang_code']
 | 
				
			||||||
 | 
					                    sub_formats = []
 | 
				
			||||||
 | 
					                    for ext in self._SUBTITLE_FORMATS:
 | 
				
			||||||
 | 
					                        params = compat_urllib_parse.urlencode({
 | 
				
			||||||
 | 
					                            'lang': original_lang,
 | 
				
			||||||
 | 
					                            'tlang': sub_lang,
 | 
				
			||||||
 | 
					                            'fmt': ext,
 | 
				
			||||||
 | 
					                            'ts': timestamp,
 | 
				
			||||||
 | 
					                            'kind': caption_kind,
 | 
				
			||||||
 | 
					                        })
 | 
				
			||||||
 | 
					                        sub_formats.append({
 | 
				
			||||||
 | 
					                            'url': caption_url + '&' + params,
 | 
				
			||||||
 | 
					                            'ext': ext,
 | 
				
			||||||
 | 
					                        })
 | 
				
			||||||
 | 
					                    sub_lang_list[sub_lang] = sub_formats
 | 
				
			||||||
 | 
					                return sub_lang_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Some videos don't provide ttsurl but rather caption_tracks and
 | 
				
			||||||
 | 
					            # caption_translation_languages (e.g. 20LmZk1hakA)
 | 
				
			||||||
 | 
					            caption_tracks = args['caption_tracks']
 | 
				
			||||||
 | 
					            caption_translation_languages = args['caption_translation_languages']
 | 
				
			||||||
 | 
					            caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
 | 
				
			||||||
 | 
					            parsed_caption_url = compat_urlparse.urlparse(caption_url)
 | 
				
			||||||
 | 
					            caption_qs = compat_parse_qs(parsed_caption_url.query)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            sub_lang_list = {}
 | 
					            sub_lang_list = {}
 | 
				
			||||||
            for lang_node in caption_list.findall('target'):
 | 
					            for lang in caption_translation_languages.split(','):
 | 
				
			||||||
                sub_lang = lang_node.attrib['lang_code']
 | 
					                lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
 | 
				
			||||||
 | 
					                sub_lang = lang_qs.get('lc', [None])[0]
 | 
				
			||||||
 | 
					                if not sub_lang:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
                sub_formats = []
 | 
					                sub_formats = []
 | 
				
			||||||
                for ext in self._SUBTITLE_FORMATS:
 | 
					                for ext in self._SUBTITLE_FORMATS:
 | 
				
			||||||
                    params = compat_urllib_parse.urlencode({
 | 
					                    caption_qs.update({
 | 
				
			||||||
                        'lang': original_lang,
 | 
					                        'tlang': [sub_lang],
 | 
				
			||||||
                        'tlang': sub_lang,
 | 
					                        'fmt': [ext],
 | 
				
			||||||
                        'fmt': ext,
 | 
					 | 
				
			||||||
                        'ts': timestamp,
 | 
					 | 
				
			||||||
                        'kind': caption_kind,
 | 
					 | 
				
			||||||
                    })
 | 
					                    })
 | 
				
			||||||
 | 
					                    sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
 | 
				
			||||||
 | 
					                        query=compat_urllib_parse.urlencode(caption_qs, True)))
 | 
				
			||||||
                    sub_formats.append({
 | 
					                    sub_formats.append({
 | 
				
			||||||
                        'url': caption_url + '&' + params,
 | 
					                        'url': sub_url,
 | 
				
			||||||
                        'ext': ext,
 | 
					                        'ext': ext,
 | 
				
			||||||
                    })
 | 
					                    })
 | 
				
			||||||
                sub_lang_list[sub_lang] = sub_formats
 | 
					                sub_lang_list[sub_lang] = sub_formats
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user