mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:35:12 +00:00 
			
		
		
		
	[youtube] Support automatic captions with original language different from English (fixes #1225) and download in multiple languages.
This commit is contained in:
		@@ -15,15 +15,20 @@ class SubtitlesInfoExtractor(InfoExtractor):
 | 
				
			|||||||
        self.to_screen(u'%s: Available subtitles for video: %s' %
 | 
					        self.to_screen(u'%s: Available subtitles for video: %s' %
 | 
				
			||||||
                       (video_id, sub_lang))
 | 
					                       (video_id, sub_lang))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _extract_subtitles(self, video_id):
 | 
					    def extract_subtitles(self, video_id, video_webpage=None):
 | 
				
			||||||
        """ returns {sub_lang: sub} or {} if subtitles not found """
 | 
					        """ returns {sub_lang: sub} or {} if subtitles not found """
 | 
				
			||||||
 | 
					        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
 | 
				
			||||||
            available_subs_list = self._get_available_subtitles(video_id)
 | 
					            available_subs_list = self._get_available_subtitles(video_id)
 | 
				
			||||||
 | 
					        elif self._downloader.params.get('writeautomaticsub', False):
 | 
				
			||||||
 | 
					            available_subs_list = self._get_available_automatic_caption(video_id, video_webpage)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not available_subs_list:  # error, it didn't get the available subtitles
 | 
					        if not available_subs_list:  # error, it didn't get the available subtitles
 | 
				
			||||||
            return {}
 | 
					            return {}
 | 
				
			||||||
        if self._downloader.params.get('allsubtitles', False):
 | 
					        if self._downloader.params.get('allsubtitles', False):
 | 
				
			||||||
            sub_lang_list = available_subs_list
 | 
					            sub_lang_list = available_subs_list
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            if self._downloader.params.get('writesubtitles', False):
 | 
					 | 
				
			||||||
            if self._downloader.params.get('subtitleslangs', False):
 | 
					            if self._downloader.params.get('subtitleslangs', False):
 | 
				
			||||||
                requested_langs = self._downloader.params.get('subtitleslangs')
 | 
					                requested_langs = self._downloader.params.get('subtitleslangs')
 | 
				
			||||||
            elif 'en' in available_subs_list:
 | 
					            elif 'en' in available_subs_list:
 | 
				
			||||||
@@ -64,23 +69,11 @@ class SubtitlesInfoExtractor(InfoExtractor):
 | 
				
			|||||||
        """
 | 
					        """
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _request_automatic_caption(self, video_id, webpage):
 | 
					    def _get_available_automatic_caption(self, video_id, webpage):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        returns {sub_lang: sub} or {} if not available
 | 
					        returns {sub_lang: url} or {} if not available
 | 
				
			||||||
        Must be redefined by the subclasses that support automatic captions,
 | 
					        Must be redefined by the subclasses that support automatic captions,
 | 
				
			||||||
        otherwise it will return {}
 | 
					        otherwise it will return {}
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self._downloader.report_warning(u'Automatic Captions not supported by this server')
 | 
					        self._downloader.report_warning(u'Automatic Captions not supported by this server')
 | 
				
			||||||
        return {}
 | 
					        return {}
 | 
				
			||||||
 | 
					 | 
				
			||||||
    def extract_subtitles(self, video_id, video_webpage=None):
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        Extract the subtitles and/or the automatic captions if requested.
 | 
					 | 
				
			||||||
        Returns None or a dictionary in the format {sub_lang: sub}
 | 
					 | 
				
			||||||
        """
 | 
					 | 
				
			||||||
        video_subtitles = None
 | 
					 | 
				
			||||||
        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
 | 
					 | 
				
			||||||
            video_subtitles = self._extract_subtitles(video_id)
 | 
					 | 
				
			||||||
        elif self._downloader.params.get('writeautomaticsub', False):
 | 
					 | 
				
			||||||
            video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 | 
					 | 
				
			||||||
        return video_subtitles
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,6 +5,7 @@ import netrc
 | 
				
			|||||||
import re
 | 
					import re
 | 
				
			||||||
import socket
 | 
					import socket
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					import xml.etree.ElementTree
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .common import InfoExtractor, SearchInfoExtractor
 | 
					from .common import InfoExtractor, SearchInfoExtractor
 | 
				
			||||||
from .subtitles import SubtitlesInfoExtractor
 | 
					from .subtitles import SubtitlesInfoExtractor
 | 
				
			||||||
@@ -478,14 +479,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 | 
				
			|||||||
            return {}
 | 
					            return {}
 | 
				
			||||||
        return sub_lang_list
 | 
					        return sub_lang_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _request_automatic_caption(self, video_id, webpage):
 | 
					    def _get_available_automatic_caption(self, video_id, webpage):
 | 
				
			||||||
        """We need the webpage for getting the captions url, pass it as an
 | 
					        """We need the webpage for getting the captions url, pass it as an
 | 
				
			||||||
           argument to speed up the process."""
 | 
					           argument to speed up the process."""
 | 
				
			||||||
        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
 | 
					 | 
				
			||||||
        sub_format = self._downloader.params.get('subtitlesformat')
 | 
					        sub_format = self._downloader.params.get('subtitlesformat')
 | 
				
			||||||
        self.to_screen(u'%s: Looking for automatic captions' % video_id)
 | 
					        self.to_screen(u'%s: Looking for automatic captions' % video_id)
 | 
				
			||||||
        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 | 
					        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 | 
				
			||||||
        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 | 
					        err_msg = u'Couldn\'t find automatic captions for %s' % video_id
 | 
				
			||||||
        if mobj is None:
 | 
					        if mobj is None:
 | 
				
			||||||
            self._downloader.report_warning(err_msg)
 | 
					            self._downloader.report_warning(err_msg)
 | 
				
			||||||
            return {}
 | 
					            return {}
 | 
				
			||||||
@@ -494,16 +494,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 | 
				
			|||||||
            args = player_config[u'args']
 | 
					            args = player_config[u'args']
 | 
				
			||||||
            caption_url = args[u'ttsurl']
 | 
					            caption_url = args[u'ttsurl']
 | 
				
			||||||
            timestamp = args[u'timestamp']
 | 
					            timestamp = args[u'timestamp']
 | 
				
			||||||
 | 
					            # We get the available subtitles
 | 
				
			||||||
 | 
					            list_params = compat_urllib_parse.urlencode({
 | 
				
			||||||
 | 
					                'type': 'list',
 | 
				
			||||||
 | 
					                'tlangs': 1,
 | 
				
			||||||
 | 
					                'asrs': 1,
 | 
				
			||||||
 | 
					            })
 | 
				
			||||||
 | 
					            list_url = caption_url + '&' + list_params
 | 
				
			||||||
 | 
					            list_page = self._download_webpage(list_url, video_id)
 | 
				
			||||||
 | 
					            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
 | 
				
			||||||
 | 
					            original_lang = caption_list.find('track').attrib['lang_code']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            sub_lang_list = {}
 | 
				
			||||||
 | 
					            for lang_node in caption_list.findall('target'):
 | 
				
			||||||
 | 
					                sub_lang = lang_node.attrib['lang_code']
 | 
				
			||||||
                params = compat_urllib_parse.urlencode({
 | 
					                params = compat_urllib_parse.urlencode({
 | 
				
			||||||
                'lang': 'en',
 | 
					                    'lang': original_lang,
 | 
				
			||||||
                    'tlang': sub_lang,
 | 
					                    'tlang': sub_lang,
 | 
				
			||||||
                    'fmt': sub_format,
 | 
					                    'fmt': sub_format,
 | 
				
			||||||
                    'ts': timestamp,
 | 
					                    'ts': timestamp,
 | 
				
			||||||
                    'kind': 'asr',
 | 
					                    'kind': 'asr',
 | 
				
			||||||
                })
 | 
					                })
 | 
				
			||||||
            subtitles_url = caption_url + '&' + params
 | 
					                sub_lang_list[sub_lang] = caption_url + '&' + params
 | 
				
			||||||
            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 | 
					            return sub_lang_list
 | 
				
			||||||
            return {sub_lang: sub}
 | 
					 | 
				
			||||||
        # An extractor error can be raise by the download process if there are
 | 
					        # An extractor error can be raise by the download process if there are
 | 
				
			||||||
        # no automatic captions but there are subtitles
 | 
					        # no automatic captions but there are subtitles
 | 
				
			||||||
        except (KeyError, ExtractorError):
 | 
					        except (KeyError, ExtractorError):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user