mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[youtube] Convert to new subtitles system
The automatic captions are stored in the 'automactic_captions' field, which is used if no normal subtitles are found for an specific language.
This commit is contained in:
		| @@ -50,11 +50,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles): | |||||||
|     url = 'QRS8MkLhQmM' |     url = 'QRS8MkLhQmM' | ||||||
|     IE = YoutubeIE |     IE = YoutubeIE | ||||||
|  |  | ||||||
|     def test_youtube_no_writesubtitles(self): |  | ||||||
|         self.DL.params['writesubtitles'] = False |  | ||||||
|         subtitles = self.getSubtitles() |  | ||||||
|         self.assertEqual(subtitles, None) |  | ||||||
|  |  | ||||||
|     def test_youtube_subtitles(self): |     def test_youtube_subtitles(self): | ||||||
|         self.DL.params['writesubtitles'] = True |         self.DL.params['writesubtitles'] = True | ||||||
|         subtitles = self.getSubtitles() |         subtitles = self.getSubtitles() | ||||||
|   | |||||||
| @@ -1020,9 +1020,13 @@ class YoutubeDL(object): | |||||||
|             info_dict['upload_date'] = upload_date.strftime('%Y%m%d') |             info_dict['upload_date'] = upload_date.strftime('%Y%m%d') | ||||||
|  |  | ||||||
|         if self.params.get('listsubtitles', False): |         if self.params.get('listsubtitles', False): | ||||||
|             self.list_subtitles(info_dict['id'], info_dict.get('subtitles')) |             if 'automatic_captions' in info_dict: | ||||||
|  |                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') | ||||||
|  |             self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') | ||||||
|             return |             return | ||||||
|         info_dict['requested_subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles')) |         info_dict['requested_subtitles'] = self.process_subtitles( | ||||||
|  |             info_dict['id'], info_dict.get('subtitles'), | ||||||
|  |             info_dict.get('automatic_captions')) | ||||||
|  |  | ||||||
|         # This extractors handle format selection themselves |         # This extractors handle format selection themselves | ||||||
|         if info_dict['extractor'] in ['Youku']: |         if info_dict['extractor'] in ['Youku']: | ||||||
| @@ -1152,8 +1156,14 @@ class YoutubeDL(object): | |||||||
|         info_dict.update(formats_to_download[-1]) |         info_dict.update(formats_to_download[-1]) | ||||||
|         return info_dict |         return info_dict | ||||||
|  |  | ||||||
|     def process_subtitles(self, video_id, available_subs): |     def process_subtitles(self, video_id, available_subs, available_autocaps): | ||||||
|         """Select the requested subtitles and their format""" |         """Select the requested subtitles and their format""" | ||||||
|  |         if available_autocaps and self.params.get('writeautomaticsub'): | ||||||
|  |             available_subs = available_subs.copy() | ||||||
|  |             for lang, cap_info in available_autocaps.items(): | ||||||
|  |                 if lang not in available_subs: | ||||||
|  |                     available_subs[lang] = cap_info | ||||||
|  |  | ||||||
|         if not available_subs: |         if not available_subs: | ||||||
|             return available_subs |             return available_subs | ||||||
|  |  | ||||||
| @@ -1645,17 +1655,17 @@ class YoutubeDL(object): | |||||||
|             ['ID', 'width', 'height', 'URL'], |             ['ID', 'width', 'height', 'URL'], | ||||||
|             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) |             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) | ||||||
|  |  | ||||||
|     def list_subtitles(self, video_id, subtitles): |     def list_subtitles(self, video_id, subtitles, name='subtitles'): | ||||||
|         if not subtitles: |         if not subtitles: | ||||||
|             self.to_screen('%s has no subtitles' % video_id) |             self.to_screen('%s has no %s' % (video_id, name)) | ||||||
|             return |             return | ||||||
|         header_line = 'Language    formats' |         header_line = 'Language    formats' | ||||||
|         sub_lines = [ |         sub_lines = [ | ||||||
|             '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) |             '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats))) | ||||||
|             for lang, formats in subtitles.items()] |             for lang, formats in subtitles.items()] | ||||||
|         self.to_screen( |         self.to_screen( | ||||||
|             'Available subtitles for %s:\n%s\n%s' % |             'Available %s for %s:\n%s\n%s' % | ||||||
|             (video_id, header_line, '\n'.join(sub_lines))) |             (name, video_id, header_line, '\n'.join(sub_lines))) | ||||||
|  |  | ||||||
|     def urlopen(self, req): |     def urlopen(self, req): | ||||||
|         """ Start an HTTP download """ |         """ Start an HTTP download """ | ||||||
|   | |||||||
| @@ -157,6 +157,8 @@ class InfoExtractor(object): | |||||||
|                     with the "ext" entry and one of: |                     with the "ext" entry and one of: | ||||||
|                         * "data": The subtitles file contents |                         * "data": The subtitles file contents | ||||||
|                         * "url": A url pointing to the subtitles file |                         * "url": A url pointing to the subtitles file | ||||||
|  |     automatic_captions: Like 'subtitles', used by the YoutubeIE for | ||||||
|  |                     automatically generated captions | ||||||
|     duration:       Length of the video in seconds, as an integer. |     duration:       Length of the video in seconds, as an integer. | ||||||
|     view_count:     How many users have watched the video on the platform. |     view_count:     How many users have watched the video on the platform. | ||||||
|     like_count:     Number of positive ratings of the video |     like_count:     Number of positive ratings of the video | ||||||
| @@ -1007,6 +1009,16 @@ class InfoExtractor(object): | |||||||
|     def _get_subtitles(self, *args, **kwargs): |     def _get_subtitles(self, *args, **kwargs): | ||||||
|         raise NotImplementedError("This method must be implemented by subclasses") |         raise NotImplementedError("This method must be implemented by subclasses") | ||||||
|  |  | ||||||
|  |     def extract_automatic_captions(self, *args, **kwargs): | ||||||
|  |         automatic_captions = {} | ||||||
|  |         list_subtitles = self._downloader.params.get('listsubtitles') | ||||||
|  |         if self._downloader.params.get('writeautomaticsub', False) or list_subtitles: | ||||||
|  |             automatic_captions.update(self._get_automatic_captions(*args, **kwargs)) | ||||||
|  |         return automatic_captions | ||||||
|  |  | ||||||
|  |     def _get_automatic_captions(self, *args, **kwargs): | ||||||
|  |         raise NotImplementedError("This method must be implemented by subclasses") | ||||||
|  |  | ||||||
|  |  | ||||||
| class SearchInfoExtractor(InfoExtractor): | class SearchInfoExtractor(InfoExtractor): | ||||||
|     """ |     """ | ||||||
|   | |||||||
| @@ -11,7 +11,6 @@ import time | |||||||
| import traceback | import traceback | ||||||
|  |  | ||||||
| from .common import InfoExtractor, SearchInfoExtractor | from .common import InfoExtractor, SearchInfoExtractor | ||||||
| from .subtitles import SubtitlesInfoExtractor |  | ||||||
| from ..jsinterp import JSInterpreter | from ..jsinterp import JSInterpreter | ||||||
| from ..swfinterp import SWFInterpreter | from ..swfinterp import SWFInterpreter | ||||||
| from ..compat import ( | from ..compat import ( | ||||||
| @@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): | |||||||
|             return |             return | ||||||
|  |  | ||||||
|  |  | ||||||
| class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | class YoutubeIE(YoutubeBaseInfoExtractor): | ||||||
|     IE_DESC = 'YouTube.com' |     IE_DESC = 'YouTube.com' | ||||||
|     _VALID_URL = r"""(?x)^ |     _VALID_URL = r"""(?x)^ | ||||||
|                      ( |                      ( | ||||||
| @@ -644,7 +643,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             raise ExtractorError( |             raise ExtractorError( | ||||||
|                 'Signature extraction failed: ' + tb, cause=e) |                 'Signature extraction failed: ' + tb, cause=e) | ||||||
|  |  | ||||||
|     def _get_available_subtitles(self, video_id, webpage): |     def _get_subtitles(self, video_id, webpage): | ||||||
|         try: |         try: | ||||||
|             subs_doc = self._download_xml( |             subs_doc = self._download_xml( | ||||||
|                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, |                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, | ||||||
| @@ -658,23 +657,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             lang = track.attrib['lang_code'] |             lang = track.attrib['lang_code'] | ||||||
|             if lang in sub_lang_list: |             if lang in sub_lang_list: | ||||||
|                 continue |                 continue | ||||||
|             params = compat_urllib_parse.urlencode({ |             sub_formats = [] | ||||||
|                 'lang': lang, |             for ext in ['sbv', 'vtt', 'srt']: | ||||||
|                 'v': video_id, |                 params = compat_urllib_parse.urlencode({ | ||||||
|                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), |                     'lang': lang, | ||||||
|                 'name': track.attrib['name'].encode('utf-8'), |                     'v': video_id, | ||||||
|             }) |                     'fmt': ext, | ||||||
|             url = 'https://www.youtube.com/api/timedtext?' + params |                     'name': track.attrib['name'].encode('utf-8'), | ||||||
|             sub_lang_list[lang] = url |                 }) | ||||||
|  |                 sub_formats.append({ | ||||||
|  |                     'url': 'https://www.youtube.com/api/timedtext?' + params, | ||||||
|  |                     'ext': ext, | ||||||
|  |                 }) | ||||||
|  |             sub_lang_list[lang] = sub_formats | ||||||
|         if not sub_lang_list: |         if not sub_lang_list: | ||||||
|             self._downloader.report_warning('video doesn\'t have subtitles') |             self._downloader.report_warning('video doesn\'t have subtitles') | ||||||
|             return {} |             return {} | ||||||
|         return sub_lang_list |         return sub_lang_list | ||||||
|  |  | ||||||
|     def _get_available_automatic_caption(self, video_id, webpage): |     def _get_automatic_captions(self, video_id, webpage): | ||||||
|         """We need the webpage for getting the captions url, pass it as an |         """We need the webpage for getting the captions url, pass it as an | ||||||
|            argument to speed up the process.""" |            argument to speed up the process.""" | ||||||
|         sub_format = self._downloader.params.get('subtitlesformat', 'srt') |  | ||||||
|         self.to_screen('%s: Looking for automatic captions' % video_id) |         self.to_screen('%s: Looking for automatic captions' % video_id) | ||||||
|         mobj = re.search(r';ytplayer.config = ({.*?});', webpage) |         mobj = re.search(r';ytplayer.config = ({.*?});', webpage) | ||||||
|         err_msg = 'Couldn\'t find automatic captions for %s' % video_id |         err_msg = 'Couldn\'t find automatic captions for %s' % video_id | ||||||
| @@ -704,14 +707,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             sub_lang_list = {} |             sub_lang_list = {} | ||||||
|             for lang_node in caption_list.findall('target'): |             for lang_node in caption_list.findall('target'): | ||||||
|                 sub_lang = lang_node.attrib['lang_code'] |                 sub_lang = lang_node.attrib['lang_code'] | ||||||
|                 params = compat_urllib_parse.urlencode({ |                 sub_formats = [] | ||||||
|                     'lang': original_lang, |                 for ext in ['sbv', 'vtt', 'srt']: | ||||||
|                     'tlang': sub_lang, |                     params = compat_urllib_parse.urlencode({ | ||||||
|                     'fmt': sub_format, |                         'lang': original_lang, | ||||||
|                     'ts': timestamp, |                         'tlang': sub_lang, | ||||||
|                     'kind': caption_kind, |                         'fmt': ext, | ||||||
|                 }) |                         'ts': timestamp, | ||||||
|                 sub_lang_list[sub_lang] = caption_url + '&' + params |                         'kind': caption_kind, | ||||||
|  |                     }) | ||||||
|  |                     sub_formats.append({ | ||||||
|  |                         'url': caption_url + '&' + params, | ||||||
|  |                         'ext': ext, | ||||||
|  |                     }) | ||||||
|  |                 sub_lang_list[sub_lang] = sub_formats | ||||||
|             return sub_lang_list |             return sub_lang_list | ||||||
|         # An extractor error can be raise by the download process if there are |         # An extractor error can be raise by the download process if there are | ||||||
|         # no automatic captions but there are subtitles |         # no automatic captions but there are subtitles | ||||||
| @@ -966,10 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|  |  | ||||||
|         # subtitles |         # subtitles | ||||||
|         video_subtitles = self.extract_subtitles(video_id, video_webpage) |         video_subtitles = self.extract_subtitles(video_id, video_webpage) | ||||||
|  |         automatic_captions = self.extract_automatic_captions(video_id, video_webpage) | ||||||
|         if self._downloader.params.get('listsubtitles', False): |  | ||||||
|             self._list_available_subtitles(video_id, video_webpage) |  | ||||||
|             return |  | ||||||
|  |  | ||||||
|         if 'length_seconds' not in video_info: |         if 'length_seconds' not in video_info: | ||||||
|             self._downloader.report_warning('unable to extract video duration') |             self._downloader.report_warning('unable to extract video duration') | ||||||
| @@ -1118,6 +1124,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): | |||||||
|             'description': video_description, |             'description': video_description, | ||||||
|             'categories': video_categories, |             'categories': video_categories, | ||||||
|             'subtitles': video_subtitles, |             'subtitles': video_subtitles, | ||||||
|  |             'automatic_captions': automatic_captions, | ||||||
|             'duration': video_duration, |             'duration': video_duration, | ||||||
|             'age_limit': 18 if age_gate else 0, |             'age_limit': 18 if age_gate else 0, | ||||||
|             'annotations': video_annotations, |             'annotations': video_annotations, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jaime Marquínez Ferrándiz
					Jaime Marquínez Ferrándiz