mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-30 22:25:19 +00:00 
			
		
		
		
	[googledrive] Add support for subtitles (fixes #13619)
This commit is contained in:
		| @@ -7,6 +7,8 @@ from ..utils import ( | ||||
|     ExtractorError, | ||||
|     int_or_none, | ||||
|     lowercase_escape, | ||||
|     error_to_compat_str, | ||||
|     update_url_query, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @@ -24,7 +26,14 @@ class GoogleDriveIE(InfoExtractor): | ||||
|     }, { | ||||
|         # video id is longer than 28 characters | ||||
|         'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', | ||||
|         'only_matching': True, | ||||
|         'md5': 'c230c67252874fddd8170e3fd1a45886', | ||||
|         'info_dict': { | ||||
|             'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', | ||||
|             'duration': 189, | ||||
|         }, | ||||
|         'only_matching': True | ||||
|     }] | ||||
|     _FORMATS_EXT = { | ||||
|         '5': 'flv', | ||||
| @@ -44,6 +53,13 @@ class GoogleDriveIE(InfoExtractor): | ||||
|         '46': 'webm', | ||||
|         '59': 'mp4', | ||||
|     } | ||||
|     _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' | ||||
|     _CAPTIONS_ENTRY_TAG = { | ||||
|         'subtitles': 'track', | ||||
|         'automatic_captions': 'target', | ||||
|     } | ||||
|     _caption_formats_ext = [] | ||||
|     _captions_by_country_xml = None | ||||
|  | ||||
|     @staticmethod | ||||
|     def _extract_url(webpage): | ||||
| @@ -53,6 +69,81 @@ class GoogleDriveIE(InfoExtractor): | ||||
|         if mobj: | ||||
|             return 'https://drive.google.com/file/d/%s' % mobj.group('id') | ||||
|  | ||||
|     def _set_captions_data(self, video_id, video_subtitles_id, hl): | ||||
|         try: | ||||
|             self._captions_by_country_xml = self._download_xml(self._BASE_URL_CAPTIONS, video_id, query={ | ||||
|                 'id': video_id, | ||||
|                 'vid': video_subtitles_id, | ||||
|                 'hl': hl, | ||||
|                 'v': video_id, | ||||
|                 'type': 'list', | ||||
|                 'tlangs': '1', | ||||
|                 'fmts': '1', | ||||
|                 'vssids': '1', | ||||
|             }) | ||||
|         except ExtractorError as ee: | ||||
|             self.report_warning('unable to download video subtitles: %s' % error_to_compat_str(ee)) | ||||
|         if self._captions_by_country_xml is not None: | ||||
|             caption_available_extensions = self._captions_by_country_xml.findall('format') | ||||
|             for caption_extension in caption_available_extensions: | ||||
|                 if caption_extension.attrib.get('fmt_code') and not caption_extension.attrib.get('default'): | ||||
|                     self._caption_formats_ext.append(caption_extension.attrib['fmt_code']) | ||||
|  | ||||
|     def _get_captions_by_type(self, video_id, video_subtitles_id, caption_type, caption_original_lang_code=None): | ||||
|         if not video_subtitles_id or not caption_type: | ||||
|             return None | ||||
|         captions = {} | ||||
|         for caption_entry in self._captions_by_country_xml.findall(self._CAPTIONS_ENTRY_TAG[caption_type]): | ||||
|             caption_lang_code = caption_entry.attrib.get('lang_code') | ||||
|             if not caption_lang_code: | ||||
|                 continue | ||||
|             caption_format_data = [] | ||||
|             for caption_format in self._caption_formats_ext: | ||||
|                 query = { | ||||
|                     'vid': video_subtitles_id, | ||||
|                     'v': video_id, | ||||
|                     'fmt': caption_format, | ||||
|                     'lang': caption_lang_code if caption_original_lang_code is None else caption_original_lang_code, | ||||
|                     'type': 'track', | ||||
|                     'name': '', | ||||
|                     'kind': '', | ||||
|                 } | ||||
|                 if caption_original_lang_code is not None: | ||||
|                     query.update({'tlang': caption_lang_code}) | ||||
|                 caption_format_data.append({ | ||||
|                     'url': update_url_query(self._BASE_URL_CAPTIONS, query), | ||||
|                     'ext': caption_format, | ||||
|                 }) | ||||
|             captions[caption_lang_code] = caption_format_data | ||||
|         if not captions: | ||||
|             self.report_warning('video doesn\'t have %s' % caption_type.replace('_', ' ')) | ||||
|         return captions | ||||
|  | ||||
|     def _get_subtitles(self, video_id, video_subtitles_id, hl): | ||||
|         if not video_subtitles_id or not hl: | ||||
|             return None | ||||
|         if self._captions_by_country_xml is None: | ||||
|             self._set_captions_data(video_id, video_subtitles_id, hl) | ||||
|             if self._captions_by_country_xml is None: | ||||
|                 return None | ||||
|         return self._get_captions_by_type(video_id, video_subtitles_id, 'subtitles') | ||||
|  | ||||
|     def _get_automatic_captions(self, video_id, video_subtitles_id, hl): | ||||
|         if not video_subtitles_id or not hl: | ||||
|             return None | ||||
|         if self._captions_by_country_xml is None: | ||||
|             self._set_captions_data(video_id, video_subtitles_id, hl) | ||||
|             if self._captions_by_country_xml is None: | ||||
|                 return None | ||||
|         self.to_screen('%s: Looking for automatic captions' % video_id) | ||||
|         subtitle_original_track = self._captions_by_country_xml.find('track') | ||||
|         if subtitle_original_track is None: | ||||
|             return None | ||||
|         subtitle_original_lang_code = subtitle_original_track.attrib.get('lang_code') | ||||
|         if not subtitle_original_lang_code: | ||||
|             return None | ||||
|         return self._get_captions_by_type(video_id, video_subtitles_id, 'automatic_captions', subtitle_original_lang_code) | ||||
|  | ||||
|     def _real_extract(self, url): | ||||
|         video_id = self._match_id(url) | ||||
|         webpage = self._download_webpage( | ||||
| @@ -97,10 +188,21 @@ class GoogleDriveIE(InfoExtractor): | ||||
|             formats.append(f) | ||||
|         self._sort_formats(formats) | ||||
|  | ||||
|         hl = self._search_regex( | ||||
|             r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) | ||||
|         video_subtitles_id = None | ||||
|         ttsurl = self._search_regex( | ||||
|             r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) | ||||
|         if ttsurl: | ||||
|             # the video Id for subtitles will be the last value in the ttsurl query string | ||||
|             video_subtitles_id = ttsurl.encode('utf-8').decode('unicode_escape').split('=')[-1] | ||||
|  | ||||
|         return { | ||||
|             'id': video_id, | ||||
|             'title': title, | ||||
|             'thumbnail': self._og_search_thumbnail(webpage, default=None), | ||||
|             'duration': duration, | ||||
|             'formats': formats, | ||||
|             'subtitles': self.extract_subtitles(video_id, video_subtitles_id, hl), | ||||
|             'automatic_captions': self.extract_automatic_captions(video_id, video_subtitles_id, hl), | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Parmjit Virk
					Parmjit Virk