mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 22:55:18 +00:00 
			
		
		
		
	[extractor/twitter] Fix GraphQL and legacy API (#7516)
Authored by: bashonly
This commit is contained in:
		| @@ -11,6 +11,7 @@ from ..compat import ( | ||||
| from ..utils import ( | ||||
|     ExtractorError, | ||||
|     dict_get, | ||||
|     filter_dict, | ||||
|     float_or_none, | ||||
|     format_field, | ||||
|     int_or_none, | ||||
| @@ -33,8 +34,8 @@ class TwitterBaseIE(InfoExtractor): | ||||
|     _API_BASE = 'https://api.twitter.com/1.1/' | ||||
|     _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' | ||||
|     _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' | ||||
|     _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} | ||||
|     _guest_token = None | ||||
|     _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' | ||||
|     _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' | ||||
|     _flow_token = None | ||||
| 
 | ||||
|     _LOGIN_INIT_DATA = json.dumps({ | ||||
| @@ -145,20 +146,21 @@ class TwitterBaseIE(InfoExtractor): | ||||
|     def is_logged_in(self): | ||||
|         return bool(self._get_cookies(self._API_BASE).get('auth_token')) | ||||
| 
 | ||||
|     def _fetch_guest_token(self, headers, display_id): | ||||
|         headers.pop('x-guest-token', None) | ||||
|         self._guest_token = traverse_obj(self._download_json( | ||||
|             f'{self._API_BASE}guest/activate.json', display_id, | ||||
|             'Downloading guest token', data=b'', headers=headers), 'guest_token') | ||||
|         if not self._guest_token: | ||||
|     def _fetch_guest_token(self, display_id): | ||||
|         guest_token = traverse_obj(self._download_json( | ||||
|             f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', | ||||
|             headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), | ||||
|             ('guest_token', {str})) | ||||
|         if not guest_token: | ||||
|             raise ExtractorError('Could not retrieve guest token') | ||||
|         return guest_token | ||||
| 
 | ||||
|     def _set_base_headers(self): | ||||
|         headers = self._AUTH.copy() | ||||
|         csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) | ||||
|         if csrf_token: | ||||
|             headers['x-csrf-token'] = csrf_token | ||||
|         return headers | ||||
|     def _set_base_headers(self, legacy=False): | ||||
|         bearer_token = self._LEGACY_AUTH if legacy and not self.is_logged_in else self._AUTH | ||||
|         return filter_dict({ | ||||
|             'Authorization': f'Bearer {bearer_token}', | ||||
|             'x-csrf-token': try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value), | ||||
|         }) | ||||
| 
 | ||||
|     def _call_login_api(self, note, headers, query={}, data=None): | ||||
|         response = self._download_json( | ||||
| @@ -183,17 +185,18 @@ class TwitterBaseIE(InfoExtractor): | ||||
|         if self.is_logged_in: | ||||
|             return | ||||
| 
 | ||||
|         self._request_webpage('https://twitter.com/', None, 'Requesting cookies') | ||||
|         headers = self._set_base_headers() | ||||
|         self._fetch_guest_token(headers, None) | ||||
|         headers.update({ | ||||
|         webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') | ||||
|         guest_token = self._search_regex( | ||||
|             r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None) | ||||
|         headers = { | ||||
|             **self._set_base_headers(), | ||||
|             'content-type': 'application/json', | ||||
|             'x-guest-token': self._guest_token, | ||||
|             'x-guest-token': guest_token, | ||||
|             'x-twitter-client-language': 'en', | ||||
|             'x-twitter-active-user': 'yes', | ||||
|             'Referer': 'https://twitter.com/', | ||||
|             'Origin': 'https://twitter.com', | ||||
|         }) | ||||
|         } | ||||
| 
 | ||||
|         def build_login_json(*subtask_inputs): | ||||
|             return json.dumps({ | ||||
| @@ -285,37 +288,26 @@ class TwitterBaseIE(InfoExtractor): | ||||
|         self.report_login() | ||||
| 
 | ||||
|     def _call_api(self, path, video_id, query={}, graphql=False): | ||||
|         headers = self._set_base_headers() | ||||
|         if self.is_logged_in: | ||||
|             headers.update({ | ||||
|                 'x-twitter-auth-type': 'OAuth2Session', | ||||
|                 'x-twitter-client-language': 'en', | ||||
|                 'x-twitter-active-user': 'yes', | ||||
|             }) | ||||
|         headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) | ||||
|         headers.update({ | ||||
|             'x-twitter-auth-type': 'OAuth2Session', | ||||
|             'x-twitter-client-language': 'en', | ||||
|             'x-twitter-active-user': 'yes', | ||||
|         } if self.is_logged_in else { | ||||
|             'x-guest-token': self._fetch_guest_token(video_id) | ||||
|         }) | ||||
|         allowed_status = {400, 401, 403, 404} if graphql else {403} | ||||
|         result = self._download_json( | ||||
|             (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, | ||||
|             video_id, headers=headers, query=query, expected_status=allowed_status, | ||||
|             note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') | ||||
| 
 | ||||
|         for first_attempt in (True, False): | ||||
|             if not self.is_logged_in: | ||||
|                 if not self._guest_token: | ||||
|                     self._fetch_guest_token(headers, video_id) | ||||
|                 headers['x-guest-token'] = self._guest_token | ||||
|         if result.get('errors'): | ||||
|             errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) | ||||
|             raise ExtractorError( | ||||
|                 f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) | ||||
| 
 | ||||
|             allowed_status = {400, 401, 403, 404} if graphql else {403} | ||||
|             result = self._download_json( | ||||
|                 (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, | ||||
|                 video_id, headers=headers, query=query, expected_status=allowed_status, | ||||
|                 note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') | ||||
| 
 | ||||
|             if result.get('errors'): | ||||
|                 errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) | ||||
|                 if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower(): | ||||
|                     self.to_screen('Guest token has expired. Refreshing guest token') | ||||
|                     self._guest_token = None | ||||
|                     continue | ||||
| 
 | ||||
|                 raise ExtractorError( | ||||
|                     f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) | ||||
| 
 | ||||
|             return result | ||||
|         return result | ||||
| 
 | ||||
|     def _build_graphql_query(self, media_id): | ||||
|         raise NotImplementedError('Method must be implemented to support GraphQL') | ||||
| @@ -765,9 +757,9 @@ class TwitterIE(TwitterBaseIE): | ||||
|         'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', | ||||
|         'info_dict': { | ||||
|             'id': '1577719286659006464', | ||||
|             'title': 'Ultima | #\u0432\u029f\u043c - Test', | ||||
|             'title': 'Ultima📛 | #вʟм - Test', | ||||
|             'description': 'Test https://t.co/Y3KEZD7Dad', | ||||
|             'uploader': 'Ultima | #\u0432\u029f\u043c', | ||||
|             'uploader': 'Ultima📛 | #вʟм', | ||||
|             'uploader_id': 'UltimaShadowX', | ||||
|             'uploader_url': 'https://twitter.com/UltimaShadowX', | ||||
|             'upload_date': '20221005', | ||||
| @@ -825,6 +817,7 @@ class TwitterIE(TwitterBaseIE): | ||||
|         }, | ||||
|         'skip': 'Requires authentication', | ||||
|     }, { | ||||
|         # Playlist result only with auth | ||||
|         'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', | ||||
|         'playlist_mincount': 2, | ||||
|         'info_dict': { | ||||
| @@ -896,6 +889,7 @@ class TwitterIE(TwitterBaseIE): | ||||
|         }, | ||||
|         'add_ie': ['TwitterSpaces'], | ||||
|         'params': {'skip_download': 'm3u8'}, | ||||
|         'skip': 'Requires authentication', | ||||
|     }, { | ||||
|         # URL specifies video number but --yes-playlist | ||||
|         'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', | ||||
| @@ -1009,14 +1003,14 @@ class TwitterIE(TwitterBaseIE): | ||||
|             'timestamp': 1670306984.0, | ||||
|         }, | ||||
|     }, { | ||||
|         # url to retweet id, legacy API | ||||
|         # url to retweet id w/ legacy api | ||||
|         'url': 'https://twitter.com/liberdalau/status/1623739803874349067', | ||||
|         'info_dict': { | ||||
|             'id': '1623274794488659969', | ||||
|             'display_id': '1623739803874349067', | ||||
|             'ext': 'mp4', | ||||
|             'title': 'Johnny Bullets - Me after going viral to over 30million people:    Whoopsie-daisy', | ||||
|             'description': 'md5:e873616a4a8fe0f93e71872678a672f3', | ||||
|             'description': 'md5:b06864cd3dc2554821cc327f5348485a', | ||||
|             'uploader': 'Johnny Bullets', | ||||
|             'uploader_id': 'Johnnybull3ts', | ||||
|             'uploader_url': 'https://twitter.com/Johnnybull3ts', | ||||
| @@ -1028,9 +1022,31 @@ class TwitterIE(TwitterBaseIE): | ||||
|             'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', | ||||
|             'like_count': int, | ||||
|             'repost_count': int, | ||||
|             'comment_count': int, | ||||
|         }, | ||||
|         'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, | ||||
|     }, { | ||||
|         # orig tweet w/ graphql | ||||
|         'url': 'https://twitter.com/liberdalau/status/1623739803874349067', | ||||
|         'info_dict': { | ||||
|             'id': '1623274794488659969', | ||||
|             'display_id': '1623739803874349067', | ||||
|             'ext': 'mp4', | ||||
|             'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people:    Whoopsie-daisy', | ||||
|             'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', | ||||
|             'uploader': '@selfisekai@hackerspace.pl 🐀', | ||||
|             'uploader_id': 'liberdalau', | ||||
|             'uploader_url': 'https://twitter.com/liberdalau', | ||||
|             'age_limit': 0, | ||||
|             'tags': [], | ||||
|             'duration': 8.033, | ||||
|             'timestamp': 1675964711.0, | ||||
|             'upload_date': '20230209', | ||||
|             'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', | ||||
|             'like_count': int, | ||||
|             'view_count': int, | ||||
|             'repost_count': int, | ||||
|             'comment_count': int, | ||||
|         }, | ||||
|     }, { | ||||
|         # onion route | ||||
|         'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', | ||||
| @@ -1073,17 +1089,21 @@ class TwitterIE(TwitterBaseIE): | ||||
|         result = traverse_obj(data, ( | ||||
|             'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', | ||||
|             lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', | ||||
|             'tweet_results', 'result', ('tweet', None), | ||||
|         ), expected_type=dict, default={}, get_all=False) | ||||
|             'tweet_results', 'result', ('tweet', None), {dict}, | ||||
|         ), default={}, get_all=False) if self.is_logged_in else traverse_obj( | ||||
|             data, ('tweetResult', 'result', {dict}), default={}) | ||||
| 
 | ||||
|         if result.get('__typename') not in ('Tweet', 'TweetTombstone', None): | ||||
|         if result.get('__typename') not in ('Tweet', 'TweetTombstone', 'TweetUnavailable', None): | ||||
|             self.report_warning(f'Unknown typename: {result.get("__typename")}', twid, only_once=True) | ||||
| 
 | ||||
|         if 'tombstone' in result: | ||||
|             cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') | ||||
|             if cause and 'adult content' in cause: | ||||
|                 self.raise_login_required(cause) | ||||
|             raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) | ||||
|         elif result.get('__typename') == 'TweetUnavailable': | ||||
|             reason = result.get('reason') | ||||
|             if reason == 'NsfwLoggedOut': | ||||
|                 self.raise_login_required('NSFW tweet requires authentication') | ||||
|             raise ExtractorError(reason or 'Requested tweet is unavailable', expected=True) | ||||
| 
 | ||||
|         status = result.get('legacy', {}) | ||||
|         status.update(traverse_obj(result, { | ||||
| @@ -1134,11 +1154,42 @@ class TwitterIE(TwitterBaseIE): | ||||
|                 'verified_phone_label_enabled': False, | ||||
|                 'vibe_api_enabled': True, | ||||
|             }, | ||||
|         } if self.is_logged_in else { | ||||
|             'variables': { | ||||
|                 'tweetId': media_id, | ||||
|                 'withCommunity': False, | ||||
|                 'includePromotedContent': False, | ||||
|                 'withVoice': False, | ||||
|             }, | ||||
|             'features': { | ||||
|                 'creator_subscriptions_tweet_preview_api_enabled': True, | ||||
|                 'tweetypie_unmention_optimization_enabled': True, | ||||
|                 'responsive_web_edit_tweet_api_enabled': True, | ||||
|                 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': True, | ||||
|                 'view_counts_everywhere_api_enabled': True, | ||||
|                 'longform_notetweets_consumption_enabled': True, | ||||
|                 'responsive_web_twitter_article_tweet_consumption_enabled': False, | ||||
|                 'tweet_awards_web_tipping_enabled': False, | ||||
|                 'freedom_of_speech_not_reach_fetch_enabled': True, | ||||
|                 'standardized_nudges_misinfo': True, | ||||
|                 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': True, | ||||
|                 'longform_notetweets_rich_text_read_enabled': True, | ||||
|                 'longform_notetweets_inline_media_enabled': True, | ||||
|                 'responsive_web_graphql_exclude_directive_enabled': True, | ||||
|                 'verified_phone_label_enabled': False, | ||||
|                 'responsive_web_media_download_video_enabled': False, | ||||
|                 'responsive_web_graphql_skip_user_profile_image_extensions_enabled': False, | ||||
|                 'responsive_web_graphql_timeline_navigation_enabled': True, | ||||
|                 'responsive_web_enhance_cards_enabled': False | ||||
|             }, | ||||
|             'fieldToggles': { | ||||
|                 'withArticleRichContentState': False | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         twid, selected_index = self._match_valid_url(url).group('id', 'index') | ||||
|         if self._configuration_arg('legacy_api') and not self.is_logged_in: | ||||
|         if not self.is_logged_in and self._configuration_arg('legacy_api'): | ||||
|             status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { | ||||
|                 'cards_platform': 'Web-12', | ||||
|                 'include_cards': 1, | ||||
| @@ -1146,11 +1197,15 @@ class TwitterIE(TwitterBaseIE): | ||||
|                 'include_user_entities': 0, | ||||
|                 'tweet_mode': 'extended', | ||||
|             }), 'retweeted_status', None) | ||||
|         elif not self.is_logged_in: | ||||
|             status = self._graphql_to_legacy( | ||||
|                 self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) | ||||
|         else: | ||||
|             result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) | ||||
|             status = self._graphql_to_legacy(result, twid) | ||||
|             status = self._graphql_to_legacy( | ||||
|                 self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) | ||||
| 
 | ||||
|         title = description = status['full_text'].replace('\n', ' ') | ||||
|         title = description = traverse_obj( | ||||
|             status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' | ||||
|         # strip  'https -_t.co_BJYgOjSeGA' junk from filenames | ||||
|         title = re.sub(r'\s+(https?://[^ ]+)', '', title) | ||||
|         user = status.get('user') or {} | ||||
| @@ -1177,11 +1232,10 @@ class TwitterIE(TwitterBaseIE): | ||||
|         def extract_from_video_info(media): | ||||
|             media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) | ||||
|             self.write_debug(f'Extracting from video info: {media_id}') | ||||
|             video_info = media.get('video_info') or {} | ||||
| 
 | ||||
|             formats = [] | ||||
|             subtitles = {} | ||||
|             for variant in video_info.get('variants', []): | ||||
|             for variant in traverse_obj(media, ('video_info', 'variants', ...)): | ||||
|                 fmts, subs = self._extract_variant_formats(variant, twid) | ||||
|                 subtitles = self._merge_subtitles(subtitles, subs) | ||||
|                 formats.extend(fmts) | ||||
| @@ -1206,7 +1260,7 @@ class TwitterIE(TwitterBaseIE): | ||||
|                 'subtitles': subtitles, | ||||
|                 'thumbnails': thumbnails, | ||||
|                 'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), | ||||
|                 'duration': float_or_none(video_info.get('duration_millis'), 1000), | ||||
|                 'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), | ||||
|                 # The codec of http formats are unknown | ||||
|                 '_format_sort_fields': ('res', 'br', 'size', 'proto'), | ||||
|             } | ||||
| @@ -1291,7 +1345,8 @@ class TwitterIE(TwitterBaseIE): | ||||
|         if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'): | ||||
|             selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card'))) | ||||
|         else: | ||||
|             desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict})) | ||||
|             desired_obj = traverse_obj(status, ( | ||||
|                 (None, 'quoted_status'), 'extended_entities', 'media', int(selected_index) - 1, {dict}), get_all=False) | ||||
|             if not desired_obj: | ||||
|                 raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True) | ||||
|             elif desired_obj.get('type') != 'video': | ||||
| @@ -1481,6 +1536,8 @@ class TwitterSpacesIE(TwitterBaseIE): | ||||
| 
 | ||||
|     def _real_extract(self, url): | ||||
|         space_id = self._match_id(url) | ||||
|         if not self.is_logged_in: | ||||
|             self.raise_login_required('Twitter Spaces require authentication') | ||||
|         space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] | ||||
|         if not space_data: | ||||
|             raise ExtractorError('Twitter Space not found', expected=True) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 bashonly
					bashonly