mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-11-04 08:35:12 +00:00 
			
		
		
		
	[youtube] Parse API parameters from initial webpage (#230)
* Obtain innertube_context, api_key and x-goog-visitor-id from webpage
* Generalize the header & Innertube_context extraction across YouTube extractors
Related: 1b0a13f33c
Authored by: colethedj
			
			
This commit is contained in:
		@@ -284,21 +284,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
 | 
				
			|||||||
        if not self._login():
 | 
					        if not self._login():
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    _YT_WEB_CLIENT_VERSION = '2.20210301.08.00'
 | 
					    _YT_WEB_CLIENT_VERSION = '2.20210407.08.00'
 | 
				
			||||||
    _DEFAULT_API_DATA = {
 | 
					    _YT_INNERTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
 | 
				
			||||||
        'context': {
 | 
					 | 
				
			||||||
            'client': {
 | 
					 | 
				
			||||||
                'clientName': 'WEB',
 | 
					 | 
				
			||||||
                'clientVersion': _YT_WEB_CLIENT_VERSION,
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    _DEFAULT_BASIC_API_HEADERS = {
 | 
					 | 
				
			||||||
        'X-YouTube-Client-Name': '1',
 | 
					 | 
				
			||||||
        'X-YouTube-Client-Version': _YT_WEB_CLIENT_VERSION
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
 | 
					    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
 | 
				
			||||||
    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
 | 
					    _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
 | 
				
			||||||
    _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
 | 
					    _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
 | 
				
			||||||
@@ -312,19 +299,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
 | 
				
			|||||||
        return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
 | 
					        return "SAPISIDHASH %s_%s" % (time_now, sapisidhash)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _call_api(self, ep, query, video_id, fatal=True, headers=None,
 | 
					    def _call_api(self, ep, query, video_id, fatal=True, headers=None,
 | 
				
			||||||
                  note='Downloading API JSON', errnote='Unable to download API page'):
 | 
					                  note='Downloading API JSON', errnote='Unable to download API page',
 | 
				
			||||||
        data = self._DEFAULT_API_DATA.copy()
 | 
					                  context=None, api_key=None):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        data = {'context': context} if context else {'context': self._extract_context()}
 | 
				
			||||||
        data.update(query)
 | 
					        data.update(query)
 | 
				
			||||||
        headers = headers or {}
 | 
					        real_headers = self._generate_api_headers()
 | 
				
			||||||
        headers.update({'content-type': 'application/json'})
 | 
					        real_headers.update({'content-type': 'application/json'})
 | 
				
			||||||
        auth = self._generate_sapisidhash_header()
 | 
					        if headers:
 | 
				
			||||||
        if auth is not None:
 | 
					            real_headers.update(headers)
 | 
				
			||||||
            headers.update({'Authorization': auth, 'X-Origin': 'https://www.youtube.com'})
 | 
					 | 
				
			||||||
        return self._download_json(
 | 
					        return self._download_json(
 | 
				
			||||||
            'https://www.youtube.com/youtubei/v1/%s' % ep,
 | 
					            'https://www.youtube.com/youtubei/v1/%s' % ep,
 | 
				
			||||||
            video_id=video_id, fatal=fatal, note=note, errnote=errnote,
 | 
					            video_id=video_id, fatal=fatal, note=note, errnote=errnote,
 | 
				
			||||||
            data=json.dumps(data).encode('utf8'), headers=headers,
 | 
					            data=json.dumps(data).encode('utf8'), headers=real_headers,
 | 
				
			||||||
            query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
 | 
					            query={'key': api_key or self._extract_api_key()})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _extract_api_key(self, ytcfg=None):
 | 
				
			||||||
 | 
					        return try_get(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str) or self._YT_INNERTUBE_API_KEY
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _extract_yt_initial_data(self, video_id, webpage):
 | 
					    def _extract_yt_initial_data(self, video_id, webpage):
 | 
				
			||||||
        return self._parse_json(
 | 
					        return self._parse_json(
 | 
				
			||||||
@@ -358,7 +349,47 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
 | 
				
			|||||||
        return self._parse_json(
 | 
					        return self._parse_json(
 | 
				
			||||||
            self._search_regex(
 | 
					            self._search_regex(
 | 
				
			||||||
                r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
 | 
					                r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
 | 
				
			||||||
                default='{}'), video_id, fatal=False)
 | 
					                default='{}'), video_id, fatal=False) or {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __extract_client_version(self, ytcfg):
 | 
				
			||||||
 | 
					        return try_get(ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or self._YT_WEB_CLIENT_VERSION
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _extract_context(self, ytcfg=None):
 | 
				
			||||||
 | 
					        context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict)
 | 
				
			||||||
 | 
					        if context:
 | 
				
			||||||
 | 
					            return context
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Recreate the client context (required)
 | 
				
			||||||
 | 
					        client_version = self.__extract_client_version(ytcfg)
 | 
				
			||||||
 | 
					        client_name = try_get(ytcfg, lambda x: x['INNERTUBE_CLIENT_NAME'], compat_str) or 'WEB'
 | 
				
			||||||
 | 
					        context = {
 | 
				
			||||||
 | 
					            'client': {
 | 
				
			||||||
 | 
					                'clientName': client_name,
 | 
				
			||||||
 | 
					                'clientVersion': client_version,
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str)
 | 
				
			||||||
 | 
					        if visitor_data:
 | 
				
			||||||
 | 
					            context['client']['visitorData'] = visitor_data
 | 
				
			||||||
 | 
					        return context
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _generate_api_headers(self, ytcfg=None, identity_token=None, account_syncid=None, visitor_data=None):
 | 
				
			||||||
 | 
					        headers = {
 | 
				
			||||||
 | 
					            'X-YouTube-Client-Name': '1',
 | 
				
			||||||
 | 
					            'X-YouTube-Client-Version': self.__extract_client_version(ytcfg),
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        if identity_token:
 | 
				
			||||||
 | 
					            headers['x-youtube-identity-token'] = identity_token
 | 
				
			||||||
 | 
					        if account_syncid:
 | 
				
			||||||
 | 
					            headers['X-Goog-PageId'] = account_syncid
 | 
				
			||||||
 | 
					            headers['X-Goog-AuthUser'] = 0
 | 
				
			||||||
 | 
					        if visitor_data:
 | 
				
			||||||
 | 
					            headers['x-goog-visitor-id'] = visitor_data
 | 
				
			||||||
 | 
					        auth = self._generate_sapisidhash_header()
 | 
				
			||||||
 | 
					        if auth is not None:
 | 
				
			||||||
 | 
					            headers['Authorization'] = auth
 | 
				
			||||||
 | 
					            headers['X-Origin'] = 'https://www.youtube.com'
 | 
				
			||||||
 | 
					        return headers
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _extract_video(self, renderer):
 | 
					    def _extract_video(self, renderer):
 | 
				
			||||||
        video_id = renderer.get('videoId')
 | 
					        video_id = renderer.get('videoId')
 | 
				
			||||||
@@ -1576,7 +1607,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
 | 
					    def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
 | 
				
			||||||
                         session_token_list, parent=None, comment_counts=None):
 | 
					                         ytcfg, session_token_list, parent=None, comment_counts=None):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def extract_thread(parent_renderer):
 | 
					        def extract_thread(parent_renderer):
 | 
				
			||||||
            contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
 | 
					            contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
 | 
				
			||||||
@@ -1602,7 +1633,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
                if comment_replies_renderer:
 | 
					                if comment_replies_renderer:
 | 
				
			||||||
                    comment_counts[2] += 1
 | 
					                    comment_counts[2] += 1
 | 
				
			||||||
                    comment_entries_iter = self._comment_entries(
 | 
					                    comment_entries_iter = self._comment_entries(
 | 
				
			||||||
                        comment_replies_renderer, identity_token, account_syncid,
 | 
					                        comment_replies_renderer, identity_token, account_syncid, ytcfg,
 | 
				
			||||||
                        parent=comment.get('id'), session_token_list=session_token_list,
 | 
					                        parent=comment.get('id'), session_token_list=session_token_list,
 | 
				
			||||||
                        comment_counts=comment_counts)
 | 
					                        comment_counts=comment_counts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1612,16 +1643,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
        if not comment_counts:
 | 
					        if not comment_counts:
 | 
				
			||||||
            # comment so far, est. total comments, current comment thread #
 | 
					            # comment so far, est. total comments, current comment thread #
 | 
				
			||||||
            comment_counts = [0, 0, 0]
 | 
					            comment_counts = [0, 0, 0]
 | 
				
			||||||
        headers = self._DEFAULT_BASIC_API_HEADERS.copy()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # TODO: Generalize the download code with TabIE
 | 
					        # TODO: Generalize the download code with TabIE
 | 
				
			||||||
        if identity_token:
 | 
					        context = self._extract_context(ytcfg)
 | 
				
			||||||
            headers['x-youtube-identity-token'] = identity_token
 | 
					        visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
 | 
				
			||||||
 | 
					 | 
				
			||||||
        if account_syncid:
 | 
					 | 
				
			||||||
            headers['X-Goog-PageId'] = account_syncid
 | 
					 | 
				
			||||||
            headers['X-Goog-AuthUser'] = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        continuation = YoutubeTabIE._extract_continuation(root_continuation_data)  # TODO
 | 
					        continuation = YoutubeTabIE._extract_continuation(root_continuation_data)  # TODO
 | 
				
			||||||
        first_continuation = False
 | 
					        first_continuation = False
 | 
				
			||||||
        if parent is None:
 | 
					        if parent is None:
 | 
				
			||||||
@@ -1630,6 +1655,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
        for page_num in itertools.count(0):
 | 
					        for page_num in itertools.count(0):
 | 
				
			||||||
            if not continuation:
 | 
					            if not continuation:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					            headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
 | 
				
			||||||
            retries = self._downloader.params.get('extractor_retries', 3)
 | 
					            retries = self._downloader.params.get('extractor_retries', 3)
 | 
				
			||||||
            count = -1
 | 
					            count = -1
 | 
				
			||||||
            last_error = None
 | 
					            last_error = None
 | 
				
			||||||
@@ -1711,6 +1737,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            if not response:
 | 
					            if not response:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					            visitor_data = try_get(
 | 
				
			||||||
 | 
					                response,
 | 
				
			||||||
 | 
					                lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
 | 
				
			||||||
 | 
					                compat_str) or visitor_data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            known_continuation_renderers = {
 | 
					            known_continuation_renderers = {
 | 
				
			||||||
                'itemSectionContinuation': extract_thread,
 | 
					                'itemSectionContinuation': extract_thread,
 | 
				
			||||||
@@ -1777,6 +1807,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
                    renderer,
 | 
					                    renderer,
 | 
				
			||||||
                    identity_token=self._extract_identity_token(webpage, item_id=video_id),
 | 
					                    identity_token=self._extract_identity_token(webpage, item_id=video_id),
 | 
				
			||||||
                    account_syncid=self._extract_account_syncid(ytcfg),
 | 
					                    account_syncid=self._extract_account_syncid(ytcfg),
 | 
				
			||||||
 | 
					                    ytcfg=ytcfg,
 | 
				
			||||||
                    session_token_list=[xsrf_token])
 | 
					                    session_token_list=[xsrf_token])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                for comment in comment_iter:
 | 
					                for comment in comment_iter:
 | 
				
			||||||
@@ -1804,9 +1835,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
            player_response = self._extract_yt_initial_variable(
 | 
					            player_response = self._extract_yt_initial_variable(
 | 
				
			||||||
                webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
 | 
					                webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
 | 
				
			||||||
                video_id, 'initial player response')
 | 
					                video_id, 'initial player response')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ytcfg = self._extract_ytcfg(video_id, webpage)
 | 
				
			||||||
        if not player_response:
 | 
					        if not player_response:
 | 
				
			||||||
            player_response = self._call_api(
 | 
					            player_response = self._call_api(
 | 
				
			||||||
                'player', {'videoId': video_id}, video_id)
 | 
					                'player', {'videoId': video_id}, video_id, api_key=self._extract_api_key(ytcfg))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        playability_status = player_response.get('playabilityStatus') or {}
 | 
					        playability_status = player_response.get('playabilityStatus') or {}
 | 
				
			||||||
        if playability_status.get('reason') == 'Sign in to confirm your age':
 | 
					        if playability_status.get('reason') == 'Sign in to confirm your age':
 | 
				
			||||||
@@ -2190,7 +2223,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
                'yt initial data')
 | 
					                'yt initial data')
 | 
				
			||||||
        if not initial_data:
 | 
					        if not initial_data:
 | 
				
			||||||
            initial_data = self._call_api(
 | 
					            initial_data = self._call_api(
 | 
				
			||||||
                'next', {'videoId': video_id}, video_id, fatal=False)
 | 
					                'next', {'videoId': video_id}, video_id, fatal=False, api_key=self._extract_api_key(ytcfg))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not is_live:
 | 
					        if not is_live:
 | 
				
			||||||
            try:
 | 
					            try:
 | 
				
			||||||
@@ -2942,7 +2975,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
            ctp = continuation_ep.get('clickTrackingParams')
 | 
					            ctp = continuation_ep.get('clickTrackingParams')
 | 
				
			||||||
            return YoutubeTabIE._build_continuation_query(continuation, ctp)
 | 
					            return YoutubeTabIE._build_continuation_query(continuation, ctp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _entries(self, tab, item_id, identity_token, account_syncid):
 | 
					    def _entries(self, tab, item_id, identity_token, account_syncid, ytcfg):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def extract_entries(parent_renderer):  # this needs to called again for continuation to work with feeds
 | 
					        def extract_entries(parent_renderer):  # this needs to called again for continuation to work with feeds
 | 
				
			||||||
            contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
 | 
					            contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
 | 
				
			||||||
@@ -2994,21 +3027,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
        for entry in extract_entries(parent_renderer):
 | 
					        for entry in extract_entries(parent_renderer):
 | 
				
			||||||
            yield entry
 | 
					            yield entry
 | 
				
			||||||
        continuation = continuation_list[0]
 | 
					        continuation = continuation_list[0]
 | 
				
			||||||
 | 
					        context = self._extract_context(ytcfg)
 | 
				
			||||||
        headers = {
 | 
					        visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
 | 
				
			||||||
            'x-youtube-client-name': '1',
 | 
					 | 
				
			||||||
            'x-youtube-client-version': '2.20201112.04.01',
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        if identity_token:
 | 
					 | 
				
			||||||
            headers['x-youtube-identity-token'] = identity_token
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if account_syncid:
 | 
					 | 
				
			||||||
            headers['X-Goog-PageId'] = account_syncid
 | 
					 | 
				
			||||||
            headers['X-Goog-AuthUser'] = 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for page_num in itertools.count(1):
 | 
					        for page_num in itertools.count(1):
 | 
				
			||||||
            if not continuation:
 | 
					            if not continuation:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					            headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
 | 
				
			||||||
            retries = self._downloader.params.get('extractor_retries', 3)
 | 
					            retries = self._downloader.params.get('extractor_retries', 3)
 | 
				
			||||||
            count = -1
 | 
					            count = -1
 | 
				
			||||||
            last_error = None
 | 
					            last_error = None
 | 
				
			||||||
@@ -3024,6 +3049,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
                            'continuation': continuation['continuation'],
 | 
					                            'continuation': continuation['continuation'],
 | 
				
			||||||
                            'clickTracking': {'clickTrackingParams': continuation['itct']},
 | 
					                            'clickTracking': {'clickTrackingParams': continuation['itct']},
 | 
				
			||||||
                        },
 | 
					                        },
 | 
				
			||||||
 | 
					                        context=context,
 | 
				
			||||||
 | 
					                        api_key=self._extract_api_key(ytcfg),
 | 
				
			||||||
                        note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
 | 
					                        note='Downloading API JSON%s' % (' (retry #%d)' % count if count else ''))
 | 
				
			||||||
                except ExtractorError as e:
 | 
					                except ExtractorError as e:
 | 
				
			||||||
                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
 | 
					                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404):
 | 
				
			||||||
@@ -3049,6 +3076,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            if not response:
 | 
					            if not response:
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
 | 
					            visitor_data = try_get(
 | 
				
			||||||
 | 
					                response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            known_continuation_renderers = {
 | 
					            known_continuation_renderers = {
 | 
				
			||||||
                'playlistVideoListContinuation': self._playlist_entries,
 | 
					                'playlistVideoListContinuation': self._playlist_entries,
 | 
				
			||||||
@@ -3196,7 +3225,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
 | 
				
			|||||||
            self._entries(
 | 
					            self._entries(
 | 
				
			||||||
                selected_tab, playlist_id,
 | 
					                selected_tab, playlist_id,
 | 
				
			||||||
                self._extract_identity_token(webpage, item_id),
 | 
					                self._extract_identity_token(webpage, item_id),
 | 
				
			||||||
                self._extract_account_syncid(data)),
 | 
					                self._extract_account_syncid(data),
 | 
				
			||||||
 | 
					                self._extract_ytcfg(item_id, webpage)),
 | 
				
			||||||
            **metadata)
 | 
					            **metadata)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _extract_mix_playlist(self, playlist, playlist_id):
 | 
					    def _extract_mix_playlist(self, playlist, playlist_id):
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user