[core] Fix HTTP headers and cookie handling

- Remove `Cookie` header from `http_headers` immediately after loading into cookiejar - Restore compat for `--load-info-json` cookies - Add more tests - Fix improper passing of Cookie header by `MailRu` extractor Closes #7558 Authored by: bashonly, pukkandan
2026-02-08 15:07:05 +00:00 · 2023-07-15 15:22:10 -05:00
parent 2b029ca0a9
commit 6c5211cebe
5 changed files with 120 additions and 33 deletions
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -680,14 +680,15 @@ class YoutubeDL:

        self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
        self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers'))
+        self.__header_cookies = []
+        self._load_cookies(self.params['http_headers'].get('Cookie'))  # compat
+        self.params['http_headers'].pop('Cookie', None)
+
        self._request_director = self.build_request_director(
            sorted(_REQUEST_HANDLERS.values(), key=lambda rh: rh.RH_NAME.lower()))
        if auto_init and auto_init != 'no_verbose_header':
            self.print_debug_header()

-        self.__header_cookies = []
-        self._load_cookies(traverse_obj(self.params.get('http_headers'), 'cookie', casesense=False))  # compat
-
        def check_deprecated(param, option, suggestion):
            if self.params.get(param) is not None:
                self.report_warning(f'{option} is deprecated. Use {suggestion} instead')
@@ -1645,18 +1646,19 @@ class YoutubeDL:
                self.to_screen('')
            raise

-    def _load_cookies(self, data, *, from_headers=True):
+    def _load_cookies(self, data, *, autoscope=True):
        """Loads cookies from a `Cookie` header

        This tries to work around the security vulnerability of passing cookies to every domain.
        See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
-        The unscoped cookies are saved for later to be stored in the jar with a limited scope.

        @param data         The Cookie header as string to load the cookies from
-        @param from_headers If `False`, allows Set-Cookie syntax in the cookie string (at least a domain will be required)
+        @param autoscope    If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains
+                            If `True`, save cookies for later to be stored in the jar with a limited scope
+                            If a URL, save cookies in the jar with the domain of the URL
        """
        for cookie in LenientSimpleCookie(data).values():
-            if from_headers and any(cookie.values()):
+            if autoscope and any(cookie.values()):
                raise ValueError('Invalid syntax in Cookie Header')

            domain = cookie.get('domain') or ''
@@ -1670,17 +1672,23 @@ class YoutubeDL:

            if domain:
                self.cookiejar.set_cookie(prepared_cookie)
-            elif from_headers:
+            elif autoscope is True:
                self.deprecated_feature(
                    'Passing cookies as a header is a potential security risk; '
                    'they will be scoped to the domain of the downloaded urls. '
                    'Please consider loading cookies from a file or browser instead.')
                self.__header_cookies.append(prepared_cookie)
+            elif autoscope:
+                self.report_warning(
+                    'The extractor result contains an unscoped cookie as an HTTP header. '
+                    f'If you are using yt-dlp with an input URL{bug_reports_message(before=",")}',
+                    only_once=True)
+                self._apply_header_cookies(autoscope, [prepared_cookie])
            else:
                self.report_error('Unscoped cookies are not allowed; please specify some sort of scoping',
                                  tb=False, is_error=False)

-    def _apply_header_cookies(self, url):
+    def _apply_header_cookies(self, url, cookies=None):
        """Applies stray header cookies to the provided url

        This loads header cookies and scopes them to the domain provided in `url`.
@@ -1691,7 +1699,7 @@ class YoutubeDL:
        if not parsed.hostname:
            return

-        for cookie in map(copy.copy, self.__header_cookies):
+        for cookie in map(copy.copy, cookies or self.__header_cookies):
            cookie.domain = f'.{parsed.hostname}'
            self.cookiejar.set_cookie(cookie)

@@ -2481,9 +2489,16 @@ class YoutubeDL:
        parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
        return _build_selector_function(parsed_selector)

-    def _calc_headers(self, info_dict):
+    def _calc_headers(self, info_dict, load_cookies=False):
        res = HTTPHeaderDict(self.params['http_headers'], info_dict.get('http_headers'))
        clean_headers(res)
+
+        if load_cookies:  # For --load-info-json
+            self._load_cookies(res.get('Cookie'), autoscope=info_dict['url'])  # compat
+            self._load_cookies(info_dict.get('cookies'), autoscope=False)
+        # The `Cookie` header is removed to prevent leaks and unscoped cookies.
+        # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
+        res.pop('Cookie', None)
        cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
        if cookies:
            encoder = LenientSimpleCookie()
@@ -2762,7 +2777,12 @@ class YoutubeDL:
                    and info_dict.get('duration') and format.get('tbr')
                    and not format.get('filesize') and not format.get('filesize_approx')):
                format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8))
-            format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict))
+            format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True)
+
+        # Safeguard against old/insecure infojson when using --load-info-json
+        if info_dict.get('http_headers'):
+            info_dict['http_headers'] = HTTPHeaderDict(info_dict['http_headers'])
+            info_dict['http_headers'].pop('Cookie', None)

        # This is copied to http_headers by the above _calc_headers and can now be removed
        if '__x_forwarded_for_ip' in info_dict:
@@ -3508,8 +3528,6 @@ class YoutubeDL:
            infos = [self.sanitize_info(info, self.params.get('clean_infojson', True))
                     for info in variadic(json.loads('\n'.join(f)))]
        for info in infos:
-            self._load_cookies(info.get('cookies'), from_headers=False)
-            self._load_cookies(traverse_obj(info.get('http_headers'), 'Cookie', casesense=False))  # compat
            try:
                self.__download_wrapper(self.process_ie_result)(info, download=True)
            except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e:
--- a/yt_dlp/downloader/common.py
+++ b/yt_dlp/downloader/common.py
@@ -32,7 +32,6 @@ from ..utils import (
    timetuple_from_msec,
    try_call,
 )
-from ..utils.traversal import traverse_obj


 class FileDownloader:
@@ -453,11 +452,6 @@ class FileDownloader:
            self.to_screen(f'[download] Sleeping {sleep_interval:.2f} seconds ...')
            time.sleep(sleep_interval)

-        # Filter the `Cookie` header from the info_dict to prevent leaks.
-        # See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj
-        info_dict['http_headers'] = dict(traverse_obj(info_dict, (
-            'http_headers', {dict.items}, lambda _, pair: pair[0].lower() != 'cookie'))) or None
-
        ret = self.real_download(filename, info_dict)
        self._finish_multiline_status()
        return ret, True
--- a/yt_dlp/extractor/mailru.py
+++ b/yt_dlp/extractor/mailru.py
@@ -1,6 +1,7 @@
 import itertools
 import json
 import re
+import urllib.parse

 from .common import InfoExtractor
 from ..compat import compat_urllib_parse_unquote
@@ -140,17 +141,15 @@ class MailRuIE(InfoExtractor):
                'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
                video_id, 'Downloading video JSON')

-        headers = {}
-
        video_key = self._get_cookies('https://my.mail.ru').get('video_key')
-        if video_key:
-            headers['Cookie'] = 'video_key=%s' % video_key.value

        formats = []
        for f in video_data['videos']:
            video_url = f.get('url')
            if not video_url:
                continue
+            if video_key:
+                self._set_cookie(urllib.parse.urlparse(video_url).hostname, 'video_key', video_key.value)
            format_id = f.get('key')
            height = int_or_none(self._search_regex(
                r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
@@ -158,7 +157,6 @@ class MailRuIE(InfoExtractor):
                'url': video_url,
                'format_id': format_id,
                'height': height,
-                'http_headers': headers,
            })

        meta_data = video_data['meta']