mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 06:35:12 +00:00 
			
		
		
		
	[rh:requests] Add handler for requests HTTP library (#3668)
				
					
				
			Adds support for HTTPS proxies and persistent connections (keep-alive) Closes https://github.com/yt-dlp/yt-dlp/issues/1890 Resolves https://github.com/yt-dlp/yt-dlp/issues/4070 Resolves https://github.com/ytdl-org/youtube-dl/issues/32549 Resolves https://github.com/ytdl-org/youtube-dl/issues/14523 Resolves https://github.com/ytdl-org/youtube-dl/issues/13734 Authored by: coletdjnz, Grub4K, bashonly
This commit is contained in:
		
							
								
								
									
										398
									
								
								yt_dlp/networking/_requests.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										398
									
								
								yt_dlp/networking/_requests.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,398 @@ | ||||
| import contextlib | ||||
| import functools | ||||
| import http.client | ||||
| import logging | ||||
| import re | ||||
| import socket | ||||
| import warnings | ||||
| 
 | ||||
| from ..dependencies import brotli, requests, urllib3 | ||||
| from ..utils import bug_reports_message, int_or_none, variadic | ||||
| 
 | ||||
| if requests is None: | ||||
|     raise ImportError('requests module is not installed') | ||||
| 
 | ||||
| if urllib3 is None: | ||||
|     raise ImportError('urllib3 module is not installed') | ||||
| 
 | ||||
| urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.')) | ||||
| 
 | ||||
| if urllib3_version < (1, 26, 17): | ||||
|     raise ImportError('Only urllib3 >= 1.26.17 is supported') | ||||
| 
 | ||||
| if requests.__build__ < 0x023100: | ||||
|     raise ImportError('Only requests >= 2.31.0 is supported') | ||||
| 
 | ||||
| import requests.adapters | ||||
| import requests.utils | ||||
| import urllib3.connection | ||||
| import urllib3.exceptions | ||||
| 
 | ||||
| from ._helper import ( | ||||
|     InstanceStoreMixin, | ||||
|     add_accept_encoding_header, | ||||
|     create_connection, | ||||
|     create_socks_proxy_socket, | ||||
|     get_redirect_method, | ||||
|     make_socks_proxy_opts, | ||||
|     select_proxy, | ||||
| ) | ||||
| from .common import ( | ||||
|     Features, | ||||
|     RequestHandler, | ||||
|     Response, | ||||
|     register_preference, | ||||
|     register_rh, | ||||
| ) | ||||
| from .exceptions import ( | ||||
|     CertificateVerifyError, | ||||
|     HTTPError, | ||||
|     IncompleteRead, | ||||
|     ProxyError, | ||||
|     RequestError, | ||||
|     SSLError, | ||||
|     TransportError, | ||||
| ) | ||||
| from ..socks import ProxyError as SocksProxyError | ||||
| 
 | ||||
| SUPPORTED_ENCODINGS = [ | ||||
|     'gzip', 'deflate' | ||||
| ] | ||||
| 
 | ||||
| if brotli is not None: | ||||
|     SUPPORTED_ENCODINGS.append('br') | ||||
| 
 | ||||
| """ | ||||
| Override urllib3's behavior to not convert lower-case percent-encoded characters | ||||
| to upper-case during url normalization process. | ||||
| 
 | ||||
| RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent | ||||
| and normalizers should convert them to uppercase for consistency [1]. | ||||
| 
 | ||||
| However, some sites may have an incorrect implementation where they provide | ||||
| a percent-encoded url that is then compared case-sensitively.[2] | ||||
| 
 | ||||
| While this is a very rare case, since urllib does not do this normalization step, it | ||||
| is best to avoid it in requests too for compatability reasons. | ||||
| 
 | ||||
| 1: https://tools.ietf.org/html/rfc3986#section-2.1 | ||||
| 2: https://github.com/streamlink/streamlink/pull/4003 | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| class Urllib3PercentREOverride: | ||||
|     def __init__(self, r: re.Pattern): | ||||
|         self.re = r | ||||
| 
 | ||||
|     # pass through all other attribute calls to the original re | ||||
|     def __getattr__(self, item): | ||||
|         return self.re.__getattribute__(item) | ||||
| 
 | ||||
|     def subn(self, repl, string, *args, **kwargs): | ||||
|         return string, self.re.subn(repl, string, *args, **kwargs)[1] | ||||
| 
 | ||||
| 
 | ||||
| # urllib3 >= 1.25.8 uses subn: | ||||
| # https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0 | ||||
| import urllib3.util.url  # noqa: E305 | ||||
| 
 | ||||
| if hasattr(urllib3.util.url, 'PERCENT_RE'): | ||||
|     urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE) | ||||
| elif hasattr(urllib3.util.url, '_PERCENT_RE'):  # urllib3 >= 2.0.0 | ||||
|     urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE) | ||||
| else: | ||||
|     warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message()) | ||||
| 
 | ||||
| """ | ||||
| Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass | ||||
| server_hostname to SSLContext.wrap_socket if server_hostname is an IP, | ||||
| however this is an issue because we set check_hostname to True in our SSLContext. | ||||
| 
 | ||||
| Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless. | ||||
| 
 | ||||
| This has been fixed in urllib3 2.0+. | ||||
| See: https://github.com/urllib3/urllib3/issues/517 | ||||
| """ | ||||
| 
 | ||||
| if urllib3_version < (2, 0, 0): | ||||
|     with contextlib.suppress(): | ||||
|         urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True | ||||
| 
 | ||||
| 
 | ||||
| # Requests will not automatically handle no_proxy by default | ||||
| # due to buggy no_proxy handling with proxy dict [1]. | ||||
| # 1. https://github.com/psf/requests/issues/5000 | ||||
| requests.adapters.select_proxy = select_proxy | ||||
| 
 | ||||
| 
 | ||||
| class RequestsResponseAdapter(Response): | ||||
|     def __init__(self, res: requests.models.Response): | ||||
|         super().__init__( | ||||
|             fp=res.raw, headers=res.headers, url=res.url, | ||||
|             status=res.status_code, reason=res.reason) | ||||
| 
 | ||||
|         self._requests_response = res | ||||
| 
 | ||||
|     def read(self, amt: int = None): | ||||
|         try: | ||||
|             # Interact with urllib3 response directly. | ||||
|             return self.fp.read(amt, decode_content=True) | ||||
| 
 | ||||
|         # See urllib3.response.HTTPResponse.read() for exceptions raised on read | ||||
|         except urllib3.exceptions.SSLError as e: | ||||
|             raise SSLError(cause=e) from e | ||||
| 
 | ||||
|         except urllib3.exceptions.IncompleteRead as e: | ||||
|             # urllib3 IncompleteRead.partial is always an integer | ||||
|             raise IncompleteRead(partial=e.partial, expected=e.expected) from e | ||||
| 
 | ||||
|         except urllib3.exceptions.ProtocolError as e: | ||||
|             # http.client.IncompleteRead may be contained within ProtocolError | ||||
|             # See urllib3.response.HTTPResponse._error_catcher() | ||||
|             ir_err = next( | ||||
|                 (err for err in (e.__context__, e.__cause__, *variadic(e.args)) | ||||
|                  if isinstance(err, http.client.IncompleteRead)), None) | ||||
|             if ir_err is not None: | ||||
|                 raise IncompleteRead(partial=len(ir_err.partial), expected=ir_err.expected) from e | ||||
|             raise TransportError(cause=e) from e | ||||
| 
 | ||||
|         except urllib3.exceptions.HTTPError as e: | ||||
|             # catch-all for any other urllib3 response exceptions | ||||
|             raise TransportError(cause=e) from e | ||||
| 
 | ||||
| 
 | ||||
| class RequestsHTTPAdapter(requests.adapters.HTTPAdapter): | ||||
|     def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs): | ||||
|         self._pm_args = {} | ||||
|         if ssl_context: | ||||
|             self._pm_args['ssl_context'] = ssl_context | ||||
|         if source_address: | ||||
|             self._pm_args['source_address'] = (source_address, 0) | ||||
|         self._proxy_ssl_context = proxy_ssl_context or ssl_context | ||||
|         super().__init__(**kwargs) | ||||
| 
 | ||||
|     def init_poolmanager(self, *args, **kwargs): | ||||
|         return super().init_poolmanager(*args, **kwargs, **self._pm_args) | ||||
| 
 | ||||
|     def proxy_manager_for(self, proxy, **proxy_kwargs): | ||||
|         extra_kwargs = {} | ||||
|         if not proxy.lower().startswith('socks') and self._proxy_ssl_context: | ||||
|             extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context | ||||
|         return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs) | ||||
| 
 | ||||
|     def cert_verify(*args, **kwargs): | ||||
|         # lean on SSLContext for cert verification | ||||
|         pass | ||||
| 
 | ||||
| 
 | ||||
| class RequestsSession(requests.sessions.Session): | ||||
|     """ | ||||
|     Ensure unified redirect method handling with our urllib redirect handler. | ||||
|     """ | ||||
|     def rebuild_method(self, prepared_request, response): | ||||
|         new_method = get_redirect_method(prepared_request.method, response.status_code) | ||||
| 
 | ||||
|         # HACK: requests removes headers/body on redirect unless code was a 307/308. | ||||
|         if new_method == prepared_request.method: | ||||
|             response._real_status_code = response.status_code | ||||
|             response.status_code = 308 | ||||
| 
 | ||||
|         prepared_request.method = new_method | ||||
| 
 | ||||
|     def rebuild_auth(self, prepared_request, response): | ||||
|         # HACK: undo status code change from rebuild_method, if applicable. | ||||
|         # rebuild_auth runs after requests would remove headers/body based on status code | ||||
|         if hasattr(response, '_real_status_code'): | ||||
|             response.status_code = response._real_status_code | ||||
|             del response._real_status_code | ||||
|         return super().rebuild_auth(prepared_request, response) | ||||
| 
 | ||||
| 
 | ||||
| class Urllib3LoggingFilter(logging.Filter): | ||||
| 
 | ||||
|     def filter(self, record): | ||||
|         # Ignore HTTP request messages since HTTPConnection prints those | ||||
|         if record.msg == '%s://%s:%s "%s %s %s" %s %s': | ||||
|             return False | ||||
|         return True | ||||
| 
 | ||||
| 
 | ||||
| class Urllib3LoggingHandler(logging.Handler): | ||||
|     """Redirect urllib3 logs to our logger""" | ||||
|     def __init__(self, logger, *args, **kwargs): | ||||
|         super().__init__(*args, **kwargs) | ||||
|         self._logger = logger | ||||
| 
 | ||||
|     def emit(self, record): | ||||
|         try: | ||||
|             msg = self.format(record) | ||||
|             if record.levelno >= logging.ERROR: | ||||
|                 self._logger.error(msg) | ||||
|             else: | ||||
|                 self._logger.stdout(msg) | ||||
| 
 | ||||
|         except Exception: | ||||
|             self.handleError(record) | ||||
| 
 | ||||
| 
 | ||||
| @register_rh | ||||
| class RequestsRH(RequestHandler, InstanceStoreMixin): | ||||
| 
 | ||||
|     """Requests RequestHandler | ||||
|     https://github.com/psf/requests | ||||
|     """ | ||||
|     _SUPPORTED_URL_SCHEMES = ('http', 'https') | ||||
|     _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS) | ||||
|     _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h') | ||||
|     _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY) | ||||
|     RH_NAME = 'requests' | ||||
| 
 | ||||
|     def __init__(self, *args, **kwargs): | ||||
|         super().__init__(*args, **kwargs) | ||||
| 
 | ||||
|         # Forward urllib3 debug messages to our logger | ||||
|         logger = logging.getLogger('urllib3') | ||||
|         handler = Urllib3LoggingHandler(logger=self._logger) | ||||
|         handler.setFormatter(logging.Formatter('requests: %(message)s')) | ||||
|         handler.addFilter(Urllib3LoggingFilter()) | ||||
|         logger.addHandler(handler) | ||||
|         logger.setLevel(logging.WARNING) | ||||
| 
 | ||||
|         if self.verbose: | ||||
|             # Setting this globally is not ideal, but is easier than hacking with urllib3. | ||||
|             # It could technically be problematic for scripts embedding yt-dlp. | ||||
|             # However, it is unlikely debug traffic is used in that context in a way this will cause problems. | ||||
|             urllib3.connection.HTTPConnection.debuglevel = 1 | ||||
|             logger.setLevel(logging.DEBUG) | ||||
|         # this is expected if we are using --no-check-certificate | ||||
|         urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | ||||
| 
 | ||||
|     def close(self): | ||||
|         self._clear_instances() | ||||
| 
 | ||||
|     def _check_extensions(self, extensions): | ||||
|         super()._check_extensions(extensions) | ||||
|         extensions.pop('cookiejar', None) | ||||
|         extensions.pop('timeout', None) | ||||
| 
 | ||||
|     def _create_instance(self, cookiejar): | ||||
|         session = RequestsSession() | ||||
|         http_adapter = RequestsHTTPAdapter( | ||||
|             ssl_context=self._make_sslcontext(), | ||||
|             source_address=self.source_address, | ||||
|             max_retries=urllib3.util.retry.Retry(False), | ||||
|         ) | ||||
|         session.adapters.clear() | ||||
|         session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'}) | ||||
|         session.mount('https://', http_adapter) | ||||
|         session.mount('http://', http_adapter) | ||||
|         session.cookies = cookiejar | ||||
|         session.trust_env = False  # no need, we already load proxies from env | ||||
|         return session | ||||
| 
 | ||||
|     def _send(self, request): | ||||
| 
 | ||||
|         headers = self._merge_headers(request.headers) | ||||
|         add_accept_encoding_header(headers, SUPPORTED_ENCODINGS) | ||||
| 
 | ||||
|         max_redirects_exceeded = False | ||||
| 
 | ||||
|         session = self._get_instance( | ||||
|             cookiejar=request.extensions.get('cookiejar') or self.cookiejar) | ||||
| 
 | ||||
|         try: | ||||
|             requests_res = session.request( | ||||
|                 method=request.method, | ||||
|                 url=request.url, | ||||
|                 data=request.data, | ||||
|                 headers=headers, | ||||
|                 timeout=float(request.extensions.get('timeout') or self.timeout), | ||||
|                 proxies=request.proxies or self.proxies, | ||||
|                 allow_redirects=True, | ||||
|                 stream=True | ||||
|             ) | ||||
| 
 | ||||
|         except requests.exceptions.TooManyRedirects as e: | ||||
|             max_redirects_exceeded = True | ||||
|             requests_res = e.response | ||||
| 
 | ||||
|         except requests.exceptions.SSLError as e: | ||||
|             if 'CERTIFICATE_VERIFY_FAILED' in str(e): | ||||
|                 raise CertificateVerifyError(cause=e) from e | ||||
|             raise SSLError(cause=e) from e | ||||
| 
 | ||||
|         except requests.exceptions.ProxyError as e: | ||||
|             raise ProxyError(cause=e) from e | ||||
| 
 | ||||
|         except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: | ||||
|             raise TransportError(cause=e) from e | ||||
| 
 | ||||
|         except urllib3.exceptions.HTTPError as e: | ||||
|             # Catch any urllib3 exceptions that may leak through | ||||
|             raise TransportError(cause=e) from e | ||||
| 
 | ||||
|         except requests.exceptions.RequestException as e: | ||||
|             # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL | ||||
|             raise RequestError(cause=e) from e | ||||
| 
 | ||||
|         res = RequestsResponseAdapter(requests_res) | ||||
| 
 | ||||
|         if not 200 <= res.status < 300: | ||||
|             raise HTTPError(res, redirect_loop=max_redirects_exceeded) | ||||
| 
 | ||||
|         return res | ||||
| 
 | ||||
| 
 | ||||
| @register_preference(RequestsRH) | ||||
| def requests_preference(rh, request): | ||||
|     return 100 | ||||
| 
 | ||||
| 
 | ||||
| # Use our socks proxy implementation with requests to avoid an extra dependency. | ||||
| class SocksHTTPConnection(urllib3.connection.HTTPConnection): | ||||
|     def __init__(self, _socks_options, *args, **kwargs):  # must use _socks_options to pass PoolKey checks | ||||
|         self._proxy_args = _socks_options | ||||
|         super().__init__(*args, **kwargs) | ||||
| 
 | ||||
|     def _new_conn(self): | ||||
|         try: | ||||
|             return create_connection( | ||||
|                 address=(self._proxy_args['addr'], self._proxy_args['port']), | ||||
|                 timeout=self.timeout, | ||||
|                 source_address=self.source_address, | ||||
|                 _create_socket_func=functools.partial( | ||||
|                     create_socks_proxy_socket, (self.host, self.port), self._proxy_args)) | ||||
|         except (socket.timeout, TimeoutError) as e: | ||||
|             raise urllib3.exceptions.ConnectTimeoutError( | ||||
|                 self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e | ||||
|         except SocksProxyError as e: | ||||
|             raise urllib3.exceptions.ProxyError(str(e), e) from e | ||||
|         except (OSError, socket.error) as e: | ||||
|             raise urllib3.exceptions.NewConnectionError( | ||||
|                 self, f'Failed to establish a new connection: {e}') from e | ||||
| 
 | ||||
| 
 | ||||
| class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool): | ||||
|     ConnectionCls = SocksHTTPConnection | ||||
| 
 | ||||
| 
 | ||||
| class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool): | ||||
|     ConnectionCls = SocksHTTPSConnection | ||||
| 
 | ||||
| 
 | ||||
| class SocksProxyManager(urllib3.PoolManager): | ||||
| 
 | ||||
|     def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw): | ||||
|         connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy) | ||||
|         super().__init__(num_pools, headers, **connection_pool_kw) | ||||
|         self.pool_classes_by_scheme = { | ||||
|             'http': SocksHTTPConnectionPool, | ||||
|             'https': SocksHTTPSConnectionPool | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
| requests.adapters.SOCKSProxyManager = SocksProxyManager | ||||
		Reference in New Issue
	
	Block a user
	 coletdjnz
					coletdjnz