mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-11-12 12:35:15 +00:00
[networking] Remove dot segments during URL normalization (#7662)
This implements RFC3986 5.2.4 remove_dot_segments during the URL normalization process. Closes #3355, #6526 Authored by: coletdjnz
This commit is contained in:
@@ -33,7 +33,6 @@ from .minicurses import MultilinePrinter, QuietMultilinePrinter
|
||||
from .utils import (
|
||||
Popen,
|
||||
error_to_str,
|
||||
escape_url,
|
||||
expand_path,
|
||||
is_path_like,
|
||||
sanitize_url,
|
||||
@@ -42,6 +41,7 @@ from .utils import (
|
||||
write_string,
|
||||
)
|
||||
from .utils._utils import _YDLLogger
|
||||
from .utils.networking import normalize_url
|
||||
|
||||
CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
|
||||
SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
|
||||
@@ -1308,7 +1308,7 @@ class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
|
||||
|
||||
def get_cookie_header(self, url):
|
||||
"""Generate a Cookie HTTP header for a given url"""
|
||||
cookie_req = urllib.request.Request(escape_url(sanitize_url(url)))
|
||||
cookie_req = urllib.request.Request(normalize_url(sanitize_url(url)))
|
||||
self.add_cookie_header(cookie_req)
|
||||
return cookie_req.get_header('Cookie')
|
||||
|
||||
@@ -1317,7 +1317,7 @@ class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
|
||||
# Policy `_now` attribute must be set before calling `_cookies_for_request`
|
||||
# Ref: https://github.com/python/cpython/blob/3.7/Lib/http/cookiejar.py#L1360
|
||||
self._policy._now = self._now = int(time.time())
|
||||
return self._cookies_for_request(urllib.request.Request(escape_url(sanitize_url(url))))
|
||||
return self._cookies_for_request(urllib.request.Request(normalize_url(sanitize_url(url))))
|
||||
|
||||
def clear(self, *args, **kwargs):
|
||||
with contextlib.suppress(KeyError):
|
||||
|
||||
@@ -41,7 +41,8 @@ from .exceptions import (
|
||||
from ..dependencies import brotli
|
||||
from ..socks import ProxyError as SocksProxyError
|
||||
from ..socks import sockssocket
|
||||
from ..utils import escape_url, update_url_query
|
||||
from ..utils import update_url_query
|
||||
from ..utils.networking import normalize_url
|
||||
|
||||
SUPPORTED_ENCODINGS = ['gzip', 'deflate']
|
||||
CONTENT_DECODE_ERRORS = [zlib.error, OSError]
|
||||
@@ -179,7 +180,7 @@ class HTTPHandler(urllib.request.AbstractHTTPHandler):
|
||||
# Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
|
||||
# the code of this workaround has been moved here from YoutubeDL.urlopen()
|
||||
url = req.get_full_url()
|
||||
url_escaped = escape_url(url)
|
||||
url_escaped = normalize_url(url)
|
||||
|
||||
# Substitute URL if any change after escaping
|
||||
if url != url_escaped:
|
||||
@@ -212,7 +213,7 @@ class HTTPHandler(urllib.request.AbstractHTTPHandler):
|
||||
if location:
|
||||
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
|
||||
location = location.encode('iso-8859-1').decode()
|
||||
location_escaped = escape_url(location)
|
||||
location_escaped = normalize_url(location)
|
||||
if location != location_escaped:
|
||||
del resp.headers['Location']
|
||||
resp.headers['Location'] = location_escaped
|
||||
|
||||
@@ -27,10 +27,9 @@ from ..utils import (
|
||||
classproperty,
|
||||
deprecation_warning,
|
||||
error_to_str,
|
||||
escape_url,
|
||||
update_url_query,
|
||||
)
|
||||
from ..utils.networking import HTTPHeaderDict
|
||||
from ..utils.networking import HTTPHeaderDict, normalize_url
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
RequestData = bytes | Iterable[bytes] | typing.IO | None
|
||||
@@ -372,7 +371,7 @@ class Request:
|
||||
raise TypeError('url must be a string')
|
||||
elif url.startswith('//'):
|
||||
url = 'http:' + url
|
||||
self._url = escape_url(url)
|
||||
self._url = normalize_url(url)
|
||||
|
||||
@property
|
||||
def method(self):
|
||||
|
||||
@@ -8,6 +8,8 @@ import urllib.request
|
||||
import zlib
|
||||
|
||||
from ._utils import Popen, decode_base_n, preferredencoding
|
||||
from .networking import escape_rfc3986 # noqa: F401
|
||||
from .networking import normalize_url as escape_url # noqa: F401
|
||||
from .traversal import traverse_obj
|
||||
from ..dependencies import certifi, websockets
|
||||
from ..networking._helper import make_ssl_context
|
||||
@@ -197,7 +199,7 @@ def request_to_url(req):
|
||||
|
||||
|
||||
def sanitized_Request(url, *args, **kwargs):
|
||||
from ..utils import escape_url, extract_basic_auth, sanitize_url
|
||||
from ..utils import extract_basic_auth, sanitize_url
|
||||
url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
|
||||
if auth_header is not None:
|
||||
headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
|
||||
|
||||
@@ -2464,23 +2464,6 @@ def lowercase_escape(s):
|
||||
s)
|
||||
|
||||
|
||||
def escape_rfc3986(s):
|
||||
"""Escape non-ASCII characters as suggested by RFC 3986"""
|
||||
return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
|
||||
|
||||
|
||||
def escape_url(url):
|
||||
"""Escape URL as suggested by RFC 3986"""
|
||||
url_parsed = urllib.parse.urlparse(url)
|
||||
return url_parsed._replace(
|
||||
netloc=url_parsed.netloc.encode('idna').decode('ascii'),
|
||||
path=escape_rfc3986(url_parsed.path),
|
||||
params=escape_rfc3986(url_parsed.params),
|
||||
query=escape_rfc3986(url_parsed.query),
|
||||
fragment=escape_rfc3986(url_parsed.fragment)
|
||||
).geturl()
|
||||
|
||||
|
||||
def parse_qs(url, **kwargs):
|
||||
return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs)
|
||||
|
||||
|
||||
@@ -121,3 +121,41 @@ def clean_headers(headers: HTTPHeaderDict):
|
||||
if 'Youtubedl-No-Compression' in headers: # compat
|
||||
del headers['Youtubedl-No-Compression']
|
||||
headers['Accept-Encoding'] = 'identity'
|
||||
|
||||
|
||||
def remove_dot_segments(path):
|
||||
# Implements RFC3986 5.2.4 remote_dot_segments
|
||||
# Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
|
||||
# https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
|
||||
output = []
|
||||
segments = path.split('/')
|
||||
for s in segments:
|
||||
if s == '.':
|
||||
continue
|
||||
elif s == '..':
|
||||
if output:
|
||||
output.pop()
|
||||
else:
|
||||
output.append(s)
|
||||
if not segments[0] and (not output or output[0]):
|
||||
output.insert(0, '')
|
||||
if segments[-1] in ('.', '..'):
|
||||
output.append('')
|
||||
return '/'.join(output)
|
||||
|
||||
|
||||
def escape_rfc3986(s):
|
||||
"""Escape non-ASCII characters as suggested by RFC 3986"""
|
||||
return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
|
||||
|
||||
|
||||
def normalize_url(url):
|
||||
"""Normalize URL as suggested by RFC 3986"""
|
||||
url_parsed = urllib.parse.urlparse(url)
|
||||
return url_parsed._replace(
|
||||
netloc=url_parsed.netloc.encode('idna').decode('ascii'),
|
||||
path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
|
||||
params=escape_rfc3986(url_parsed.params),
|
||||
query=escape_rfc3986(url_parsed.query),
|
||||
fragment=escape_rfc3986(url_parsed.fragment)
|
||||
).geturl()
|
||||
|
||||
Reference in New Issue
Block a user