mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[core] Support decoding multiple content encodings (#7142)
Authored by: coletdjnz
This commit is contained in:
		| @@ -17,9 +17,11 @@ import tempfile | |||||||
| import threading | import threading | ||||||
| import urllib.error | import urllib.error | ||||||
| import urllib.request | import urllib.request | ||||||
|  | import zlib | ||||||
| 
 | 
 | ||||||
| from test.helper import http_server_port | from test.helper import http_server_port | ||||||
| from yt_dlp import YoutubeDL | from yt_dlp import YoutubeDL | ||||||
|  | from yt_dlp.dependencies import brotli | ||||||
| from yt_dlp.utils import sanitized_Request, urlencode_postdata | from yt_dlp.utils import sanitized_Request, urlencode_postdata | ||||||
| 
 | 
 | ||||||
| from .helper import FakeYDL | from .helper import FakeYDL | ||||||
| @@ -148,6 +150,31 @@ class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler): | |||||||
|             self.send_header('Location', new_url) |             self.send_header('Location', new_url) | ||||||
|             self.send_header('Content-Length', '0') |             self.send_header('Content-Length', '0') | ||||||
|             self.end_headers() |             self.end_headers() | ||||||
|  |         elif self.path == '/content-encoding': | ||||||
|  |             encodings = self.headers.get('ytdl-encoding', '') | ||||||
|  |             payload = b'<html><video src="/vid.mp4" /></html>' | ||||||
|  |             for encoding in filter(None, (e.strip() for e in encodings.split(','))): | ||||||
|  |                 if encoding == 'br' and brotli: | ||||||
|  |                     payload = brotli.compress(payload) | ||||||
|  |                 elif encoding == 'gzip': | ||||||
|  |                     buf = io.BytesIO() | ||||||
|  |                     with gzip.GzipFile(fileobj=buf, mode='wb') as f: | ||||||
|  |                         f.write(payload) | ||||||
|  |                     payload = buf.getvalue() | ||||||
|  |                 elif encoding == 'deflate': | ||||||
|  |                     payload = zlib.compress(payload) | ||||||
|  |                 elif encoding == 'unsupported': | ||||||
|  |                     payload = b'raw' | ||||||
|  |                     break | ||||||
|  |                 else: | ||||||
|  |                     self._status(415) | ||||||
|  |                     return | ||||||
|  |             self.send_response(200) | ||||||
|  |             self.send_header('Content-Encoding', encodings) | ||||||
|  |             self.send_header('Content-Length', str(len(payload))) | ||||||
|  |             self.end_headers() | ||||||
|  |             self.wfile.write(payload) | ||||||
|  | 
 | ||||||
|         else: |         else: | ||||||
|             self._status(404) |             self._status(404) | ||||||
| 
 | 
 | ||||||
| @@ -302,6 +329,55 @@ class TestHTTP(unittest.TestCase): | |||||||
|             data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8') |             data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8') | ||||||
|             self.assertEqual(data, '<html><video src="/vid.mp4" /></html>') |             self.assertEqual(data, '<html><video src="/vid.mp4" /></html>') | ||||||
| 
 | 
 | ||||||
|  |     @unittest.skipUnless(brotli, 'brotli support is not installed') | ||||||
|  |     def test_brotli(self): | ||||||
|  |         with FakeYDL() as ydl: | ||||||
|  |             res = ydl.urlopen( | ||||||
|  |                 sanitized_Request( | ||||||
|  |                     f'http://127.0.0.1:{self.http_port}/content-encoding', | ||||||
|  |                     headers={'ytdl-encoding': 'br'})) | ||||||
|  |             self.assertEqual(res.headers.get('Content-Encoding'), 'br') | ||||||
|  |             self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') | ||||||
|  | 
 | ||||||
|  |     def test_deflate(self): | ||||||
|  |         with FakeYDL() as ydl: | ||||||
|  |             res = ydl.urlopen( | ||||||
|  |                 sanitized_Request( | ||||||
|  |                     f'http://127.0.0.1:{self.http_port}/content-encoding', | ||||||
|  |                     headers={'ytdl-encoding': 'deflate'})) | ||||||
|  |             self.assertEqual(res.headers.get('Content-Encoding'), 'deflate') | ||||||
|  |             self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') | ||||||
|  | 
 | ||||||
|  |     def test_gzip(self): | ||||||
|  |         with FakeYDL() as ydl: | ||||||
|  |             res = ydl.urlopen( | ||||||
|  |                 sanitized_Request( | ||||||
|  |                     f'http://127.0.0.1:{self.http_port}/content-encoding', | ||||||
|  |                     headers={'ytdl-encoding': 'gzip'})) | ||||||
|  |             self.assertEqual(res.headers.get('Content-Encoding'), 'gzip') | ||||||
|  |             self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') | ||||||
|  | 
 | ||||||
|  |     def test_multiple_encodings(self): | ||||||
|  |         # https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4 | ||||||
|  |         with FakeYDL() as ydl: | ||||||
|  |             for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): | ||||||
|  |                 res = ydl.urlopen( | ||||||
|  |                     sanitized_Request( | ||||||
|  |                         f'http://127.0.0.1:{self.http_port}/content-encoding', | ||||||
|  |                         headers={'ytdl-encoding': pair})) | ||||||
|  |                 self.assertEqual(res.headers.get('Content-Encoding'), pair) | ||||||
|  |                 self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') | ||||||
|  | 
 | ||||||
|  |     def test_unsupported_encoding(self): | ||||||
|  |         # it should return the raw content | ||||||
|  |         with FakeYDL() as ydl: | ||||||
|  |             res = ydl.urlopen( | ||||||
|  |                 sanitized_Request( | ||||||
|  |                     f'http://127.0.0.1:{self.http_port}/content-encoding', | ||||||
|  |                     headers={'ytdl-encoding': 'unsupported'})) | ||||||
|  |             self.assertEqual(res.headers.get('Content-Encoding'), 'unsupported') | ||||||
|  |             self.assertEqual(res.read(), b'raw') | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class TestClientCert(unittest.TestCase): | class TestClientCert(unittest.TestCase): | ||||||
|     def setUp(self): |     def setUp(self): | ||||||
|   | |||||||
| @@ -1361,6 +1361,23 @@ class YoutubeDLHandler(urllib.request.HTTPHandler): | |||||||
|             return data |             return data | ||||||
|         return brotli.decompress(data) |         return brotli.decompress(data) | ||||||
| 
 | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def gz(data): | ||||||
|  |         gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb') | ||||||
|  |         try: | ||||||
|  |             return gz.read() | ||||||
|  |         except OSError as original_oserror: | ||||||
|  |             # There may be junk add the end of the file | ||||||
|  |             # See http://stackoverflow.com/q/4928560/35070 for details | ||||||
|  |             for i in range(1, 1024): | ||||||
|  |                 try: | ||||||
|  |                     gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb') | ||||||
|  |                     return gz.read() | ||||||
|  |                 except OSError: | ||||||
|  |                     continue | ||||||
|  |             else: | ||||||
|  |                 raise original_oserror | ||||||
|  | 
 | ||||||
|     def http_request(self, req): |     def http_request(self, req): | ||||||
|         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not |         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not | ||||||
|         # always respected by websites, some tend to give out URLs with non percent-encoded |         # always respected by websites, some tend to give out URLs with non percent-encoded | ||||||
| @@ -1394,35 +1411,21 @@ class YoutubeDLHandler(urllib.request.HTTPHandler): | |||||||
| 
 | 
 | ||||||
|     def http_response(self, req, resp): |     def http_response(self, req, resp): | ||||||
|         old_resp = resp |         old_resp = resp | ||||||
|         # gzip | 
 | ||||||
|         if resp.headers.get('Content-encoding', '') == 'gzip': |         # Content-Encoding header lists the encodings in order that they were applied [1]. | ||||||
|             content = resp.read() |         # To decompress, we simply do the reverse. | ||||||
|             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') |         # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding | ||||||
|             try: |         decoded_response = None | ||||||
|                 uncompressed = io.BytesIO(gz.read()) |         for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))): | ||||||
|             except OSError as original_ioerror: |             if encoding == 'gzip': | ||||||
|                 # There may be junk add the end of the file |                 decoded_response = self.gz(decoded_response or resp.read()) | ||||||
|                 # See http://stackoverflow.com/q/4928560/35070 for details |             elif encoding == 'deflate': | ||||||
|                 for i in range(1, 1024): |                 decoded_response = self.deflate(decoded_response or resp.read()) | ||||||
|                     try: |             elif encoding == 'br' and brotli: | ||||||
|                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') |                 decoded_response = self.brotli(decoded_response or resp.read()) | ||||||
|                         uncompressed = io.BytesIO(gz.read()) | 
 | ||||||
|                     except OSError: |         if decoded_response is not None: | ||||||
|                         continue |             resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code) | ||||||
|                     break |  | ||||||
|                 else: |  | ||||||
|                     raise original_ioerror |  | ||||||
|             resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) |  | ||||||
|             resp.msg = old_resp.msg |  | ||||||
|         # deflate |  | ||||||
|         if resp.headers.get('Content-encoding', '') == 'deflate': |  | ||||||
|             gz = io.BytesIO(self.deflate(resp.read())) |  | ||||||
|             resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) |  | ||||||
|             resp.msg = old_resp.msg |  | ||||||
|         # brotli |  | ||||||
|         if resp.headers.get('Content-encoding', '') == 'br': |  | ||||||
|             resp = urllib.request.addinfourl( |  | ||||||
|                 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) |  | ||||||
|             resp.msg = old_resp.msg |             resp.msg = old_resp.msg | ||||||
|         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see |         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see | ||||||
|         # https://github.com/ytdl-org/youtube-dl/issues/6457). |         # https://github.com/ytdl-org/youtube-dl/issues/6457). | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 coletdjnz
					coletdjnz