mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 14:45:14 +00:00 
			
		
		
		
	[generic] Add support for BOMs (Fixes #4753)
This commit is contained in:
		| @@ -28,6 +28,7 @@ from youtube_dl.utils import ( | |||||||
|     fix_xml_ampersands, |     fix_xml_ampersands, | ||||||
|     InAdvancePagedList, |     InAdvancePagedList, | ||||||
|     intlist_to_bytes, |     intlist_to_bytes, | ||||||
|  |     is_html, | ||||||
|     js_to_json, |     js_to_json, | ||||||
|     limit_length, |     limit_length, | ||||||
|     OnDemandPagedList, |     OnDemandPagedList, | ||||||
| @@ -417,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') | |||||||
|         self.assertTrue(age_restricted(18, 14)) |         self.assertTrue(age_restricted(18, 14)) | ||||||
|         self.assertFalse(age_restricted(18, 18)) |         self.assertFalse(age_restricted(18, 18)) | ||||||
|  |  | ||||||
|  |     def test_is_html(self): | ||||||
|  |         self.assertFalse(is_html(b'\x49\x44\x43<html')) | ||||||
|  |         self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa')) | ||||||
|  |         self.assertTrue(is_html(  # UTF-8 with BOM | ||||||
|  |             b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa')) | ||||||
|  |         self.assertTrue(is_html(  # UTF-16-LE | ||||||
|  |             b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00' | ||||||
|  |         )) | ||||||
|  |         self.assertTrue(is_html(  # UTF-16-BE | ||||||
|  |             b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4' | ||||||
|  |         )) | ||||||
|  |         self.assertTrue(is_html(  # UTF-32-BE | ||||||
|  |             b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4')) | ||||||
|  |         self.assertTrue(is_html(  # UTF-32-LE | ||||||
|  |             b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00')) | ||||||
|  |  | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     unittest.main() |     unittest.main() | ||||||
|   | |||||||
| @@ -17,6 +17,7 @@ from ..utils import ( | |||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     float_or_none, |     float_or_none, | ||||||
|     HEADRequest, |     HEADRequest, | ||||||
|  |     is_html, | ||||||
|     orderedSet, |     orderedSet, | ||||||
|     parse_xml, |     parse_xml, | ||||||
|     smuggle_url, |     smuggle_url, | ||||||
| @@ -647,7 +648,7 @@ class GenericIE(InfoExtractor): | |||||||
|         # Maybe it's a direct link to a video? |         # Maybe it's a direct link to a video? | ||||||
|         # Be careful not to download the whole thing! |         # Be careful not to download the whole thing! | ||||||
|         first_bytes = full_response.read(512) |         first_bytes = full_response.read(512) | ||||||
|         if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): |         if not is_html(first_bytes): | ||||||
|             self._downloader.report_warning( |             self._downloader.report_warning( | ||||||
|                 'URL could be a direct video link, returning it as such.') |                 'URL could be a direct video link, returning it as such.') | ||||||
|             upload_date = unified_strdate( |             upload_date = unified_strdate( | ||||||
|   | |||||||
| @@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit): | |||||||
|     if content_limit is None: |     if content_limit is None: | ||||||
|         return False  # Content available for everyone |         return False  # Content available for everyone | ||||||
|     return age_limit < content_limit |     return age_limit < content_limit | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def is_html(first_bytes): | ||||||
|  |     """ Detect whether a file contains HTML by examining its first bytes. """ | ||||||
|  |  | ||||||
|  |     BOMS = [ | ||||||
|  |         (b'\xef\xbb\xbf', 'utf-8'), | ||||||
|  |         (b'\x00\x00\xfe\xff', 'utf-32-be'), | ||||||
|  |         (b'\xff\xfe\x00\x00', 'utf-32-le'), | ||||||
|  |         (b'\xff\xfe', 'utf-16-le'), | ||||||
|  |         (b'\xfe\xff', 'utf-16-be'), | ||||||
|  |     ] | ||||||
|  |     for bom, enc in BOMS: | ||||||
|  |         if first_bytes.startswith(bom): | ||||||
|  |             s = first_bytes[len(bom):].decode(enc, 'replace') | ||||||
|  |             break | ||||||
|  |     else: | ||||||
|  |         s = first_bytes.decode('utf-8', 'replace') | ||||||
|  |  | ||||||
|  |     return re.match(r'^\s*<', s) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
					Philipp Hagemeister