mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-10-31 16:32:23 -04:00 
			
		
		
		
	[generic] Add support for BOMs (Fixes #4753)
This commit is contained in:
		| @@ -28,6 +28,7 @@ from youtube_dl.utils import ( | ||||
|     fix_xml_ampersands, | ||||
|     InAdvancePagedList, | ||||
|     intlist_to_bytes, | ||||
|     is_html, | ||||
|     js_to_json, | ||||
|     limit_length, | ||||
|     OnDemandPagedList, | ||||
| @@ -417,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') | ||||
|         self.assertTrue(age_restricted(18, 14)) | ||||
|         self.assertFalse(age_restricted(18, 18)) | ||||
|  | ||||
|     def test_is_html(self): | ||||
|         self.assertFalse(is_html(b'\x49\x44\x43<html')) | ||||
|         self.assertTrue(is_html(b'<!DOCTYPE foo>\xaaa')) | ||||
|         self.assertTrue(is_html(  # UTF-8 with BOM | ||||
|             b'\xef\xbb\xbf<!DOCTYPE foo>\xaaa')) | ||||
|         self.assertTrue(is_html(  # UTF-16-LE | ||||
|             b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00' | ||||
|         )) | ||||
|         self.assertTrue(is_html(  # UTF-16-BE | ||||
|             b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4' | ||||
|         )) | ||||
|         self.assertTrue(is_html(  # UTF-32-BE | ||||
|             b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4')) | ||||
|         self.assertTrue(is_html(  # UTF-32-LE | ||||
|             b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00')) | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|   | ||||
| @@ -17,6 +17,7 @@ from ..utils import ( | ||||
|     ExtractorError, | ||||
|     float_or_none, | ||||
|     HEADRequest, | ||||
|     is_html, | ||||
|     orderedSet, | ||||
|     parse_xml, | ||||
|     smuggle_url, | ||||
| @@ -647,7 +648,7 @@ class GenericIE(InfoExtractor): | ||||
|         # Maybe it's a direct link to a video? | ||||
|         # Be careful not to download the whole thing! | ||||
|         first_bytes = full_response.read(512) | ||||
|         if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): | ||||
|         if not is_html(first_bytes): | ||||
|             self._downloader.report_warning( | ||||
|                 'URL could be a direct video link, returning it as such.') | ||||
|             upload_date = unified_strdate( | ||||
|   | ||||
| @@ -1631,3 +1631,23 @@ def age_restricted(content_limit, age_limit): | ||||
|     if content_limit is None: | ||||
|         return False  # Content available for everyone | ||||
|     return age_limit < content_limit | ||||
|  | ||||
|  | ||||
| def is_html(first_bytes): | ||||
|     """ Detect whether a file contains HTML by examining its first bytes. """ | ||||
|  | ||||
|     BOMS = [ | ||||
|         (b'\xef\xbb\xbf', 'utf-8'), | ||||
|         (b'\x00\x00\xfe\xff', 'utf-32-be'), | ||||
|         (b'\xff\xfe\x00\x00', 'utf-32-le'), | ||||
|         (b'\xff\xfe', 'utf-16-le'), | ||||
|         (b'\xfe\xff', 'utf-16-be'), | ||||
|     ] | ||||
|     for bom, enc in BOMS: | ||||
|         if first_bytes.startswith(bom): | ||||
|             s = first_bytes[len(bom):].decode(enc, 'replace') | ||||
|             break | ||||
|     else: | ||||
|         s = first_bytes.decode('utf-8', 'replace') | ||||
|  | ||||
|     return re.match(r'^\s*<', s) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user