mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-04 04:37:06 -05:00 
			
		
		
		
	Merge pull request #8092 from bpfoley/twitter-thumbnail
[utils] Add extract_attributes for extracting html tag attributes
This commit is contained in:
		@@ -28,6 +28,7 @@ from youtube_dl.utils import (
 | 
			
		||||
    encodeFilename,
 | 
			
		||||
    escape_rfc3986,
 | 
			
		||||
    escape_url,
 | 
			
		||||
    extract_attributes,
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    find_xpath_attr,
 | 
			
		||||
    fix_xml_ampersands,
 | 
			
		||||
@@ -77,6 +78,7 @@ from youtube_dl.utils import (
 | 
			
		||||
    cli_bool_option,
 | 
			
		||||
)
 | 
			
		||||
from youtube_dl.compat import (
 | 
			
		||||
    compat_chr,
 | 
			
		||||
    compat_etree_fromstring,
 | 
			
		||||
    compat_urlparse,
 | 
			
		||||
    compat_parse_qs,
 | 
			
		||||
@@ -629,6 +631,44 @@ class TestUtil(unittest.TestCase):
 | 
			
		||||
        on = js_to_json('{"abc": "def",}')
 | 
			
		||||
        self.assertEqual(json.loads(on), {'abc': 'def'})
 | 
			
		||||
 | 
			
		||||
    def test_extract_attributes(self):
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="&">'), {'x': '&'})  # XML
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=""">'), {'x': '"'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x >'), {'x': None})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
 | 
			
		||||
        self.assertEqual(extract_attributes('<e x="décomposé">'), {'x': 'décompose\u0301'})
 | 
			
		||||
        # "Narrow" Python builds don't support unicode code points outside BMP.
 | 
			
		||||
        try:
 | 
			
		||||
            compat_chr(0x10000)
 | 
			
		||||
            supports_outside_bmp = True
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            supports_outside_bmp = False
 | 
			
		||||
        if supports_outside_bmp:
 | 
			
		||||
            self.assertEqual(extract_attributes('<e x="Smile 😀!">'), {'x': 'Smile \U0001f600!'})
 | 
			
		||||
 | 
			
		||||
    def test_clean_html(self):
 | 
			
		||||
        self.assertEqual(clean_html('a:\nb'), 'a: b')
 | 
			
		||||
        self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"')
 | 
			
		||||
 
 | 
			
		||||
@@ -77,6 +77,11 @@ try:
 | 
			
		||||
except ImportError:  # Python 2
 | 
			
		||||
    from urllib import urlretrieve as compat_urlretrieve
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from html.parser import HTMLParser as compat_HTMLParser
 | 
			
		||||
except ImportError:  # Python 2
 | 
			
		||||
    from HTMLParser import HTMLParser as compat_HTMLParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from subprocess import DEVNULL
 | 
			
		||||
@@ -543,6 +548,7 @@ else:
 | 
			
		||||
    from tokenize import generate_tokens as compat_tokenize_tokenize
 | 
			
		||||
 | 
			
		||||
__all__ = [
 | 
			
		||||
    'compat_HTMLParser',
 | 
			
		||||
    'compat_HTTPError',
 | 
			
		||||
    'compat_basestring',
 | 
			
		||||
    'compat_chr',
 | 
			
		||||
 
 | 
			
		||||
@@ -35,6 +35,7 @@ import xml.etree.ElementTree
 | 
			
		||||
import zlib
 | 
			
		||||
 | 
			
		||||
from .compat import (
 | 
			
		||||
    compat_HTMLParser,
 | 
			
		||||
    compat_basestring,
 | 
			
		||||
    compat_chr,
 | 
			
		||||
    compat_etree_fromstring,
 | 
			
		||||
@@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):
 | 
			
		||||
 | 
			
		||||
    return unescapeHTML(res)
 | 
			
		||||
 | 
			
		||||
class HTMLAttributeParser(compat_HTMLParser):
 | 
			
		||||
    """Trivial HTML parser to gather the attributes for a single element"""
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.attrs = { }
 | 
			
		||||
        compat_HTMLParser.__init__(self)
 | 
			
		||||
 | 
			
		||||
    def handle_starttag(self, tag, attrs):
 | 
			
		||||
        self.attrs = dict(attrs)
 | 
			
		||||
 | 
			
		||||
def extract_attributes(html_element):
 | 
			
		||||
    """Given a string for an HTML element such as
 | 
			
		||||
    <el
 | 
			
		||||
         a="foo" B="bar" c="&98;az" d=boz
 | 
			
		||||
         empty= noval entity="&"
 | 
			
		||||
         sq='"' dq="'"
 | 
			
		||||
    >
 | 
			
		||||
    Decode and return a dictionary of attributes.
 | 
			
		||||
    {
 | 
			
		||||
        'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 | 
			
		||||
        'empty': '', 'noval': None, 'entity': '&',
 | 
			
		||||
        'sq': '"', 'dq': '\''
 | 
			
		||||
    }.
 | 
			
		||||
    NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 | 
			
		||||
    but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 | 
			
		||||
    """
 | 
			
		||||
    parser = HTMLAttributeParser()
 | 
			
		||||
    parser.feed(html_element)
 | 
			
		||||
    parser.close()
 | 
			
		||||
    return parser.attrs
 | 
			
		||||
 | 
			
		||||
def clean_html(html):
 | 
			
		||||
    """Clean an HTML snippet into a readable string"""
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user