mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-04 00:57:07 -05:00 
			
		
		
		
	[compat] compat_etree_fromstring: also decode the text attribute
Deletes parse_xml from utils, because it also does it.
This commit is contained in:
		@@ -74,10 +74,19 @@ class TestCompat(unittest.TestCase):
 | 
			
		||||
        self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
 | 
			
		||||
 | 
			
		||||
    def test_compat_etree_fromstring(self):
 | 
			
		||||
        xml = '<el foo="bar" spam="中文"></el>'
 | 
			
		||||
        xml = '''
 | 
			
		||||
            <root foo="bar" spam="中文">
 | 
			
		||||
                <normal>foo</normal>
 | 
			
		||||
                <chinese>中文</chinese>
 | 
			
		||||
                <foo><bar>spam</bar></foo>
 | 
			
		||||
            </root>
 | 
			
		||||
        '''
 | 
			
		||||
        doc = compat_etree_fromstring(xml.encode('utf-8'))
 | 
			
		||||
        self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
 | 
			
		||||
        self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
 | 
			
		||||
        self.assertTrue(isinstance(doc.find('normal').text, compat_str))
 | 
			
		||||
        self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
 | 
			
		||||
        self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    unittest.main()
 | 
			
		||||
 
 | 
			
		||||
@@ -216,9 +216,19 @@ except ImportError:  # Python 2.6
 | 
			
		||||
if sys.version_info[0] >= 3:
 | 
			
		||||
    compat_etree_fromstring = xml.etree.ElementTree.fromstring
 | 
			
		||||
else:
 | 
			
		||||
    # on python 2.x the the attributes of a node aren't always unicode objects
 | 
			
		||||
    # on python 2.x the attributes and text of a node aren't always unicode
 | 
			
		||||
    # objects
 | 
			
		||||
    etree = xml.etree.ElementTree
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        _etree_iter = etree.Element.iter
 | 
			
		||||
    except AttributeError:  # Python <=2.6
 | 
			
		||||
        def _etree_iter(root):
 | 
			
		||||
            for el in root.findall('*'):
 | 
			
		||||
                yield el
 | 
			
		||||
                for sub in _etree_iter(el):
 | 
			
		||||
                    yield sub
 | 
			
		||||
 | 
			
		||||
    # on 2.6 XML doesn't have a parser argument, function copied from CPython
 | 
			
		||||
    # 2.7 source
 | 
			
		||||
    def _XML(text, parser=None):
 | 
			
		||||
@@ -235,7 +245,11 @@ else:
 | 
			
		||||
        return el
 | 
			
		||||
 | 
			
		||||
    def compat_etree_fromstring(text):
 | 
			
		||||
        return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
 | 
			
		||||
        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
 | 
			
		||||
        for el in _etree_iter(doc):
 | 
			
		||||
            if el.text is not None and isinstance(el.text, bytes):
 | 
			
		||||
                el.text = el.text.decode('utf-8')
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from urllib.parse import parse_qs as compat_parse_qs
 | 
			
		||||
 
 | 
			
		||||
@@ -14,8 +14,8 @@ from ..utils import (
 | 
			
		||||
    parse_duration,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
    xpath_text,
 | 
			
		||||
    parse_xml,
 | 
			
		||||
)
 | 
			
		||||
from ..compat import compat_etree_fromstring
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ARDMediathekIE(InfoExtractor):
 | 
			
		||||
@@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):
 | 
			
		||||
            raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
 | 
			
		||||
 | 
			
		||||
        if re.search(r'[\?&]rss($|[=&])', url):
 | 
			
		||||
            doc = parse_xml(webpage)
 | 
			
		||||
            doc = compat_etree_fromstring(webpage.encode('utf-8'))
 | 
			
		||||
            if doc.tag == 'rss':
 | 
			
		||||
                return GenericIE()._extract_rss(url, video_id, doc)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -9,6 +9,7 @@ import sys
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
from .youtube import YoutubeIE
 | 
			
		||||
from ..compat import (
 | 
			
		||||
    compat_etree_fromstring,
 | 
			
		||||
    compat_urllib_parse_unquote,
 | 
			
		||||
    compat_urllib_request,
 | 
			
		||||
    compat_urlparse,
 | 
			
		||||
@@ -21,7 +22,6 @@ from ..utils import (
 | 
			
		||||
    HEADRequest,
 | 
			
		||||
    is_html,
 | 
			
		||||
    orderedSet,
 | 
			
		||||
    parse_xml,
 | 
			
		||||
    smuggle_url,
 | 
			
		||||
    unescapeHTML,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
@@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor):
 | 
			
		||||
 | 
			
		||||
        # Is it an RSS feed, a SMIL file or a XSPF playlist?
 | 
			
		||||
        try:
 | 
			
		||||
            doc = parse_xml(webpage)
 | 
			
		||||
            doc = compat_etree_fromstring(webpage.encode('utf-8'))
 | 
			
		||||
            if doc.tag == 'rss':
 | 
			
		||||
                return self._extract_rss(url, video_id, doc)
 | 
			
		||||
            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
 | 
			
		||||
 
 | 
			
		||||
@@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'):
 | 
			
		||||
    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    etree_iter = xml.etree.ElementTree.Element.iter
 | 
			
		||||
except AttributeError:  # Python <=2.6
 | 
			
		||||
    etree_iter = lambda n: n.findall('.//*')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_xml(s):
 | 
			
		||||
    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
 | 
			
		||||
        def doctype(self, name, pubid, system):
 | 
			
		||||
            pass  # Ignore doctypes
 | 
			
		||||
 | 
			
		||||
    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
 | 
			
		||||
    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
 | 
			
		||||
    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
 | 
			
		||||
    # Fix up XML parser in Python 2.x
 | 
			
		||||
    if sys.version_info < (3, 0):
 | 
			
		||||
        for n in etree_iter(tree):
 | 
			
		||||
            if n.text is not None:
 | 
			
		||||
                if not isinstance(n.text, compat_str):
 | 
			
		||||
                    n.text = n.text.decode('utf-8')
 | 
			
		||||
    return tree
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
US_RATINGS = {
 | 
			
		||||
    'G': 0,
 | 
			
		||||
    'PG': 10,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user