mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-04 09:17:07 -05:00 
			
		
		
		
	[mixcloud] Fix extraction (closes #14088)
This commit is contained in:
		
				
					committed by
					
						
						Sergey M․
					
				
			
			
				
	
			
			
			
						parent
						
							8c2895305d
						
					
				
				
					commit
					2384f5a64e
				
			@@ -6,6 +6,7 @@ import collections
 | 
				
			|||||||
import email
 | 
					import email
 | 
				
			||||||
import getpass
 | 
					import getpass
 | 
				
			||||||
import io
 | 
					import io
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
import optparse
 | 
					import optparse
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
@@ -15,7 +16,6 @@ import socket
 | 
				
			|||||||
import struct
 | 
					import struct
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import itertools
 | 
					 | 
				
			||||||
import xml.etree.ElementTree
 | 
					import xml.etree.ElementTree
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -2898,6 +2898,13 @@ else:
 | 
				
			|||||||
    compat_struct_pack = struct.pack
 | 
					    compat_struct_pack = struct.pack
 | 
				
			||||||
    compat_struct_unpack = struct.unpack
 | 
					    compat_struct_unpack = struct.unpack
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					try:
 | 
				
			||||||
 | 
					    from future_builtins import zip as compat_zip
 | 
				
			||||||
 | 
					except ImportError:  # not 2.6+ or is 3.x
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        from itertools import izip as compat_zip  # < 2.5 or 3.x
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        compat_zip = zip
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = [
 | 
					__all__ = [
 | 
				
			||||||
    'compat_HTMLParseError',
 | 
					    'compat_HTMLParseError',
 | 
				
			||||||
@@ -2948,5 +2955,6 @@ __all__ = [
 | 
				
			|||||||
    'compat_urlretrieve',
 | 
					    'compat_urlretrieve',
 | 
				
			||||||
    'compat_xml_parse_error',
 | 
					    'compat_xml_parse_error',
 | 
				
			||||||
    'compat_xpath',
 | 
					    'compat_xpath',
 | 
				
			||||||
 | 
					    'compat_zip',
 | 
				
			||||||
    'workaround_optparse_bug9161',
 | 
					    'workaround_optparse_bug9161',
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,16 +9,16 @@ from .common import InfoExtractor
 | 
				
			|||||||
from ..compat import (
 | 
					from ..compat import (
 | 
				
			||||||
    compat_chr,
 | 
					    compat_chr,
 | 
				
			||||||
    compat_ord,
 | 
					    compat_ord,
 | 
				
			||||||
    compat_str,
 | 
					 | 
				
			||||||
    compat_urllib_parse_unquote,
 | 
					    compat_urllib_parse_unquote,
 | 
				
			||||||
    compat_urlparse,
 | 
					    compat_urlparse,
 | 
				
			||||||
 | 
					    compat_zip
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from ..utils import (
 | 
					from ..utils import (
 | 
				
			||||||
    clean_html,
 | 
					    clean_html,
 | 
				
			||||||
    ExtractorError,
 | 
					    ExtractorError,
 | 
				
			||||||
    OnDemandPagedList,
 | 
					    OnDemandPagedList,
 | 
				
			||||||
    str_to_int,
 | 
					    str_to_int,
 | 
				
			||||||
)
 | 
					    try_get)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class MixcloudIE(InfoExtractor):
 | 
					class MixcloudIE(InfoExtractor):
 | 
				
			||||||
@@ -54,27 +54,19 @@ class MixcloudIE(InfoExtractor):
 | 
				
			|||||||
        'only_matching': True,
 | 
					        'only_matching': True,
 | 
				
			||||||
    }]
 | 
					    }]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    _keys = [
 | 
					    @staticmethod
 | 
				
			||||||
        'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };',
 | 
					    def _decrypt_xor_cipher(key, ciphertext):
 | 
				
			||||||
        'pleasedontdownloadourmusictheartistswontgetpaid',
 | 
					        """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
 | 
				
			||||||
        'window.addEventListener = window.addEventListener || function() {};',
 | 
					        return ''.join([
 | 
				
			||||||
        '(function() { return new Date().toLocaleDateString(); })()'
 | 
					            compat_chr(compat_ord(ch) ^ compat_ord(k))
 | 
				
			||||||
    ]
 | 
					            for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
 | 
				
			||||||
    _current_key = None
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
 | 
					    @staticmethod
 | 
				
			||||||
    def _decrypt_play_info(self, play_info, video_id):
 | 
					    def _decrypt_and_extend(stream_info, url_key, getter, key, formats):
 | 
				
			||||||
        play_info = base64.b64decode(play_info.encode('ascii'))
 | 
					        maybe_url = stream_info.get(url_key)
 | 
				
			||||||
        for num, key in enumerate(self._keys, start=1):
 | 
					        if maybe_url is not None:
 | 
				
			||||||
            try:
 | 
					            decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url))
 | 
				
			||||||
                return self._parse_json(
 | 
					            formats.extend(getter(decrypted))
 | 
				
			||||||
                    ''.join([
 | 
					 | 
				
			||||||
                        compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)]))
 | 
					 | 
				
			||||||
                        for idx, ch in enumerate(play_info)]),
 | 
					 | 
				
			||||||
                    video_id)
 | 
					 | 
				
			||||||
            except ExtractorError:
 | 
					 | 
				
			||||||
                if num == len(self._keys):
 | 
					 | 
				
			||||||
                    raise
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _real_extract(self, url):
 | 
					    def _real_extract(self, url):
 | 
				
			||||||
        mobj = re.match(self._VALID_URL, url)
 | 
					        mobj = re.match(self._VALID_URL, url)
 | 
				
			||||||
@@ -84,54 +76,105 @@ class MixcloudIE(InfoExtractor):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        webpage = self._download_webpage(url, track_id)
 | 
					        webpage = self._download_webpage(url, track_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if not self._current_key:
 | 
					        # Legacy path
 | 
				
			||||||
            js_url = self._search_regex(
 | 
					        encrypted_play_info = self._search_regex(
 | 
				
			||||||
                r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
 | 
					            r'm-play-info="([^"]+)"', webpage, 'play info', default=None)
 | 
				
			||||||
                webpage, 'js url', default=None)
 | 
					
 | 
				
			||||||
            if js_url:
 | 
					        if encrypted_play_info is not None:
 | 
				
			||||||
                js = self._download_webpage(js_url, track_id, fatal=False)
 | 
					            # Decode
 | 
				
			||||||
                if js:
 | 
					            encrypted_play_info = base64.b64decode(encrypted_play_info)
 | 
				
			||||||
                    KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
 | 
					        else:
 | 
				
			||||||
                    for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'):
 | 
					            # New path
 | 
				
			||||||
                        key = self._search_regex(
 | 
					            full_info_json = self._parse_json(self._html_search_regex(
 | 
				
			||||||
                            KEY_RE_TEMPLATE % key_name, js, 'key',
 | 
					                r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', webpage, 'play info'), 'play info')
 | 
				
			||||||
                            default=None, group='key')
 | 
					            for item in full_info_json:
 | 
				
			||||||
                        if key and isinstance(key, compat_str):
 | 
					                item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup'])
 | 
				
			||||||
                            self._keys.insert(0, key)
 | 
					                if try_get(item_data, lambda x: x['streamInfo']['url']):
 | 
				
			||||||
                            self._current_key = key
 | 
					                    info_json = item_data
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                raise ExtractorError('Failed to extract matching stream info')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        message = self._html_search_regex(
 | 
					        message = self._html_search_regex(
 | 
				
			||||||
            r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
 | 
					            r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
 | 
				
			||||||
            webpage, 'error message', default=None)
 | 
					            webpage, 'error message', default=None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        encrypted_play_info = self._search_regex(
 | 
					        js_url = self._search_regex(
 | 
				
			||||||
            r'm-play-info="([^"]+)"', webpage, 'play info')
 | 
					            r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
 | 
				
			||||||
 | 
					            webpage, 'js url', default=None)
 | 
				
			||||||
 | 
					        if js_url is None:
 | 
				
			||||||
 | 
					            js_url = self._search_regex(
 | 
				
			||||||
 | 
					                r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)',
 | 
				
			||||||
 | 
					                webpage, 'js url')
 | 
				
			||||||
 | 
					        js = self._download_webpage(js_url, track_id)
 | 
				
			||||||
 | 
					        # Known plaintext attack
 | 
				
			||||||
 | 
					        if encrypted_play_info:
 | 
				
			||||||
 | 
					            kps = ['{"stream_url":']
 | 
				
			||||||
 | 
					            kpa_target = encrypted_play_info
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            kps = ['https://', 'http://']
 | 
				
			||||||
 | 
					            kpa_target = base64.b64decode(info_json['streamInfo']['url'])
 | 
				
			||||||
 | 
					        for kp in kps:
 | 
				
			||||||
 | 
					            partial_key = self._decrypt_xor_cipher(kpa_target, kp)
 | 
				
			||||||
 | 
					            for quote in ["'", '"']:
 | 
				
			||||||
 | 
					                key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js,
 | 
				
			||||||
 | 
					                                         "encryption key", default=None)
 | 
				
			||||||
 | 
					                if key is not None:
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ExtractorError('Failed to extract encryption key')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        play_info = self._decrypt_play_info(encrypted_play_info, track_id)
 | 
					        if encrypted_play_info is not None:
 | 
				
			||||||
 | 
					            play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info')
 | 
				
			||||||
 | 
					            if message and 'stream_url' not in play_info:
 | 
				
			||||||
 | 
					                raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
 | 
				
			||||||
 | 
					            song_url = play_info['stream_url']
 | 
				
			||||||
 | 
					            formats = [{
 | 
				
			||||||
 | 
					                'format_id': 'normal',
 | 
				
			||||||
 | 
					                'url': song_url
 | 
				
			||||||
 | 
					            }]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if message and 'stream_url' not in play_info:
 | 
					            title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
 | 
				
			||||||
            raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
 | 
					            thumbnail = self._proto_relative_url(self._html_search_regex(
 | 
				
			||||||
 | 
					                r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False))
 | 
				
			||||||
 | 
					            uploader = self._html_search_regex(
 | 
				
			||||||
 | 
					                r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False)
 | 
				
			||||||
 | 
					            uploader_id = self._search_regex(
 | 
				
			||||||
 | 
					                r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
 | 
				
			||||||
 | 
					            description = self._og_search_description(webpage)
 | 
				
			||||||
 | 
					            view_count = str_to_int(self._search_regex(
 | 
				
			||||||
 | 
					                [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
 | 
				
			||||||
 | 
					                 r'/listeners/?">([0-9,.]+)</a>',
 | 
				
			||||||
 | 
					                 r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
 | 
				
			||||||
 | 
					                webpage, 'play count', default=None))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        song_url = play_info['stream_url']
 | 
					        else:
 | 
				
			||||||
 | 
					            title = info_json['name']
 | 
				
			||||||
 | 
					            thumbnail = try_get(info_json,
 | 
				
			||||||
 | 
					                                lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot'])
 | 
				
			||||||
 | 
					            uploader = try_get(info_json, lambda x: x['owner']['displayName'])
 | 
				
			||||||
 | 
					            uploader_id = try_get(info_json, lambda x: x['owner']['username'])
 | 
				
			||||||
 | 
					            description = try_get(info_json, lambda x: x['description'])
 | 
				
			||||||
 | 
					            view_count = try_get(info_json, lambda x: x['plays'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
 | 
					            stream_info = info_json['streamInfo']
 | 
				
			||||||
        thumbnail = self._proto_relative_url(self._html_search_regex(
 | 
					            formats = []
 | 
				
			||||||
            r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False))
 | 
					            self._decrypt_and_extend(stream_info, 'url', lambda x: [{
 | 
				
			||||||
        uploader = self._html_search_regex(
 | 
					                'format_id': 'normal',
 | 
				
			||||||
            r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False)
 | 
					                'url': x
 | 
				
			||||||
        uploader_id = self._search_regex(
 | 
					            }], key, formats)
 | 
				
			||||||
            r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
 | 
					            self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key,
 | 
				
			||||||
        description = self._og_search_description(webpage)
 | 
					                                     formats)
 | 
				
			||||||
        view_count = str_to_int(self._search_regex(
 | 
					            self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key,
 | 
				
			||||||
            [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
 | 
					                                     formats)
 | 
				
			||||||
             r'/listeners/?">([0-9,.]+)</a>',
 | 
					 | 
				
			||||||
             r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
 | 
					 | 
				
			||||||
            webpage, 'play count', default=None))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return {
 | 
					        return {
 | 
				
			||||||
            'id': track_id,
 | 
					            'id': track_id,
 | 
				
			||||||
            'title': title,
 | 
					            'title': title,
 | 
				
			||||||
            'url': song_url,
 | 
					            'formats': formats,
 | 
				
			||||||
            'description': description,
 | 
					            'description': description,
 | 
				
			||||||
            'thumbnail': thumbnail,
 | 
					            'thumbnail': thumbnail,
 | 
				
			||||||
            'uploader': uploader,
 | 
					            'uploader': uploader,
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user