mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-04 09:17:07 -05:00 
			
		
		
		
	[mixcloud] Fix extraction (closes #14088)
This commit is contained in:
		
				
					committed by
					
						
						Sergey M․
					
				
			
			
				
	
			
			
			
						parent
						
							8c2895305d
						
					
				
				
					commit
					2384f5a64e
				
			@@ -6,6 +6,7 @@ import collections
 | 
			
		||||
import email
 | 
			
		||||
import getpass
 | 
			
		||||
import io
 | 
			
		||||
import itertools
 | 
			
		||||
import optparse
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
@@ -15,7 +16,6 @@ import socket
 | 
			
		||||
import struct
 | 
			
		||||
import subprocess
 | 
			
		||||
import sys
 | 
			
		||||
import itertools
 | 
			
		||||
import xml.etree.ElementTree
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -2898,6 +2898,13 @@ else:
 | 
			
		||||
    compat_struct_pack = struct.pack
 | 
			
		||||
    compat_struct_unpack = struct.unpack
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from future_builtins import zip as compat_zip
 | 
			
		||||
except ImportError:  # not 2.6+ or is 3.x
 | 
			
		||||
    try:
 | 
			
		||||
        from itertools import izip as compat_zip  # < 2.5 or 3.x
 | 
			
		||||
    except ImportError:
 | 
			
		||||
        compat_zip = zip
 | 
			
		||||
 | 
			
		||||
__all__ = [
 | 
			
		||||
    'compat_HTMLParseError',
 | 
			
		||||
@@ -2948,5 +2955,6 @@ __all__ = [
 | 
			
		||||
    'compat_urlretrieve',
 | 
			
		||||
    'compat_xml_parse_error',
 | 
			
		||||
    'compat_xpath',
 | 
			
		||||
    'compat_zip',
 | 
			
		||||
    'workaround_optparse_bug9161',
 | 
			
		||||
]
 | 
			
		||||
 
 | 
			
		||||
@@ -9,16 +9,16 @@ from .common import InfoExtractor
 | 
			
		||||
from ..compat import (
 | 
			
		||||
    compat_chr,
 | 
			
		||||
    compat_ord,
 | 
			
		||||
    compat_str,
 | 
			
		||||
    compat_urllib_parse_unquote,
 | 
			
		||||
    compat_urlparse,
 | 
			
		||||
    compat_zip
 | 
			
		||||
)
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    clean_html,
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    OnDemandPagedList,
 | 
			
		||||
    str_to_int,
 | 
			
		||||
)
 | 
			
		||||
    try_get)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MixcloudIE(InfoExtractor):
 | 
			
		||||
@@ -54,27 +54,19 @@ class MixcloudIE(InfoExtractor):
 | 
			
		||||
        'only_matching': True,
 | 
			
		||||
    }]
 | 
			
		||||
 | 
			
		||||
    _keys = [
 | 
			
		||||
        'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };',
 | 
			
		||||
        'pleasedontdownloadourmusictheartistswontgetpaid',
 | 
			
		||||
        'window.addEventListener = window.addEventListener || function() {};',
 | 
			
		||||
        '(function() { return new Date().toLocaleDateString(); })()'
 | 
			
		||||
    ]
 | 
			
		||||
    _current_key = None
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _decrypt_xor_cipher(key, ciphertext):
 | 
			
		||||
        """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
 | 
			
		||||
        return ''.join([
 | 
			
		||||
            compat_chr(compat_ord(ch) ^ compat_ord(k))
 | 
			
		||||
            for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
 | 
			
		||||
 | 
			
		||||
    # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
 | 
			
		||||
    def _decrypt_play_info(self, play_info, video_id):
 | 
			
		||||
        play_info = base64.b64decode(play_info.encode('ascii'))
 | 
			
		||||
        for num, key in enumerate(self._keys, start=1):
 | 
			
		||||
            try:
 | 
			
		||||
                return self._parse_json(
 | 
			
		||||
                    ''.join([
 | 
			
		||||
                        compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)]))
 | 
			
		||||
                        for idx, ch in enumerate(play_info)]),
 | 
			
		||||
                    video_id)
 | 
			
		||||
            except ExtractorError:
 | 
			
		||||
                if num == len(self._keys):
 | 
			
		||||
                    raise
 | 
			
		||||
    @staticmethod
 | 
			
		||||
    def _decrypt_and_extend(stream_info, url_key, getter, key, formats):
 | 
			
		||||
        maybe_url = stream_info.get(url_key)
 | 
			
		||||
        if maybe_url is not None:
 | 
			
		||||
            decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url))
 | 
			
		||||
            formats.extend(getter(decrypted))
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
@@ -84,35 +76,66 @@ class MixcloudIE(InfoExtractor):
 | 
			
		||||
 | 
			
		||||
        webpage = self._download_webpage(url, track_id)
 | 
			
		||||
 | 
			
		||||
        if not self._current_key:
 | 
			
		||||
            js_url = self._search_regex(
 | 
			
		||||
                r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
 | 
			
		||||
                webpage, 'js url', default=None)
 | 
			
		||||
            if js_url:
 | 
			
		||||
                js = self._download_webpage(js_url, track_id, fatal=False)
 | 
			
		||||
                if js:
 | 
			
		||||
                    KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
 | 
			
		||||
                    for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'):
 | 
			
		||||
                        key = self._search_regex(
 | 
			
		||||
                            KEY_RE_TEMPLATE % key_name, js, 'key',
 | 
			
		||||
                            default=None, group='key')
 | 
			
		||||
                        if key and isinstance(key, compat_str):
 | 
			
		||||
                            self._keys.insert(0, key)
 | 
			
		||||
                            self._current_key = key
 | 
			
		||||
        # Legacy path
 | 
			
		||||
        encrypted_play_info = self._search_regex(
 | 
			
		||||
            r'm-play-info="([^"]+)"', webpage, 'play info', default=None)
 | 
			
		||||
 | 
			
		||||
        if encrypted_play_info is not None:
 | 
			
		||||
            # Decode
 | 
			
		||||
            encrypted_play_info = base64.b64decode(encrypted_play_info)
 | 
			
		||||
        else:
 | 
			
		||||
            # New path
 | 
			
		||||
            full_info_json = self._parse_json(self._html_search_regex(
 | 
			
		||||
                r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>', webpage, 'play info'), 'play info')
 | 
			
		||||
            for item in full_info_json:
 | 
			
		||||
                item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup'])
 | 
			
		||||
                if try_get(item_data, lambda x: x['streamInfo']['url']):
 | 
			
		||||
                    info_json = item_data
 | 
			
		||||
                    break
 | 
			
		||||
            else:
 | 
			
		||||
                raise ExtractorError('Failed to extract matching stream info')
 | 
			
		||||
 | 
			
		||||
        message = self._html_search_regex(
 | 
			
		||||
            r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
 | 
			
		||||
            webpage, 'error message', default=None)
 | 
			
		||||
 | 
			
		||||
        encrypted_play_info = self._search_regex(
 | 
			
		||||
            r'm-play-info="([^"]+)"', webpage, 'play info')
 | 
			
		||||
 | 
			
		||||
        play_info = self._decrypt_play_info(encrypted_play_info, track_id)
 | 
			
		||||
        js_url = self._search_regex(
 | 
			
		||||
            r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
 | 
			
		||||
            webpage, 'js url', default=None)
 | 
			
		||||
        if js_url is None:
 | 
			
		||||
            js_url = self._search_regex(
 | 
			
		||||
                r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)',
 | 
			
		||||
                webpage, 'js url')
 | 
			
		||||
        js = self._download_webpage(js_url, track_id)
 | 
			
		||||
        # Known plaintext attack
 | 
			
		||||
        if encrypted_play_info:
 | 
			
		||||
            kps = ['{"stream_url":']
 | 
			
		||||
            kpa_target = encrypted_play_info
 | 
			
		||||
        else:
 | 
			
		||||
            kps = ['https://', 'http://']
 | 
			
		||||
            kpa_target = base64.b64decode(info_json['streamInfo']['url'])
 | 
			
		||||
        for kp in kps:
 | 
			
		||||
            partial_key = self._decrypt_xor_cipher(kpa_target, kp)
 | 
			
		||||
            for quote in ["'", '"']:
 | 
			
		||||
                key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js,
 | 
			
		||||
                                         "encryption key", default=None)
 | 
			
		||||
                if key is not None:
 | 
			
		||||
                    break
 | 
			
		||||
            else:
 | 
			
		||||
                continue
 | 
			
		||||
            break
 | 
			
		||||
        else:
 | 
			
		||||
            raise ExtractorError('Failed to extract encryption key')
 | 
			
		||||
 | 
			
		||||
        if encrypted_play_info is not None:
 | 
			
		||||
            play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info')
 | 
			
		||||
            if message and 'stream_url' not in play_info:
 | 
			
		||||
                raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
 | 
			
		||||
 | 
			
		||||
            song_url = play_info['stream_url']
 | 
			
		||||
            formats = [{
 | 
			
		||||
                'format_id': 'normal',
 | 
			
		||||
                'url': song_url
 | 
			
		||||
            }]
 | 
			
		||||
 | 
			
		||||
            title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
 | 
			
		||||
            thumbnail = self._proto_relative_url(self._html_search_regex(
 | 
			
		||||
@@ -128,10 +151,30 @@ class MixcloudIE(InfoExtractor):
 | 
			
		||||
                 r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
 | 
			
		||||
                webpage, 'play count', default=None))
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            title = info_json['name']
 | 
			
		||||
            thumbnail = try_get(info_json,
 | 
			
		||||
                                lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot'])
 | 
			
		||||
            uploader = try_get(info_json, lambda x: x['owner']['displayName'])
 | 
			
		||||
            uploader_id = try_get(info_json, lambda x: x['owner']['username'])
 | 
			
		||||
            description = try_get(info_json, lambda x: x['description'])
 | 
			
		||||
            view_count = try_get(info_json, lambda x: x['plays'])
 | 
			
		||||
 | 
			
		||||
            stream_info = info_json['streamInfo']
 | 
			
		||||
            formats = []
 | 
			
		||||
            self._decrypt_and_extend(stream_info, 'url', lambda x: [{
 | 
			
		||||
                'format_id': 'normal',
 | 
			
		||||
                'url': x
 | 
			
		||||
            }], key, formats)
 | 
			
		||||
            self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key,
 | 
			
		||||
                                     formats)
 | 
			
		||||
            self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key,
 | 
			
		||||
                                     formats)
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            'id': track_id,
 | 
			
		||||
            'title': title,
 | 
			
		||||
            'url': song_url,
 | 
			
		||||
            'formats': formats,
 | 
			
		||||
            'description': description,
 | 
			
		||||
            'thumbnail': thumbnail,
 | 
			
		||||
            'uploader': uploader,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user