Update to ytdl-2021.02.10

Except: [archiveorg] Fix and improve extraction (5fc53690cbe6abb11941a3f4846b566a7472753e)
2025-10-03 23:43:21 -04:00 · 2021-02-11 02:52:55 +05:30
parent 539d158c50
commit cc2db87805
8 changed files with 345 additions and 237 deletions
--- a/youtube_dlc/extractor/youtube.py
+++ b/youtube_dlc/extractor/youtube.py
@@ -32,7 +32,7 @@ from ..utils import (
    mimetype2ext,
    parse_codecs,
    parse_duration,
-    # qualities,
+    # qualities,  # TODO: Enable this after fixing formatSort
    remove_start,
    smuggle_url,
    str_or_none,
@@ -414,7 +414,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     (?(1).+)?                                                # if we found the ID, everything can follow
                     $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
    _PLAYER_INFO_RE = (
-        r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.js$',
+        r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
+        r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
        r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
    )
    _formats = {
@@ -621,6 +622,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'uploader': 'AfrojackVEVO',
                'uploader_id': 'AfrojackVEVO',
                'upload_date': '20131011',
+                'abr': 129.495,
            },
            'params': {
                'youtube_include_dash_manifest': True,
@@ -1134,10 +1136,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            'only_matching': True,
        },
        {
-            # Age-gated video only available with authentication (unavailable
-            # via embed page workaround)
-            'url': 'XgnwCQzjau8',
-            'only_matching': True,
+            # https://github.com/ytdl-org/youtube-dl/pull/28094
+            'url': 'OtqTfy26tG0',
+            'info_dict': {
+                'id': 'OtqTfy26tG0',
+                'ext': 'mp4',
+                'title': 'Burn Out',
+                'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
+                'upload_date': '20141120',
+                'uploader': 'The Cinematic Orchestra - Topic',
+                'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
+                'artist': 'The Cinematic Orchestra',
+                'track': 'Burn Out',
+                'album': 'Every Day',
+                'release_data': None,
+                'release_year': None,
+            },
+            'params': {
+                'skip_download': True,
+            },
        },
    ]

@@ -1230,6 +1248,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        funcname = self._search_regex(
            (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
+             r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
+             r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
             r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
             r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
             # Obsolete patterns
@@ -1493,7 +1514,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):

        formats = []
        itags = []
+        itag_qualities = {}
        player_url = None
+        # TODO: Enable this after fixing formatSort
        # q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
        streaming_data = player_response.get('streamingData') or {}
        streaming_formats = streaming_data.get('formats') or []
@@ -1502,6 +1525,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
                continue

+            itag = str_or_none(fmt.get('itag'))
+            quality = fmt.get('quality')
+            if itag and quality:
+                itag_qualities[itag] = quality
+            # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
+            # (adding `&sq=0` to the URL) and parsing emsg box to determine the
+            # number of fragment that would subsequently requested with (`&sq=N`)
+            if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
+                continue
+
            fmt_url = fmt.get('url')
            if not fmt_url:
                sc = compat_parse_qs(fmt.get('signatureCipher'))
@@ -1521,10 +1554,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
                fmt_url += '&' + sp + '=' + signature

-            itag = str_or_none(fmt.get('itag'))
            if itag:
                itags.append(itag)
-            quality = fmt.get('quality')
+            tbr = float_or_none(
+                fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
            dct = {
                'asr': int_or_none(fmt.get('audioSampleRate')),
                'filesize': int_or_none(fmt.get('contentLength')),
@@ -1532,9 +1565,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                'format_note': fmt.get('qualityLabel') or quality,
                'fps': int_or_none(fmt.get('fps')),
                'height': int_or_none(fmt.get('height')),
-                # 'quality': q(quality),  # This does not correctly reflect the overall quality of the format
-                'tbr': float_or_none(fmt.get(
-                    'averageBitrate') or fmt.get('bitrate'), 1000),
+                # 'quality': q(quality),    # TODO: Enable this after fixing formatSort
+                'tbr': tbr,
                'url': fmt_url,
                'width': fmt.get('width'),
            }
@@ -1545,7 +1577,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                if mobj:
                    dct['ext'] = mimetype2ext(mobj.group(1))
                    dct.update(parse_codecs(mobj.group(2)))
-            if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
+            no_audio = dct.get('acodec') == 'none'
+            no_video = dct.get('vcodec') == 'none'
+            if no_audio:
+                dct['vbr'] = tbr
+            if no_video:
+                dct['abr'] = tbr
+            if no_audio or no_video:
                dct['downloader_options'] = {
                    # Youtube throttles chunks >~10M
                    'http_chunk_size': 10485760,
@@ -1565,22 +1603,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        if self._downloader.params.get('youtube_include_dash_manifest'):
            dash_manifest_url = streaming_data.get('dashManifestUrl')
            if dash_manifest_url:
-                dash_formats = []
                for f in self._extract_mpd_formats(
                        dash_manifest_url, video_id, fatal=False):
+                    itag = f['format_id']
+                    if itag in itags:
+                        continue
+                    # if itag in itag_qualities:  # TODO: Enable this after fixing formatSort
+                    #     f['quality'] = q(itag_qualities[itag])
                    filesize = int_or_none(self._search_regex(
                        r'/clen/(\d+)', f.get('fragment_base_url')
                        or f['url'], 'file size', default=None))
                    if filesize:
                        f['filesize'] = filesize
-                    dash_formats.append(f)
-                # Until further investigation prefer DASH formats as non-DASH
-                # may not be available (see [1])
-                # 1. https://github.com/ytdl-org/youtube-dl/issues/28070
-                if dash_formats:
-                    dash_formats_keys = [f['format_id'] for f in dash_formats]
-                    formats = [f for f in formats if f['format_id'] not in dash_formats_keys]
-                    formats.extend(dash_formats)
+                    formats.append(f)

        if not formats:
            if streaming_data.get('licenseInfos'):
@@ -1747,7 +1782,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                    'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
                    'track': mobj.group('track').strip(),
                    'release_date': release_date,
-                    'release_year': int(release_year),
+                    'release_year': int_or_none(release_year),
                })

        initial_data = None
@@ -2597,9 +2632,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
        next_continuation = cls._extract_next_continuation_data(renderer)
        if next_continuation:
            return next_continuation
-        contents = renderer.get('contents') or renderer.get('items')
-        if not isinstance(contents, list):
-            return
+        contents = []
+        for key in ('contents', 'items'):
+            contents.extend(try_get(renderer, lambda x: x[key], list) or [])
        for content in contents:
            if not isinstance(content, dict):
                continue