mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-03 18:47:06 -05:00 
			
		
		
		
	Add infrastructure for paged lists
This commit allows to download pages in playlists as needed instead of all at once.
Before this commit,
    youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download
took quite some time - now it's almost instantaneous.
As an example, the youtube:user extractor has been converted.
Fixes #2175
			
			
This commit is contained in:
		@@ -18,6 +18,7 @@ from youtube_dl.utils import (
 | 
			
		||||
    find_xpath_attr,
 | 
			
		||||
    get_meta_content,
 | 
			
		||||
    orderedSet,
 | 
			
		||||
    PagedList,
 | 
			
		||||
    parse_duration,
 | 
			
		||||
    sanitize_filename,
 | 
			
		||||
    shell_quote,
 | 
			
		||||
@@ -200,5 +201,26 @@ class TestUtil(unittest.TestCase):
 | 
			
		||||
        self.assertEqual(parse_duration('9:12:43'), 33163)
 | 
			
		||||
        self.assertEqual(parse_duration('x:y'), None)
 | 
			
		||||
 | 
			
		||||
    def test_paged_list(self):
 | 
			
		||||
        def testPL(size, pagesize, sliceargs, expected):
 | 
			
		||||
            def get_page(pagenum):
 | 
			
		||||
                firstid = pagenum * pagesize
 | 
			
		||||
                upto = min(size, pagenum * pagesize + pagesize)
 | 
			
		||||
                for i in range(firstid, upto):
 | 
			
		||||
                    yield i
 | 
			
		||||
 | 
			
		||||
            pl = PagedList(get_page, pagesize)
 | 
			
		||||
            got = pl.getslice(*sliceargs)
 | 
			
		||||
            self.assertEqual(got, expected)
 | 
			
		||||
 | 
			
		||||
        testPL(5, 2, (), [0, 1, 2, 3, 4])
 | 
			
		||||
        testPL(5, 2, (1,), [1, 2, 3, 4])
 | 
			
		||||
        testPL(5, 2, (2,), [2, 3, 4])
 | 
			
		||||
        testPL(5, 2, (4,), [4])
 | 
			
		||||
        testPL(5, 2, (0, 3), [0, 1, 2])
 | 
			
		||||
        testPL(5, 2, (1, 4), [1, 2, 3])
 | 
			
		||||
        testPL(5, 2, (2, 99), [2, 3, 4])
 | 
			
		||||
        testPL(5, 2, (20, 99), [])
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    unittest.main()
 | 
			
		||||
 
 | 
			
		||||
@@ -39,6 +39,7 @@ from .utils import (
 | 
			
		||||
    locked_file,
 | 
			
		||||
    make_HTTPS_handler,
 | 
			
		||||
    MaxDownloadsReached,
 | 
			
		||||
    PagedList,
 | 
			
		||||
    PostProcessingError,
 | 
			
		||||
    platform_name,
 | 
			
		||||
    preferredencoding,
 | 
			
		||||
@@ -575,19 +576,27 @@ class YoutubeDL(object):
 | 
			
		||||
 | 
			
		||||
            playlist_results = []
 | 
			
		||||
 | 
			
		||||
            n_all_entries = len(ie_result['entries'])
 | 
			
		||||
            playliststart = self.params.get('playliststart', 1) - 1
 | 
			
		||||
            playlistend = self.params.get('playlistend', None)
 | 
			
		||||
            # For backwards compatibility, interpret -1 as whole list
 | 
			
		||||
            if playlistend == -1:
 | 
			
		||||
                playlistend = None
 | 
			
		||||
 | 
			
		||||
            entries = ie_result['entries'][playliststart:playlistend]
 | 
			
		||||
            n_entries = len(entries)
 | 
			
		||||
 | 
			
		||||
            self.to_screen(
 | 
			
		||||
                "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 | 
			
		||||
                (ie_result['extractor'], playlist, n_all_entries, n_entries))
 | 
			
		||||
            if isinstance(ie_result['entries'], list):
 | 
			
		||||
                n_all_entries = len(ie_result['entries'])
 | 
			
		||||
                entries = ie_result['entries'][playliststart:playlistend]
 | 
			
		||||
                n_entries = len(entries)
 | 
			
		||||
                self.to_screen(
 | 
			
		||||
                    "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
 | 
			
		||||
                    (ie_result['extractor'], playlist, n_all_entries, n_entries))
 | 
			
		||||
            else:
 | 
			
		||||
                assert isinstance(ie_result['entries'], PagedList)
 | 
			
		||||
                entries = ie_result['entries'].getslice(
 | 
			
		||||
                    playliststart, playlistend)
 | 
			
		||||
                n_entries = len(entries)
 | 
			
		||||
                self.to_screen(
 | 
			
		||||
                    "[%s] playlist %s: Downloading %d videos" %
 | 
			
		||||
                    (ie_result['extractor'], playlist, n_entries))
 | 
			
		||||
 | 
			
		||||
            for i, entry in enumerate(entries, 1):
 | 
			
		||||
                self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
 | 
			
		||||
 
 | 
			
		||||
@@ -27,6 +27,7 @@ from ..utils import (
 | 
			
		||||
    get_element_by_id,
 | 
			
		||||
    get_element_by_attribute,
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    PagedList,
 | 
			
		||||
    RegexNotFoundError,
 | 
			
		||||
    unescapeHTML,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
@@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor):
 | 
			
		||||
        # page by page until there are no video ids - it means we got
 | 
			
		||||
        # all of them.
 | 
			
		||||
 | 
			
		||||
        url_results = []
 | 
			
		||||
 | 
			
		||||
        for pagenum in itertools.count(0):
 | 
			
		||||
        def download_page(pagenum):
 | 
			
		||||
            start_index = pagenum * self._GDATA_PAGE_SIZE + 1
 | 
			
		||||
 | 
			
		||||
            gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
 | 
			
		||||
            page = self._download_webpage(gdata_url, username,
 | 
			
		||||
                                          u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
 | 
			
		||||
            page = self._download_webpage(
 | 
			
		||||
                gdata_url, username,
 | 
			
		||||
                u'Downloading video ids from %d to %d' % (
 | 
			
		||||
                    start_index, start_index + self._GDATA_PAGE_SIZE))
 | 
			
		||||
 | 
			
		||||
            try:
 | 
			
		||||
                response = json.loads(page)
 | 
			
		||||
            except ValueError as err:
 | 
			
		||||
                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
 | 
			
		||||
            if 'entry' not in response['feed']:
 | 
			
		||||
                # Number of videos is a multiple of self._MAX_RESULTS
 | 
			
		||||
                break
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
            # Extract video identifiers
 | 
			
		||||
            entries = response['feed']['entry']
 | 
			
		||||
            for entry in entries:
 | 
			
		||||
                title = entry['title']['$t']
 | 
			
		||||
                video_id = entry['id']['$t'].split('/')[-1]
 | 
			
		||||
                url_results.append({
 | 
			
		||||
                yield {
 | 
			
		||||
                    '_type': 'url',
 | 
			
		||||
                    'url': video_id,
 | 
			
		||||
                    'ie_key': 'Youtube',
 | 
			
		||||
                    'id': 'video_id',
 | 
			
		||||
                    'title': title,
 | 
			
		||||
                })
 | 
			
		||||
 | 
			
		||||
            # A little optimization - if current page is not
 | 
			
		||||
            # "full", ie. does not contain PAGE_SIZE video ids then
 | 
			
		||||
            # we can assume that this page is the last one - there
 | 
			
		||||
            # are no more ids on further pages - no need to query
 | 
			
		||||
            # again.
 | 
			
		||||
 | 
			
		||||
            if len(entries) < self._GDATA_PAGE_SIZE:
 | 
			
		||||
                break
 | 
			
		||||
                }
 | 
			
		||||
        url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
 | 
			
		||||
 | 
			
		||||
        return self.playlist_result(url_results, playlist_title=username)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -6,6 +6,7 @@ import datetime
 | 
			
		||||
import email.utils
 | 
			
		||||
import errno
 | 
			
		||||
import gzip
 | 
			
		||||
import itertools
 | 
			
		||||
import io
 | 
			
		||||
import json
 | 
			
		||||
import locale
 | 
			
		||||
@@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):
 | 
			
		||||
    except OSError:
 | 
			
		||||
        return False
 | 
			
		||||
    return exe
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PagedList(object):
 | 
			
		||||
    def __init__(self, pagefunc, pagesize):
 | 
			
		||||
        self._pagefunc = pagefunc
 | 
			
		||||
        self._pagesize = pagesize
 | 
			
		||||
 | 
			
		||||
    def getslice(self, start=0, end=None):
 | 
			
		||||
        res = []
 | 
			
		||||
        for pagenum in itertools.count(start // self._pagesize):
 | 
			
		||||
            firstid = pagenum * self._pagesize
 | 
			
		||||
            nextfirstid = pagenum * self._pagesize + self._pagesize
 | 
			
		||||
            if start >= nextfirstid:
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            page_results = list(self._pagefunc(pagenum))
 | 
			
		||||
 | 
			
		||||
            startv = (
 | 
			
		||||
                start % self._pagesize
 | 
			
		||||
                if firstid <= start < nextfirstid
 | 
			
		||||
                else 0)
 | 
			
		||||
 | 
			
		||||
            endv = (
 | 
			
		||||
                ((end - 1) % self._pagesize) + 1
 | 
			
		||||
                if (end is not None and firstid <= end <= nextfirstid)
 | 
			
		||||
                else None)
 | 
			
		||||
 | 
			
		||||
            if startv != 0 or endv is not None:
 | 
			
		||||
                page_results = page_results[startv:endv]
 | 
			
		||||
            res.extend(page_results)
 | 
			
		||||
 | 
			
		||||
            # A little optimization - if current page is not "full", ie. does
 | 
			
		||||
            # not contain page_size videos then we can assume that this page
 | 
			
		||||
            # is the last one - there are no more ids on further pages -
 | 
			
		||||
            # i.e. no need to query again.
 | 
			
		||||
            if len(page_results) + startv < self._pagesize:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
            # If we got the whole page, but the next page is not interesting,
 | 
			
		||||
            # break out early as well
 | 
			
		||||
            if end == nextfirstid:
 | 
			
		||||
                break
 | 
			
		||||
        return res
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user