mirror of
				https://gitlab.com/ytdl-org/youtube-dl.git
				synced 2025-11-03 20:37:08 -05:00 
			
		
		
		
	[YoutubeDL] Add generic video filtering (Fixes #4916)
This functionality is intended to eventually encompass the current format filtering.
This commit is contained in:
		@@ -53,6 +53,7 @@ from youtube_dl.utils import (
 | 
			
		||||
    version_tuple,
 | 
			
		||||
    xpath_with_ns,
 | 
			
		||||
    render_table,
 | 
			
		||||
    match_str,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
 | 
			
		||||
            '123  4\n'
 | 
			
		||||
            '9999 51')
 | 
			
		||||
 | 
			
		||||
    def test_match_str(self):
 | 
			
		||||
        self.assertRaises(ValueError, match_str, 'xy>foobar', {})
 | 
			
		||||
        self.assertFalse(match_str('xy', {'x': 1200}))
 | 
			
		||||
        self.assertTrue(match_str('!xy', {'x': 1200}))
 | 
			
		||||
        self.assertTrue(match_str('x', {'x': 1200}))
 | 
			
		||||
        self.assertFalse(match_str('!x', {'x': 1200}))
 | 
			
		||||
        self.assertTrue(match_str('x', {'x': 0}))
 | 
			
		||||
        self.assertFalse(match_str('x>0', {'x': 0}))
 | 
			
		||||
        self.assertFalse(match_str('x>0', {}))
 | 
			
		||||
        self.assertTrue(match_str('x>?0', {}))
 | 
			
		||||
        self.assertTrue(match_str('x>1K', {'x': 1200}))
 | 
			
		||||
        self.assertFalse(match_str('x>2K', {'x': 1200}))
 | 
			
		||||
        self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200}))
 | 
			
		||||
        self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200}))
 | 
			
		||||
        self.assertFalse(match_str('y=a212', {'y': 'foobar42'}))
 | 
			
		||||
        self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'}))
 | 
			
		||||
        self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'}))
 | 
			
		||||
        self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'}))
 | 
			
		||||
        self.assertFalse(match_str(
 | 
			
		||||
            'like_count > 100 & dislike_count <? 50 & description',
 | 
			
		||||
            {'like_count': 90, 'description': 'foo'}))
 | 
			
		||||
        self.assertTrue(match_str(
 | 
			
		||||
            'like_count > 100 & dislike_count <? 50 & description',
 | 
			
		||||
            {'like_count': 190, 'description': 'foo'}))
 | 
			
		||||
        self.assertFalse(match_str(
 | 
			
		||||
            'like_count > 100 & dislike_count <? 50 & description',
 | 
			
		||||
            {'like_count': 190, 'dislike_count': 60, 'description': 'foo'}))
 | 
			
		||||
        self.assertFalse(match_str(
 | 
			
		||||
            'like_count > 100 & dislike_count <? 50 & description',
 | 
			
		||||
            {'like_count': 190, 'dislike_count': 10}))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    unittest.main()
 | 
			
		||||
 
 | 
			
		||||
@@ -228,6 +228,11 @@ class YoutubeDL(object):
 | 
			
		||||
    external_downloader:  Executable of the external downloader to call.
 | 
			
		||||
    listformats:       Print an overview of available video formats and exit.
 | 
			
		||||
    list_thumbnails:   Print a table of all thumbnails and exit.
 | 
			
		||||
    match_filter:      A function that gets called with the info_dict of
 | 
			
		||||
                       every video.
 | 
			
		||||
                       If it returns a message, the video is ignored.
 | 
			
		||||
                       If it returns None, the video is downloaded.
 | 
			
		||||
                       match_filter_func in utils.py is one example for this.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    The following parameters are not used by YoutubeDL itself, they are used by
 | 
			
		||||
@@ -583,9 +588,16 @@ class YoutubeDL(object):
 | 
			
		||||
            if max_views is not None and view_count > max_views:
 | 
			
		||||
                return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 | 
			
		||||
        if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
 | 
			
		||||
            return 'Skipping "%s" because it is age restricted' % title
 | 
			
		||||
            return 'Skipping "%s" because it is age restricted' % video_title
 | 
			
		||||
        if self.in_download_archive(info_dict):
 | 
			
		||||
            return '%s has already been recorded in archive' % video_title
 | 
			
		||||
 | 
			
		||||
        match_filter = self.params.get('match_filter')
 | 
			
		||||
        if match_filter is not None:
 | 
			
		||||
            ret = match_filter(info_dict)
 | 
			
		||||
            if ret is not None:
 | 
			
		||||
                return ret
 | 
			
		||||
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
 
 | 
			
		||||
@@ -23,9 +23,10 @@ from .compat import (
 | 
			
		||||
)
 | 
			
		||||
from .utils import (
 | 
			
		||||
    DateRange,
 | 
			
		||||
    DEFAULT_OUTTMPL,
 | 
			
		||||
    decodeOption,
 | 
			
		||||
    DEFAULT_OUTTMPL,
 | 
			
		||||
    DownloadError,
 | 
			
		||||
    match_filter_func,
 | 
			
		||||
    MaxDownloadsReached,
 | 
			
		||||
    preferredencoding,
 | 
			
		||||
    read_batch_urls,
 | 
			
		||||
@@ -247,6 +248,9 @@ def _real_main(argv=None):
 | 
			
		||||
            xattr  # Confuse flake8
 | 
			
		||||
        except ImportError:
 | 
			
		||||
            parser.error('setting filesize xattr requested but python-xattr is not available')
 | 
			
		||||
    match_filter = (
 | 
			
		||||
        None if opts.match_filter is None
 | 
			
		||||
        else match_filter_func(opts.match_filter))
 | 
			
		||||
 | 
			
		||||
    ydl_opts = {
 | 
			
		||||
        'usenetrc': opts.usenetrc,
 | 
			
		||||
@@ -344,6 +348,7 @@ def _real_main(argv=None):
 | 
			
		||||
        'list_thumbnails': opts.list_thumbnails,
 | 
			
		||||
        'playlist_items': opts.playlist_items,
 | 
			
		||||
        'xattr_set_filesize': opts.xattr_set_filesize,
 | 
			
		||||
        'match_filter': match_filter,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    with YoutubeDL(ydl_opts) as ydl:
 | 
			
		||||
 
 | 
			
		||||
@@ -244,6 +244,25 @@ def parseOpts(overrideArguments=None):
 | 
			
		||||
        '--max-views',
 | 
			
		||||
        metavar='COUNT', dest='max_views', default=None, type=int,
 | 
			
		||||
        help='Do not download any videos with more than COUNT views')
 | 
			
		||||
    selection.add_option(
 | 
			
		||||
        '--match-filter',
 | 
			
		||||
        metavar='FILTER', dest='match_filter', default=None,
 | 
			
		||||
        help=(
 | 
			
		||||
            '(Experimental) Generic video filter. '
 | 
			
		||||
            'Specify any key (see help for -o for a list of available keys) to'
 | 
			
		||||
            ' match if the key is present, '
 | 
			
		||||
            '!key to check if the key is not present,'
 | 
			
		||||
            'key > NUMBER (like "comment_count > 12", also works with '
 | 
			
		||||
            '>=, <, <=, !=, =) to compare against a number, and '
 | 
			
		||||
            '& to require multiple matches. '
 | 
			
		||||
            'Values which are not known are excluded unless you'
 | 
			
		||||
            ' put a question mark (?) after the operator.'
 | 
			
		||||
            'For example, to only match videos that have been liked more than '
 | 
			
		||||
            '100 times and disliked less than 50 times (or the dislike '
 | 
			
		||||
            'functionality is not available at the given service), but who '
 | 
			
		||||
            'also have a description, use  --match-filter '
 | 
			
		||||
            '"like_count > 100 & dislike_count <? 50 & description" .'
 | 
			
		||||
        ))
 | 
			
		||||
    selection.add_option(
 | 
			
		||||
        '--no-playlist',
 | 
			
		||||
        action='store_true', dest='noplaylist', default=False,
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,7 @@ import io
 | 
			
		||||
import json
 | 
			
		||||
import locale
 | 
			
		||||
import math
 | 
			
		||||
import operator
 | 
			
		||||
import os
 | 
			
		||||
import pipes
 | 
			
		||||
import platform
 | 
			
		||||
@@ -1678,3 +1679,79 @@ def render_table(header_row, data):
 | 
			
		||||
    max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
 | 
			
		||||
    format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
 | 
			
		||||
    return '\n'.join(format_str % tuple(row) for row in table)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _match_one(filter_part, dct):
 | 
			
		||||
    COMPARISON_OPERATORS = {
 | 
			
		||||
        '<': operator.lt,
 | 
			
		||||
        '<=': operator.le,
 | 
			
		||||
        '>': operator.gt,
 | 
			
		||||
        '>=': operator.ge,
 | 
			
		||||
        '=': operator.eq,
 | 
			
		||||
        '!=': operator.ne,
 | 
			
		||||
    }
 | 
			
		||||
    operator_rex = re.compile(r'''(?x)\s*
 | 
			
		||||
        (?P<key>[a-z_]+)
 | 
			
		||||
        \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
 | 
			
		||||
        (?:
 | 
			
		||||
            (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
 | 
			
		||||
            (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
 | 
			
		||||
        )
 | 
			
		||||
        \s*$
 | 
			
		||||
        ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
 | 
			
		||||
    m = operator_rex.search(filter_part)
 | 
			
		||||
    if m:
 | 
			
		||||
        op = COMPARISON_OPERATORS[m.group('op')]
 | 
			
		||||
        if m.group('strval') is not None:
 | 
			
		||||
            if m.group('op') not in ('=', '!='):
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    'Operator %s does not support string values!' % m.group('op'))
 | 
			
		||||
            comparison_value = m.group('strval')
 | 
			
		||||
        else:
 | 
			
		||||
            try:
 | 
			
		||||
                comparison_value = int(m.group('intval'))
 | 
			
		||||
            except ValueError:
 | 
			
		||||
                comparison_value = parse_filesize(m.group('intval'))
 | 
			
		||||
                if comparison_value is None:
 | 
			
		||||
                    comparison_value = parse_filesize(m.group('intval') + 'B')
 | 
			
		||||
                if comparison_value is None:
 | 
			
		||||
                    raise ValueError(
 | 
			
		||||
                        'Invalid integer value %r in filter part %r' % (
 | 
			
		||||
                            m.group('intval'), filter_part))
 | 
			
		||||
        actual_value = dct.get(m.group('key'))
 | 
			
		||||
        if actual_value is None:
 | 
			
		||||
            return m.group('none_inclusive')
 | 
			
		||||
        return op(actual_value, comparison_value)
 | 
			
		||||
 | 
			
		||||
    UNARY_OPERATORS = {
 | 
			
		||||
        '': lambda v: v is not None,
 | 
			
		||||
        '!': lambda v: v is None,
 | 
			
		||||
    }
 | 
			
		||||
    operator_rex = re.compile(r'''(?x)\s*
 | 
			
		||||
        (?P<op>%s)\s*(?P<key>[a-z_]+)
 | 
			
		||||
        \s*$
 | 
			
		||||
        ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
 | 
			
		||||
    m = operator_rex.search(filter_part)
 | 
			
		||||
    if m:
 | 
			
		||||
        op = UNARY_OPERATORS[m.group('op')]
 | 
			
		||||
        actual_value = dct.get(m.group('key'))
 | 
			
		||||
        return op(actual_value)
 | 
			
		||||
 | 
			
		||||
    raise ValueError('Invalid filter part %r' % filter_part)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def match_str(filter_str, dct):
 | 
			
		||||
    """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
 | 
			
		||||
 | 
			
		||||
    return all(
 | 
			
		||||
        _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def match_filter_func(filter_str):
 | 
			
		||||
    def _match_func(info_dict):
 | 
			
		||||
        if match_str(filter_str, info_dict):
 | 
			
		||||
            return None
 | 
			
		||||
        else:
 | 
			
		||||
            video_title = info_dict.get('title', info_dict.get('id', 'video'))
 | 
			
		||||
            return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
 | 
			
		||||
    return _match_func
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user