mirror of
				https://github.com/yt-dlp/yt-dlp.git
				synced 2025-10-31 22:55:18 +00:00 
			
		
		
		
	[YoutubeDL] Add generic video filtering (Fixes #4916)
This functionality is intended to eventually encompass the current format filtering.
This commit is contained in:
		| @@ -53,6 +53,7 @@ from youtube_dl.utils import ( | |||||||
|     version_tuple, |     version_tuple, | ||||||
|     xpath_with_ns, |     xpath_with_ns, | ||||||
|     render_table, |     render_table, | ||||||
|  |     match_str, | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -459,6 +460,37 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') | |||||||
|             '123  4\n' |             '123  4\n' | ||||||
|             '9999 51') |             '9999 51') | ||||||
|  |  | ||||||
|  |     def test_match_str(self): | ||||||
|  |         self.assertRaises(ValueError, match_str, 'xy>foobar', {}) | ||||||
|  |         self.assertFalse(match_str('xy', {'x': 1200})) | ||||||
|  |         self.assertTrue(match_str('!xy', {'x': 1200})) | ||||||
|  |         self.assertTrue(match_str('x', {'x': 1200})) | ||||||
|  |         self.assertFalse(match_str('!x', {'x': 1200})) | ||||||
|  |         self.assertTrue(match_str('x', {'x': 0})) | ||||||
|  |         self.assertFalse(match_str('x>0', {'x': 0})) | ||||||
|  |         self.assertFalse(match_str('x>0', {})) | ||||||
|  |         self.assertTrue(match_str('x>?0', {})) | ||||||
|  |         self.assertTrue(match_str('x>1K', {'x': 1200})) | ||||||
|  |         self.assertFalse(match_str('x>2K', {'x': 1200})) | ||||||
|  |         self.assertTrue(match_str('x>=1200 & x < 1300', {'x': 1200})) | ||||||
|  |         self.assertFalse(match_str('x>=1100 & x < 1200', {'x': 1200})) | ||||||
|  |         self.assertFalse(match_str('y=a212', {'y': 'foobar42'})) | ||||||
|  |         self.assertTrue(match_str('y=foobar42', {'y': 'foobar42'})) | ||||||
|  |         self.assertFalse(match_str('y!=foobar42', {'y': 'foobar42'})) | ||||||
|  |         self.assertTrue(match_str('y!=foobar2', {'y': 'foobar42'})) | ||||||
|  |         self.assertFalse(match_str( | ||||||
|  |             'like_count > 100 & dislike_count <? 50 & description', | ||||||
|  |             {'like_count': 90, 'description': 'foo'})) | ||||||
|  |         self.assertTrue(match_str( | ||||||
|  |             'like_count > 100 & dislike_count <? 50 & description', | ||||||
|  |             {'like_count': 190, 'description': 'foo'})) | ||||||
|  |         self.assertFalse(match_str( | ||||||
|  |             'like_count > 100 & dislike_count <? 50 & description', | ||||||
|  |             {'like_count': 190, 'dislike_count': 60, 'description': 'foo'})) | ||||||
|  |         self.assertFalse(match_str( | ||||||
|  |             'like_count > 100 & dislike_count <? 50 & description', | ||||||
|  |             {'like_count': 190, 'dislike_count': 10})) | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     unittest.main() |     unittest.main() | ||||||
|   | |||||||
| @@ -228,6 +228,11 @@ class YoutubeDL(object): | |||||||
|     external_downloader:  Executable of the external downloader to call. |     external_downloader:  Executable of the external downloader to call. | ||||||
|     listformats:       Print an overview of available video formats and exit. |     listformats:       Print an overview of available video formats and exit. | ||||||
|     list_thumbnails:   Print a table of all thumbnails and exit. |     list_thumbnails:   Print a table of all thumbnails and exit. | ||||||
|  |     match_filter:      A function that gets called with the info_dict of | ||||||
|  |                        every video. | ||||||
|  |                        If it returns a message, the video is ignored. | ||||||
|  |                        If it returns None, the video is downloaded. | ||||||
|  |                        match_filter_func in utils.py is one example for this. | ||||||
|  |  | ||||||
|  |  | ||||||
|     The following parameters are not used by YoutubeDL itself, they are used by |     The following parameters are not used by YoutubeDL itself, they are used by | ||||||
| @@ -583,9 +588,16 @@ class YoutubeDL(object): | |||||||
|             if max_views is not None and view_count > max_views: |             if max_views is not None and view_count > max_views: | ||||||
|                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) |                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) | ||||||
|         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): |         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): | ||||||
|             return 'Skipping "%s" because it is age restricted' % title |             return 'Skipping "%s" because it is age restricted' % video_title | ||||||
|         if self.in_download_archive(info_dict): |         if self.in_download_archive(info_dict): | ||||||
|             return '%s has already been recorded in archive' % video_title |             return '%s has already been recorded in archive' % video_title | ||||||
|  |  | ||||||
|  |         match_filter = self.params.get('match_filter') | ||||||
|  |         if match_filter is not None: | ||||||
|  |             ret = match_filter(info_dict) | ||||||
|  |             if ret is not None: | ||||||
|  |                 return ret | ||||||
|  |  | ||||||
|         return None |         return None | ||||||
|  |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
|   | |||||||
| @@ -23,9 +23,10 @@ from .compat import ( | |||||||
| ) | ) | ||||||
| from .utils import ( | from .utils import ( | ||||||
|     DateRange, |     DateRange, | ||||||
|     DEFAULT_OUTTMPL, |  | ||||||
|     decodeOption, |     decodeOption, | ||||||
|  |     DEFAULT_OUTTMPL, | ||||||
|     DownloadError, |     DownloadError, | ||||||
|  |     match_filter_func, | ||||||
|     MaxDownloadsReached, |     MaxDownloadsReached, | ||||||
|     preferredencoding, |     preferredencoding, | ||||||
|     read_batch_urls, |     read_batch_urls, | ||||||
| @@ -247,6 +248,9 @@ def _real_main(argv=None): | |||||||
|             xattr  # Confuse flake8 |             xattr  # Confuse flake8 | ||||||
|         except ImportError: |         except ImportError: | ||||||
|             parser.error('setting filesize xattr requested but python-xattr is not available') |             parser.error('setting filesize xattr requested but python-xattr is not available') | ||||||
|  |     match_filter = ( | ||||||
|  |         None if opts.match_filter is None | ||||||
|  |         else match_filter_func(opts.match_filter)) | ||||||
|  |  | ||||||
|     ydl_opts = { |     ydl_opts = { | ||||||
|         'usenetrc': opts.usenetrc, |         'usenetrc': opts.usenetrc, | ||||||
| @@ -344,6 +348,7 @@ def _real_main(argv=None): | |||||||
|         'list_thumbnails': opts.list_thumbnails, |         'list_thumbnails': opts.list_thumbnails, | ||||||
|         'playlist_items': opts.playlist_items, |         'playlist_items': opts.playlist_items, | ||||||
|         'xattr_set_filesize': opts.xattr_set_filesize, |         'xattr_set_filesize': opts.xattr_set_filesize, | ||||||
|  |         'match_filter': match_filter, | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     with YoutubeDL(ydl_opts) as ydl: |     with YoutubeDL(ydl_opts) as ydl: | ||||||
|   | |||||||
| @@ -244,6 +244,25 @@ def parseOpts(overrideArguments=None): | |||||||
|         '--max-views', |         '--max-views', | ||||||
|         metavar='COUNT', dest='max_views', default=None, type=int, |         metavar='COUNT', dest='max_views', default=None, type=int, | ||||||
|         help='Do not download any videos with more than COUNT views') |         help='Do not download any videos with more than COUNT views') | ||||||
|  |     selection.add_option( | ||||||
|  |         '--match-filter', | ||||||
|  |         metavar='FILTER', dest='match_filter', default=None, | ||||||
|  |         help=( | ||||||
|  |             '(Experimental) Generic video filter. ' | ||||||
|  |             'Specify any key (see help for -o for a list of available keys) to' | ||||||
|  |             ' match if the key is present, ' | ||||||
|  |             '!key to check if the key is not present,' | ||||||
|  |             'key > NUMBER (like "comment_count > 12", also works with ' | ||||||
|  |             '>=, <, <=, !=, =) to compare against a number, and ' | ||||||
|  |             '& to require multiple matches. ' | ||||||
|  |             'Values which are not known are excluded unless you' | ||||||
|  |             ' put a question mark (?) after the operator.' | ||||||
|  |             'For example, to only match videos that have been liked more than ' | ||||||
|  |             '100 times and disliked less than 50 times (or the dislike ' | ||||||
|  |             'functionality is not available at the given service), but who ' | ||||||
|  |             'also have a description, use  --match-filter ' | ||||||
|  |             '"like_count > 100 & dislike_count <? 50 & description" .' | ||||||
|  |         )) | ||||||
|     selection.add_option( |     selection.add_option( | ||||||
|         '--no-playlist', |         '--no-playlist', | ||||||
|         action='store_true', dest='noplaylist', default=False, |         action='store_true', dest='noplaylist', default=False, | ||||||
|   | |||||||
| @@ -17,6 +17,7 @@ import io | |||||||
| import json | import json | ||||||
| import locale | import locale | ||||||
| import math | import math | ||||||
|  | import operator | ||||||
| import os | import os | ||||||
| import pipes | import pipes | ||||||
| import platform | import platform | ||||||
| @@ -1678,3 +1679,79 @@ def render_table(header_row, data): | |||||||
|     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] |     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)] | ||||||
|     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' |     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s' | ||||||
|     return '\n'.join(format_str % tuple(row) for row in table) |     return '\n'.join(format_str % tuple(row) for row in table) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _match_one(filter_part, dct): | ||||||
|  |     COMPARISON_OPERATORS = { | ||||||
|  |         '<': operator.lt, | ||||||
|  |         '<=': operator.le, | ||||||
|  |         '>': operator.gt, | ||||||
|  |         '>=': operator.ge, | ||||||
|  |         '=': operator.eq, | ||||||
|  |         '!=': operator.ne, | ||||||
|  |     } | ||||||
|  |     operator_rex = re.compile(r'''(?x)\s* | ||||||
|  |         (?P<key>[a-z_]+) | ||||||
|  |         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* | ||||||
|  |         (?: | ||||||
|  |             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| | ||||||
|  |             (?P<strval>(?![0-9.])[a-z0-9A-Z]*) | ||||||
|  |         ) | ||||||
|  |         \s*$ | ||||||
|  |         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) | ||||||
|  |     m = operator_rex.search(filter_part) | ||||||
|  |     if m: | ||||||
|  |         op = COMPARISON_OPERATORS[m.group('op')] | ||||||
|  |         if m.group('strval') is not None: | ||||||
|  |             if m.group('op') not in ('=', '!='): | ||||||
|  |                 raise ValueError( | ||||||
|  |                     'Operator %s does not support string values!' % m.group('op')) | ||||||
|  |             comparison_value = m.group('strval') | ||||||
|  |         else: | ||||||
|  |             try: | ||||||
|  |                 comparison_value = int(m.group('intval')) | ||||||
|  |             except ValueError: | ||||||
|  |                 comparison_value = parse_filesize(m.group('intval')) | ||||||
|  |                 if comparison_value is None: | ||||||
|  |                     comparison_value = parse_filesize(m.group('intval') + 'B') | ||||||
|  |                 if comparison_value is None: | ||||||
|  |                     raise ValueError( | ||||||
|  |                         'Invalid integer value %r in filter part %r' % ( | ||||||
|  |                             m.group('intval'), filter_part)) | ||||||
|  |         actual_value = dct.get(m.group('key')) | ||||||
|  |         if actual_value is None: | ||||||
|  |             return m.group('none_inclusive') | ||||||
|  |         return op(actual_value, comparison_value) | ||||||
|  |  | ||||||
|  |     UNARY_OPERATORS = { | ||||||
|  |         '': lambda v: v is not None, | ||||||
|  |         '!': lambda v: v is None, | ||||||
|  |     } | ||||||
|  |     operator_rex = re.compile(r'''(?x)\s* | ||||||
|  |         (?P<op>%s)\s*(?P<key>[a-z_]+) | ||||||
|  |         \s*$ | ||||||
|  |         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) | ||||||
|  |     m = operator_rex.search(filter_part) | ||||||
|  |     if m: | ||||||
|  |         op = UNARY_OPERATORS[m.group('op')] | ||||||
|  |         actual_value = dct.get(m.group('key')) | ||||||
|  |         return op(actual_value) | ||||||
|  |  | ||||||
|  |     raise ValueError('Invalid filter part %r' % filter_part) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def match_str(filter_str, dct): | ||||||
|  |     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """ | ||||||
|  |  | ||||||
|  |     return all( | ||||||
|  |         _match_one(filter_part, dct) for filter_part in filter_str.split('&')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def match_filter_func(filter_str): | ||||||
|  |     def _match_func(info_dict): | ||||||
|  |         if match_str(filter_str, info_dict): | ||||||
|  |             return None | ||||||
|  |         else: | ||||||
|  |             video_title = info_dict.get('title', info_dict.get('id', 'video')) | ||||||
|  |             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str) | ||||||
|  |     return _match_func | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
					Philipp Hagemeister