diff --git a/core/app.py b/core/app.py index 46cff0a8..ee31a114 100644 --- a/core/app.py +++ b/core/app.py @@ -140,7 +140,7 @@ class DupeGuru(Broadcaster): self.app_mode = AppMode.Standard self.discarded_file_count = 0 self.exclude_list = ExcludeList() - self.directories = directories.Directories() + self.directories = directories.Directories(self.exclude_list) self.results = results.Results(self) self.ignore_list = IgnoreList() # In addition to "app-level" options, this dictionary also holds options that will be diff --git a/core/directories.py b/core/directories.py index 5f465818..781ced90 100644 --- a/core/directories.py +++ b/core/directories.py @@ -13,7 +13,6 @@ from hscommon.path import Path from hscommon.util import FileOrPath from . import fs -from .exclude import ExcludeList __all__ = [ "Directories", @@ -53,17 +52,15 @@ class Directories: Then, when the user starts the scan, :meth:`get_files` is called to retrieve all files (wrapped in :mod:`core.fs`) that have to be scanned according to the chosen folders/states. """ - # FIXME: if there is zero item in these sets, the for each loops will yield NOTHING - deny_list_str = set() - deny_list_re = set() - deny_list_re_files = set() # ---Override - def __init__(self, excluded=ExcludeList()): + def __init__(self, exclude_list=None): self._dirs = [] # {path: state} self.states = {} - self._excluded = excluded + self._exclude_list = exclude_list + if exclude_list is not None: + exclude_list._combined_regex = False # TODO make a setter def __contains__(self, path): for p in self._dirs: @@ -81,49 +78,58 @@ class Directories: return len(self._dirs) # ---Private - def _default_state_for_path(self, path, deny_list_re=deny_list_re): + def _default_state_for_path(self, path): + # New logic with regex filters + if self._exclude_list is not None and len(self._exclude_list) > 0: + # We iterate even if we only have one item here + for denied_path_re in self._exclude_list.compiled_combined: + if denied_path_re.match(str(path)): + return DirectoryState.Excluded + return None + # Old default logic, still used during initialization of DirectoryTree: # Override this in subclasses to specify the state of some special folders. - # if path.name.startswith("."): # hidden - # return DirectoryState.Excluded - for denied_path_re in deny_list_re: - if denied_path_re.match(str(path)): - return DirectoryState.Excluded + if path.name.startswith("."): + return DirectoryState.Excluded - def _get_files(self, from_path, fileclasses, j, deny_list_re=deny_list_re_files): + def _get_files(self, from_path, fileclasses, j): for root, dirs, files in os.walk(str(from_path)): j.check_if_cancelled() - root = Path(root) + rootPath = Path(root) state = self.get_state(root) if state == DirectoryState.Excluded: # Recursively get files from folders with lots of subfolder is expensive. However, there # might be a subfolder in this path that is not excluded. What we want to do is to skim # through self.states and see if we must continue, or we can stop right here to save time - if not any(p[: len(root)] == root for p in self.states): + if not any(p[: len(rootPath)] == rootPath for p in self.states): del dirs[:] try: if state != DirectoryState.Excluded: - found_files = [] - for f in files: - found = False - for expr in deny_list_re: - found = expr.match(f) - if found: - break - if not found: - found_files.append(fs.get_file(root + f, fileclasses=fileclasses)) + # Old logic + if self._exclude_list is None or not len(self._exclude_list): + found_files = [fs.get_file(rootPath + f, fileclasses=fileclasses) for f in files] + else: + found_files = [] + for f in files: + found = False + for expr in self._exclude_list.compiled_files_combined: + found = expr.match(f) + if found: + break + if not found: + found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses)) found_files = [f for f in found_files if f is not None] # In some cases, directories can be considered as files by dupeGuru, which is # why we have this line below. In fact, there only one case: Bundle files under # OS X... In other situations, this forloop will do nothing. for d in dirs[:]: - f = fs.get_file(root + d, fileclasses=fileclasses) + f = fs.get_file(rootPath + d, fileclasses=fileclasses) if f is not None: found_files.append(f) dirs.remove(d) logging.debug( "Collected %d files in folder %s", len(found_files), - str(root), + str(rootPath), ) for file in found_files: file.is_ref = state == DirectoryState.Reference @@ -131,7 +137,7 @@ class Directories: except (EnvironmentError, fs.InvalidPath): pass - def _get_folders(self, from_folder, j, deny_list_re=deny_list_re): + def _get_folders(self, from_folder, j): j.check_if_cancelled() try: for subfolder in from_folder.subfolders: @@ -177,7 +183,7 @@ class Directories: except EnvironmentError: return [] - def get_files(self, fileclasses=None, j=job.nulljob, deny_list_re=deny_list_re_files): + def get_files(self, fileclasses=None, j=job.nulljob): """Returns a list of all files that are not excluded. Returned files also have their ``is_ref`` attr set if applicable. @@ -185,7 +191,7 @@ class Directories: if fileclasses is None: fileclasses = [fs.File] for path in self._dirs: - for file in self._get_files(path, fileclasses=fileclasses, j=j, deny_list_re=deny_list_re): + for file in self._get_files(path, fileclasses=fileclasses, j=j): yield file def get_folders(self, folderclass=None, j=job.nulljob): @@ -200,7 +206,7 @@ class Directories: for folder in self._get_folders(from_folder, j): yield folder - def get_state(self, path, deny_list_re=deny_list_re): + def get_state(self, path): """Returns the state of ``path``. :rtype: :class:`DirectoryState` @@ -208,7 +214,7 @@ class Directories: # direct match? easy result. if path in self.states: return self.states[path] - state = self._default_state_for_path(path, deny_list_re) or DirectoryState.Normal + state = self._default_state_for_path(path) or DirectoryState.Normal prevlen = 0 # we loop through the states to find the longest matching prefix for p, s in self.states.items(): diff --git a/core/exclude.py b/core/exclude.py index 74c44c2d..29bab540 100644 --- a/core/exclude.py +++ b/core/exclude.py @@ -4,6 +4,8 @@ from .markable import Markable from xml.etree import ElementTree as ET +# TODO: perhaps use regex module for better Unicode support? https://pypi.org/project/regex/ +# or perhaps also https://pypi.org/project/re2/ import re from os import sep import logging @@ -50,14 +52,18 @@ class ExcludeList(Markable): The downside is we have to compare strings every time we look for an item in the list since we use regex strings as keys. [regex:str, compilable:bool, error:Exception, compiled:Pattern]) + If combined_regex is True, the compiled regexes will be combined into one Pattern + instead of returned as separate Patterns. """ # ---Override - def __init__(self): + def __init__(self, combined_regex=False): Markable.__init__(self) + self._combined_regex = combined_regex self._excluded = [] self._count = 0 self._excluded_compiled = set() + self._dirty = True def __debug_test(self): self.test_regexes = [ @@ -81,30 +87,38 @@ class ExcludeList(Markable): yield self.is_marked(regex), regex def __len__(self): - return self._count + """Returns the number of marked regexes.""" + return len([x for marked, x in self if marked]) def is_markable(self, regex): return self._is_markable(regex) def _is_markable(self, regex): """Return the cached result of "compilable" property""" - # FIXME save result of compilation via memoization - # return self._excluded.get(regex)[0] for item in self._excluded: if item[0] == regex: return item[1] - return False # FIXME should not be needed + return False # should not be needed def _did_mark(self, regex): + self._add_compiled(regex) + + def _did_unmark(self, regex): + self._remove_compiled(regex) + + def _add_compiled(self, regex): + if self._combined_regex: + self._dirty = True + return for item in self._excluded: if item[0] == regex: # no need to test if already present since it's a set() self._excluded_compiled.add(item[3]) - def _did_unmark(self, regex): - self._remove_compiled(regex) - def _remove_compiled(self, regex): + if self._combined_regex: + self._dirty = True + return for item in self._excluded_compiled: if regex in item.pattern: self._excluded_compiled.remove(item) @@ -137,13 +151,41 @@ class ExcludeList(Markable): @property def compiled(self): """Should be used by other classes to retrieve the up-to-date list of patterns.""" - return self._excluded_compiled + if not self._combined_regex: + return self._excluded_compiled + else: + return self.compiled_combined @property def compiled_files(self): """Should be used by other classes to retrieve the up-to-date list of patterns for files only.""" - return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern] + if not self._combined_regex: + # Return each compiled element separately + # return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern] + for compiled in self.compiled: + if sep not in compiled.pattern: + yield compiled + else: + return self.compiled_files_combined + + @property + def compiled_combined(self): + if self._dirty: + self._cached_compiled_combined =\ + re.compile('|'.join(x for marked, x in self if marked)) + # Must compute the filtered out version as well + self._cached_compiled_combined_files =\ + re.compile('|'.join(x for marked, x in self + if marked and sep not in x)) + self._dirty = False + # returned as a tuple to get a free iterator and to avoid subclassing + return (self._cached_compiled_combined,) + + @property + def compiled_files_combined(self): + # returned as a tuple to get a free iterator and to avoid subclassing + return (self._cached_compiled_combined_files,) # ---Public def add(self, regex, forced=False): @@ -164,7 +206,7 @@ class ExcludeList(Markable): def _do_add(self, regex, iscompilable, exception, compiled): # We need to insert at the top self._excluded.insert(0, [regex, iscompilable, exception, compiled]) - self._count = len(self._excluded) + # self._count = len(self._excluded) def isExcluded(self, regex): for item in self._excluded: @@ -174,7 +216,6 @@ class ExcludeList(Markable): def clear(self): self._excluded = [] - self._count = 0 def remove(self, regex): for item in self._excluded: @@ -286,9 +327,6 @@ class ExcludeDict(ExcludeList): for regex in ordered_keys(self._excluded): yield self.is_marked(regex), regex - def __len__(self): - return self._count - def is_markable(self, regex): return self._is_markable(regex) @@ -299,17 +337,16 @@ class ExcludeDict(ExcludeList): return exists.get("compilable") return False - def _did_mark(self, regex): - # self._excluded[regex][0] = True # is compilable + def _add_compiled(self, regex): + if self._combined_regex: + self._dirty = True + return try: self._excluded_compiled.add(self._excluded[regex]["compiled"]) except Exception as e: print(f"Exception while adding regex {regex} to compiled set: {e}") return - def _did_unmark(self, regex): - self._remove_compiled(regex) - def is_compilable(self, regex): """Returns the cached "compilable" value""" return self._excluded[regex]["compilable"] @@ -318,24 +355,13 @@ class ExcludeDict(ExcludeList): """Return the compilation error message for regex string""" return self._excluded.get(regex).get("error") - @property - def compiled(self): - """Should be used by other classes to retrieve the up-to-date list of patterns.""" - return self._excluded_compiled - - @property - def compiled_files(self): - """Should be used by other classes to retrieve the up-to-date list of patterns - for files only.""" - return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern] - # ---Public def _do_add(self, regex, iscompilable, exception, compiled): # We always insert at the top, so index should be 0 and other indices should be pushed by one for value in self._excluded.values(): value["index"] += 1 self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled} - self._count = len(self._excluded) + # self._count = len(self._excluded) def isExcluded(self, regex): if regex in self._excluded.keys(): @@ -344,13 +370,13 @@ class ExcludeDict(ExcludeList): def clear(self): self._excluded = {} - self._count = 0 def remove(self, regex): old_value = self._excluded.pop(regex) # Bring down all indices which where above it index = old_value["index"] - if index == len(self._excluded): + if index == len(self._excluded) - 1: # we start at 0... + # Old index was at the end, no need to update other indices self._remove_compiled(regex) return diff --git a/qt/exclude_list_dialog.py b/qt/exclude_list_dialog.py index 96389568..f251d1e2 100644 --- a/qt/exclude_list_dialog.py +++ b/qt/exclude_list_dialog.py @@ -68,10 +68,13 @@ class ExcludeListDialog(QDialog): gridlayout.addItem(QSpacerItem(0, 0, QSizePolicy.Minimum, QSizePolicy.Expanding), 4, 1) gridlayout.addWidget(self.buttonClose, 5, 1) layout.addLayout(gridlayout) + self.linedit.setPlaceholderText("Type a regular expression here...") + self.linedit.setFocus() # --- model --> view def show(self): super().show() + self.linedit.setFocus() @pyqtSlot() def addStringFromLineEdit(self):