mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-05-08 09:49:51 +00:00
Concatenate regexes prio to compilation
* Concatenating regexes into one Pattern might yield better performance under (un)certain conditions. * Filenames are tested against regexes with no os.sep in them. This may or may not be what we want to do. And alternative would be to test against the whole (absolute) path of each file, which would filter more agressively.
This commit is contained in:
parent
2eaf7e7893
commit
9f223f3964
@ -140,7 +140,7 @@ class DupeGuru(Broadcaster):
|
|||||||
self.app_mode = AppMode.Standard
|
self.app_mode = AppMode.Standard
|
||||||
self.discarded_file_count = 0
|
self.discarded_file_count = 0
|
||||||
self.exclude_list = ExcludeList()
|
self.exclude_list = ExcludeList()
|
||||||
self.directories = directories.Directories()
|
self.directories = directories.Directories(self.exclude_list)
|
||||||
self.results = results.Results(self)
|
self.results = results.Results(self)
|
||||||
self.ignore_list = IgnoreList()
|
self.ignore_list = IgnoreList()
|
||||||
# In addition to "app-level" options, this dictionary also holds options that will be
|
# In addition to "app-level" options, this dictionary also holds options that will be
|
||||||
|
@ -13,7 +13,6 @@ from hscommon.path import Path
|
|||||||
from hscommon.util import FileOrPath
|
from hscommon.util import FileOrPath
|
||||||
|
|
||||||
from . import fs
|
from . import fs
|
||||||
from .exclude import ExcludeList
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Directories",
|
"Directories",
|
||||||
@ -53,17 +52,15 @@ class Directories:
|
|||||||
Then, when the user starts the scan, :meth:`get_files` is called to retrieve all files (wrapped
|
Then, when the user starts the scan, :meth:`get_files` is called to retrieve all files (wrapped
|
||||||
in :mod:`core.fs`) that have to be scanned according to the chosen folders/states.
|
in :mod:`core.fs`) that have to be scanned according to the chosen folders/states.
|
||||||
"""
|
"""
|
||||||
# FIXME: if there is zero item in these sets, the for each loops will yield NOTHING
|
|
||||||
deny_list_str = set()
|
|
||||||
deny_list_re = set()
|
|
||||||
deny_list_re_files = set()
|
|
||||||
|
|
||||||
# ---Override
|
# ---Override
|
||||||
def __init__(self, excluded=ExcludeList()):
|
def __init__(self, exclude_list=None):
|
||||||
self._dirs = []
|
self._dirs = []
|
||||||
# {path: state}
|
# {path: state}
|
||||||
self.states = {}
|
self.states = {}
|
||||||
self._excluded = excluded
|
self._exclude_list = exclude_list
|
||||||
|
if exclude_list is not None:
|
||||||
|
exclude_list._combined_regex = False # TODO make a setter
|
||||||
|
|
||||||
def __contains__(self, path):
|
def __contains__(self, path):
|
||||||
for p in self._dirs:
|
for p in self._dirs:
|
||||||
@ -81,49 +78,58 @@ class Directories:
|
|||||||
return len(self._dirs)
|
return len(self._dirs)
|
||||||
|
|
||||||
# ---Private
|
# ---Private
|
||||||
def _default_state_for_path(self, path, deny_list_re=deny_list_re):
|
def _default_state_for_path(self, path):
|
||||||
|
# New logic with regex filters
|
||||||
|
if self._exclude_list is not None and len(self._exclude_list) > 0:
|
||||||
|
# We iterate even if we only have one item here
|
||||||
|
for denied_path_re in self._exclude_list.compiled_combined:
|
||||||
|
if denied_path_re.match(str(path)):
|
||||||
|
return DirectoryState.Excluded
|
||||||
|
return None
|
||||||
|
# Old default logic, still used during initialization of DirectoryTree:
|
||||||
# Override this in subclasses to specify the state of some special folders.
|
# Override this in subclasses to specify the state of some special folders.
|
||||||
# if path.name.startswith("."): # hidden
|
if path.name.startswith("."):
|
||||||
# return DirectoryState.Excluded
|
return DirectoryState.Excluded
|
||||||
for denied_path_re in deny_list_re:
|
|
||||||
if denied_path_re.match(str(path)):
|
|
||||||
return DirectoryState.Excluded
|
|
||||||
|
|
||||||
def _get_files(self, from_path, fileclasses, j, deny_list_re=deny_list_re_files):
|
def _get_files(self, from_path, fileclasses, j):
|
||||||
for root, dirs, files in os.walk(str(from_path)):
|
for root, dirs, files in os.walk(str(from_path)):
|
||||||
j.check_if_cancelled()
|
j.check_if_cancelled()
|
||||||
root = Path(root)
|
rootPath = Path(root)
|
||||||
state = self.get_state(root)
|
state = self.get_state(root)
|
||||||
if state == DirectoryState.Excluded:
|
if state == DirectoryState.Excluded:
|
||||||
# Recursively get files from folders with lots of subfolder is expensive. However, there
|
# Recursively get files from folders with lots of subfolder is expensive. However, there
|
||||||
# might be a subfolder in this path that is not excluded. What we want to do is to skim
|
# might be a subfolder in this path that is not excluded. What we want to do is to skim
|
||||||
# through self.states and see if we must continue, or we can stop right here to save time
|
# through self.states and see if we must continue, or we can stop right here to save time
|
||||||
if not any(p[: len(root)] == root for p in self.states):
|
if not any(p[: len(rootPath)] == rootPath for p in self.states):
|
||||||
del dirs[:]
|
del dirs[:]
|
||||||
try:
|
try:
|
||||||
if state != DirectoryState.Excluded:
|
if state != DirectoryState.Excluded:
|
||||||
found_files = []
|
# Old logic
|
||||||
for f in files:
|
if self._exclude_list is None or not len(self._exclude_list):
|
||||||
found = False
|
found_files = [fs.get_file(rootPath + f, fileclasses=fileclasses) for f in files]
|
||||||
for expr in deny_list_re:
|
else:
|
||||||
found = expr.match(f)
|
found_files = []
|
||||||
if found:
|
for f in files:
|
||||||
break
|
found = False
|
||||||
if not found:
|
for expr in self._exclude_list.compiled_files_combined:
|
||||||
found_files.append(fs.get_file(root + f, fileclasses=fileclasses))
|
found = expr.match(f)
|
||||||
|
if found:
|
||||||
|
break
|
||||||
|
if not found:
|
||||||
|
found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses))
|
||||||
found_files = [f for f in found_files if f is not None]
|
found_files = [f for f in found_files if f is not None]
|
||||||
# In some cases, directories can be considered as files by dupeGuru, which is
|
# In some cases, directories can be considered as files by dupeGuru, which is
|
||||||
# why we have this line below. In fact, there only one case: Bundle files under
|
# why we have this line below. In fact, there only one case: Bundle files under
|
||||||
# OS X... In other situations, this forloop will do nothing.
|
# OS X... In other situations, this forloop will do nothing.
|
||||||
for d in dirs[:]:
|
for d in dirs[:]:
|
||||||
f = fs.get_file(root + d, fileclasses=fileclasses)
|
f = fs.get_file(rootPath + d, fileclasses=fileclasses)
|
||||||
if f is not None:
|
if f is not None:
|
||||||
found_files.append(f)
|
found_files.append(f)
|
||||||
dirs.remove(d)
|
dirs.remove(d)
|
||||||
logging.debug(
|
logging.debug(
|
||||||
"Collected %d files in folder %s",
|
"Collected %d files in folder %s",
|
||||||
len(found_files),
|
len(found_files),
|
||||||
str(root),
|
str(rootPath),
|
||||||
)
|
)
|
||||||
for file in found_files:
|
for file in found_files:
|
||||||
file.is_ref = state == DirectoryState.Reference
|
file.is_ref = state == DirectoryState.Reference
|
||||||
@ -131,7 +137,7 @@ class Directories:
|
|||||||
except (EnvironmentError, fs.InvalidPath):
|
except (EnvironmentError, fs.InvalidPath):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _get_folders(self, from_folder, j, deny_list_re=deny_list_re):
|
def _get_folders(self, from_folder, j):
|
||||||
j.check_if_cancelled()
|
j.check_if_cancelled()
|
||||||
try:
|
try:
|
||||||
for subfolder in from_folder.subfolders:
|
for subfolder in from_folder.subfolders:
|
||||||
@ -177,7 +183,7 @@ class Directories:
|
|||||||
except EnvironmentError:
|
except EnvironmentError:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_files(self, fileclasses=None, j=job.nulljob, deny_list_re=deny_list_re_files):
|
def get_files(self, fileclasses=None, j=job.nulljob):
|
||||||
"""Returns a list of all files that are not excluded.
|
"""Returns a list of all files that are not excluded.
|
||||||
|
|
||||||
Returned files also have their ``is_ref`` attr set if applicable.
|
Returned files also have their ``is_ref`` attr set if applicable.
|
||||||
@ -185,7 +191,7 @@ class Directories:
|
|||||||
if fileclasses is None:
|
if fileclasses is None:
|
||||||
fileclasses = [fs.File]
|
fileclasses = [fs.File]
|
||||||
for path in self._dirs:
|
for path in self._dirs:
|
||||||
for file in self._get_files(path, fileclasses=fileclasses, j=j, deny_list_re=deny_list_re):
|
for file in self._get_files(path, fileclasses=fileclasses, j=j):
|
||||||
yield file
|
yield file
|
||||||
|
|
||||||
def get_folders(self, folderclass=None, j=job.nulljob):
|
def get_folders(self, folderclass=None, j=job.nulljob):
|
||||||
@ -200,7 +206,7 @@ class Directories:
|
|||||||
for folder in self._get_folders(from_folder, j):
|
for folder in self._get_folders(from_folder, j):
|
||||||
yield folder
|
yield folder
|
||||||
|
|
||||||
def get_state(self, path, deny_list_re=deny_list_re):
|
def get_state(self, path):
|
||||||
"""Returns the state of ``path``.
|
"""Returns the state of ``path``.
|
||||||
|
|
||||||
:rtype: :class:`DirectoryState`
|
:rtype: :class:`DirectoryState`
|
||||||
@ -208,7 +214,7 @@ class Directories:
|
|||||||
# direct match? easy result.
|
# direct match? easy result.
|
||||||
if path in self.states:
|
if path in self.states:
|
||||||
return self.states[path]
|
return self.states[path]
|
||||||
state = self._default_state_for_path(path, deny_list_re) or DirectoryState.Normal
|
state = self._default_state_for_path(path) or DirectoryState.Normal
|
||||||
prevlen = 0
|
prevlen = 0
|
||||||
# we loop through the states to find the longest matching prefix
|
# we loop through the states to find the longest matching prefix
|
||||||
for p, s in self.states.items():
|
for p, s in self.states.items():
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
|
|
||||||
from .markable import Markable
|
from .markable import Markable
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
# TODO: perhaps use regex module for better Unicode support? https://pypi.org/project/regex/
|
||||||
|
# or perhaps also https://pypi.org/project/re2/
|
||||||
import re
|
import re
|
||||||
from os import sep
|
from os import sep
|
||||||
import logging
|
import logging
|
||||||
@ -50,14 +52,18 @@ class ExcludeList(Markable):
|
|||||||
The downside is we have to compare strings every time we look for an item in the list
|
The downside is we have to compare strings every time we look for an item in the list
|
||||||
since we use regex strings as keys.
|
since we use regex strings as keys.
|
||||||
[regex:str, compilable:bool, error:Exception, compiled:Pattern])
|
[regex:str, compilable:bool, error:Exception, compiled:Pattern])
|
||||||
|
If combined_regex is True, the compiled regexes will be combined into one Pattern
|
||||||
|
instead of returned as separate Patterns.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# ---Override
|
# ---Override
|
||||||
def __init__(self):
|
def __init__(self, combined_regex=False):
|
||||||
Markable.__init__(self)
|
Markable.__init__(self)
|
||||||
|
self._combined_regex = combined_regex
|
||||||
self._excluded = []
|
self._excluded = []
|
||||||
self._count = 0
|
self._count = 0
|
||||||
self._excluded_compiled = set()
|
self._excluded_compiled = set()
|
||||||
|
self._dirty = True
|
||||||
|
|
||||||
def __debug_test(self):
|
def __debug_test(self):
|
||||||
self.test_regexes = [
|
self.test_regexes = [
|
||||||
@ -81,30 +87,38 @@ class ExcludeList(Markable):
|
|||||||
yield self.is_marked(regex), regex
|
yield self.is_marked(regex), regex
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self._count
|
"""Returns the number of marked regexes."""
|
||||||
|
return len([x for marked, x in self if marked])
|
||||||
|
|
||||||
def is_markable(self, regex):
|
def is_markable(self, regex):
|
||||||
return self._is_markable(regex)
|
return self._is_markable(regex)
|
||||||
|
|
||||||
def _is_markable(self, regex):
|
def _is_markable(self, regex):
|
||||||
"""Return the cached result of "compilable" property"""
|
"""Return the cached result of "compilable" property"""
|
||||||
# FIXME save result of compilation via memoization
|
|
||||||
# return self._excluded.get(regex)[0]
|
|
||||||
for item in self._excluded:
|
for item in self._excluded:
|
||||||
if item[0] == regex:
|
if item[0] == regex:
|
||||||
return item[1]
|
return item[1]
|
||||||
return False # FIXME should not be needed
|
return False # should not be needed
|
||||||
|
|
||||||
def _did_mark(self, regex):
|
def _did_mark(self, regex):
|
||||||
|
self._add_compiled(regex)
|
||||||
|
|
||||||
|
def _did_unmark(self, regex):
|
||||||
|
self._remove_compiled(regex)
|
||||||
|
|
||||||
|
def _add_compiled(self, regex):
|
||||||
|
if self._combined_regex:
|
||||||
|
self._dirty = True
|
||||||
|
return
|
||||||
for item in self._excluded:
|
for item in self._excluded:
|
||||||
if item[0] == regex:
|
if item[0] == regex:
|
||||||
# no need to test if already present since it's a set()
|
# no need to test if already present since it's a set()
|
||||||
self._excluded_compiled.add(item[3])
|
self._excluded_compiled.add(item[3])
|
||||||
|
|
||||||
def _did_unmark(self, regex):
|
|
||||||
self._remove_compiled(regex)
|
|
||||||
|
|
||||||
def _remove_compiled(self, regex):
|
def _remove_compiled(self, regex):
|
||||||
|
if self._combined_regex:
|
||||||
|
self._dirty = True
|
||||||
|
return
|
||||||
for item in self._excluded_compiled:
|
for item in self._excluded_compiled:
|
||||||
if regex in item.pattern:
|
if regex in item.pattern:
|
||||||
self._excluded_compiled.remove(item)
|
self._excluded_compiled.remove(item)
|
||||||
@ -137,13 +151,41 @@ class ExcludeList(Markable):
|
|||||||
@property
|
@property
|
||||||
def compiled(self):
|
def compiled(self):
|
||||||
"""Should be used by other classes to retrieve the up-to-date list of patterns."""
|
"""Should be used by other classes to retrieve the up-to-date list of patterns."""
|
||||||
return self._excluded_compiled
|
if not self._combined_regex:
|
||||||
|
return self._excluded_compiled
|
||||||
|
else:
|
||||||
|
return self.compiled_combined
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def compiled_files(self):
|
def compiled_files(self):
|
||||||
"""Should be used by other classes to retrieve the up-to-date list of patterns
|
"""Should be used by other classes to retrieve the up-to-date list of patterns
|
||||||
for files only."""
|
for files only."""
|
||||||
return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]
|
if not self._combined_regex:
|
||||||
|
# Return each compiled element separately
|
||||||
|
# return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]
|
||||||
|
for compiled in self.compiled:
|
||||||
|
if sep not in compiled.pattern:
|
||||||
|
yield compiled
|
||||||
|
else:
|
||||||
|
return self.compiled_files_combined
|
||||||
|
|
||||||
|
@property
|
||||||
|
def compiled_combined(self):
|
||||||
|
if self._dirty:
|
||||||
|
self._cached_compiled_combined =\
|
||||||
|
re.compile('|'.join(x for marked, x in self if marked))
|
||||||
|
# Must compute the filtered out version as well
|
||||||
|
self._cached_compiled_combined_files =\
|
||||||
|
re.compile('|'.join(x for marked, x in self
|
||||||
|
if marked and sep not in x))
|
||||||
|
self._dirty = False
|
||||||
|
# returned as a tuple to get a free iterator and to avoid subclassing
|
||||||
|
return (self._cached_compiled_combined,)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def compiled_files_combined(self):
|
||||||
|
# returned as a tuple to get a free iterator and to avoid subclassing
|
||||||
|
return (self._cached_compiled_combined_files,)
|
||||||
|
|
||||||
# ---Public
|
# ---Public
|
||||||
def add(self, regex, forced=False):
|
def add(self, regex, forced=False):
|
||||||
@ -164,7 +206,7 @@ class ExcludeList(Markable):
|
|||||||
def _do_add(self, regex, iscompilable, exception, compiled):
|
def _do_add(self, regex, iscompilable, exception, compiled):
|
||||||
# We need to insert at the top
|
# We need to insert at the top
|
||||||
self._excluded.insert(0, [regex, iscompilable, exception, compiled])
|
self._excluded.insert(0, [regex, iscompilable, exception, compiled])
|
||||||
self._count = len(self._excluded)
|
# self._count = len(self._excluded)
|
||||||
|
|
||||||
def isExcluded(self, regex):
|
def isExcluded(self, regex):
|
||||||
for item in self._excluded:
|
for item in self._excluded:
|
||||||
@ -174,7 +216,6 @@ class ExcludeList(Markable):
|
|||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
self._excluded = []
|
self._excluded = []
|
||||||
self._count = 0
|
|
||||||
|
|
||||||
def remove(self, regex):
|
def remove(self, regex):
|
||||||
for item in self._excluded:
|
for item in self._excluded:
|
||||||
@ -286,9 +327,6 @@ class ExcludeDict(ExcludeList):
|
|||||||
for regex in ordered_keys(self._excluded):
|
for regex in ordered_keys(self._excluded):
|
||||||
yield self.is_marked(regex), regex
|
yield self.is_marked(regex), regex
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self._count
|
|
||||||
|
|
||||||
def is_markable(self, regex):
|
def is_markable(self, regex):
|
||||||
return self._is_markable(regex)
|
return self._is_markable(regex)
|
||||||
|
|
||||||
@ -299,17 +337,16 @@ class ExcludeDict(ExcludeList):
|
|||||||
return exists.get("compilable")
|
return exists.get("compilable")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _did_mark(self, regex):
|
def _add_compiled(self, regex):
|
||||||
# self._excluded[regex][0] = True # is compilable
|
if self._combined_regex:
|
||||||
|
self._dirty = True
|
||||||
|
return
|
||||||
try:
|
try:
|
||||||
self._excluded_compiled.add(self._excluded[regex]["compiled"])
|
self._excluded_compiled.add(self._excluded[regex]["compiled"])
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Exception while adding regex {regex} to compiled set: {e}")
|
print(f"Exception while adding regex {regex} to compiled set: {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
def _did_unmark(self, regex):
|
|
||||||
self._remove_compiled(regex)
|
|
||||||
|
|
||||||
def is_compilable(self, regex):
|
def is_compilable(self, regex):
|
||||||
"""Returns the cached "compilable" value"""
|
"""Returns the cached "compilable" value"""
|
||||||
return self._excluded[regex]["compilable"]
|
return self._excluded[regex]["compilable"]
|
||||||
@ -318,24 +355,13 @@ class ExcludeDict(ExcludeList):
|
|||||||
"""Return the compilation error message for regex string"""
|
"""Return the compilation error message for regex string"""
|
||||||
return self._excluded.get(regex).get("error")
|
return self._excluded.get(regex).get("error")
|
||||||
|
|
||||||
@property
|
|
||||||
def compiled(self):
|
|
||||||
"""Should be used by other classes to retrieve the up-to-date list of patterns."""
|
|
||||||
return self._excluded_compiled
|
|
||||||
|
|
||||||
@property
|
|
||||||
def compiled_files(self):
|
|
||||||
"""Should be used by other classes to retrieve the up-to-date list of patterns
|
|
||||||
for files only."""
|
|
||||||
return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]
|
|
||||||
|
|
||||||
# ---Public
|
# ---Public
|
||||||
def _do_add(self, regex, iscompilable, exception, compiled):
|
def _do_add(self, regex, iscompilable, exception, compiled):
|
||||||
# We always insert at the top, so index should be 0 and other indices should be pushed by one
|
# We always insert at the top, so index should be 0 and other indices should be pushed by one
|
||||||
for value in self._excluded.values():
|
for value in self._excluded.values():
|
||||||
value["index"] += 1
|
value["index"] += 1
|
||||||
self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled}
|
self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled}
|
||||||
self._count = len(self._excluded)
|
# self._count = len(self._excluded)
|
||||||
|
|
||||||
def isExcluded(self, regex):
|
def isExcluded(self, regex):
|
||||||
if regex in self._excluded.keys():
|
if regex in self._excluded.keys():
|
||||||
@ -344,13 +370,13 @@ class ExcludeDict(ExcludeList):
|
|||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
self._excluded = {}
|
self._excluded = {}
|
||||||
self._count = 0
|
|
||||||
|
|
||||||
def remove(self, regex):
|
def remove(self, regex):
|
||||||
old_value = self._excluded.pop(regex)
|
old_value = self._excluded.pop(regex)
|
||||||
# Bring down all indices which where above it
|
# Bring down all indices which where above it
|
||||||
index = old_value["index"]
|
index = old_value["index"]
|
||||||
if index == len(self._excluded):
|
if index == len(self._excluded) - 1: # we start at 0...
|
||||||
|
# Old index was at the end, no need to update other indices
|
||||||
self._remove_compiled(regex)
|
self._remove_compiled(regex)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -68,10 +68,13 @@ class ExcludeListDialog(QDialog):
|
|||||||
gridlayout.addItem(QSpacerItem(0, 0, QSizePolicy.Minimum, QSizePolicy.Expanding), 4, 1)
|
gridlayout.addItem(QSpacerItem(0, 0, QSizePolicy.Minimum, QSizePolicy.Expanding), 4, 1)
|
||||||
gridlayout.addWidget(self.buttonClose, 5, 1)
|
gridlayout.addWidget(self.buttonClose, 5, 1)
|
||||||
layout.addLayout(gridlayout)
|
layout.addLayout(gridlayout)
|
||||||
|
self.linedit.setPlaceholderText("Type a regular expression here...")
|
||||||
|
self.linedit.setFocus()
|
||||||
|
|
||||||
# --- model --> view
|
# --- model --> view
|
||||||
def show(self):
|
def show(self):
|
||||||
super().show()
|
super().show()
|
||||||
|
self.linedit.setFocus()
|
||||||
|
|
||||||
@pyqtSlot()
|
@pyqtSlot()
|
||||||
def addStringFromLineEdit(self):
|
def addStringFromLineEdit(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user