From ab8750eedbeb7be2779a49ff2da7cf8c460d9596 Mon Sep 17 00:00:00 2001 From: glubsy Date: Thu, 17 Jun 2021 03:49:59 +0200 Subject: [PATCH 1/2] Fix partial regex match yielding false positive --- core/directories.py | 20 +++++++++++++------- core/gui/exclude_list_dialog.py | 2 +- qt/exclude_list_dialog.py | 3 +-- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/core/directories.py b/core/directories.py index 7cd103fc..78883662 100644 --- a/core/directories.py +++ b/core/directories.py @@ -108,18 +108,24 @@ class Directories: found_files = [] # print(f"len of files: {len(files)} {files}") for f in files: - found = False + matched = False for expr in self._exclude_list.compiled_files: - if expr.match(f): - found = True + if expr.fullmatch(f): + logging.debug(f"{expr} matched {f}.") + matched = True break - if not found: + if not matched: + logging.debug(f"path {root + os.sep + f}") for expr in self._exclude_list.compiled_paths: - if expr.match(root + os.sep + f): - found = True + if expr.fullmatch(root + os.sep + f): + print(f"{expr} matched {root}{os.sep}{f}.") + matched = True break - if not found: + if not matched: + logging.debug(f"Not filtering: {f}.") found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses)) + else: + logging.debug(f"Filtering: {f}") found_files = [f for f in found_files if f is not None] # In some cases, directories can be considered as files by dupeGuru, which is # why we have this line below. In fact, there only one case: Bundle files under diff --git a/core/gui/exclude_list_dialog.py b/core/gui/exclude_list_dialog.py index b56ea7d2..82e219a2 100644 --- a/core/gui/exclude_list_dialog.py +++ b/core/gui/exclude_list_dialog.py @@ -56,7 +56,7 @@ class ExcludeListDialogCore: matched = False for row in self.exclude_list_table.rows: compiled_regex = self.exclude_list.get_compiled(row.regex) - if compiled_regex and compiled_regex.match(test_string): + if compiled_regex and compiled_regex.fullmatch(test_string): matched = True row.highlight = True else: diff --git a/qt/exclude_list_dialog.py b/qt/exclude_list_dialog.py index 855d6ac4..63be5a4e 100644 --- a/qt/exclude_list_dialog.py +++ b/qt/exclude_list_dialog.py @@ -129,8 +129,7 @@ class ExcludeListDialog(QDialog): except re.error: self.reset_input_style() return - match = compiled.match(input_text) - if match: + if compiled.fullmatch(input_text): self._input_styled = True self.inputLine.setStyleSheet("background-color: rgb(10, 200, 10);") else: From a6f83ad3d7e002371c30d97f09af9c9afb95c95a Mon Sep 17 00:00:00 2001 From: glubsy Date: Sat, 19 Jun 2021 01:52:31 +0200 Subject: [PATCH 2/2] Fix missing regexp after rename * Doing a full match should be safer to avoid partial results which would result in overly aggressive filtering. * Add new tests to test suite to cover this issue. * Fixes #903. --- core/directories.py | 21 +---- core/exclude.py | 65 ++++++++----- core/gui/exclude_list_dialog.py | 33 ++++++- core/gui/exclude_list_table.py | 2 +- core/tests/directories_test.py | 18 ++++ core/tests/exclude_test.py | 156 ++++++++++++++++++++++++++++++++ qt/exclude_list_dialog.py | 10 +- 7 files changed, 254 insertions(+), 51 deletions(-) diff --git a/core/directories.py b/core/directories.py index 78883662..9b4293b0 100644 --- a/core/directories.py +++ b/core/directories.py @@ -108,24 +108,9 @@ class Directories: found_files = [] # print(f"len of files: {len(files)} {files}") for f in files: - matched = False - for expr in self._exclude_list.compiled_files: - if expr.fullmatch(f): - logging.debug(f"{expr} matched {f}.") - matched = True - break - if not matched: - logging.debug(f"path {root + os.sep + f}") - for expr in self._exclude_list.compiled_paths: - if expr.fullmatch(root + os.sep + f): - print(f"{expr} matched {root}{os.sep}{f}.") - matched = True - break - if not matched: - logging.debug(f"Not filtering: {f}.") - found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses)) - else: - logging.debug(f"Filtering: {f}") + if not self._exclude_list.is_excluded(root, f): + found_files.append(fs.get_file(rootPath + f, + fileclasses=fileclasses)) found_files = [f for f in found_files if f is not None] # In some cases, directories can be considered as files by dupeGuru, which is # why we have this line below. In fact, there only one case: Bundle files under diff --git a/core/exclude.py b/core/exclude.py index 29b00e6b..314a1c6e 100644 --- a/core/exclude.py +++ b/core/exclude.py @@ -81,7 +81,7 @@ class ExcludeList(Markable): yield self.is_marked(regex), regex def __contains__(self, item): - return self.isExcluded(item) + return self.has_entry(item) def __len__(self): """Returns the total number of regexes regardless of mark status.""" @@ -173,7 +173,9 @@ class ExcludeList(Markable): [x for x in self._excluded_compiled if not has_sep(x.pattern)] self._cached_compiled_paths =\ [x for x in self._excluded_compiled if has_sep(x.pattern)] + self._dirty = False return + marked_count = [x for marked, x in self if marked] # If there is no item, the compiled Pattern will be '' and match everything! if not marked_count: @@ -197,14 +199,14 @@ class ExcludeList(Markable): else: self._cached_compiled_union_paths =\ (re.compile('|'.join(paths_marked)),) + self._dirty = False @property def compiled(self): """Should be used by other classes to retrieve the up-to-date list of patterns.""" if self._use_union: if self._dirty: - self.build_compiled_caches(True) - self._dirty = False + self.build_compiled_caches(self._use_union) return self._cached_compiled_union_all return self._excluded_compiled @@ -215,8 +217,7 @@ class ExcludeList(Markable): The interface should be expected to be a generator, even if it returns only one item (one Pattern in the union case).""" if self._dirty: - self.build_compiled_caches(True if self._use_union else False) - self._dirty = False + self.build_compiled_caches(self._use_union) return self._cached_compiled_union_files if self._use_union\ else self._cached_compiled_files @@ -224,8 +225,7 @@ class ExcludeList(Markable): def compiled_paths(self): """Returns patterns with only separators in them, for more precise filtering.""" if self._dirty: - self.build_compiled_caches(True if self._use_union else False) - self._dirty = False + self.build_compiled_caches(self._use_union) return self._cached_compiled_union_paths if self._use_union\ else self._cached_compiled_paths @@ -233,7 +233,7 @@ class ExcludeList(Markable): def add(self, regex, forced=False): """This interface should throw exceptions if there is an error during regex compilation""" - if self.isExcluded(regex): + if self.has_entry(regex): # This exception should never be ignored raise AlreadyThereException() if regex in forbidden_regexes: @@ -256,12 +256,27 @@ class ExcludeList(Markable): """Returns the number of marked regexes only.""" return len([x for marked, x in self if marked]) - def isExcluded(self, regex): + def has_entry(self, regex): for item in self._excluded: if regex == item[0]: return True return False + def is_excluded(self, dirname, filename): + """Return True if the file or the absolute path to file is supposed to be + filtered out, False otherwise.""" + matched = False + for expr in self.compiled_files: + if expr.fullmatch(filename): + matched = True + break + if not matched: + for expr in self.compiled_paths: + if expr.fullmatch(dirname + sep + filename): + matched = True + break + return matched + def remove(self, regex): for item in self._excluded: if item[0] == regex: @@ -286,9 +301,11 @@ class ExcludeList(Markable): break if not found: return - if is_compilable and was_marked: - # Not marked by default when added, add it back - self.mark(newregex) + if is_compilable: + self._add_compiled(newregex) + if was_marked: + # Not marked by default when added, add it back + self.mark(newregex) # def change_index(self, regex, new_index): # """Internal list must be a list, not dict.""" @@ -300,7 +317,7 @@ class ExcludeList(Markable): if regex not in default_regexes: self.unmark(regex) for default_regex in default_regexes: - if not self.isExcluded(default_regex): + if not self.has_entry(default_regex): self.add(default_regex) self.mark(default_regex) @@ -399,9 +416,9 @@ class ExcludeDict(ExcludeList): if self._use_union: return try: - self._excluded_compiled.add(self._excluded[regex]["compiled"]) + self._excluded_compiled.add(self._excluded.get(regex).get("compiled")) except Exception as e: - logging.warning(f"Exception while adding regex {regex} to compiled set: {e}") + logging.error(f"Exception while adding regex {regex} to compiled set: {e}") return def is_compilable(self, regex): @@ -425,7 +442,7 @@ class ExcludeDict(ExcludeList): "compiled": compiled } - def isExcluded(self, regex): + def has_entry(self, regex): if regex in self._excluded.keys(): return True return False @@ -451,14 +468,16 @@ class ExcludeDict(ExcludeList): previous = self._excluded.pop(regex) iscompilable, error, compiled = self.compile_re(newregex) self._excluded[newregex] = { - "index": previous["index"], + "index": previous.get('index'), "compilable": iscompilable, "error": error, "compiled": compiled } self._remove_compiled(regex) - if was_marked and iscompilable: - self.mark(newregex) + if iscompilable: + self._add_compiled(newregex) + if was_marked: + self.mark(newregex) def save_to_xml(self, outfile): """Create a XML file that can be used by load_from_xml. @@ -492,8 +511,8 @@ def ordered_keys(_dict): if ISWINDOWS: - def has_sep(x): - return '\\' + sep in x + def has_sep(regexp): + return '\\' + sep in regexp else: - def has_sep(x): - return sep in x + def has_sep(regexp): + return sep in regexp diff --git a/core/gui/exclude_list_dialog.py b/core/gui/exclude_list_dialog.py index 82e219a2..b07a5d99 100644 --- a/core/gui/exclude_list_dialog.py +++ b/core/gui/exclude_list_dialog.py @@ -7,6 +7,8 @@ # from hscommon.trans import tr from .exclude_list_table import ExcludeListTable +from core.exclude import has_sep +from os import sep import logging @@ -30,9 +32,10 @@ class ExcludeListDialogCore: self.refresh() def rename_selected(self, newregex): - """Renames the selected regex to ``newregex``. - If there's more than one selected row, the first one is used. + """Rename the selected regex to ``newregex``. + If there is more than one selected row, the first one is used. :param str newregex: The regex to rename the row's regex to. + :return bool: true if success, false if error. """ try: r = self.exclude_list_table.selected_rows[0] @@ -52,17 +55,37 @@ class ExcludeListDialogCore: self.exclude_list_table.add(regex) def test_string(self, test_string): - """Sets property on row to highlight if its regex matches test_string supplied.""" + """Set the highlight property on each row when its regex matches the + test_string supplied. Return True if any row matched.""" matched = False for row in self.exclude_list_table.rows: compiled_regex = self.exclude_list.get_compiled(row.regex) - if compiled_regex and compiled_regex.fullmatch(test_string): - matched = True + + if self.is_match(test_string, compiled_regex): row.highlight = True + matched = True else: row.highlight = False return matched + def is_match(self, test_string, compiled_regex): + # This method is like an inverted version of ExcludeList.is_excluded() + if not compiled_regex: + return False + matched = False + + # Test only the filename portion of the path + if not has_sep(compiled_regex.pattern) and sep in test_string: + filename = test_string.rsplit(sep, 1)[1] + if compiled_regex.fullmatch(filename): + matched = True + return matched + + # Test the entire path + filename + if compiled_regex.fullmatch(test_string): + matched = True + return matched + def reset_rows_highlight(self): for row in self.exclude_list_table.rows: row.highlight = False diff --git a/core/gui/exclude_list_table.py b/core/gui/exclude_list_table.py index 8875d330..01f56762 100644 --- a/core/gui/exclude_list_table.py +++ b/core/gui/exclude_list_table.py @@ -36,7 +36,7 @@ class ExcludeListTable(GUITable, DupeGuruGUIObject): return ExcludeListRow(self, self.dialog.exclude_list.is_marked(regex), regex), 0 def _do_delete(self): - self.dalog.exclude_list.remove(self.selected_row.regex) + self.dialog.exclude_list.remove(self.selected_row.regex) # --- Override def add(self, regex): diff --git a/core/tests/directories_test.py b/core/tests/directories_test.py index 8a5ddcdb..503f98ff 100644 --- a/core/tests/directories_test.py +++ b/core/tests/directories_test.py @@ -473,6 +473,24 @@ files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled assert "file_ending_with_subdir" not in files assert "file_which_shouldnt_match" in files + # This should match the directory only + regex6 = r".*/subdir.*" + if ISWINDOWS: + regex6 = r".*\\.*subdir.*" + self.d._exclude_list.rename(regex5, regex6) + self.d._exclude_list.remove(regex1) + assert regex1 not in self.d._exclude_list + assert regex5 not in self.d._exclude_list + assert self.d._exclude_list.error(regex6) is None + # This still should not be affected + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal) + files = self.get_files_and_expect_num_result(5) + # These files are under the "/subdir" directory + assert "somesubdirfile.png" not in files + assert "unwanted_subdirfile.gif" not in files + # This file under "subdar" directory should not be filtered out + assert "file_ending_with_subdir" in files + def test_japanese_unicode(self, tmpdir): p1 = Path(str(tmpdir)) p1["$Recycle.Bin"].mkdir() diff --git a/core/tests/exclude_test.py b/core/tests/exclude_test.py index de5e46c5..c2f1f03e 100644 --- a/core/tests/exclude_test.py +++ b/core/tests/exclude_test.py @@ -188,6 +188,28 @@ class TestCaseListEmpty: self.exclude_list.rename(regex_renamed_compilable, regex_compilable) eq_(self.exclude_list.is_marked(regex_compilable), True) + def test_rename_regex_file_to_path(self): + regex = r".*/one.*" + if ISWINDOWS: + regex = r".*\\one.*" + regex2 = r".*one.*" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + compiled_re = [x.pattern for x in self.exclude_list._excluded_compiled] + files_re = [x.pattern for x in self.exclude_list.compiled_files] + paths_re = [x.pattern for x in self.exclude_list.compiled_paths] + assert regex in compiled_re + assert regex not in files_re + assert regex in paths_re + self.exclude_list.rename(regex, regex2) + compiled_re = [x.pattern for x in self.exclude_list._excluded_compiled] + files_re = [x.pattern for x in self.exclude_list.compiled_files] + paths_re = [x.pattern for x in self.exclude_list.compiled_paths] + assert regex not in compiled_re + assert regex2 in compiled_re + assert regex2 in files_re + assert regex2 not in paths_re + def test_restore_default(self): """Only unmark previously added regexes and mark the pre-defined ones""" regex = r"one" @@ -213,6 +235,73 @@ class TestCaseListEmpty: eq_(len(default_regexes), len(self.exclude_list.compiled)) +class TestCaseListEmptyUnion(TestCaseListEmpty): + """Same but with union regex""" + def setup_method(self, method): + self.app = DupeGuru() + self.app.exclude_list = ExcludeList(union_regex=True) + self.exclude_list = self.app.exclude_list + + def test_add_mark_and_remove_regex(self): + regex1 = r"one" + regex2 = r"two" + self.exclude_list.add(regex1) + assert(regex1 in self.exclude_list) + self.exclude_list.add(regex2) + self.exclude_list.mark(regex1) + self.exclude_list.mark(regex2) + eq_(len(self.exclude_list), 2) + eq_(len(self.exclude_list.compiled), 1) + compiled_files = [x for x in self.exclude_list.compiled_files] + eq_(len(compiled_files), 1) # Two patterns joined together into one + assert "|" in compiled_files[0].pattern + self.exclude_list.remove(regex2) + assert(regex2 not in self.exclude_list) + eq_(len(self.exclude_list), 1) + + def test_rename_regex_file_to_path(self): + regex = r".*/one.*" + if ISWINDOWS: + regex = r".*\\one.*" + regex2 = r".*one.*" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + eq_(len([x for x in self.exclude_list]), 1) + compiled_re = [x.pattern for x in self.exclude_list.compiled] + files_re = [x.pattern for x in self.exclude_list.compiled_files] + paths_re = [x.pattern for x in self.exclude_list.compiled_paths] + assert regex in compiled_re + assert regex not in files_re + assert regex in paths_re + self.exclude_list.rename(regex, regex2) + eq_(len([x for x in self.exclude_list]), 1) + compiled_re = [x.pattern for x in self.exclude_list.compiled] + files_re = [x.pattern for x in self.exclude_list.compiled_files] + paths_re = [x.pattern for x in self.exclude_list.compiled_paths] + assert regex not in compiled_re + assert regex2 in compiled_re + assert regex2 in files_re + assert regex2 not in paths_re + + def test_restore_default(self): + """Only unmark previously added regexes and mark the pre-defined ones""" + regex = r"one" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + self.exclude_list.restore_defaults() + eq_(len(default_regexes), self.exclude_list.marked_count) + # added regex shouldn't be marked + eq_(self.exclude_list.is_marked(regex), False) + # added regex shouldn't be in compiled list either + compiled = [x for x in self.exclude_list.compiled] + assert regex not in compiled + # Need to escape both to get the same strings after compilation + compiled_escaped = set([x.encode('unicode-escape').decode() for x in compiled[0].pattern.split("|")]) + default_escaped = set([x.encode('unicode-escape').decode() for x in default_regexes]) + assert compiled_escaped == default_escaped + eq_(len(default_regexes), len(compiled[0].pattern.split("|"))) + + class TestCaseDictEmpty(TestCaseListEmpty): """Same, but with dictionary implementation""" def setup_method(self, method): @@ -221,6 +310,73 @@ class TestCaseDictEmpty(TestCaseListEmpty): self.exclude_list = self.app.exclude_list +class TestCaseDictEmptyUnion(TestCaseDictEmpty): + """Same, but with union regex""" + def setup_method(self, method): + self.app = DupeGuru() + self.app.exclude_list = ExcludeDict(union_regex=True) + self.exclude_list = self.app.exclude_list + + def test_add_mark_and_remove_regex(self): + regex1 = r"one" + regex2 = r"two" + self.exclude_list.add(regex1) + assert(regex1 in self.exclude_list) + self.exclude_list.add(regex2) + self.exclude_list.mark(regex1) + self.exclude_list.mark(regex2) + eq_(len(self.exclude_list), 2) + eq_(len(self.exclude_list.compiled), 1) + compiled_files = [x for x in self.exclude_list.compiled_files] + # two patterns joined into one + eq_(len(compiled_files), 1) + self.exclude_list.remove(regex2) + assert(regex2 not in self.exclude_list) + eq_(len(self.exclude_list), 1) + + def test_rename_regex_file_to_path(self): + regex = r".*/one.*" + if ISWINDOWS: + regex = r".*\\one.*" + regex2 = r".*one.*" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + marked_re = [x for marked, x in self.exclude_list if marked] + eq_(len(marked_re), 1) + compiled_re = [x.pattern for x in self.exclude_list.compiled] + files_re = [x.pattern for x in self.exclude_list.compiled_files] + paths_re = [x.pattern for x in self.exclude_list.compiled_paths] + assert regex in compiled_re + assert regex not in files_re + assert regex in paths_re + self.exclude_list.rename(regex, regex2) + compiled_re = [x.pattern for x in self.exclude_list.compiled] + files_re = [x.pattern for x in self.exclude_list.compiled_files] + paths_re = [x.pattern for x in self.exclude_list.compiled_paths] + assert regex not in compiled_re + assert regex2 in compiled_re + assert regex2 in files_re + assert regex2 not in paths_re + + def test_restore_default(self): + """Only unmark previously added regexes and mark the pre-defined ones""" + regex = r"one" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + self.exclude_list.restore_defaults() + eq_(len(default_regexes), self.exclude_list.marked_count) + # added regex shouldn't be marked + eq_(self.exclude_list.is_marked(regex), False) + # added regex shouldn't be in compiled list either + compiled = [x for x in self.exclude_list.compiled] + assert regex not in compiled + # Need to escape both to get the same strings after compilation + compiled_escaped = set([x.encode('unicode-escape').decode() for x in compiled[0].pattern.split("|")]) + default_escaped = set([x.encode('unicode-escape').decode() for x in default_regexes]) + assert compiled_escaped == default_escaped + eq_(len(default_regexes), len(compiled[0].pattern.split("|"))) + + def split_union(pattern_object): """Returns list of strings for each union pattern""" return [x for x in pattern_object.pattern.split("|")] diff --git a/qt/exclude_list_dialog.py b/qt/exclude_list_dialog.py index 63be5a4e..4246087e 100644 --- a/qt/exclude_list_dialog.py +++ b/qt/exclude_list_dialog.py @@ -116,30 +116,32 @@ class ExcludeListDialog(QDialog): if not input_text: self.reset_input_style() return - # if at least one row matched, we know whether table is highlighted or not + # If at least one row matched, we know whether table is highlighted or not self._row_matched = self.model.test_string(input_text) self.table.refresh() + # Test the string currently in the input text box as well input_regex = self.inputLine.text() if not input_regex: self.reset_input_style() return + compiled = None try: compiled = re.compile(input_regex) except re.error: self.reset_input_style() return - if compiled.fullmatch(input_text): - self._input_styled = True + if self.model.is_match(input_text, compiled): self.inputLine.setStyleSheet("background-color: rgb(10, 200, 10);") + self._input_styled = True else: self.reset_input_style() def reset_input_style(self): """Reset regex input line background""" if self._input_styled: - self._input_styled = False self.inputLine.setStyleSheet(self.styleSheet()) + self._input_styled = False def reset_table_style(self): if self._row_matched: