From 4a1641e39d66afd1c25ef5a621278f90353a3081 Mon Sep 17 00:00:00 2001 From: glubsy Date: Sat, 29 Aug 2020 03:57:00 +0200 Subject: [PATCH] Add test suite, fix bugs --- core/app.py | 2 +- core/directories.py | 31 ++-- core/exclude.py | 197 ++++++++++++++--------- core/fs.py | 11 +- core/tests/directories_test.py | 236 ++++++++++++++++++++++------ core/tests/exclude_test.py | 277 +++++++++++++++++++++++++++++++++ tox.ini | 2 +- 7 files changed, 613 insertions(+), 143 deletions(-) create mode 100644 core/tests/exclude_test.py diff --git a/core/app.py b/core/app.py index ee31a114..3f4c3266 100644 --- a/core/app.py +++ b/core/app.py @@ -26,7 +26,7 @@ from .pe.photo import get_delta_dimensions from .util import cmp_value, fix_surrogate_encoding from . import directories, results, export, fs, prioritize from .ignore import IgnoreList -from .exclude import ExcludeList as ExcludeList +from .exclude import ExcludeDict as ExcludeList from .scanner import ScanType from .gui.deletion_options import DeletionOptions from .gui.details_panel import DetailsPanel diff --git a/core/directories.py b/core/directories.py index 781ced90..aa6298d4 100644 --- a/core/directories.py +++ b/core/directories.py @@ -80,13 +80,12 @@ class Directories: # ---Private def _default_state_for_path(self, path): # New logic with regex filters - if self._exclude_list is not None and len(self._exclude_list) > 0: + if self._exclude_list is not None and self._exclude_list.mark_count > 0: # We iterate even if we only have one item here - for denied_path_re in self._exclude_list.compiled_combined: - if denied_path_re.match(str(path)): + for denied_path_re in self._exclude_list.compiled: + if denied_path_re.match(str(path.name)): return DirectoryState.Excluded - return None - # Old default logic, still used during initialization of DirectoryTree: + # return # We still use the old logic to force state on hidden dirs # Override this in subclasses to specify the state of some special folders. if path.name.startswith("."): return DirectoryState.Excluded @@ -95,7 +94,7 @@ class Directories: for root, dirs, files in os.walk(str(from_path)): j.check_if_cancelled() rootPath = Path(root) - state = self.get_state(root) + state = self.get_state(rootPath) if state == DirectoryState.Excluded: # Recursively get files from folders with lots of subfolder is expensive. However, there # might be a subfolder in this path that is not excluded. What we want to do is to skim @@ -105,16 +104,22 @@ class Directories: try: if state != DirectoryState.Excluded: # Old logic - if self._exclude_list is None or not len(self._exclude_list): + if self._exclude_list is None or not self._exclude_list.mark_count: found_files = [fs.get_file(rootPath + f, fileclasses=fileclasses) for f in files] else: found_files = [] + # print(f"len of files: {len(files)} {files}") for f in files: found = False - for expr in self._exclude_list.compiled_files_combined: - found = expr.match(f) - if found: + for expr in self._exclude_list.compiled_files: + if expr.match(f): + found = True break + if not found: + for expr in self._exclude_list.compiled_paths: + if expr.match(root + os.sep + f): + found = True + break if not found: found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses)) found_files = [f for f in found_files if f is not None] @@ -215,8 +220,14 @@ class Directories: if path in self.states: return self.states[path] state = self._default_state_for_path(path) or DirectoryState.Normal + # Save non-default states in cache, necessary for _get_files() + if state != DirectoryState.Normal: + self.states[path] = state + return state + prevlen = 0 # we loop through the states to find the longest matching prefix + # if the parent has a state in cache, return that state for p, s in self.states.items(): if p.is_parent_of(path) and len(p) > prevlen: prevlen = len(p) diff --git a/core/exclude.py b/core/exclude.py index 29bab540..e0e8a901 100644 --- a/core/exclude.py +++ b/core/exclude.py @@ -5,7 +5,8 @@ from .markable import Markable from xml.etree import ElementTree as ET # TODO: perhaps use regex module for better Unicode support? https://pypi.org/project/regex/ -# or perhaps also https://pypi.org/project/re2/ +# also https://pypi.org/project/re2/ +# TODO update the Result list with newly added regexes if possible import re from os import sep import logging @@ -13,8 +14,14 @@ import functools from hscommon.util import FileOrPath import time -default_regexes = [r".*thumbs", r"\.DS.Store", r"\.Trash", r".*Trash-Bin"] -forbidden_regexes = [r".*", r"\/.*", r".*\/.*"] +default_regexes = [r"^thumbs\.db$", # Obsolete after WindowsXP + r"^\.DS_Store$", # MacOS metadata + r"^\.Trash\-.*", # Linux trash directories + r"^\$Recycle\.Bin$", # Windows + r"^\..*" # Hidden files + ] +# These are too agressive +forbidden_regexes = [r".*", r"\/.*", r".*\/.*", r".*\..*"] def timer(func): @@ -59,36 +66,37 @@ class ExcludeList(Markable): # ---Override def __init__(self, combined_regex=False): Markable.__init__(self) - self._combined_regex = combined_regex + self._use_combined = combined_regex self._excluded = [] - self._count = 0 self._excluded_compiled = set() self._dirty = True - def __debug_test(self): - self.test_regexes = [ - r".*Recycle\.Bin$", r"denyme.*", r".*denyme", r".*/test/denyme*", - r".*/test/*denyme", r"denyme", r".*\/\..*", r"^\..*"] - for regex in self.test_regexes: - try: - self.add(regex) - except Exception as e: - print(f"Exception loading test regex {regex}: {e}") - continue - try: - self.mark(regex) - except Exception as e: - print(f"Exception marking test regex {regex}: {e}") - def __iter__(self): """Iterate in order.""" for item in self._excluded: regex = item[0] yield self.is_marked(regex), regex + def __contains__(self, item): + return self.isExcluded(item) + def __len__(self): - """Returns the number of marked regexes.""" - return len([x for marked, x in self if marked]) + """Returns the total number of regexes regardless of mark status.""" + return len(self._excluded) + + def __getitem__(self, key): + for item in self._excluded: + if item[0] == key: + return item + raise KeyError(f"Key {key} is not in exclusion list.") + + def __setitem__(self, key, value): + # TODO if necessary + pass + + def __delitem__(self, key): + # TODO if necessary + pass def is_markable(self, regex): return self._is_markable(regex) @@ -98,7 +106,7 @@ class ExcludeList(Markable): for item in self._excluded: if item[0] == regex: return item[1] - return False # should not be needed + return False # should not be necessary, regex SHOULD be in there def _did_mark(self, regex): self._add_compiled(regex) @@ -107,17 +115,19 @@ class ExcludeList(Markable): self._remove_compiled(regex) def _add_compiled(self, regex): - if self._combined_regex: - self._dirty = True + self._dirty = True + if self._use_combined: return for item in self._excluded: + # FIXME probably faster to just rebuild the set from the compiled instead of comparing strings if item[0] == regex: # no need to test if already present since it's a set() self._excluded_compiled.add(item[3]) + break def _remove_compiled(self, regex): - if self._combined_regex: - self._dirty = True + self._dirty = True + if self._use_combined: return for item in self._excluded_compiled: if regex in item.pattern: @@ -148,44 +158,65 @@ class ExcludeList(Markable): if item[0] == regex: return item[2] + def build_compiled_caches(self, combined=False): + if not combined: + self._cached_compiled_files =\ + [x for x in self._excluded_compiled if sep not in x.pattern] + self._cached_compiled_paths =\ + [x for x in self._excluded_compiled if sep in x.pattern] + return + # HACK returned as a tuple to get a free iterator to keep interface the same + # regardless of whether the client asked for combined or not + marked_count = [x for marked, x in self if marked] + # If there is no item, the compiled Pattern will be '' and match everything! + if not marked_count: + self._cached_compiled_combined_all = [] + self._cached_compiled_combined_files = [] + self._cached_compiled_combined_paths = [] + else: + self._cached_compiled_combined_all =\ + (re.compile('|'.join(marked_count)),) + files_marked = [x for x in marked_count if sep not in x] + if not files_marked: + self._cached_compiled_combined_files = tuple() + else: + self._cached_compiled_combined_files =\ + (re.compile('|'.join(files_marked)),) + paths_marked = [x for x in marked_count if sep in x] + if not paths_marked: + self._cached_compiled_combined_paths = tuple() + else: + self._cached_compiled_combined_paths =\ + (re.compile('|'.join(paths_marked)),) + @property def compiled(self): """Should be used by other classes to retrieve the up-to-date list of patterns.""" - if not self._combined_regex: - return self._excluded_compiled - else: - return self.compiled_combined + if self._use_combined: + if self._dirty: + self.build_compiled_caches(True) + self._dirty = False + return self._cached_compiled_combined_all + return self._excluded_compiled @property def compiled_files(self): - """Should be used by other classes to retrieve the up-to-date list of patterns - for files only.""" - if not self._combined_regex: - # Return each compiled element separately - # return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern] - for compiled in self.compiled: - if sep not in compiled.pattern: - yield compiled - else: - return self.compiled_files_combined - - @property - def compiled_combined(self): + """When matching against filenames only, we probably won't be seeing any + directory separator, so we filter out regexes with os.sep in them. + The interface should be expected to be a generator, even if it returns only + one item (one Pattern in the combined case).""" if self._dirty: - self._cached_compiled_combined =\ - re.compile('|'.join(x for marked, x in self if marked)) - # Must compute the filtered out version as well - self._cached_compiled_combined_files =\ - re.compile('|'.join(x for marked, x in self - if marked and sep not in x)) + self.build_compiled_caches(True if self._use_combined else False) self._dirty = False - # returned as a tuple to get a free iterator and to avoid subclassing - return (self._cached_compiled_combined,) + return self._cached_compiled_combined_files if self._use_combined else self._cached_compiled_files @property - def compiled_files_combined(self): - # returned as a tuple to get a free iterator and to avoid subclassing - return (self._cached_compiled_combined_files,) + def compiled_paths(self): + """Returns patterns with only separators in them, for more precise filtering.""" + if self._dirty: + self.build_compiled_caches(True if self._use_combined else False) + self._dirty = False + return self._cached_compiled_combined_paths if self._use_combined else self._cached_compiled_paths # ---Public def add(self, regex, forced=False): @@ -206,7 +237,11 @@ class ExcludeList(Markable): def _do_add(self, regex, iscompilable, exception, compiled): # We need to insert at the top self._excluded.insert(0, [regex, iscompilable, exception, compiled]) - # self._count = len(self._excluded) + + @property + def marked_count(self): + """Returns the number of marked regexes only.""" + return len([x for marked, x in self if marked]) def isExcluded(self, regex): for item in self._excluded: @@ -215,6 +250,7 @@ class ExcludeList(Markable): return False def clear(self): + """Not used and needs refactoring""" self._excluded = [] def remove(self, regex): @@ -224,25 +260,24 @@ class ExcludeList(Markable): self._remove_compiled(regex) def rename(self, regex, newregex): - # if regex not in self._excluded or regex == newregex: - # return + # if regex not in self._excluded: return if regex == newregex: return found = False - for item in self._excluded: - if regex == item[0]: - found = True - break - if not found: - return - - was_marked = self.is_marked(regex) - is_compilable, exception, compiled = self.compile_re(newregex) + was_marked = False + is_compilable = False for item in self._excluded: if item[0] == regex: + found = True + was_marked = self.is_marked(regex) + is_compilable, exception, compiled = self.compile_re(newregex) # We overwrite the found entry self._excluded[self._excluded.index(item)] =\ [newregex, is_compilable, exception, compiled] + self._remove_compiled(regex) + break + if not found: + return if is_compilable and was_marked: # Not marked by default when added, add it back self.mark(newregex) @@ -271,7 +306,6 @@ class ExcludeList(Markable): except Exception as e: logging.warning(f"Error while loading {infile}: {e}") self.restore_defaults() - self.__debug_test() return e marked = set() @@ -291,7 +325,6 @@ class ExcludeList(Markable): for item in marked: self.mark(item) - self.__debug_test() def save_to_xml(self, outfile): """Create a XML file that can be used by load_from_xml. @@ -314,13 +347,14 @@ class ExcludeDict(ExcludeList): to keep the index of each string-key as its sub-element and keep it updated whenever insert/remove is done.""" - def __init__(self): + def __init__(self, combined_regex=False): Markable.__init__(self) + self._use_combined = combined_regex # { "regex": { "index": int, "compilable": bool, "error": str, "compiled": Pattern or None}} # Note: "compilable" key should only be updated on add / rename self._excluded = {} - self._count = 0 self._excluded_compiled = set() + self._dirty = True def __iter__(self): """Iterate in order.""" @@ -338,8 +372,8 @@ class ExcludeDict(ExcludeList): return False def _add_compiled(self, regex): - if self._combined_regex: - self._dirty = True + self._dirty = True + if self._use_combined: return try: self._excluded_compiled.add(self._excluded[regex]["compiled"]) @@ -360,8 +394,12 @@ class ExcludeDict(ExcludeList): # We always insert at the top, so index should be 0 and other indices should be pushed by one for value in self._excluded.values(): value["index"] += 1 - self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled} - # self._count = len(self._excluded) + self._excluded[regex] = { + "index": 0, + "compilable": iscompilable, + "error": exception, + "compiled": compiled + } def isExcluded(self, regex): if regex in self._excluded.keys(): @@ -369,6 +407,7 @@ class ExcludeDict(ExcludeList): return False def clear(self): + """Not used, need refactoring""" self._excluded = {} def remove(self, regex): @@ -391,7 +430,13 @@ class ExcludeDict(ExcludeList): was_marked = self.is_marked(regex) previous = self._excluded.pop(regex) iscompilable, error, compiled = self.compile_re(newregex) - self._excluded[newregex] = {"index": previous["index"], "compilable": iscompilable, "error": error, "compiled": compiled} + self._excluded[newregex] = { + "index": previous["index"], + "compilable": iscompilable, + "error": error, + "compiled": compiled + } + self._remove_compiled(regex) if was_marked and iscompilable: self.mark(newregex) diff --git a/core/fs.py b/core/fs.py index 90f400d9..f18186ae 100644 --- a/core/fs.py +++ b/core/fs.py @@ -245,7 +245,7 @@ class Folder(File): return not path.islink() and path.isdir() -def get_file(path, fileclasses=[File], deny_list_re=set()): +def get_file(path, fileclasses=[File]): """Wraps ``path`` around its appropriate :class:`File` class. Whether a class is "appropriate" is decided by :meth:`File.can_handle` @@ -255,15 +255,10 @@ def get_file(path, fileclasses=[File], deny_list_re=set()): """ for fileclass in fileclasses: if fileclass.can_handle(path): - # print(f"returning {path}") - # for expr in deny_list_re: - # if expr.match(str(path.name)): - # print(f"FOUND {repr(expr)} in {str(path.name)}") - # return return fileclass(path) -def get_files(path, fileclasses=[File], deny_list_re=set()): +def get_files(path, fileclasses=[File]): """Returns a list of :class:`File` for each file contained in ``path``. :param Path path: path to scan @@ -273,7 +268,7 @@ def get_files(path, fileclasses=[File], deny_list_re=set()): try: result = [] for path in path.listdir(): - file = get_file(path, fileclasses=fileclasses, deny_list_re=deny_list_re) + file = get_file(path, fileclasses=fileclasses) if file is not None: result.append(file) return result diff --git a/core/tests/directories_test.py b/core/tests/directories_test.py index 7273b566..1ce84fb4 100644 --- a/core/tests/directories_test.py +++ b/core/tests/directories_test.py @@ -20,6 +20,7 @@ from ..directories import ( AlreadyThereError, InvalidPathError, ) +from ..exclude import ExcludeList, ExcludeDict def create_fake_fs(rootpath): @@ -323,7 +324,7 @@ def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir): def test_default_path_state_override(tmpdir): # It's possible for a subclass to override the default state of a path class MyDirectories(Directories): - def _default_state_for_path(self, path, denylist): + def _default_state_for_path(self, path): if "foobar" in path: return DirectoryState.Excluded @@ -343,52 +344,193 @@ def test_default_path_state_override(tmpdir): eq_(len(list(d.get_files())), 2) -def test_exclude_list_regular_expressions(tmpdir): - d = Directories() - d.deny_list_str.clear() - d.deny_list_re.clear() - d.deny_list_re_files.clear() - # This should only exlude the directory, but not the contained files if - # its status is set to normal after loading it in the directory tree - d.deny_list_str.add(r".*Recycle\.Bin$") - d.deny_list_str.add(r"denyme.*") - # d.deny_list_str.add(r".*denymetoo") - # d.deny_list_str.add(r"denyme") - d.deny_list_str.add(r".*\/\..*") - d.deny_list_str.add(r"^\..*") - d.compile_re() - p1 = Path(str(tmpdir)) - # Should be ignored on Windows only (by default) - p1["Recycle.Bin"].mkdir() - p1["Recycle.Bin/somerecycledfile"].open("w").close() +class TestExcludeList(): + def setup_method(self, method): + self.d = Directories(exclude_list=ExcludeList(combined_regex=False)) - p1["denyme_blah.txt"].open("w").close() - p1["blah_denymetoo"].open("w").close() - p1["blah_denyme"].open("w").close() + def get_files_and_expect_num_result(self, num_result): + """Calls get_files(), get the filenames only, print for debugging. + num_result is how many files are expected as a result.""" + print(f"EXCLUDED REGEX: paths {self.d._exclude_list.compiled_paths} \ +files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled}") + files = list(self.d.get_files()) + files = [file.name for file in files] + print(f"FINAL FILES {files}") + eq_(len(files), num_result) + return files - p1[".hidden_file"].open("w").close() - p1[".hidden_dir"].mkdir() - p1[".hidden_dir/somenormalfile1"].open("w").close() - p1[".hidden_dir/somenormalfile2_denyme"].open("w").close() + def test_exclude_recycle_bin_by_default(self, tmpdir): + regex = r"^.*Recycle\.Bin$" + self.d._exclude_list.add(regex) + self.d._exclude_list.mark(regex) + p1 = Path(str(tmpdir)) + p1["$Recycle.Bin"].mkdir() + p1["$Recycle.Bin"]["subdir"].mkdir() + self.d.add_path(p1) + eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded) + # By default, subdirs should be excluded too, but this can be overriden separately + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded) + self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal) + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal) - p1["foobar"].mkdir() - p1["foobar/somefile"].open("w").close() - d.add_path(p1) - eq_(d.get_state(p1["Recycle.Bin"]), DirectoryState.Excluded) - eq_(d.get_state(p1["foobar"]), DirectoryState.Normal) - files = list(d.get_files()) - files = [file.name for file in files] - print(f"first files: {files}") - assert "somerecycledfile" not in files - assert "denyme_blah.txt" not in files - assert ".hidden_file" not in files - assert "somefile1" not in files - assert "somefile2_denyme" not in files - # Overriding the default state from the Directory Tree - d.set_state(p1["Recycle.Bin"], DirectoryState.Normal) - d.set_state(p1[".hidden_dir"], DirectoryState.Normal) - files = list(d.get_files()) - files = [file.name for file in files] - print(f"second files: {files}") - assert "somerecycledfile" in files - assert "somenormalfile1" in files + def test_exclude_refined(self, tmpdir): + regex1 = r"^\$Recycle\.Bin$" + self.d._exclude_list.add(regex1) + self.d._exclude_list.mark(regex1) + p1 = Path(str(tmpdir)) + p1["$Recycle.Bin"].mkdir() + p1["$Recycle.Bin"]["somefile.png"].open("w").close() + p1["$Recycle.Bin"]["some_unwanted_file.jpg"].open("w").close() + p1["$Recycle.Bin"]["subdir"].mkdir() + p1["$Recycle.Bin"]["subdir"]["somesubdirfile.png"].open("w").close() + p1["$Recycle.Bin"]["subdir"]["unwanted_subdirfile.gif"].open("w").close() + p1["$Recycle.Bin"]["subdar"].mkdir() + p1["$Recycle.Bin"]["subdar"]["somesubdarfile.jpeg"].open("w").close() + p1["$Recycle.Bin"]["subdar"]["unwanted_subdarfile.png"].open("w").close() + self.d.add_path(p1["$Recycle.Bin"]) + + # Filter should set the default state to Excluded + eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded) + # The subdir should inherit its parent state + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded) + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Excluded) + # Override a child path's state + self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal) + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal) + # Parent should keep its default state, and the other child too + eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded) + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Excluded) + # print(f"get_folders(): {[x for x in self.d.get_folders()]}") + + # only the 2 files directly under the Normal directory + files = self.get_files_and_expect_num_result(2) + assert "somefile.png" not in files + assert "some_unwanted_file.jpg" not in files + assert "somesubdarfile.jpeg" not in files + assert "unwanted_subdarfile.png" not in files + assert "somesubdirfile.png" in files + assert "unwanted_subdirfile.gif" in files + # Overriding the parent should enable all children + self.d.set_state(p1["$Recycle.Bin"], DirectoryState.Normal) + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Normal) + # all files there + files = self.get_files_and_expect_num_result(6) + assert "somefile.png" in files + assert "some_unwanted_file.jpg" in files + + # This should still filter out files under directory, despite the Normal state + regex2 = r".*unwanted.*" + self.d._exclude_list.add(regex2) + self.d._exclude_list.mark(regex2) + files = self.get_files_and_expect_num_result(3) + assert "somefile.png" in files + assert "some_unwanted_file.jpg" not in files + assert "unwanted_subdirfile.gif" not in files + assert "unwanted_subdarfile.png" not in files + + regex3 = r".*Recycle\.Bin\/.*unwanted.*subdirfile.*" + self.d._exclude_list.rename(regex2, regex3) + assert self.d._exclude_list.error(regex3) is None + # print(f"get_folders(): {[x for x in self.d.get_folders()]}") + # Directory shouldn't change its state here, unless explicitely done by user + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal) + files = self.get_files_and_expect_num_result(5) + assert "unwanted_subdirfile.gif" not in files + assert "unwanted_subdarfile.png" in files + + # using end of line character should only filter the directory, or file ending with subdir + regex4 = r".*subdir$" + self.d._exclude_list.rename(regex3, regex4) + assert self.d._exclude_list.error(regex4) is None + p1["$Recycle.Bin"]["subdar"]["file_ending_with_subdir"].open("w").close() + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded) + files = self.get_files_and_expect_num_result(4) + assert "file_ending_with_subdir" not in files + assert "somesubdarfile.jpeg" in files + assert "somesubdirfile.png" not in files + assert "unwanted_subdirfile.gif" not in files + self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal) + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal) + # print(f"get_folders(): {[x for x in self.d.get_folders()]}") + files = self.get_files_and_expect_num_result(6) + assert "file_ending_with_subdir" not in files + assert "somesubdirfile.png" in files + assert "unwanted_subdirfile.gif" in files + + regex5 = r".*subdir.*" + self.d._exclude_list.rename(regex4, regex5) + # Files containing substring should be filtered + eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal) + # The path should not match, only the filename, the "subdir" in the directory name shouldn't matter + p1["$Recycle.Bin"]["subdir"]["file_which_shouldnt_match"].open("w").close() + files = self.get_files_and_expect_num_result(5) + assert "somesubdirfile.png" not in files + assert "unwanted_subdirfile.gif" not in files + assert "file_ending_with_subdir" not in files + assert "file_which_shouldnt_match" in files + + def test_japanese_unicode(self, tmpdir): + p1 = Path(str(tmpdir)) + p1["$Recycle.Bin"].mkdir() + p1["$Recycle.Bin"]["somerecycledfile.png"].open("w").close() + p1["$Recycle.Bin"]["some_unwanted_file.jpg"].open("w").close() + p1["$Recycle.Bin"]["subdir"].mkdir() + p1["$Recycle.Bin"]["subdir"]["過去白濁物語~]_カラー.jpg"].open("w").close() + p1["$Recycle.Bin"]["思叫物語"].mkdir() + p1["$Recycle.Bin"]["思叫物語"]["なししろ会う前"].open("w").close() + p1["$Recycle.Bin"]["思叫物語"]["堂~ロ"].open("w").close() + self.d.add_path(p1["$Recycle.Bin"]) + regex3 = r".*物語.*" + self.d._exclude_list.add(regex3) + self.d._exclude_list.mark(regex3) + # print(f"get_folders(): {[x for x in self.d.get_folders()]}") + eq_(self.d.get_state(p1["$Recycle.Bin"]["思叫物語"]), DirectoryState.Excluded) + files = self.get_files_and_expect_num_result(2) + assert "過去白濁物語~]_カラー.jpg" not in files + assert "なししろ会う前" not in files + assert "堂~ロ" not in files + # using end of line character should only filter that directory, not affecting its files + regex4 = r".*物語$" + self.d._exclude_list.rename(regex3, regex4) + assert self.d._exclude_list.error(regex4) is None + self.d.set_state(p1["$Recycle.Bin"]["思叫物語"], DirectoryState.Normal) + files = self.get_files_and_expect_num_result(5) + assert "過去白濁物語~]_カラー.jpg" in files + assert "なししろ会う前" in files + assert "堂~ロ" in files + + def test_get_state_returns_excluded_for_hidden_directories_and_files(self, tmpdir): + # This regex only work for files, not paths + regex = r"^\..*$" + self.d._exclude_list.add(regex) + self.d._exclude_list.mark(regex) + p1 = Path(str(tmpdir)) + p1["foobar"].mkdir() + p1["foobar"][".hidden_file.txt"].open("w").close() + p1["foobar"][".hidden_dir"].mkdir() + p1["foobar"][".hidden_dir"]["foobar.jpg"].open("w").close() + p1["foobar"][".hidden_dir"][".hidden_subfile.png"].open("w").close() + self.d.add_path(p1["foobar"]) + # It should not inherit its parent's state originally + eq_(self.d.get_state(p1["foobar"][".hidden_dir"]), DirectoryState.Excluded) + self.d.set_state(p1["foobar"][".hidden_dir"], DirectoryState.Normal) + # The files should still be filtered + files = self.get_files_and_expect_num_result(1) + assert ".hidden_file.txt" not in files + assert ".hidden_subfile.png" not in files + assert "foobar.jpg" in files + + +class TestExcludeDict(TestExcludeList): + def setup_method(self, method): + self.d = Directories(exclude_list=ExcludeDict(combined_regex=False)) + + +class TestExcludeListCombined(TestExcludeList): + def setup_method(self, method): + self.d = Directories(exclude_list=ExcludeList(combined_regex=True)) + + +class TestExcludeDictCombined(TestExcludeList): + def setup_method(self, method): + self.d = Directories(exclude_list=ExcludeDict(combined_regex=True)) diff --git a/core/tests/exclude_test.py b/core/tests/exclude_test.py new file mode 100644 index 00000000..0dc4a033 --- /dev/null +++ b/core/tests/exclude_test.py @@ -0,0 +1,277 @@ +# Copyright 2016 Hardcoded Software (http://www.hardcoded.net) +# +# This software is licensed under the "GPLv3" License as described in the "LICENSE" file, +# which should be included with this package. The terms are also available at +# http://www.gnu.org/licenses/gpl-3.0.html + +import io +# import os.path as op + +from xml.etree import ElementTree as ET + +# from pytest import raises +from hscommon.testutil import eq_ + +from .base import DupeGuru +from ..exclude import ExcludeList, ExcludeDict, default_regexes, AlreadyThereException + +from re import error + + +# Two slightly different implementations here, one around a list of lists, +# and another around a dictionary. + + +class TestCaseListXMLLoading: + def setup_method(self, method): + self.exclude_list = ExcludeList() + + def test_load_non_existant_file(self): + # Loads the pre-defined regexes + self.exclude_list.load_from_xml("non_existant.xml") + eq_(len(default_regexes), len(self.exclude_list)) + # they should also be marked by default + eq_(len(default_regexes), self.exclude_list.marked_count) + + def test_save_to_xml(self): + f = io.BytesIO() + self.exclude_list.save_to_xml(f) + f.seek(0) + doc = ET.parse(f) + root = doc.getroot() + eq_("exclude_list", root.tag) + + def test_save_and_load(self, tmpdir): + e1 = ExcludeList() + e2 = ExcludeList() + eq_(len(e1), 0) + e1.add(r"one") + e1.mark(r"one") + e1.add(r"two") + tmpxml = str(tmpdir.join("exclude_testunit.xml")) + e1.save_to_xml(tmpxml) + e2.load_from_xml(tmpxml) + # We should have the default regexes + assert r"one" in e2 + assert r"two" in e2 + eq_(len(e2), 2) + eq_(e2.marked_count, 1) + + def test_load_xml_with_garbage_and_missing_elements(self): + root = ET.Element("foobar") # The root element shouldn't matter + exclude_node = ET.SubElement(root, "bogus") + exclude_node.set("regex", "None") + exclude_node.set("marked", "y") + + exclude_node = ET.SubElement(root, "exclude") + exclude_node.set("regex", "one") + # marked field invalid + exclude_node.set("markedddd", "y") + + exclude_node = ET.SubElement(root, "exclude") + exclude_node.set("regex", "two") + # missing marked field + + exclude_node = ET.SubElement(root, "exclude") + exclude_node.set("regex", "three") + exclude_node.set("markedddd", "pazjbjepo") + + f = io.BytesIO() + tree = ET.ElementTree(root) + tree.write(f, encoding="utf-8") + f.seek(0) + self.exclude_list.load_from_xml(f) + print(f"{[x for x in self.exclude_list]}") + # only the two "exclude" nodes should be added, + eq_(3, len(self.exclude_list)) + # None should be marked + eq_(0, self.exclude_list.marked_count) + + +class TestCaseDictXMLLoading(TestCaseListXMLLoading): + def setup_method(self, method): + self.exclude_list = ExcludeDict() + + +class TestCaseListEmpty: + def setup_method(self, method): + self.app = DupeGuru() + self.app.exclude_list = ExcludeList() + self.exclude_list = self.app.exclude_list + + def test_add_mark_and_remove_regex(self): + regex1 = r"one" + regex2 = r"two" + self.exclude_list.add(regex1) + assert(regex1 in self.exclude_list) + self.exclude_list.add(regex2) + self.exclude_list.mark(regex1) + self.exclude_list.mark(regex2) + eq_(len(self.exclude_list), 2) + eq_(len(self.exclude_list.compiled), 2) + compiled_files = [x for x in self.exclude_list.compiled_files] + eq_(len(compiled_files), 2) + self.exclude_list.remove(regex2) + assert(regex2 not in self.exclude_list) + eq_(len(self.exclude_list), 1) + + def test_add_duplicate(self): + self.exclude_list.add(r"one") + eq_(1 , len(self.exclude_list)) + try: + self.exclude_list.add(r"one") + except Exception: + pass + eq_(1 , len(self.exclude_list)) + + def test_add_not_compilable(self): + # Trying to add a non-valid regex should not work and raise exception + regex = r"one))" + try: + self.exclude_list.add(regex) + except Exception as e: + # Make sure we raise a re.error so that the interface can process it + eq_(type(e), error) + added = self.exclude_list.mark(regex) + eq_(added, False) + eq_(len(self.exclude_list), 0) + eq_(len(self.exclude_list.compiled), 0) + compiled_files = [x for x in self.exclude_list.compiled_files] + eq_(len(compiled_files), 0) + + def test_force_add_not_compilable(self): + """Used when loading from XML for example""" + regex = r"one))" + try: + self.exclude_list.add(regex, forced=True) + except Exception as e: + # Should not get an exception here unless it's a duplicate regex + raise e + marked = self.exclude_list.mark(regex) + eq_(marked, False) # can't be marked since not compilable + eq_(len(self.exclude_list), 1) + eq_(len(self.exclude_list.compiled), 0) + compiled_files = [x for x in self.exclude_list.compiled_files] + eq_(len(compiled_files), 0) + # adding a duplicate + regex = r"one))" + try: + self.exclude_list.add(regex, forced=True) + except Exception as e: + # we should have this exception, and it shouldn't be added + assert type(e) is AlreadyThereException + eq_(len(self.exclude_list), 1) + eq_(len(self.exclude_list.compiled), 0) + + def test_rename_regex(self): + regex = r"one" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + regex_renamed = r"one))" + # Not compilable, can't be marked + self.exclude_list.rename(regex, regex_renamed) + assert regex not in self.exclude_list + assert regex_renamed in self.exclude_list + eq_(self.exclude_list.is_marked(regex_renamed), False) + self.exclude_list.mark(regex_renamed) + eq_(self.exclude_list.is_marked(regex_renamed), False) + regex_renamed_compilable = r"two" + self.exclude_list.rename(regex_renamed, regex_renamed_compilable) + assert regex_renamed_compilable in self.exclude_list + eq_(self.exclude_list.is_marked(regex_renamed), False) + self.exclude_list.mark(regex_renamed_compilable) + eq_(self.exclude_list.is_marked(regex_renamed_compilable), True) + eq_(len(self.exclude_list), 1) + # Should still be marked after rename + regex_compilable = r"three" + self.exclude_list.rename(regex_renamed_compilable, regex_compilable) + eq_(self.exclude_list.is_marked(regex_compilable), True) + + def test_restore_default(self): + """Only unmark previously added regexes and mark the pre-defined ones""" + regex = r"one" + self.exclude_list.add(regex) + self.exclude_list.mark(regex) + self.exclude_list.restore_defaults() + eq_(len(default_regexes), self.exclude_list.marked_count) + # added regex shouldn't be marked + eq_(self.exclude_list.is_marked(regex), False) + # added regex shouldn't be in compiled list either + compiled = [x for x in self.exclude_list.compiled] + assert regex not in compiled + # Only default regexes marked and in compiled list + for re in default_regexes: + assert self.exclude_list.is_marked(re) + found = False + for compiled_re in compiled: + if compiled_re.pattern == re: + found = True + if not found: + raise(Exception(f"Default RE {re} not found in compiled list.")) + continue + eq_(len(default_regexes), len(self.exclude_list.compiled)) + + +class TestCaseDictEmpty(TestCaseListEmpty): + """Same, but with dictionary implementation""" + def setup_method(self, method): + self.app = DupeGuru() + self.app.exclude_list = ExcludeDict() + self.exclude_list = self.app.exclude_list + + +def split_combined(pattern_object): + """Returns list of strings for each combined pattern""" + return [x for x in pattern_object.pattern.split("|")] + + +class TestCaseCompiledList(): + """Test consistency between combined or not""" + def setup_method(self, method): + self.e_separate = ExcludeList(combined_regex=False) + self.e_separate.restore_defaults() + self.e_combined = ExcludeList(combined_regex=True) + self.e_combined.restore_defaults() + + def test_same_number_of_expressions(self): + # We only get one combined Pattern item in a tuple, which is made of however many parts + eq_(len(split_combined(self.e_combined.compiled[0])), len(default_regexes)) + # We get as many as there are marked items + eq_(len(self.e_separate.compiled), len(default_regexes)) + exprs = split_combined(self.e_combined.compiled[0]) + # We should have the same number and the same expressions + eq_(len(exprs), len(self.e_separate.compiled)) + for expr in self.e_separate.compiled: + assert expr.pattern in exprs + + def test_compiled_files(self): + # test is separator is indeed checked properly to yield the output + regex1 = r"test/one/sub" + self.e_separate.add(regex1) + self.e_separate.mark(regex1) + self.e_combined.add(regex1) + self.e_combined.mark(regex1) + separate_compiled_dirs = self.e_separate.compiled + separate_compiled_files = [x for x in self.e_separate.compiled_files] + # HACK we need to call compiled property FIRST to generate the cache + combined_compiled_dirs = self.e_combined.compiled + # print(f"type: {type(self.e_combined.compiled_files[0])}") + # A generator returning only one item... ugh + combined_compiled_files = [x for x in self.e_combined.compiled_files][0] + print(f"compiled files: {combined_compiled_files}") + # Separate should give several plus the one added + eq_(len(separate_compiled_dirs), len(default_regexes) + 1) + # regex1 shouldn't be in the "files" version + eq_(len(separate_compiled_files), len(default_regexes)) + # Only one Pattern returned, which when split should be however many + 1 + eq_(len(split_combined(combined_compiled_dirs[0])), len(default_regexes) + 1) + # regex1 shouldn't be here either + eq_(len(split_combined(combined_compiled_files)), len(default_regexes)) + + +class TestCaseCompiledDict(TestCaseCompiledList): + def setup_method(self, method): + self.e_separate = ExcludeDict(combined_regex=False) + self.e_separate.restore_defaults() + self.e_combined = ExcludeDict(combined_regex=True) + self.e_combined.restore_defaults() diff --git a/tox.ini b/tox.ini index 33d32846..6a8b14be 100644 --- a/tox.ini +++ b/tox.ini @@ -10,7 +10,7 @@ setenv = PYTHON="{envpython}" commands = make modules - {posargs:py.test} core hscommon + {posargs:py.test core hscommon} deps = -r{toxinidir}/requirements.txt -r{toxinidir}/requirements-extra.txt