From 4a1641e39d66afd1c25ef5a621278f90353a3081 Mon Sep 17 00:00:00 2001
From: glubsy <glubsy@users.noreply.github.com>
Date: Sat, 29 Aug 2020 03:57:00 +0200
Subject: [PATCH] Add test suite, fix bugs

---
 core/app.py                    |   2 +-
 core/directories.py            |  31 ++--
 core/exclude.py                | 197 ++++++++++++++---------
 core/fs.py                     |  11 +-
 core/tests/directories_test.py | 236 ++++++++++++++++++++++------
 core/tests/exclude_test.py     | 277 +++++++++++++++++++++++++++++++++
 tox.ini                        |   2 +-
 7 files changed, 613 insertions(+), 143 deletions(-)
 create mode 100644 core/tests/exclude_test.py

diff --git a/core/app.py b/core/app.py
index ee31a114..3f4c3266 100644
--- a/core/app.py
+++ b/core/app.py
@@ -26,7 +26,7 @@ from .pe.photo import get_delta_dimensions
 from .util import cmp_value, fix_surrogate_encoding
 from . import directories, results, export, fs, prioritize
 from .ignore import IgnoreList
-from .exclude import ExcludeList as ExcludeList
+from .exclude import ExcludeDict as ExcludeList
 from .scanner import ScanType
 from .gui.deletion_options import DeletionOptions
 from .gui.details_panel import DetailsPanel
diff --git a/core/directories.py b/core/directories.py
index 781ced90..aa6298d4 100644
--- a/core/directories.py
+++ b/core/directories.py
@@ -80,13 +80,12 @@ class Directories:
     # ---Private
     def _default_state_for_path(self, path):
         # New logic with regex filters
-        if self._exclude_list is not None and len(self._exclude_list) > 0:
+        if self._exclude_list is not None and self._exclude_list.mark_count > 0:
             # We iterate even if we only have one item here
-            for denied_path_re in self._exclude_list.compiled_combined:
-                if denied_path_re.match(str(path)):
+            for denied_path_re in self._exclude_list.compiled:
+                if denied_path_re.match(str(path.name)):
                     return DirectoryState.Excluded
-            return None
-        # Old default logic, still used during initialization of DirectoryTree:
+            # return # We still use the old logic to force state on hidden dirs
         # Override this in subclasses to specify the state of some special folders.
         if path.name.startswith("."):
             return DirectoryState.Excluded
@@ -95,7 +94,7 @@ class Directories:
         for root, dirs, files in os.walk(str(from_path)):
             j.check_if_cancelled()
             rootPath = Path(root)
-            state = self.get_state(root)
+            state = self.get_state(rootPath)
             if state == DirectoryState.Excluded:
                 # Recursively get files from folders with lots of subfolder is expensive. However, there
                 # might be a subfolder in this path that is not excluded. What we want to do is to skim
@@ -105,16 +104,22 @@ class Directories:
             try:
                 if state != DirectoryState.Excluded:
                     # Old logic
-                    if self._exclude_list is None or not len(self._exclude_list):
+                    if self._exclude_list is None or not self._exclude_list.mark_count:
                         found_files = [fs.get_file(rootPath + f, fileclasses=fileclasses) for f in files]
                     else:
                         found_files = []
+                        # print(f"len of files: {len(files)} {files}")
                         for f in files:
                             found = False
-                            for expr in self._exclude_list.compiled_files_combined:
-                                found = expr.match(f)
-                                if found:
+                            for expr in self._exclude_list.compiled_files:
+                                if expr.match(f):
+                                    found = True
                                     break
+                            if not found:
+                                for expr in self._exclude_list.compiled_paths:
+                                    if expr.match(root + os.sep + f):
+                                        found = True
+                                        break
                             if not found:
                                 found_files.append(fs.get_file(rootPath + f, fileclasses=fileclasses))
                     found_files = [f for f in found_files if f is not None]
@@ -215,8 +220,14 @@ class Directories:
         if path in self.states:
             return self.states[path]
         state = self._default_state_for_path(path) or DirectoryState.Normal
+        # Save non-default states in cache, necessary for _get_files()
+        if state != DirectoryState.Normal:
+            self.states[path] = state
+            return state
+
         prevlen = 0
         # we loop through the states to find the longest matching prefix
+        # if the parent has a state in cache, return that state
         for p, s in self.states.items():
             if p.is_parent_of(path) and len(p) > prevlen:
                 prevlen = len(p)
diff --git a/core/exclude.py b/core/exclude.py
index 29bab540..e0e8a901 100644
--- a/core/exclude.py
+++ b/core/exclude.py
@@ -5,7 +5,8 @@
 from .markable import Markable
 from xml.etree import ElementTree as ET
 # TODO: perhaps use regex module for better Unicode support? https://pypi.org/project/regex/
-# or perhaps also https://pypi.org/project/re2/
+# also https://pypi.org/project/re2/
+# TODO update the Result list with newly added regexes if possible
 import re
 from os import sep
 import logging
@@ -13,8 +14,14 @@ import functools
 from hscommon.util import FileOrPath
 import time
 
-default_regexes = [r".*thumbs", r"\.DS.Store", r"\.Trash", r".*Trash-Bin"]
-forbidden_regexes = [r".*", r"\/.*", r".*\/.*"]
+default_regexes = [r"^thumbs\.db$",  # Obsolete after WindowsXP
+                   r"^\.DS_Store$",  # MacOS metadata
+                   r"^\.Trash\-.*",  # Linux trash directories
+                   r"^\$Recycle\.Bin$",  # Windows
+                   r"^\..*"  # Hidden files
+                   ]
+# These are too agressive
+forbidden_regexes = [r".*", r"\/.*", r".*\/.*", r".*\..*"]
 
 
 def timer(func):
@@ -59,36 +66,37 @@ class ExcludeList(Markable):
     # ---Override
     def __init__(self, combined_regex=False):
         Markable.__init__(self)
-        self._combined_regex = combined_regex
+        self._use_combined = combined_regex
         self._excluded = []
-        self._count = 0
         self._excluded_compiled = set()
         self._dirty = True
 
-    def __debug_test(self):
-        self.test_regexes = [
-            r".*Recycle\.Bin$", r"denyme.*", r".*denyme", r".*/test/denyme*",
-            r".*/test/*denyme", r"denyme", r".*\/\..*", r"^\..*"]
-        for regex in self.test_regexes:
-            try:
-                self.add(regex)
-            except Exception as e:
-                print(f"Exception loading test regex {regex}: {e}")
-                continue
-            try:
-                self.mark(regex)
-            except Exception as e:
-                print(f"Exception marking test regex {regex}: {e}")
-
     def __iter__(self):
         """Iterate in order."""
         for item in self._excluded:
             regex = item[0]
             yield self.is_marked(regex), regex
 
+    def __contains__(self, item):
+        return self.isExcluded(item)
+
     def __len__(self):
-        """Returns the number of marked regexes."""
-        return len([x for marked, x in self if marked])
+        """Returns the total number of regexes regardless of mark status."""
+        return len(self._excluded)
+
+    def __getitem__(self, key):
+        for item in self._excluded:
+            if item[0] == key:
+                return item
+        raise KeyError(f"Key {key} is not in exclusion list.")
+
+    def __setitem__(self, key, value):
+        # TODO if necessary
+        pass
+
+    def __delitem__(self, key):
+        # TODO if necessary
+        pass
 
     def is_markable(self, regex):
         return self._is_markable(regex)
@@ -98,7 +106,7 @@ class ExcludeList(Markable):
         for item in self._excluded:
             if item[0] == regex:
                 return item[1]
-        return False  # should not be needed
+        return False  # should not be necessary, regex SHOULD be in there
 
     def _did_mark(self, regex):
         self._add_compiled(regex)
@@ -107,17 +115,19 @@ class ExcludeList(Markable):
         self._remove_compiled(regex)
 
     def _add_compiled(self, regex):
-        if self._combined_regex:
-            self._dirty = True
+        self._dirty = True
+        if self._use_combined:
             return
         for item in self._excluded:
+            # FIXME probably faster to just rebuild the set from the compiled instead of comparing strings
             if item[0] == regex:
                 # no need to test if already present since it's a set()
                 self._excluded_compiled.add(item[3])
+                break
 
     def _remove_compiled(self, regex):
-        if self._combined_regex:
-            self._dirty = True
+        self._dirty = True
+        if self._use_combined:
             return
         for item in self._excluded_compiled:
             if regex in item.pattern:
@@ -148,44 +158,65 @@ class ExcludeList(Markable):
             if item[0] == regex:
                 return item[2]
 
+    def build_compiled_caches(self, combined=False):
+        if not combined:
+            self._cached_compiled_files =\
+                [x for x in self._excluded_compiled if sep not in x.pattern]
+            self._cached_compiled_paths =\
+                [x for x in self._excluded_compiled if sep in x.pattern]
+            return
+        # HACK returned as a tuple to get a free iterator to keep interface the same
+        # regardless of whether the client asked for combined or not
+        marked_count = [x for marked, x in self if marked]
+        # If there is no item, the compiled Pattern will be '' and match everything!
+        if not marked_count:
+            self._cached_compiled_combined_all = []
+            self._cached_compiled_combined_files = []
+            self._cached_compiled_combined_paths = []
+        else:
+            self._cached_compiled_combined_all =\
+                (re.compile('|'.join(marked_count)),)
+            files_marked = [x for x in marked_count if sep not in x]
+            if not files_marked:
+                self._cached_compiled_combined_files = tuple()
+            else:
+                self._cached_compiled_combined_files =\
+                    (re.compile('|'.join(files_marked)),)
+            paths_marked = [x for x in marked_count if sep in x]
+            if not paths_marked:
+                self._cached_compiled_combined_paths = tuple()
+            else:
+                self._cached_compiled_combined_paths =\
+                    (re.compile('|'.join(paths_marked)),)
+
     @property
     def compiled(self):
         """Should be used by other classes to retrieve the up-to-date list of patterns."""
-        if not self._combined_regex:
-            return self._excluded_compiled
-        else:
-            return self.compiled_combined
+        if self._use_combined:
+            if self._dirty:
+                self.build_compiled_caches(True)
+                self._dirty = False
+            return self._cached_compiled_combined_all
+        return self._excluded_compiled
 
     @property
     def compiled_files(self):
-        """Should be used by other classes to retrieve the up-to-date list of patterns
-        for files only."""
-        if not self._combined_regex:
-            # Return each compiled element separately
-            # return [compiled_pattern for compiled_pattern in self.compiled if sep not in compiled_pattern.pattern]
-            for compiled in self.compiled:
-                if sep not in compiled.pattern:
-                    yield compiled
-        else:
-            return self.compiled_files_combined
-
-    @property
-    def compiled_combined(self):
+        """When matching against filenames only, we probably won't be seeing any
+        directory separator, so we filter out regexes with os.sep in them.
+        The interface should be expected to be a generator, even if it returns only
+        one item (one Pattern in the combined case)."""
         if self._dirty:
-            self._cached_compiled_combined =\
-                re.compile('|'.join(x for marked, x in self if marked))
-            # Must compute the filtered out version as well
-            self._cached_compiled_combined_files =\
-                re.compile('|'.join(x for marked, x in self
-                           if marked and sep not in x))
+            self.build_compiled_caches(True if self._use_combined else False)
             self._dirty = False
-        # returned as a tuple to get a free iterator and to avoid subclassing
-        return (self._cached_compiled_combined,)
+        return self._cached_compiled_combined_files if self._use_combined else self._cached_compiled_files
 
     @property
-    def compiled_files_combined(self):
-        # returned as a tuple to get a free iterator and to avoid subclassing
-        return (self._cached_compiled_combined_files,)
+    def compiled_paths(self):
+        """Returns patterns with only separators in them, for more precise filtering."""
+        if self._dirty:
+            self.build_compiled_caches(True if self._use_combined else False)
+            self._dirty = False
+        return self._cached_compiled_combined_paths if self._use_combined else self._cached_compiled_paths
 
     # ---Public
     def add(self, regex, forced=False):
@@ -206,7 +237,11 @@ class ExcludeList(Markable):
     def _do_add(self, regex, iscompilable, exception, compiled):
         # We need to insert at the top
         self._excluded.insert(0, [regex, iscompilable, exception, compiled])
-        # self._count = len(self._excluded)
+
+    @property
+    def marked_count(self):
+        """Returns the number of marked regexes only."""
+        return len([x for marked, x in self if marked])
 
     def isExcluded(self, regex):
         for item in self._excluded:
@@ -215,6 +250,7 @@ class ExcludeList(Markable):
         return False
 
     def clear(self):
+        """Not used and needs refactoring"""
         self._excluded = []
 
     def remove(self, regex):
@@ -224,25 +260,24 @@ class ExcludeList(Markable):
         self._remove_compiled(regex)
 
     def rename(self, regex, newregex):
-        # if regex not in self._excluded or regex == newregex:
-        #     return
+        # if regex not in self._excluded: return
         if regex == newregex:
             return
         found = False
-        for item in self._excluded:
-            if regex == item[0]:
-                found = True
-                break
-        if not found:
-            return
-
-        was_marked = self.is_marked(regex)
-        is_compilable, exception, compiled = self.compile_re(newregex)
+        was_marked = False
+        is_compilable = False
         for item in self._excluded:
             if item[0] == regex:
+                found = True
+                was_marked = self.is_marked(regex)
+                is_compilable, exception, compiled = self.compile_re(newregex)
                 # We overwrite the found entry
                 self._excluded[self._excluded.index(item)] =\
                     [newregex, is_compilable, exception, compiled]
+                self._remove_compiled(regex)
+                break
+        if not found:
+            return
         if is_compilable and was_marked:
             # Not marked by default when added, add it back
             self.mark(newregex)
@@ -271,7 +306,6 @@ class ExcludeList(Markable):
         except Exception as e:
             logging.warning(f"Error while loading {infile}: {e}")
             self.restore_defaults()
-            self.__debug_test()
             return e
 
         marked = set()
@@ -291,7 +325,6 @@ class ExcludeList(Markable):
 
         for item in marked:
             self.mark(item)
-        self.__debug_test()
 
     def save_to_xml(self, outfile):
         """Create a XML file that can be used by load_from_xml.
@@ -314,13 +347,14 @@ class ExcludeDict(ExcludeList):
     to keep the index of each string-key as its sub-element and keep it updated
     whenever insert/remove is done."""
 
-    def __init__(self):
+    def __init__(self, combined_regex=False):
         Markable.__init__(self)
+        self._use_combined = combined_regex
         # { "regex": { "index": int, "compilable": bool, "error": str, "compiled": Pattern or None}}
         # Note: "compilable" key should only be updated on add / rename
         self._excluded = {}
-        self._count = 0
         self._excluded_compiled = set()
+        self._dirty = True
 
     def __iter__(self):
         """Iterate in order."""
@@ -338,8 +372,8 @@ class ExcludeDict(ExcludeList):
         return False
 
     def _add_compiled(self, regex):
-        if self._combined_regex:
-            self._dirty = True
+        self._dirty = True
+        if self._use_combined:
             return
         try:
             self._excluded_compiled.add(self._excluded[regex]["compiled"])
@@ -360,8 +394,12 @@ class ExcludeDict(ExcludeList):
         # We always insert at the top, so index should be 0 and other indices should be pushed by one
         for value in self._excluded.values():
             value["index"] += 1
-        self._excluded[regex] = {"index": 0, "compilable": iscompilable, "error": exception, "compiled": compiled}
-        # self._count = len(self._excluded)
+        self._excluded[regex] = {
+            "index": 0,
+            "compilable": iscompilable,
+            "error": exception,
+            "compiled": compiled
+        }
 
     def isExcluded(self, regex):
         if regex in self._excluded.keys():
@@ -369,6 +407,7 @@ class ExcludeDict(ExcludeList):
         return False
 
     def clear(self):
+        """Not used, need refactoring"""
         self._excluded = {}
 
     def remove(self, regex):
@@ -391,7 +430,13 @@ class ExcludeDict(ExcludeList):
         was_marked = self.is_marked(regex)
         previous = self._excluded.pop(regex)
         iscompilable, error, compiled = self.compile_re(newregex)
-        self._excluded[newregex] = {"index": previous["index"], "compilable": iscompilable, "error": error, "compiled": compiled}
+        self._excluded[newregex] = {
+            "index": previous["index"],
+            "compilable": iscompilable,
+            "error": error,
+            "compiled": compiled
+        }
+        self._remove_compiled(regex)
         if was_marked and iscompilable:
             self.mark(newregex)
 
diff --git a/core/fs.py b/core/fs.py
index 90f400d9..f18186ae 100644
--- a/core/fs.py
+++ b/core/fs.py
@@ -245,7 +245,7 @@ class Folder(File):
         return not path.islink() and path.isdir()
 
 
-def get_file(path, fileclasses=[File], deny_list_re=set()):
+def get_file(path, fileclasses=[File]):
     """Wraps ``path`` around its appropriate :class:`File` class.
 
     Whether a class is "appropriate" is decided by :meth:`File.can_handle`
@@ -255,15 +255,10 @@ def get_file(path, fileclasses=[File], deny_list_re=set()):
     """
     for fileclass in fileclasses:
         if fileclass.can_handle(path):
-            # print(f"returning {path}")
-            # for expr in deny_list_re:
-            #     if expr.match(str(path.name)):
-            #         print(f"FOUND {repr(expr)} in {str(path.name)}")
-            #         return
             return fileclass(path)
 
 
-def get_files(path, fileclasses=[File], deny_list_re=set()):
+def get_files(path, fileclasses=[File]):
     """Returns a list of :class:`File` for each file contained in ``path``.
 
     :param Path path: path to scan
@@ -273,7 +268,7 @@ def get_files(path, fileclasses=[File], deny_list_re=set()):
     try:
         result = []
         for path in path.listdir():
-            file = get_file(path, fileclasses=fileclasses, deny_list_re=deny_list_re)
+            file = get_file(path, fileclasses=fileclasses)
             if file is not None:
                 result.append(file)
         return result
diff --git a/core/tests/directories_test.py b/core/tests/directories_test.py
index 7273b566..1ce84fb4 100644
--- a/core/tests/directories_test.py
+++ b/core/tests/directories_test.py
@@ -20,6 +20,7 @@ from ..directories import (
     AlreadyThereError,
     InvalidPathError,
 )
+from ..exclude import ExcludeList, ExcludeDict
 
 
 def create_fake_fs(rootpath):
@@ -323,7 +324,7 @@ def test_get_state_returns_excluded_by_default_for_hidden_directories(tmpdir):
 def test_default_path_state_override(tmpdir):
     # It's possible for a subclass to override the default state of a path
     class MyDirectories(Directories):
-        def _default_state_for_path(self, path, denylist):
+        def _default_state_for_path(self, path):
             if "foobar" in path:
                 return DirectoryState.Excluded
 
@@ -343,52 +344,193 @@ def test_default_path_state_override(tmpdir):
     eq_(len(list(d.get_files())), 2)
 
 
-def test_exclude_list_regular_expressions(tmpdir):
-    d = Directories()
-    d.deny_list_str.clear()
-    d.deny_list_re.clear()
-    d.deny_list_re_files.clear()
-    # This should only exlude the directory, but not the contained files if
-    # its status is set to normal after loading it in the directory tree
-    d.deny_list_str.add(r".*Recycle\.Bin$")
-    d.deny_list_str.add(r"denyme.*")
-    # d.deny_list_str.add(r".*denymetoo")
-    # d.deny_list_str.add(r"denyme")
-    d.deny_list_str.add(r".*\/\..*")
-    d.deny_list_str.add(r"^\..*")
-    d.compile_re()
-    p1 = Path(str(tmpdir))
-    # Should be ignored on Windows only (by default)
-    p1["Recycle.Bin"].mkdir()
-    p1["Recycle.Bin/somerecycledfile"].open("w").close()
+class TestExcludeList():
+    def setup_method(self, method):
+        self.d = Directories(exclude_list=ExcludeList(combined_regex=False))
 
-    p1["denyme_blah.txt"].open("w").close()
-    p1["blah_denymetoo"].open("w").close()
-    p1["blah_denyme"].open("w").close()
+    def get_files_and_expect_num_result(self, num_result):
+        """Calls get_files(), get the filenames only, print for debugging.
+        num_result is how many files are expected as a result."""
+        print(f"EXCLUDED REGEX: paths {self.d._exclude_list.compiled_paths} \
+files: {self.d._exclude_list.compiled_files} all: {self.d._exclude_list.compiled}")
+        files = list(self.d.get_files())
+        files = [file.name for file in files]
+        print(f"FINAL FILES {files}")
+        eq_(len(files), num_result)
+        return files
 
-    p1[".hidden_file"].open("w").close()
-    p1[".hidden_dir"].mkdir()
-    p1[".hidden_dir/somenormalfile1"].open("w").close()
-    p1[".hidden_dir/somenormalfile2_denyme"].open("w").close()
+    def test_exclude_recycle_bin_by_default(self, tmpdir):
+        regex = r"^.*Recycle\.Bin$"
+        self.d._exclude_list.add(regex)
+        self.d._exclude_list.mark(regex)
+        p1 = Path(str(tmpdir))
+        p1["$Recycle.Bin"].mkdir()
+        p1["$Recycle.Bin"]["subdir"].mkdir()
+        self.d.add_path(p1)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded)
+        # By default, subdirs should be excluded too, but this can be overriden separately
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded)
+        self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
 
-    p1["foobar"].mkdir()
-    p1["foobar/somefile"].open("w").close()
-    d.add_path(p1)
-    eq_(d.get_state(p1["Recycle.Bin"]), DirectoryState.Excluded)
-    eq_(d.get_state(p1["foobar"]), DirectoryState.Normal)
-    files = list(d.get_files())
-    files = [file.name for file in files]
-    print(f"first files: {files}")
-    assert "somerecycledfile" not in files
-    assert "denyme_blah.txt" not in files
-    assert ".hidden_file" not in files
-    assert "somefile1" not in files
-    assert "somefile2_denyme" not in files
-    # Overriding the default state from the Directory Tree
-    d.set_state(p1["Recycle.Bin"], DirectoryState.Normal)
-    d.set_state(p1[".hidden_dir"], DirectoryState.Normal)
-    files = list(d.get_files())
-    files = [file.name for file in files]
-    print(f"second files: {files}")
-    assert "somerecycledfile" in files
-    assert "somenormalfile1" in files
+    def test_exclude_refined(self, tmpdir):
+        regex1 = r"^\$Recycle\.Bin$"
+        self.d._exclude_list.add(regex1)
+        self.d._exclude_list.mark(regex1)
+        p1 = Path(str(tmpdir))
+        p1["$Recycle.Bin"].mkdir()
+        p1["$Recycle.Bin"]["somefile.png"].open("w").close()
+        p1["$Recycle.Bin"]["some_unwanted_file.jpg"].open("w").close()
+        p1["$Recycle.Bin"]["subdir"].mkdir()
+        p1["$Recycle.Bin"]["subdir"]["somesubdirfile.png"].open("w").close()
+        p1["$Recycle.Bin"]["subdir"]["unwanted_subdirfile.gif"].open("w").close()
+        p1["$Recycle.Bin"]["subdar"].mkdir()
+        p1["$Recycle.Bin"]["subdar"]["somesubdarfile.jpeg"].open("w").close()
+        p1["$Recycle.Bin"]["subdar"]["unwanted_subdarfile.png"].open("w").close()
+        self.d.add_path(p1["$Recycle.Bin"])
+
+        # Filter should set the default state to Excluded
+        eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded)
+        # The subdir should inherit its parent state
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Excluded)
+        # Override a child path's state
+        self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        # Parent should keep its default state, and the other child too
+        eq_(self.d.get_state(p1["$Recycle.Bin"]), DirectoryState.Excluded)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Excluded)
+        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
+
+        # only the 2 files directly under the Normal directory
+        files = self.get_files_and_expect_num_result(2)
+        assert "somefile.png" not in files
+        assert "some_unwanted_file.jpg" not in files
+        assert "somesubdarfile.jpeg" not in files
+        assert "unwanted_subdarfile.png" not in files
+        assert "somesubdirfile.png" in files
+        assert "unwanted_subdirfile.gif" in files
+        # Overriding the parent should enable all children
+        self.d.set_state(p1["$Recycle.Bin"], DirectoryState.Normal)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdar"]), DirectoryState.Normal)
+        # all files there
+        files = self.get_files_and_expect_num_result(6)
+        assert "somefile.png" in files
+        assert "some_unwanted_file.jpg" in files
+
+        # This should still filter out files under directory, despite the Normal state
+        regex2 = r".*unwanted.*"
+        self.d._exclude_list.add(regex2)
+        self.d._exclude_list.mark(regex2)
+        files = self.get_files_and_expect_num_result(3)
+        assert "somefile.png" in files
+        assert "some_unwanted_file.jpg" not in files
+        assert "unwanted_subdirfile.gif" not in files
+        assert "unwanted_subdarfile.png" not in files
+
+        regex3 = r".*Recycle\.Bin\/.*unwanted.*subdirfile.*"
+        self.d._exclude_list.rename(regex2, regex3)
+        assert self.d._exclude_list.error(regex3) is None
+        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
+        # Directory shouldn't change its state here, unless explicitely done by user
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        files = self.get_files_and_expect_num_result(5)
+        assert "unwanted_subdirfile.gif" not in files
+        assert "unwanted_subdarfile.png" in files
+
+        # using end of line character should only filter the directory, or file ending with subdir
+        regex4 = r".*subdir$"
+        self.d._exclude_list.rename(regex3, regex4)
+        assert self.d._exclude_list.error(regex4) is None
+        p1["$Recycle.Bin"]["subdar"]["file_ending_with_subdir"].open("w").close()
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Excluded)
+        files = self.get_files_and_expect_num_result(4)
+        assert "file_ending_with_subdir" not in files
+        assert "somesubdarfile.jpeg" in files
+        assert "somesubdirfile.png" not in files
+        assert "unwanted_subdirfile.gif" not in files
+        self.d.set_state(p1["$Recycle.Bin"]["subdir"], DirectoryState.Normal)
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
+        files = self.get_files_and_expect_num_result(6)
+        assert "file_ending_with_subdir" not in files
+        assert "somesubdirfile.png" in files
+        assert "unwanted_subdirfile.gif" in files
+
+        regex5 = r".*subdir.*"
+        self.d._exclude_list.rename(regex4, regex5)
+        # Files containing substring should be filtered
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["subdir"]), DirectoryState.Normal)
+        # The path should not match, only the filename, the "subdir" in the directory name shouldn't matter
+        p1["$Recycle.Bin"]["subdir"]["file_which_shouldnt_match"].open("w").close()
+        files = self.get_files_and_expect_num_result(5)
+        assert "somesubdirfile.png" not in files
+        assert "unwanted_subdirfile.gif" not in files
+        assert "file_ending_with_subdir" not in files
+        assert "file_which_shouldnt_match" in files
+
+    def test_japanese_unicode(self, tmpdir):
+        p1 = Path(str(tmpdir))
+        p1["$Recycle.Bin"].mkdir()
+        p1["$Recycle.Bin"]["somerecycledfile.png"].open("w").close()
+        p1["$Recycle.Bin"]["some_unwanted_file.jpg"].open("w").close()
+        p1["$Recycle.Bin"]["subdir"].mkdir()
+        p1["$Recycle.Bin"]["subdir"]["過去白濁物語～]_カラー.jpg"].open("w").close()
+        p1["$Recycle.Bin"]["思叫物語"].mkdir()
+        p1["$Recycle.Bin"]["思叫物語"]["なししろ会う前"].open("w").close()
+        p1["$Recycle.Bin"]["思叫物語"]["堂～ロ"].open("w").close()
+        self.d.add_path(p1["$Recycle.Bin"])
+        regex3 = r".*物語.*"
+        self.d._exclude_list.add(regex3)
+        self.d._exclude_list.mark(regex3)
+        # print(f"get_folders(): {[x for x in self.d.get_folders()]}")
+        eq_(self.d.get_state(p1["$Recycle.Bin"]["思叫物語"]), DirectoryState.Excluded)
+        files = self.get_files_and_expect_num_result(2)
+        assert "過去白濁物語～]_カラー.jpg" not in files
+        assert "なししろ会う前" not in files
+        assert "堂～ロ" not in files
+        # using end of line character should only filter that directory, not affecting its files
+        regex4 = r".*物語$"
+        self.d._exclude_list.rename(regex3, regex4)
+        assert self.d._exclude_list.error(regex4) is None
+        self.d.set_state(p1["$Recycle.Bin"]["思叫物語"], DirectoryState.Normal)
+        files = self.get_files_and_expect_num_result(5)
+        assert "過去白濁物語～]_カラー.jpg" in files
+        assert "なししろ会う前" in files
+        assert "堂～ロ" in files
+
+    def test_get_state_returns_excluded_for_hidden_directories_and_files(self, tmpdir):
+        # This regex only work for files, not paths
+        regex = r"^\..*$"
+        self.d._exclude_list.add(regex)
+        self.d._exclude_list.mark(regex)
+        p1 = Path(str(tmpdir))
+        p1["foobar"].mkdir()
+        p1["foobar"][".hidden_file.txt"].open("w").close()
+        p1["foobar"][".hidden_dir"].mkdir()
+        p1["foobar"][".hidden_dir"]["foobar.jpg"].open("w").close()
+        p1["foobar"][".hidden_dir"][".hidden_subfile.png"].open("w").close()
+        self.d.add_path(p1["foobar"])
+        # It should not inherit its parent's state originally
+        eq_(self.d.get_state(p1["foobar"][".hidden_dir"]), DirectoryState.Excluded)
+        self.d.set_state(p1["foobar"][".hidden_dir"], DirectoryState.Normal)
+        # The files should still be filtered
+        files = self.get_files_and_expect_num_result(1)
+        assert ".hidden_file.txt" not in files
+        assert ".hidden_subfile.png" not in files
+        assert "foobar.jpg" in files
+
+
+class TestExcludeDict(TestExcludeList):
+    def setup_method(self, method):
+        self.d = Directories(exclude_list=ExcludeDict(combined_regex=False))
+
+
+class TestExcludeListCombined(TestExcludeList):
+    def setup_method(self, method):
+        self.d = Directories(exclude_list=ExcludeList(combined_regex=True))
+
+
+class TestExcludeDictCombined(TestExcludeList):
+    def setup_method(self, method):
+        self.d = Directories(exclude_list=ExcludeDict(combined_regex=True))
diff --git a/core/tests/exclude_test.py b/core/tests/exclude_test.py
new file mode 100644
index 00000000..0dc4a033
--- /dev/null
+++ b/core/tests/exclude_test.py
@@ -0,0 +1,277 @@
+# Copyright 2016 Hardcoded Software (http://www.hardcoded.net)
+#
+# This software is licensed under the "GPLv3" License as described in the "LICENSE" file,
+# which should be included with this package. The terms are also available at
+# http://www.gnu.org/licenses/gpl-3.0.html
+
+import io
+# import os.path as op
+
+from xml.etree import ElementTree as ET
+
+# from pytest import raises
+from hscommon.testutil import eq_
+
+from .base import DupeGuru
+from ..exclude import ExcludeList, ExcludeDict, default_regexes, AlreadyThereException
+
+from re import error
+
+
+# Two slightly different implementations here, one around a list of lists,
+# and another around a dictionary.
+
+
+class TestCaseListXMLLoading:
+    def setup_method(self, method):
+        self.exclude_list = ExcludeList()
+
+    def test_load_non_existant_file(self):
+        # Loads the pre-defined regexes
+        self.exclude_list.load_from_xml("non_existant.xml")
+        eq_(len(default_regexes), len(self.exclude_list))
+        # they should also be marked by default
+        eq_(len(default_regexes), self.exclude_list.marked_count)
+
+    def test_save_to_xml(self):
+        f = io.BytesIO()
+        self.exclude_list.save_to_xml(f)
+        f.seek(0)
+        doc = ET.parse(f)
+        root = doc.getroot()
+        eq_("exclude_list", root.tag)
+
+    def test_save_and_load(self, tmpdir):
+        e1 = ExcludeList()
+        e2 = ExcludeList()
+        eq_(len(e1), 0)
+        e1.add(r"one")
+        e1.mark(r"one")
+        e1.add(r"two")
+        tmpxml = str(tmpdir.join("exclude_testunit.xml"))
+        e1.save_to_xml(tmpxml)
+        e2.load_from_xml(tmpxml)
+        # We should have the default regexes
+        assert r"one" in e2
+        assert r"two" in e2
+        eq_(len(e2), 2)
+        eq_(e2.marked_count, 1)
+
+    def test_load_xml_with_garbage_and_missing_elements(self):
+        root = ET.Element("foobar")  # The root element shouldn't matter
+        exclude_node = ET.SubElement(root, "bogus")
+        exclude_node.set("regex", "None")
+        exclude_node.set("marked", "y")
+
+        exclude_node = ET.SubElement(root, "exclude")
+        exclude_node.set("regex", "one")
+        # marked field invalid
+        exclude_node.set("markedddd", "y")
+
+        exclude_node = ET.SubElement(root, "exclude")
+        exclude_node.set("regex", "two")
+        # missing marked field
+
+        exclude_node = ET.SubElement(root, "exclude")
+        exclude_node.set("regex", "three")
+        exclude_node.set("markedddd", "pazjbjepo")
+
+        f = io.BytesIO()
+        tree = ET.ElementTree(root)
+        tree.write(f, encoding="utf-8")
+        f.seek(0)
+        self.exclude_list.load_from_xml(f)
+        print(f"{[x for x in self.exclude_list]}")
+        # only the two "exclude" nodes should be added,
+        eq_(3, len(self.exclude_list))
+        # None should be marked
+        eq_(0, self.exclude_list.marked_count)
+
+
+class TestCaseDictXMLLoading(TestCaseListXMLLoading):
+    def setup_method(self, method):
+        self.exclude_list = ExcludeDict()
+
+
+class TestCaseListEmpty:
+    def setup_method(self, method):
+        self.app = DupeGuru()
+        self.app.exclude_list = ExcludeList()
+        self.exclude_list = self.app.exclude_list
+
+    def test_add_mark_and_remove_regex(self):
+        regex1 = r"one"
+        regex2 = r"two"
+        self.exclude_list.add(regex1)
+        assert(regex1 in self.exclude_list)
+        self.exclude_list.add(regex2)
+        self.exclude_list.mark(regex1)
+        self.exclude_list.mark(regex2)
+        eq_(len(self.exclude_list), 2)
+        eq_(len(self.exclude_list.compiled), 2)
+        compiled_files = [x for x in self.exclude_list.compiled_files]
+        eq_(len(compiled_files), 2)
+        self.exclude_list.remove(regex2)
+        assert(regex2 not in self.exclude_list)
+        eq_(len(self.exclude_list), 1)
+
+    def test_add_duplicate(self):
+        self.exclude_list.add(r"one")
+        eq_(1 , len(self.exclude_list))
+        try:
+            self.exclude_list.add(r"one")
+        except Exception:
+            pass
+        eq_(1 , len(self.exclude_list))
+
+    def test_add_not_compilable(self):
+        # Trying to add a non-valid regex should not work and raise exception
+        regex = r"one))"
+        try:
+            self.exclude_list.add(regex)
+        except Exception as e:
+            # Make sure we raise a re.error so that the interface can process it
+            eq_(type(e), error)
+        added = self.exclude_list.mark(regex)
+        eq_(added, False)
+        eq_(len(self.exclude_list), 0)
+        eq_(len(self.exclude_list.compiled), 0)
+        compiled_files = [x for x in self.exclude_list.compiled_files]
+        eq_(len(compiled_files), 0)
+
+    def test_force_add_not_compilable(self):
+        """Used when loading from XML for example"""
+        regex = r"one))"
+        try:
+            self.exclude_list.add(regex, forced=True)
+        except Exception as e:
+            # Should not get an exception here unless it's a duplicate regex
+            raise e
+        marked = self.exclude_list.mark(regex)
+        eq_(marked, False)  # can't be marked since not compilable
+        eq_(len(self.exclude_list), 1)
+        eq_(len(self.exclude_list.compiled), 0)
+        compiled_files = [x for x in self.exclude_list.compiled_files]
+        eq_(len(compiled_files), 0)
+        # adding a duplicate
+        regex = r"one))"
+        try:
+            self.exclude_list.add(regex, forced=True)
+        except Exception as e:
+            # we should have this exception, and it shouldn't be added
+            assert type(e) is AlreadyThereException
+        eq_(len(self.exclude_list), 1)
+        eq_(len(self.exclude_list.compiled), 0)
+
+    def test_rename_regex(self):
+        regex = r"one"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        regex_renamed = r"one))"
+        # Not compilable, can't be marked
+        self.exclude_list.rename(regex, regex_renamed)
+        assert regex not in self.exclude_list
+        assert regex_renamed in self.exclude_list
+        eq_(self.exclude_list.is_marked(regex_renamed), False)
+        self.exclude_list.mark(regex_renamed)
+        eq_(self.exclude_list.is_marked(regex_renamed), False)
+        regex_renamed_compilable = r"two"
+        self.exclude_list.rename(regex_renamed, regex_renamed_compilable)
+        assert regex_renamed_compilable in self.exclude_list
+        eq_(self.exclude_list.is_marked(regex_renamed), False)
+        self.exclude_list.mark(regex_renamed_compilable)
+        eq_(self.exclude_list.is_marked(regex_renamed_compilable), True)
+        eq_(len(self.exclude_list), 1)
+        # Should still be marked after rename
+        regex_compilable = r"three"
+        self.exclude_list.rename(regex_renamed_compilable, regex_compilable)
+        eq_(self.exclude_list.is_marked(regex_compilable), True)
+
+    def test_restore_default(self):
+        """Only unmark previously added regexes and mark the pre-defined ones"""
+        regex = r"one"
+        self.exclude_list.add(regex)
+        self.exclude_list.mark(regex)
+        self.exclude_list.restore_defaults()
+        eq_(len(default_regexes), self.exclude_list.marked_count)
+        # added regex shouldn't be marked
+        eq_(self.exclude_list.is_marked(regex), False)
+        # added regex shouldn't be in compiled list either
+        compiled = [x for x in self.exclude_list.compiled]
+        assert regex not in compiled
+        # Only default regexes marked and in compiled list
+        for re in default_regexes:
+            assert self.exclude_list.is_marked(re)
+            found = False
+            for compiled_re in compiled:
+                if compiled_re.pattern == re:
+                    found = True
+            if not found:
+                raise(Exception(f"Default RE {re} not found in compiled list."))
+            continue
+        eq_(len(default_regexes), len(self.exclude_list.compiled))
+
+
+class TestCaseDictEmpty(TestCaseListEmpty):
+    """Same, but with dictionary implementation"""
+    def setup_method(self, method):
+        self.app = DupeGuru()
+        self.app.exclude_list = ExcludeDict()
+        self.exclude_list = self.app.exclude_list
+
+
+def split_combined(pattern_object):
+    """Returns list of strings for each combined pattern"""
+    return [x for x in pattern_object.pattern.split("|")]
+
+
+class TestCaseCompiledList():
+    """Test consistency between combined or not"""
+    def setup_method(self, method):
+        self.e_separate = ExcludeList(combined_regex=False)
+        self.e_separate.restore_defaults()
+        self.e_combined = ExcludeList(combined_regex=True)
+        self.e_combined.restore_defaults()
+
+    def test_same_number_of_expressions(self):
+        # We only get one combined Pattern item in a tuple, which is made of however many parts
+        eq_(len(split_combined(self.e_combined.compiled[0])), len(default_regexes))
+        # We get as many as there are marked items
+        eq_(len(self.e_separate.compiled), len(default_regexes))
+        exprs = split_combined(self.e_combined.compiled[0])
+        # We should have the same number and the same expressions
+        eq_(len(exprs), len(self.e_separate.compiled))
+        for expr in self.e_separate.compiled:
+            assert expr.pattern in exprs
+
+    def test_compiled_files(self):
+        # test is separator is indeed checked properly to yield the output
+        regex1 = r"test/one/sub"
+        self.e_separate.add(regex1)
+        self.e_separate.mark(regex1)
+        self.e_combined.add(regex1)
+        self.e_combined.mark(regex1)
+        separate_compiled_dirs = self.e_separate.compiled
+        separate_compiled_files = [x for x in self.e_separate.compiled_files]
+        # HACK we need to call compiled property FIRST to generate the cache
+        combined_compiled_dirs = self.e_combined.compiled
+        # print(f"type: {type(self.e_combined.compiled_files[0])}")
+        # A generator returning only one item... ugh
+        combined_compiled_files = [x for x in self.e_combined.compiled_files][0]
+        print(f"compiled files: {combined_compiled_files}")
+        # Separate should give several plus the one added
+        eq_(len(separate_compiled_dirs), len(default_regexes) + 1)
+        # regex1 shouldn't be in the "files" version
+        eq_(len(separate_compiled_files), len(default_regexes))
+        # Only one Pattern returned, which when split should be however many + 1
+        eq_(len(split_combined(combined_compiled_dirs[0])), len(default_regexes) + 1)
+        # regex1 shouldn't be here either
+        eq_(len(split_combined(combined_compiled_files)), len(default_regexes))
+
+
+class TestCaseCompiledDict(TestCaseCompiledList):
+    def setup_method(self, method):
+        self.e_separate = ExcludeDict(combined_regex=False)
+        self.e_separate.restore_defaults()
+        self.e_combined = ExcludeDict(combined_regex=True)
+        self.e_combined.restore_defaults()
diff --git a/tox.ini b/tox.ini
index 33d32846..6a8b14be 100644
--- a/tox.ini
+++ b/tox.ini
@@ -10,7 +10,7 @@ setenv =
     PYTHON="{envpython}"
 commands =
     make modules
-    {posargs:py.test} core hscommon
+    {posargs:py.test core hscommon}
 deps =
     -r{toxinidir}/requirements.txt
     -r{toxinidir}/requirements-extra.txt