Changed the build system (that commit is *huge*)

--HG-- rename : base/cocoa/AppDelegate.h => cocoa/base/AppDelegate.h rename : base/cocoa/AppDelegate.m => cocoa/base/AppDelegate.m rename : base/cocoa/Consts.h => cocoa/base/Consts.h rename : base/cocoa/DetailsPanel.h => cocoa/base/DetailsPanel.h rename : base/cocoa/DetailsPanel.m => cocoa/base/DetailsPanel.m rename : base/cocoa/DirectoryPanel.h => cocoa/base/DirectoryPanel.h rename : base/cocoa/DirectoryPanel.m => cocoa/base/DirectoryPanel.m rename : base/cocoa/PyDupeGuru.h => cocoa/base/PyDupeGuru.h rename : base/cocoa/ResultWindow.h => cocoa/base/ResultWindow.h rename : base/cocoa/ResultWindow.m => cocoa/base/ResultWindow.m rename : base/cocoa/dsa_pub.pem => cocoa/base/dsa_pub.pem rename : base/cocoa/xib/DetailsPanel.xib => cocoa/base/xib/DetailsPanel.xib rename : base/cocoa/xib/DirectoryPanel.xib => cocoa/base/xib/DirectoryPanel.xib rename : base/cocoa/xib/MainMenu.xib => cocoa/base/xib/MainMenu.xib rename : me/cocoa/AppDelegate.h => cocoa/me/AppDelegate.h rename : me/cocoa/AppDelegate.m => cocoa/me/AppDelegate.m rename : me/cocoa/Consts.h => cocoa/me/Consts.h rename : me/cocoa/DetailsPanel.h => cocoa/me/DetailsPanel.h rename : me/cocoa/DetailsPanel.m => cocoa/me/DetailsPanel.m rename : me/cocoa/DirectoryPanel.h => cocoa/me/DirectoryPanel.h rename : me/cocoa/DirectoryPanel.m => cocoa/me/DirectoryPanel.m rename : me/cocoa/Info.plist => cocoa/me/Info.plist rename : me/cocoa/PyDupeGuru.h => cocoa/me/PyDupeGuru.h rename : me/cocoa/ResultWindow.h => cocoa/me/ResultWindow.h rename : me/cocoa/ResultWindow.m => cocoa/me/ResultWindow.m rename : me/cocoa/dupeguru.icns => cocoa/me/dupeguru.icns rename : me/cocoa/dupeguru.xcodeproj/project.pbxproj => cocoa/me/dupeguru.xcodeproj/project.pbxproj rename : me/cocoa/gen.py => cocoa/me/gen.py rename : me/cocoa/main.m => cocoa/me/main.m rename : me/cocoa/py/dg_cocoa.py => cocoa/me/py/dg_cocoa.py rename : me/cocoa/py/setup.py => cocoa/me/py/setup.py rename : me/cocoa/xib/Preferences.xib => cocoa/me/xib/Preferences.xib rename : pe/cocoa/AppDelegate.h => cocoa/pe/AppDelegate.h rename : pe/cocoa/AppDelegate.m => cocoa/pe/AppDelegate.m rename : pe/cocoa/Consts.h => cocoa/pe/Consts.h rename : pe/cocoa/DetailsPanel.h => cocoa/pe/DetailsPanel.h rename : pe/cocoa/DetailsPanel.m => cocoa/pe/DetailsPanel.m rename : pe/cocoa/DirectoryPanel.h => cocoa/pe/DirectoryPanel.h rename : pe/cocoa/DirectoryPanel.m => cocoa/pe/DirectoryPanel.m rename : pe/cocoa/Info.plist => cocoa/pe/Info.plist rename : pe/cocoa/PictureBlocks.h => cocoa/pe/PictureBlocks.h rename : pe/cocoa/PictureBlocks.m => cocoa/pe/PictureBlocks.m rename : pe/cocoa/PyDupeGuru.h => cocoa/pe/PyDupeGuru.h rename : pe/cocoa/ResultWindow.h => cocoa/pe/ResultWindow.h rename : pe/cocoa/ResultWindow.m => cocoa/pe/ResultWindow.m rename : pe/cocoa/dupeguru.icns => cocoa/pe/dupeguru.icns rename : pe/cocoa/dupeguru.xcodeproj/project.pbxproj => cocoa/pe/dupeguru.xcodeproj/project.pbxproj rename : pe/cocoa/gen.py => cocoa/pe/gen.py rename : pe/cocoa/main.m => cocoa/pe/main.m rename : pe/cocoa/py/dg_cocoa.py => cocoa/pe/py/dg_cocoa.py rename : pe/cocoa/py/setup.py => cocoa/pe/py/setup.py rename : pe/cocoa/xib/DetailsPanel.xib => cocoa/pe/xib/DetailsPanel.xib rename : pe/cocoa/xib/Preferences.xib => cocoa/pe/xib/Preferences.xib rename : se/cocoa/AppDelegate.h => cocoa/se/AppDelegate.h rename : se/cocoa/AppDelegate.m => cocoa/se/AppDelegate.m rename : se/cocoa/Consts.h => cocoa/se/Consts.h rename : se/cocoa/DetailsPanel.h => cocoa/se/DetailsPanel.h rename : se/cocoa/DetailsPanel.m => cocoa/se/DetailsPanel.m rename : se/cocoa/DirectoryPanel.h => cocoa/se/DirectoryPanel.h rename : se/cocoa/DirectoryPanel.m => cocoa/se/DirectoryPanel.m rename : se/cocoa/Info.plist => cocoa/se/Info.plist rename : se/cocoa/PyDupeGuru.h => cocoa/se/PyDupeGuru.h rename : se/cocoa/ResultWindow.h => cocoa/se/ResultWindow.h rename : se/cocoa/ResultWindow.m => cocoa/se/ResultWindow.m rename : se/cocoa/dupeguru.icns => cocoa/se/dupeguru.icns rename : se/cocoa/dupeguru.xcodeproj/project.pbxproj => cocoa/se/dupeguru.xcodeproj/project.pbxproj rename : se/cocoa/gen.py => cocoa/se/gen.py rename : se/cocoa/main.m => cocoa/se/main.m rename : se/cocoa/py/dg_cocoa.py => cocoa/se/py/dg_cocoa.py rename : se/cocoa/py/setup.py => cocoa/se/py/setup.py rename : se/cocoa/xib/Preferences.xib => cocoa/se/xib/Preferences.xib rename : base/core/LICENSE => core/LICENSE rename : base/core/__init__.py => core/__init__.py rename : base/core/app.py => core/app.py rename : base/core/app_cocoa.py => core/app_cocoa.py rename : base/core/data.py => core/data.py rename : base/core/directories.py => core/directories.py rename : base/core/engine.py => core/engine.py rename : base/core/export.py => core/export.py rename : base/core/fs.py => core/fs.py rename : base/core/ignore.py => core/ignore.py rename : base/core/results.py => core/results.py rename : base/core/scanner.py => core/scanner.py rename : base/core/tests/__init__.py => core/tests/__init__.py rename : base/core/tests/app_cocoa_test.py => core/tests/app_cocoa_test.py rename : base/core/tests/app_test.py => core/tests/app_test.py rename : base/core/tests/data.py => core/tests/data.py rename : base/core/tests/directories_test.py => core/tests/directories_test.py rename : base/core/tests/engine_test.py => core/tests/engine_test.py rename : base/core/tests/ignore_test.py => core/tests/ignore_test.py rename : base/core/tests/results_test.py => core/tests/results_test.py rename : base/core/tests/scanner_test.py => core/tests/scanner_test.py rename : me/core/__init__.py => core_me/__init__.py rename : me/core/app_cocoa.py => core_me/app_cocoa.py rename : me/core/data.py => core_me/data.py rename : me/core/fs.py => core_me/fs.py rename : me/core/scanner.py => core_me/scanner.py rename : me/core/tests/__init__.py => core_me/tests/__init__.py rename : me/core/tests/scanner_test.py => core_me/tests/scanner_test.py rename : pe/core/LICENSE => core_pe/LICENSE rename : pe/core/__init__.py => core_pe/__init__.py rename : pe/core/app_cocoa.py => core_pe/app_cocoa.py rename : pe/core/block.py => core_pe/block.py rename : pe/core/cache.py => core_pe/cache.py rename : pe/core/data.py => core_pe/data.py rename : pe/core/gen.py => core_pe/gen.py rename : pe/core/matchbase.py => core_pe/matchbase.py rename : pe/core/modules/block/block.pyx => core_pe/modules/block/block.pyx rename : pe/core/modules/block/setup.py => core_pe/modules/block/setup.py rename : pe/core/modules/cache/cache.pyx => core_pe/modules/cache/cache.pyx rename : pe/core/modules/cache/setup.py => core_pe/modules/cache/setup.py rename : pe/core/scanner.py => core_pe/scanner.py rename : pe/core/tests/__init__.py => core_pe/tests/__init__.py rename : pe/core/tests/block_test.py => core_pe/tests/block_test.py rename : pe/core/tests/cache_test.py => core_pe/tests/cache_test.py rename : se/core/LICENSE => core_se/LICENSE rename : se/core/__init__.py => core_se/__init__.py rename : se/core/app_cocoa.py => core_se/app_cocoa.py rename : se/core/data.py => core_se/data.py rename : se/core/fs.py => core_se/fs.py rename : se/core/tests/__init__.py => core_se/tests/__init__.py rename : se/core/tests/fs_test.py => core_se/tests/fs_test.py rename : me/help/LICENSE => help_me/LICENSE rename : me/help/__init__.py => help_me/__init__.py rename : me/help/changelog.yaml => help_me/changelog.yaml rename : me/help/gen.py => help_me/gen.py rename : me/help/skeleton/hardcoded.css => help_me/skeleton/hardcoded.css rename : me/help/skeleton/images/hs_title.png => help_me/skeleton/images/hs_title.png rename : me/help/templates/base_dg.mako => help_me/templates/base_dg.mako rename : me/help/templates/credits.mako => help_me/templates/credits.mako rename : me/help/templates/directories.mako => help_me/templates/directories.mako rename : me/help/templates/faq.mako => help_me/templates/faq.mako rename : me/help/templates/intro.mako => help_me/templates/intro.mako rename : me/help/templates/power_marker.mako => help_me/templates/power_marker.mako rename : me/help/templates/preferences.mako => help_me/templates/preferences.mako rename : me/help/templates/quick_start.mako => help_me/templates/quick_start.mako rename : me/help/templates/results.mako => help_me/templates/results.mako rename : me/help/templates/versions.mako => help_me/templates/versions.mako rename : pe/help/LICENSE => help_pe/LICENSE rename : pe/help/__init__.py => help_pe/__init__.py rename : pe/help/changelog.yaml => help_pe/changelog.yaml rename : pe/help/gen.py => help_pe/gen.py rename : pe/help/skeleton/hardcoded.css => help_pe/skeleton/hardcoded.css rename : pe/help/skeleton/images/hs_title.png => help_pe/skeleton/images/hs_title.png rename : pe/help/templates/base_dg.mako => help_pe/templates/base_dg.mako rename : pe/help/templates/credits.mako => help_pe/templates/credits.mako rename : pe/help/templates/directories.mako => help_pe/templates/directories.mako rename : pe/help/templates/faq.mako => help_pe/templates/faq.mako rename : pe/help/templates/intro.mako => help_pe/templates/intro.mako rename : pe/help/templates/power_marker.mako => help_pe/templates/power_marker.mako rename : pe/help/templates/preferences.mako => help_pe/templates/preferences.mako rename : pe/help/templates/quick_start.mako => help_pe/templates/quick_start.mako rename : pe/help/templates/results.mako => help_pe/templates/results.mako rename : pe/help/templates/versions.mako => help_pe/templates/versions.mako rename : se/help/LICENSE => help_se/LICENSE rename : se/help/changelog.yaml => help_se/changelog.yaml rename : se/help/gen.py => help_se/gen.py rename : se/help/skeleton/hardcoded.css => help_se/skeleton/hardcoded.css rename : se/help/skeleton/images/hs_title.png => help_se/skeleton/images/hs_title.png rename : se/help/templates/base_dg.mako => help_se/templates/base_dg.mako rename : se/help/templates/credits.mako => help_se/templates/credits.mako rename : se/help/templates/directories.mako => help_se/templates/directories.mako rename : se/help/templates/faq.mako => help_se/templates/faq.mako rename : se/help/templates/intro.mako => help_se/templates/intro.mako rename : se/help/templates/power_marker.mako => help_se/templates/power_marker.mako rename : se/help/templates/preferences.mako => help_se/templates/preferences.mako rename : se/help/templates/quick_start.mako => help_se/templates/quick_start.mako rename : se/help/templates/results.mako => help_se/templates/results.mako rename : se/help/templates/versions.mako => help_se/templates/versions.mako rename : base/qt/WARNING => qt/WARNING rename : base/qt/__init__.py => qt/base/__init__.py rename : base/qt/app.py => qt/base/app.py rename : base/qt/details_table.py => qt/base/details_table.py rename : base/qt/dg.qrc => qt/base/dg.qrc rename : base/qt/directories_dialog.py => qt/base/directories_dialog.py rename : base/qt/directories_dialog.ui => qt/base/directories_dialog.ui rename : base/qt/directories_model.py => qt/base/directories_model.py rename : base/qt/main_window.py => qt/base/main_window.py rename : base/qt/main_window.ui => qt/base/main_window.ui rename : base/qt/platform.py => qt/base/platform.py rename : base/qt/platform_osx.py => qt/base/platform_osx.py rename : base/qt/platform_win.py => qt/base/platform_win.py rename : base/qt/preferences.py => qt/base/preferences.py rename : base/qt/results_model.py => qt/base/results_model.py rename : me/qt/app.py => qt/me/app.py rename : me/qt/build.py => qt/me/build.py rename : me/qt/details_dialog.py => qt/me/details_dialog.py rename : me/qt/details_dialog.ui => qt/me/details_dialog.ui rename : me/qt/dgme.spec => qt/me/dgme.spec rename : me/qt/gen.py => qt/me/gen.py rename : me/qt/installer.aip => qt/me/installer.aip rename : me/qt/preferences.py => qt/me/preferences.py rename : me/qt/preferences_dialog.py => qt/me/preferences_dialog.py rename : me/qt/preferences_dialog.ui => qt/me/preferences_dialog.ui rename : me/qt/profile.py => qt/me/profile.py rename : me/qt/start.py => qt/me/start.py rename : me/qt/verinfo => qt/me/verinfo rename : pe/qt/app.py => qt/pe/app.py rename : pe/qt/block.py => qt/pe/block.py rename : pe/qt/build.py => qt/pe/build.py rename : pe/qt/details_dialog.py => qt/pe/details_dialog.py rename : pe/qt/details_dialog.ui => qt/pe/details_dialog.ui rename : pe/qt/dgpe.spec => qt/pe/dgpe.spec rename : pe/qt/gen.py => qt/pe/gen.py rename : pe/qt/installer.aip => qt/pe/installer.aip rename : pe/qt/main_window.py => qt/pe/main_window.py rename : pe/qt/modules/block/block.pyx => qt/pe/modules/block/block.pyx rename : pe/qt/modules/block/setup.py => qt/pe/modules/block/setup.py rename : pe/qt/preferences.py => qt/pe/preferences.py rename : pe/qt/preferences_dialog.py => qt/pe/preferences_dialog.py rename : pe/qt/preferences_dialog.ui => qt/pe/preferences_dialog.ui rename : pe/qt/profile.py => qt/pe/profile.py rename : pe/qt/start.py => qt/pe/start.py rename : pe/qt/verinfo => qt/pe/verinfo rename : se/qt/app.py => qt/se/app.py rename : se/qt/build.py => qt/se/build.py rename : se/qt/details_dialog.py => qt/se/details_dialog.py rename : se/qt/details_dialog.ui => qt/se/details_dialog.ui rename : se/qt/dgse.spec => qt/se/dgse.spec rename : se/qt/gen.py => qt/se/gen.py rename : se/qt/installer.aip => qt/se/installer.aip rename : se/qt/preferences.py => qt/se/preferences.py rename : se/qt/preferences_dialog.py => qt/se/preferences_dialog.py rename : se/qt/preferences_dialog.ui => qt/se/preferences_dialog.ui rename : se/qt/profile.py => qt/se/profile.py rename : se/qt/start.py => qt/se/start.py rename : se/qt/verinfo => qt/se/verinfo extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40285
2026-03-09 10:31:38 +00:00 · 2009-12-30 16:34:41 +00:00
parent 5645515d90
commit 838f8ae352
251 changed files with 602 additions and 500 deletions
--- a/core/LICENSE
+++ b/core/LICENSE
@@ -0,0 +1,11 @@
+Copyright 2009 Hardcoded Software Inc. (http://www.hardcoded.net)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Hardcoded Software Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+    * If the source code has been published less than two years ago, any redistribution, in whole or in part, must retain full licensing functionality, without any attempt to change, obscure or in other ways circumvent its intent.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/core/init.py
+++ b/core/init.py
@@ -0,0 +1 @@
+
--- a/core/app.py
+++ b/core/app.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python
+# Created By: Virgil Dupras
+# Created On: 2006/11/11
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from __future__ import unicode_literals
+
+import os
+import os.path as op
+import logging
+
+from hsutil import io, files
+from hsutil.path import Path
+from hsutil.reg import RegistrableApplication, RegistrationRequired
+from hsutil.misc import flatten, first
+from hsutil.str import escape
+
+from . import directories, results, scanner, export, fs
+
+JOB_SCAN = 'job_scan'
+JOB_LOAD = 'job_load'
+JOB_MOVE = 'job_move'
+JOB_COPY = 'job_copy'
+JOB_DELETE = 'job_delete'
+
+class NoScannableFileError(Exception):
+    pass
+
+class AllFilesAreRefError(Exception):
+    pass
+
+class DupeGuru(RegistrableApplication):
+    def __init__(self, data_module, appdata, appid):
+        RegistrableApplication.__init__(self, appid)
+        self.appdata = appdata
+        if not op.exists(self.appdata):
+            os.makedirs(self.appdata)
+        self.data = data_module
+        self.directories = directories.Directories()
+        self.results = results.Results(data_module)
+        self.scanner = scanner.Scanner()
+        self.action_count = 0
+        self.last_op_error_count = 0
+        self.options = {
+            'escape_filter_regexp': True,
+            'clean_empty_dirs': False,
+        }
+    
+    def _demo_check(self):
+        if self.registered:
+            return
+        count = self.results.mark_count
+        if count + self.action_count > 10:
+            raise RegistrationRequired()
+        else:
+            self.action_count += count
+    
+    def _do_delete(self, j):
+        def op(dupe):
+            j.add_progress()
+            return self._do_delete_dupe(dupe)
+        
+        j.start_job(self.results.mark_count)
+        self.last_op_error_count = self.results.perform_on_marked(op, True)
+    
+    def _do_delete_dupe(self, dupe):
+        if not io.exists(dupe.path):
+            return True
+        self._recycle_dupe(dupe)
+        self.clean_empty_dirs(dupe.path[:-1])
+        if not io.exists(dupe.path):
+            return True
+        logging.warning("Could not send {0} to trash.".format(unicode(dupe.path)))
+        return False
+    
+    def _do_load(self, j):
+        self.directories.load_from_file(op.join(self.appdata, 'last_directories.xml'))
+        j = j.start_subjob([1, 9])
+        self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
+        files = flatten(g[:] for g in self.results.groups)
+        for file in j.iter_with_progress(files, 'Reading metadata %d/%d'):
+            file._read_all_info(attrnames=self.data.METADATA_TO_READ)
+    
+    def _get_display_info(self, dupe, group, delta=False):
+        if (dupe is None) or (group is None):
+            return ['---'] * len(self.data.COLUMNS)
+        try:
+            return self.data.GetDisplayInfo(dupe, group, delta)
+        except Exception as e:
+            logging.warning("Exception on GetDisplayInfo for %s: %s", unicode(dupe.path), unicode(e))
+            return ['---'] * len(self.data.COLUMNS)
+    
+    def _get_file(self, str_path):
+        path = Path(str_path)
+        return fs.get_file(path, self.directories.fileclasses)    
+    
+    @staticmethod
+    def _recycle_dupe(dupe):
+        raise NotImplementedError()
+    
+    def _start_job(self, jobid, func):
+        # func(j)
+        raise NotImplementedError()
+    
+    def add_directory(self, d):
+        try:
+            self.directories.add_path(Path(d))
+            return 0
+        except directories.AlreadyThereError:
+            return 1
+        except directories.InvalidPathError:
+            return 2
+    
+    def add_to_ignore_list(self, dupe):
+        g = self.results.get_group_of_duplicate(dupe)
+        for other in g:
+            if other is not dupe:
+                self.scanner.ignore_list.Ignore(unicode(other.path), unicode(dupe.path))
+    
+    def apply_filter(self, filter):
+        self.results.apply_filter(None)
+        if self.options['escape_filter_regexp']:
+            filter = escape(filter, '()[]\\.|+?^')
+            filter = escape(filter, '*', '.')
+        self.results.apply_filter(filter)
+    
+    def clean_empty_dirs(self, path):
+        if self.options['clean_empty_dirs']:
+            while files.delete_if_empty(path, ['.DS_Store']):
+                path = path[:-1]
+    
+    def copy_or_move(self, dupe, copy, destination, dest_type):
+        """
+            copy: True = Copy False = Move
+            destination: string.
+            dest_type: 0 = right in destination.
+                       1 = relative re-creation.
+                       2 = absolute re-creation.
+        """
+        source_path = dupe.path
+        location_path = first(p for p in self.directories if dupe.path in p)
+        dest_path = Path(destination)
+        if dest_type == 2:
+            dest_path = dest_path + source_path[1:-1] #Remove drive letter and filename
+        elif dest_type == 1:
+            dest_path = dest_path + source_path[location_path:-1]
+        try:
+            if not io.exists(dest_path):
+                io.makedirs(dest_path)
+            if copy:
+                files.copy(source_path, dest_path)
+            else:
+                files.move(source_path, dest_path)
+                self.clean_empty_dirs(source_path[:-1])
+        except EnvironmentError as e:
+            operation = 'Copy' if copy else 'Move'
+            logging.warning('%s operation failed on %s. Error: %s' % (operation, unicode(dupe.path), unicode(e)))
+            return False
+        return True
+    
+    def copy_or_move_marked(self, copy, destination, recreate_path):
+        def do(j):
+            def op(dupe):
+                j.add_progress()
+                return self.copy_or_move(dupe, copy, destination, recreate_path)
+            
+            j.start_job(self.results.mark_count)
+            self.last_op_error_count = self.results.perform_on_marked(op, not copy)
+        
+        self._demo_check()
+        jobid = JOB_COPY if copy else JOB_MOVE
+        self._start_job(jobid, do)
+    
+    def delete_marked(self):
+        self._demo_check()
+        self._start_job(JOB_DELETE, self._do_delete)
+    
+    def export_to_xhtml(self, column_ids):
+        column_ids = [colid for colid in column_ids if colid.isdigit()]
+        column_ids = map(int, column_ids)
+        column_ids.sort()
+        colnames = [col['display'] for i, col in enumerate(self.data.COLUMNS) if i in column_ids]
+        rows = []
+        for group in self.results.groups:
+            for dupe in group:
+                data = self._get_display_info(dupe, group)
+                row = [data[colid] for colid in column_ids]
+                row.insert(0, dupe is not group.ref)
+                rows.append(row)
+        return export.export_to_xhtml(colnames, rows)
+    
+    def load(self):
+        self._start_job(JOB_LOAD, self._do_load)
+        self.load_ignore_list()
+    
+    def load_ignore_list(self):
+        p = op.join(self.appdata, 'ignore_list.xml')
+        self.scanner.ignore_list.load_from_xml(p)
+    
+    def make_reference(self, duplicates):
+        changed_groups = set()
+        for dupe in duplicates:
+            g = self.results.get_group_of_duplicate(dupe)
+            if g not in changed_groups:
+                self.results.make_ref(dupe)
+                changed_groups.add(g)
+    
+    def save(self):
+        try:
+            self.directories.save_to_file(op.join(self.appdata, 'last_directories.xml'))
+            self.results.save_to_xml(op.join(self.appdata, 'last_results.xml'))
+        except LookupError:
+            # This is that weird issue from #39 that sometimes happens when auto-updating with
+            # Sparkle. Just ignore it.
+            pass
+    
+    def save_ignore_list(self):
+        p = op.join(self.appdata, 'ignore_list.xml')
+        self.scanner.ignore_list.save_to_xml(p)
+    
+    def start_scanning(self):
+        def do(j):
+            j.set_progress(0, 'Collecting files to scan')
+            files = list(self.directories.get_files())
+            logging.info('Scanning %d files' % len(files))
+            self.results.groups = self.scanner.GetDupeGroups(files, j)
+        
+        files = self.directories.get_files()
+        first_file = first(files)
+        if first_file is None:
+            raise NoScannableFileError()
+        if first_file.is_ref and all(f.is_ref for f in files):
+            raise AllFilesAreRefError()
+        self.results.groups = []
+        self._start_job(JOB_SCAN, do)
+    
+    #--- Properties
+    @property
+    def stat_line(self):
+        result = self.results.stat_line
+        if self.scanner.discarded_file_count:
+            result = '%s (%d discarded)' % (result, self.scanner.discarded_file_count)
+        return result
+    
--- a/core/app_cocoa.py
+++ b/core/app_cocoa.py
@@ -0,0 +1,307 @@
+# Created By: Virgil Dupras
+# Created On: 2006/11/11
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import objc
+from Foundation import *
+from AppKit import *
+import logging
+import os.path as op
+
+from hsutil import io, cocoa, job
+from hsutil.cocoa import install_exception_hook
+from hsutil.misc import stripnone
+from hsutil.reg import RegistrationRequired
+
+from . import app, fs
+
+JOBID2TITLE = {
+    app.JOB_SCAN: "Scanning for duplicates",
+    app.JOB_LOAD: "Loading",
+    app.JOB_MOVE: "Moving",
+    app.JOB_COPY: "Copying",
+    app.JOB_DELETE: "Sending to Trash",
+}
+
+def demo_method(method):
+    def wrapper(self, *args, **kwargs):
+        try:
+            return method(self, *args, **kwargs)
+        except RegistrationRequired:
+            NSNotificationCenter.defaultCenter().postNotificationName_object_('RegistrationRequired', self)
+    
+    return wrapper
+
+class DupeGuru(app.DupeGuru):
+    def __init__(self, data_module, appdata_subdir, appid):
+        LOGGING_LEVEL = logging.DEBUG if NSUserDefaults.standardUserDefaults().boolForKey_('debug') else logging.WARNING
+        logging.basicConfig(level=LOGGING_LEVEL, format='%(levelname)s %(message)s')
+        logging.debug('started in debug mode')
+        install_exception_hook()
+        appsupport = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, True)[0]
+        appdata = op.join(appsupport, appdata_subdir)
+        app.DupeGuru.__init__(self, data_module, appdata, appid)
+        self.progress = cocoa.ThreadedJobPerformer()
+        self.display_delta_values = False
+        self.selected_dupes = []
+        self.RefreshDetailsTable(None,None)
+    
+    #--- Override
+    @staticmethod
+    def _recycle_dupe(dupe):
+        directory = unicode(dupe.path[:-1])
+        filename = dupe.name
+        if objc.__version__ == '1.4': # For a while, we have to support this.
+            result, tag = NSWorkspace.sharedWorkspace().performFileOperation_source_destination_files_tag_(
+                NSWorkspaceRecycleOperation, directory, '', [filename])
+        else:
+            result, tag = NSWorkspace.sharedWorkspace().performFileOperation_source_destination_files_tag_(
+                NSWorkspaceRecycleOperation, directory, '', [filename], None)
+    
+    def _start_job(self, jobid, func):
+        try:
+            j = self.progress.create_job()
+            self.progress.run_threaded(func, args=(j, ))
+        except job.JobInProgressError:
+            NSNotificationCenter.defaultCenter().postNotificationName_object_('JobInProgress', self)
+        else:
+            ud = {'desc': JOBID2TITLE[jobid], 'jobid':jobid}
+            NSNotificationCenter.defaultCenter().postNotificationName_object_userInfo_('JobStarted', self, ud)
+    
+    #---Helpers
+    def GetObjects(self,node_path):
+        #returns a tuple g,d
+        try:
+            g = self.results.groups[node_path[0]]
+            if len(node_path) == 2:
+                return (g,self.results.groups[node_path[0]].dupes[node_path[1]])
+            else:
+                return (g,None)
+        except IndexError:
+            return (None,None)
+    
+    def get_folder_path(self, node_path, curr_path=None):
+        if not node_path:
+            return curr_path
+        current_index = node_path[0]
+        if curr_path is None:
+            curr_path = self.directories[current_index]
+        else:
+            curr_path = self.directories.get_subfolders(curr_path)[current_index]
+        return self.get_folder_path(node_path[1:], curr_path)
+    
+    def RefreshDetailsTable(self,dupe,group):
+        l1 = self._get_display_info(dupe, group, False)
+        # we don't want the two sides of the table to display the stats for the same file
+        ref = group.ref if group is not None and group.ref is not dupe else None
+        l2 = self._get_display_info(ref, group, False)
+        names = [c['display'] for c in self.data.COLUMNS]
+        self.details_table = zip(names,l1,l2)
+    
+    #---Public
+    def AddSelectedToIgnoreList(self):
+        for dupe in self.selected_dupes:
+            self.add_to_ignore_list(dupe)
+    
+    copy_or_move_marked = demo_method(app.DupeGuru.copy_or_move_marked)
+    delete_marked = demo_method(app.DupeGuru.delete_marked)
+
+    def MakeSelectedReference(self):
+        self.make_reference(self.selected_dupes)
+    
+    def OpenSelected(self):
+        if self.selected_dupes:
+            path = unicode(self.selected_dupes[0].path)
+            NSWorkspace.sharedWorkspace().openFile_(path)
+    
+    def PurgeIgnoreList(self):
+        self.scanner.ignore_list.Filter(lambda f,s:op.exists(f) and op.exists(s))
+    
+    def RefreshDetailsWithSelected(self):
+        if self.selected_dupes:
+            self.RefreshDetailsTable(
+                self.selected_dupes[0],
+                self.results.get_group_of_duplicate(self.selected_dupes[0])
+            )
+        else:
+            self.RefreshDetailsTable(None,None)
+    
+    def RemoveDirectory(self,index):
+        try:
+            del self.directories[index]
+        except IndexError:
+            pass
+    
+    def RemoveSelected(self):
+        self.results.remove_duplicates(self.selected_dupes)
+    
+    def RenameSelected(self, newname):
+        try:
+            d = self.selected_dupes[0]
+            d.rename(newname)
+            return True
+        except (IndexError, fs.FSError) as e:
+            logging.warning("dupeGuru Warning: %s" % unicode(e))
+        return False
+    
+    def RevealSelected(self):
+        if self.selected_dupes:
+            path = unicode(self.selected_dupes[0].path)
+            NSWorkspace.sharedWorkspace().selectFile_inFileViewerRootedAtPath_(path,'')
+    
+    def start_scanning(self):
+        self.RefreshDetailsTable(None, None)
+        try:
+            app.DupeGuru.start_scanning(self)
+            return 0
+        except app.NoScannableFileError:
+            return 3
+        except app.AllFilesAreRefError:
+            return 1
+    
+    def selected_result_node_paths(self):
+        def get_path(dupe):
+            try:
+                group = self.results.get_group_of_duplicate(dupe)
+                groupindex = self.results.groups.index(group)
+                if dupe is group.ref:
+                    return [groupindex]
+                dupeindex = group.dupes.index(dupe)
+                return [groupindex, dupeindex]
+            except ValueError: # dupe not in there
+                return None
+        
+        dupes = self.selected_dupes
+        return stripnone(get_path(dupe) for dupe in dupes)
+    
+    def selected_powermarker_node_paths(self):
+        def get_path(dupe):
+            try:
+                dupeindex = self.results.dupes.index(dupe)
+                return [dupeindex]
+            except ValueError: # dupe not in there
+                return None
+        
+        dupes = self.selected_dupes
+        return stripnone(get_path(dupe) for dupe in dupes)
+    
+    def SelectResultNodePaths(self,node_paths):
+        def extract_dupe(t):
+            g,d = t
+            if d is not None:
+                return d
+            else:
+                if g is not None:
+                    return g.ref
+        
+        selected = [extract_dupe(self.GetObjects(p)) for p in node_paths]
+        self.selected_dupes = [dupe for dupe in selected if dupe is not None]
+    
+    def SelectPowerMarkerNodePaths(self,node_paths):
+        rows = [p[0] for p in node_paths]
+        self.selected_dupes = [
+            self.results.dupes[row] for row in rows if row in xrange(len(self.results.dupes))
+        ]
+    
+    def SetDirectoryState(self, node_path, state):
+        p = self.get_folder_path(node_path)
+        self.directories.set_state(p, state)
+    
+    def sort_dupes(self,key,asc):
+        self.results.sort_dupes(key,asc,self.display_delta_values)
+    
+    def sort_groups(self,key,asc):
+        self.results.sort_groups(key,asc)
+    
+    def ToggleSelectedMarkState(self):
+        for dupe in self.selected_dupes:
+            self.results.mark_toggle(dupe)
+    
+    #---Data
+    def GetOutlineViewMaxLevel(self, tag):
+        if tag == 0:
+            return 2
+        elif tag == 1:
+            return 0
+        elif tag == 2:
+            return 1
+    
+    def GetOutlineViewChildCounts(self, tag, node_path):
+        if self.progress._job_running:
+            return []
+        if tag == 0: #Normal results
+            assert not node_path # no other value is possible
+            return [len(g.dupes) for g in self.results.groups]
+        elif tag == 1: #Directories
+            try:
+                if node_path:
+                    path = self.get_folder_path(node_path)
+                    subfolders = self.directories.get_subfolders(path)
+                else:
+                    subfolders = self.directories
+                return [len(self.directories.get_subfolders(path)) for path in subfolders]
+            except IndexError: # node_path out of range
+                return []
+        else: #Power Marker
+            assert not node_path # no other value is possible
+            return [0 for d in self.results.dupes]
+    
+    def GetOutlineViewValues(self, tag, node_path):
+        if self.progress._job_running:
+            return
+        if not node_path:
+            return
+        if tag in (0,2): #Normal results / Power Marker
+            if tag == 0:
+                g, d = self.GetObjects(node_path)
+                if d is None:
+                    d = g.ref
+            else:
+                d = self.results.dupes[node_path[0]]
+                g = self.results.get_group_of_duplicate(d)
+            result = self._get_display_info(d, g, self.display_delta_values)
+            return result
+        elif tag == 1: #Directories
+            try:
+                path = self.get_folder_path(node_path)
+                name = unicode(path) if len(node_path) == 1 else path[-1]
+                return [name, self.directories.get_state(path)]
+            except IndexError: # node_path out of range
+                return []
+    
+    def GetOutlineViewMarked(self, tag, node_path):
+        # 0=unmarked 1=marked 2=unmarkable
+        if self.progress._job_running:
+            return
+        if not node_path:
+            return 2
+        if tag == 1: #Directories
+            return 2
+        if tag == 0: #Normal results
+            g, d = self.GetObjects(node_path)
+        else: #Power Marker
+            d = self.results.dupes[node_path[0]]
+        if (d is None) or (not self.results.is_markable(d)):
+            return 2
+        elif self.results.is_marked(d):
+            return 1
+        else:
+            return 0
+    
+    def GetTableViewCount(self, tag):
+        if self.progress._job_running:
+            return 0
+        return len(self.details_table)
+    
+    def GetTableViewMarkedIndexes(self,tag):
+        return []
+    
+    def GetTableViewValues(self,tag,row):
+        return self.details_table[row]
+    
+
--- a/core/data.py
+++ b/core/data.py
@@ -0,0 +1,42 @@
+# Created By: Virgil Dupras
+# Created On: 2006/03/15
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from hsutil.str import format_time, FT_DECIMAL, format_size
+
+import time
+
+def format_path(p):
+    return unicode(p[:-1])
+
+def format_timestamp(t, delta):
+    if delta:
+        return format_time(t, FT_DECIMAL)
+    else:
+        if t > 0:
+            return time.strftime('%Y/%m/%d %H:%M:%S', time.localtime(t))
+        else:
+            return '---'
+
+def format_words(w):
+    def do_format(w):
+        if isinstance(w, list):
+            return '(%s)' % ', '.join(do_format(item) for item in w)
+        else:
+            return w.replace('\n', ' ')
+    
+    return ', '.join(do_format(item) for item in w)
+
+def format_perc(p):
+    return "%0.0f" % p
+
+def format_dupe_count(c):
+    return str(c) if c else '---'
+
+def cmp_value(value):
+    return value.lower() if isinstance(value, basestring) else value
--- a/core/directories.py
+++ b/core/directories.py
@@ -0,0 +1,173 @@
+# Created By: Virgil Dupras
+# Created On: 2006/02/27
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import xml.dom.minidom
+
+from hsutil import io
+from hsutil.files import FileOrPath
+from hsutil.path import Path
+
+from . import fs
+
+(STATE_NORMAL,
+STATE_REFERENCE,
+STATE_EXCLUDED) = range(3)
+
+class AlreadyThereError(Exception):
+    """The path being added is already in the directory list"""
+
+class InvalidPathError(Exception):
+    """The path being added is invalid"""
+
+class Directories(object):
+    #---Override
+    def __init__(self, fileclasses=[fs.File]):
+        self._dirs = []
+        self.states = {}
+        self.fileclasses = fileclasses
+    
+    def __contains__(self, path):
+        for p in self._dirs:
+            if path in p:
+                return True
+        return False
+    
+    def __delitem__(self,key):
+        self._dirs.__delitem__(key)
+    
+    def __getitem__(self,key):
+        return self._dirs.__getitem__(key)
+    
+    def __len__(self):
+        return len(self._dirs)
+    
+    #---Private
+    def _default_state_for_path(self, path):
+        # Override this in subclasses to specify the state of some special folders.
+        if path[-1].startswith('.'): # hidden
+            return STATE_EXCLUDED
+    
+    def _get_files(self, from_path):
+        state = self.get_state(from_path)
+        if state == STATE_EXCLUDED:
+            # Recursively get files from folders with lots of subfolder is expensive. However, there
+            # might be a subfolder in this path that is not excluded. What we want to do is to skim
+            # through self.states and see if we must continue, or we can stop right here to save time
+            if not any(p[:len(from_path)] == from_path for p in self.states):
+                return
+        try:
+            filepaths = set()
+            if state != STATE_EXCLUDED:
+                for file in fs.get_files(from_path, fileclasses=self.fileclasses):
+                    file.is_ref = state == STATE_REFERENCE
+                    filepaths.add(file.path)
+                    yield file
+            subpaths = [from_path + name for name in io.listdir(from_path)]
+            # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
+            subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
+            for subfolder in subfolders:
+                for file in self._get_files(subfolder):
+                    yield file
+        except (EnvironmentError, fs.InvalidPath):
+            pass
+    
+    #---Public
+    def add_path(self, path):
+        """Adds 'path' to self, if not already there.
+        
+        Raises AlreadyThereError if 'path' is already in self. If path is a directory containing
+        some of the directories already present in self, 'path' will be added, but all directories
+        under it will be removed. Can also raise InvalidPathError if 'path' does not exist.
+        """
+        if path in self:
+            raise AlreadyThereError()
+        if not io.exists(path):
+            raise InvalidPathError()
+        self._dirs = [p for p in self._dirs if p not in path]
+        self._dirs.append(path)
+    
+    @staticmethod
+    def get_subfolders(path):
+        """returns a sorted list of paths corresponding to subfolders in `path`"""
+        try:
+            names = [name for name in io.listdir(path) if io.isdir(path + name)]
+            names.sort(key=lambda x:x.lower())
+            return [path + name for name in names]
+        except EnvironmentError:
+            return []
+    
+    def get_files(self):
+        """Returns a list of all files that are not excluded.
+        
+        Returned files also have their 'is_ref' attr set.
+        """
+        for path in self._dirs:
+            for file in self._get_files(path):
+                yield file
+    
+    def get_state(self, path):
+        """Returns the state of 'path' (One of the STATE_* const.)
+        """
+        if path in self.states:
+            return self.states[path]
+        default_state = self._default_state_for_path(path)
+        if default_state is not None:
+            return default_state
+        parent = path[:-1]
+        if parent in self:
+            return self.get_state(parent)
+        else:
+            return STATE_NORMAL
+    
+    def load_from_file(self, infile):
+        try:
+            doc = xml.dom.minidom.parse(infile)
+        except:
+            return
+        root_path_nodes = doc.getElementsByTagName('root_directory')
+        for rdn in root_path_nodes:
+            if not rdn.getAttributeNode('path'):
+                continue
+            path = rdn.getAttributeNode('path').nodeValue
+            try:
+                self.add_path(Path(path))
+            except (AlreadyThereError, InvalidPathError):
+                pass
+        state_nodes = doc.getElementsByTagName('state')
+        for sn in state_nodes:
+            if not (sn.getAttributeNode('path') and sn.getAttributeNode('value')):
+                continue
+            path = sn.getAttributeNode('path').nodeValue
+            state = sn.getAttributeNode('value').nodeValue
+            self.set_state(Path(path), int(state))
+    
+    def save_to_file(self,outfile):
+        with FileOrPath(outfile, 'wb') as fp:
+            doc = xml.dom.minidom.Document()
+            root = doc.appendChild(doc.createElement('directories'))
+            for root_path in self:
+                root_path_node = root.appendChild(doc.createElement('root_directory'))
+                root_path_node.setAttribute('path', unicode(root_path).encode('utf-8'))
+            for path, state in self.states.iteritems():
+                state_node = root.appendChild(doc.createElement('state'))
+                state_node.setAttribute('path', unicode(path).encode('utf-8'))
+                state_node.setAttribute('value', str(state))
+            doc.writexml(fp, '\t', '\t', '\n', encoding='utf-8')
+    
+    def set_state(self, path, state):
+        if self.get_state(path) == state:
+            return
+        # we don't want to needlessly fill self.states. if get_state returns the same thing
+        # without an explicit entry, remove that entry
+        if path in self.states:
+            del self.states[path]
+            if self.get_state(path) == state: # no need for an entry
+                return
+        self.states[path] = state
+    
--- a/core/engine.py
+++ b/core/engine.py
@@ -0,0 +1,389 @@
+# Created By: Virgil Dupras
+# Created On: 2006/01/29
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from __future__ import division
+import difflib
+import itertools
+import logging
+import string
+from collections import defaultdict, namedtuple
+from unicodedata import normalize
+
+from hsutil.misc import flatten
+from hsutil.str import multi_replace
+from hsutil import job
+
+(WEIGHT_WORDS,
+MATCH_SIMILAR_WORDS,
+NO_FIELD_ORDER) = range(3)
+
+JOB_REFRESH_RATE = 100
+
+def getwords(s):
+    if isinstance(s, unicode):
+        s = normalize('NFD', s)
+    s = multi_replace(s, "-_&+():;\\[]{}.,<>/?~!@#$*", ' ').lower()
+    s = ''.join(c for c in s if c in string.ascii_letters + string.digits + string.whitespace)
+    return filter(None, s.split(' ')) # filter() is to remove empty elements
+
+def getfields(s):
+    fields = [getwords(field) for field in s.split(' - ')]
+    return filter(None, fields)
+
+def unpack_fields(fields):
+    result = []
+    for field in fields:
+        if isinstance(field, list):
+            result += field
+        else:
+            result.append(field)
+    return result
+
+def compare(first, second, flags=()):
+    """Returns the % of words that match between first and second
+    
+    The result is a int in the range 0..100.
+    First and second can be either a string or a list.
+    """
+    if not (first and second):
+        return 0
+    if any(isinstance(element, list) for element in first):
+        return compare_fields(first, second, flags)
+    second = second[:] #We must use a copy of second because we remove items from it        
+    match_similar = MATCH_SIMILAR_WORDS in flags
+    weight_words = WEIGHT_WORDS in flags
+    joined = first + second
+    total_count = (sum(len(word) for word in joined) if weight_words else len(joined))
+    match_count = 0
+    in_order = True
+    for word in first:
+        if match_similar and (word not in second):
+            similar = difflib.get_close_matches(word, second, 1, 0.8)
+            if similar:
+                word = similar[0]
+        if word in second:
+            if second[0] != word:
+                in_order = False
+            second.remove(word)
+            match_count += (len(word) if weight_words else 1)
+    result = round(((match_count * 2) / total_count) * 100)
+    if (result == 100) and (not in_order):
+        result = 99 # We cannot consider a match exact unless the ordering is the same
+    return result
+
+def compare_fields(first, second, flags=()):
+    """Returns the score for the lowest matching fields.
+    
+    first and second must be lists of lists of string.
+    """
+    if len(first) != len(second):
+        return 0
+    if NO_FIELD_ORDER in flags:
+        results = []
+        #We don't want to remove field directly in the list. We must work on a copy.
+        second = second[:]
+        for field1 in first:
+            max = 0
+            matched_field = None
+            for field2 in second:
+                r = compare(field1, field2, flags)
+                if r > max:
+                    max = r
+                    matched_field = field2
+            results.append(max)
+            if matched_field:
+                second.remove(matched_field)
+    else:
+        results = [compare(word1, word2, flags) for word1, word2 in zip(first, second)]
+    return min(results) if results else 0
+
+def build_word_dict(objects, j=job.nulljob):
+    """Returns a dict of objects mapped by their words.
+    
+    objects must have a 'words' attribute being a list of strings or a list of lists of strings.
+    
+    The result will be a dict with words as keys, lists of objects as values.
+    """
+    result = defaultdict(set)
+    for object in j.iter_with_progress(objects, 'Prepared %d/%d files', JOB_REFRESH_RATE):
+        for word in unpack_fields(object.words):
+            result[word].add(object)
+    return result
+
+def merge_similar_words(word_dict):
+    """Take all keys in word_dict that are similar, and merge them together.
+    """
+    keys = word_dict.keys()
+    keys.sort(key=len)# we want the shortest word to stay
+    while keys:
+        key = keys.pop(0)
+        similars = difflib.get_close_matches(key, keys, 100, 0.8)
+        if not similars:
+            continue
+        objects = word_dict[key]
+        for similar in similars:
+            objects |= word_dict[similar]
+            del word_dict[similar]
+            keys.remove(similar)
+
+def reduce_common_words(word_dict, threshold):
+    """Remove all objects from word_dict values where the object count >= threshold
+    
+    The exception to this removal are the objects where all the words of the object are common.
+    Because if we remove them, we will miss some duplicates!
+    """
+    uncommon_words = set(word for word, objects in word_dict.items() if len(objects) < threshold)
+    for word, objects in word_dict.items():
+        if len(objects) < threshold:
+            continue
+        reduced = set()
+        for o in objects:
+            if not any(w in uncommon_words for w in unpack_fields(o.words)):
+                reduced.add(o)
+        if reduced:
+            word_dict[word] = reduced
+        else:
+            del word_dict[word]
+
+Match = namedtuple('Match', 'first second percentage')
+def get_match(first, second, flags=()):
+    #it is assumed here that first and second both have a "words" attribute
+    percentage = compare(first.words, second.words, flags)
+    return Match(first, second, percentage)
+
+def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, 
+    no_field_order=False, j=job.nulljob):
+    COMMON_WORD_THRESHOLD = 50
+    LIMIT = 5000000
+    j = j.start_subjob(2)
+    sj = j.start_subjob(2)
+    for o in objects:
+        if not hasattr(o, 'words'):
+            o.words = getwords(o.name)
+    word_dict = build_word_dict(objects, sj)
+    reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
+    if match_similar_words:
+        merge_similar_words(word_dict)
+    match_flags = []
+    if weight_words:
+        match_flags.append(WEIGHT_WORDS)
+    if match_similar_words:
+        match_flags.append(MATCH_SIMILAR_WORDS)
+    if no_field_order:
+        match_flags.append(NO_FIELD_ORDER)
+    j.start_job(len(word_dict), '0 matches found')
+    compared = defaultdict(set)
+    result = []
+    try:
+        # This whole 'popping' thing is there to avoid taking too much memory at the same time.
+        while word_dict:
+            items = word_dict.popitem()[1]
+            while items:
+                ref = items.pop()
+                compared_already = compared[ref]
+                to_compare = items - compared_already
+                compared_already |= to_compare
+                for other in to_compare:
+                    m = get_match(ref, other, match_flags)
+                    if m.percentage >= min_match_percentage:
+                        result.append(m)
+                        if len(result) >= LIMIT:
+                            return result
+            j.add_progress(desc='%d matches found' % len(result))
+    except MemoryError:
+        # This is the place where the memory usage is at its peak during the scan.
+        # Just continue the process with an incomplete list of matches.
+        del compared # This should give us enough room to call logging.
+        logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
+        return result
+    return result
+
+def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
+    j = j.start_subjob([2, 8])
+    size2files = defaultdict(set)
+    for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
+        filesize = getattr(file, sizeattr)
+        if filesize:
+            size2files[filesize].add(file)
+    possible_matches = [files for files in size2files.values() if len(files) > 1]
+    del size2files
+    result = []
+    j.start_job(len(possible_matches), '0 matches found')
+    for group in possible_matches:
+        for first, second in itertools.combinations(group, 2):
+            if first.md5partial == second.md5partial:
+                if partial or first.md5 == second.md5:
+                    result.append(Match(first, second, 100))
+        j.add_progress(desc='%d matches found' % len(result))
+    return result
+
+class Group(object):
+    #---Override
+    def __init__(self):
+        self._clear()
+    
+    def __contains__(self, item):
+        return item in self.unordered
+    
+    def __getitem__(self, key):
+        return self.ordered.__getitem__(key)
+    
+    def __iter__(self):
+        return iter(self.ordered)
+    
+    def __len__(self):
+        return len(self.ordered)
+    
+    #---Private
+    def _clear(self):
+        self._percentage = None
+        self._matches_for_ref = None
+        self.matches = set()
+        self.candidates = defaultdict(set)
+        self.ordered = []
+        self.unordered = set()
+    
+    def _get_matches_for_ref(self):
+        if self._matches_for_ref is None:
+            ref = self.ref
+            self._matches_for_ref = [match for match in self.matches if ref in match]
+        return self._matches_for_ref
+    
+    #---Public
+    def add_match(self, match):
+        def add_candidate(item, match):
+            matches = self.candidates[item]
+            matches.add(match)
+            if self.unordered <= matches:
+                self.ordered.append(item)
+                self.unordered.add(item)
+        
+        if match in self.matches:
+            return
+        self.matches.add(match)
+        first, second, _ = match
+        if first not in self.unordered:
+            add_candidate(first, second)
+        if second not in self.unordered:
+            add_candidate(second, first)
+        self._percentage = None
+        self._matches_for_ref = None
+    
+    def discard_matches(self):
+        discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second]))
+        self.matches -= discarded
+        self.candidates = defaultdict(set)
+        return discarded
+    
+    def get_match_of(self, item):
+        if item is self.ref:
+            return
+        for m in self._get_matches_for_ref():
+            if item in m:
+                return m
+    
+    def prioritize(self, key_func, tie_breaker=None):
+        # tie_breaker(ref, dupe) --> True if dupe should be ref
+        self.ordered.sort(key=key_func)
+        if tie_breaker is None:
+            return
+        ref = self.ref
+        key_value = key_func(ref)
+        for dupe in self.dupes:
+            if key_func(dupe) != key_value:
+                break
+            if tie_breaker(ref, dupe):
+                ref = dupe
+        if ref is not self.ref:
+            self.switch_ref(ref)
+    
+    def remove_dupe(self, item, discard_matches=True):
+        try:
+            self.ordered.remove(item)
+            self.unordered.remove(item)
+            self._percentage = None
+            self._matches_for_ref = None
+            if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
+                if discard_matches:
+                    self.matches = set(m for m in self.matches if item not in m)
+            else:
+                self._clear()
+        except ValueError:
+            pass
+    
+    def switch_ref(self, with_dupe):
+        try:
+            self.ordered.remove(with_dupe)
+            self.ordered.insert(0, with_dupe)
+            self._percentage = None
+            self._matches_for_ref = None
+        except ValueError:
+            pass
+    
+    dupes = property(lambda self: self[1:])
+    
+    @property
+    def percentage(self):
+        if self._percentage is None:
+            if self.dupes:
+                matches = self._get_matches_for_ref()
+                self._percentage = sum(match.percentage for match in matches) // len(matches)
+            else:
+                self._percentage = 0
+        return self._percentage
+    
+    @property
+    def ref(self):
+        if self:
+            return self[0]
+    
+
+def get_groups(matches, j=job.nulljob):
+    matches.sort(key=lambda match: -match.percentage)
+    dupe2group = {}
+    groups = []
+    try:
+        for match in j.iter_with_progress(matches, 'Grouped %d/%d matches', JOB_REFRESH_RATE):
+            first, second, _ = match
+            first_group = dupe2group.get(first)
+            second_group = dupe2group.get(second)
+            if first_group:
+                if second_group:
+                    if first_group is second_group:
+                        target_group = first_group
+                    else:
+                        continue
+                else:
+                    target_group = first_group
+                    dupe2group[second] = target_group
+            else:
+                if second_group:
+                    target_group = second_group
+                    dupe2group[first] = target_group
+                else:
+                    target_group = Group()
+                    groups.append(target_group)
+                    dupe2group[first] = target_group
+                    dupe2group[second] = target_group
+            target_group.add_match(match)
+    except MemoryError:
+        del dupe2group
+        del matches
+        # should free enough memory to continue
+        logging.warning('Memory Overflow. Groups: {0}'.format(len(groups)))
+    # Now that we have a group, we have to discard groups' matches and see if there're any "orphan"
+    # matches, that is, matches that were candidate in a group but that none of their 2 files were
+    # accepted in the group. With these orphan groups, it's safe to build additional groups
+    matched_files = set(flatten(groups))
+    orphan_matches = []
+    for group in groups:
+        orphan_matches += set(m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second]))
+    if groups and orphan_matches:
+        groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time
+    return groups
--- a/core/export.py
+++ b/core/export.py
@@ -0,0 +1,138 @@
+# Created By: Virgil Dupras
+# Created On: 2006/09/16
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import tempfile
+import os.path as op
+from tempfile import mkdtemp
+
+# Yes, this is a very low-tech solution, but at least it doesn't have all these annoying dependency
+# and resource problems.
+
+MAIN_TEMPLATE = u"""
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Strict//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+    <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
+	<title>dupeGuru Results</title>
+	<style type="text/css">
+BODY
+{
+	background-color:white;
+}
+
+BODY,A,P,UL,TABLE,TR,TD
+{
+	font-family:Tahoma,Arial,sans-serif;
+	font-size:10pt;
+	color: #4477AA;
+}
+
+TABLE
+{
+	background-color: #225588;
+	margin-left: auto;
+  	margin-right: auto;
+	width: 90%;
+}
+
+TR 
+{
+    background-color: white;
+}
+
+TH 
+{ 
+	font-weight: bold; 
+	color: black;
+	background-color: #C8D6E5;
+}
+
+TH TD 
+{
+    color:black;
+}
+
+TD 
+{
+    padding-left: 2pt;
+}
+
+TD.rightelem
+{
+	text-align:right;
+	/*padding-left:0pt;*/
+	padding-right: 2pt;
+	width: 17%;
+}
+
+TD.indented
+{
+    padding-left: 12pt;
+}
+
+H1
+{
+	font-family:&quot;Courier New&quot;,monospace;
+	color:#6699CC;
+    font-size:18pt; 
+	color:#6da500;
+	border-color: #70A0CF;
+	border-width: 1pt;
+	border-style: solid;
+	margin-top:   16pt;
+	margin-left:  5%;
+	margin-right: 5%;
+	padding-top:  2pt;
+	padding-bottom:2pt;
+	text-align:   center;
+}
+</style>
+</head>
+<body>
+<h1>dupeGuru Results</h1>
+<table>
+<tr>$colheaders</tr>
+$rows
+</table>
+</body>
+</html>
+"""
+
+COLHEADERS_TEMPLATE = u"<th>{name}</th>"
+
+ROW_TEMPLATE = u"""
+<tr>
+    <td class="{indented}">{filename}</td>{cells}
+</tr>
+"""
+
+CELL_TEMPLATE = u"""<td>{value}</td>"""
+
+def export_to_xhtml(colnames, rows):
+    # a row is a list of values with the first value being a flag indicating if the row should be indented
+    if rows:
+        assert len(rows[0]) == len(colnames) + 1 # + 1 is for the "indented" flag
+    colheaders = u''.join(COLHEADERS_TEMPLATE.format(name=name) for name in colnames)
+    rendered_rows = []
+    for row in rows:
+        # [2:] is to remove the indented flag + filename
+        indented = u'indented' if row[0] else u''
+        filename = row[1]
+        cells = u''.join(CELL_TEMPLATE.format(value=value) for value in row[2:])
+        rendered_rows.append(ROW_TEMPLATE.format(indented=indented, filename=filename, cells=cells))
+    rendered_rows = u''.join(rendered_rows)
+    # The main template can't use format because the css code uses {}
+    content = MAIN_TEMPLATE.replace('$colheaders', colheaders).replace('$rows', rendered_rows)
+    folder = mkdtemp()
+    destpath = op.join(folder, u'export.htm')
+    fp = open(destpath, 'w')
+    fp.write(content.encode('utf-8'))
+    fp.close()
+    return destpath
--- a/core/fs.py
+++ b/core/fs.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-22
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+# This is a fork from hsfs. The reason for this fork is that hsfs has been designed for musicGuru
+# and was re-used for dupeGuru. The problem is that hsfs is way over-engineered for dupeGuru,
+# resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
+# and I'm doing it now.
+
+from __future__ import unicode_literals
+
+import hashlib
+import logging
+
+from hsutil import io
+from hsutil.misc import nonone, flatten
+from hsutil.str import get_file_ext
+
+class FSError(Exception):
+    cls_message = "An error has occured on '{name}' in '{parent}'"
+    def __init__(self, fsobject, parent=None):
+        message = self.cls_message
+        if isinstance(fsobject, basestring):
+            name = fsobject
+        elif isinstance(fsobject, File):
+            name = fsobject.name
+        else:
+            name = ''
+        parentname = unicode(parent) if parent is not None else ''
+        Exception.__init__(self, message.format(name=name, parent=parentname))
+    
+
+class AlreadyExistsError(FSError):
+    "The directory or file name we're trying to add already exists"
+    cls_message = "'{name}' already exists in '{parent}'"
+
+class InvalidPath(FSError):
+    "The path of self is invalid, and cannot be worked with."
+    cls_message = "'{name}' is invalid."
+
+class InvalidDestinationError(FSError):
+    """A copy/move operation has been called, but the destination is invalid."""
+    cls_message = "'{name}' is an invalid destination for this operation."
+
+class OperationError(FSError):
+    """A copy/move/delete operation has been called, but the checkup after the 
+    operation shows that it didn't work."""
+    cls_message = "Operation on '{name}' failed."
+
+class File(object):
+    INITIAL_INFO = {
+        'size': 0,
+        'ctime': 0,
+        'mtime': 0,
+        'md5': '',
+        'md5partial': '',
+    }
+    
+    def __init__(self, path):
+        self.path = path
+        #This offset is where we should start reading the file to get a partial md5
+        #For audio file, it should be where audio data starts
+        self._md5partial_offset = 0x4000 #16Kb
+        self._md5partial_size   = 0x4000 #16Kb
+    
+    def __getattr__(self, attrname):
+        # Only called when attr is not there
+        if attrname in self.INITIAL_INFO:
+            try:
+                self._read_info(attrname)
+            except Exception as e:
+                logging.warning("An error '%s' was raised while decoding '%s'", e, repr(self.path))
+            try:
+                return self.__dict__[attrname]
+            except KeyError:
+                return self.INITIAL_INFO[attrname]
+        raise AttributeError()
+    
+    def _read_info(self, field):
+        if field in ('size', 'ctime', 'mtime'):
+            stats = io.stat(self.path)
+            self.size = nonone(stats.st_size, 0)
+            self.ctime = nonone(stats.st_ctime, 0)
+            self.mtime = nonone(stats.st_mtime, 0)
+        elif field == 'md5partial':
+            try:
+                fp = io.open(self.path, 'rb')
+                offset = self._md5partial_offset
+                size = self._md5partial_size
+                fp.seek(offset)
+                partialdata = fp.read(size)
+                md5 = hashlib.md5(partialdata)
+                self.md5partial = md5.digest()
+                fp.close()
+            except Exception:
+                pass
+        elif field == 'md5':
+            try:
+                fp = io.open(self.path, 'rb')
+                filedata = fp.read()
+                md5 = hashlib.md5(filedata)
+                self.md5 = md5.digest()
+                fp.close()
+            except Exception:
+                pass
+    
+    def _read_all_info(self, attrnames=None):
+        """Cache all possible info.
+        
+        If `attrnames` is not None, caches only attrnames.
+        """
+        if attrnames is None:
+            attrnames = self.INITIAL_INFO.keys()
+        for attrname in attrnames:
+            if attrname not in self.__dict__:
+                self._read_info(attrname)
+    
+    #--- Public
+    @classmethod
+    def can_handle(cls, path):
+        return not io.islink(path) and io.isfile(path)
+    
+    def rename(self, newname):
+        if newname == self.name:
+            return
+        destpath = self.path[:-1] + newname
+        if io.exists(destpath):
+            raise AlreadyExistsError(newname, self.path[:-1])
+        try:
+            io.rename(self.path, destpath)
+        except EnvironmentError:
+            raise OperationError(self)
+        if not io.exists(destpath):
+            raise OperationError(self)
+        self.path = destpath
+    
+    #--- Properties
+    @property
+    def extension(self):
+        return get_file_ext(self.name)
+    
+    @property
+    def name(self):
+        return self.path[-1]
+    
+
+def get_file(path, fileclasses=[File]):
+    for fileclass in fileclasses:
+        if fileclass.can_handle(path):
+            return fileclass(path)
+
+def get_files(path, fileclasses=[File]):
+    assert all(issubclass(fileclass, File) for fileclass in fileclasses)
+    try:
+        paths = [path + name for name in io.listdir(path)]
+        result = []
+        for path in paths:
+            file = get_file(path, fileclasses=fileclasses)
+            if file is not None:
+                result.append(file)
+        return result
+    except EnvironmentError:
+        raise InvalidPath(path)
+
+def get_all_files(path, fileclasses=[File]):
+    files = get_files(path, fileclasses=fileclasses)
+    filepaths = set(f.path for f in files)
+    subpaths = [path + name for name in io.listdir(path)]
+    # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
+    subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
+    subfiles = flatten(get_all_files(subpath, fileclasses=fileclasses) for subpath in subfolders)
+    return subfiles + files
--- a/core/ignore.py
+++ b/core/ignore.py
@@ -0,0 +1,116 @@
+# Created By: Virgil Dupras
+# Created On: 2006/05/02
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from hsutil.files import FileOrPath
+
+import xml.dom.minidom
+
+class IgnoreList(object):
+    """An ignore list implementation that is iterable, filterable and exportable to XML.
+    
+    Call Ignore to add an ignore list entry, and AreIgnore to check if 2 items are in the list.
+    When iterated, 2 sized tuples will be returned, the tuples containing 2 items ignored together.
+    """
+    #---Override
+    def __init__(self):
+        self._ignored = {}
+        self._count = 0
+    
+    def __iter__(self):
+        for first,seconds in self._ignored.iteritems():
+            for second in seconds:
+                yield (first,second)
+    
+    def __len__(self):
+        return self._count
+    
+    #---Public
+    def AreIgnored(self,first,second):
+        def do_check(first,second):
+            try:
+                matches = self._ignored[first]
+                return second in matches
+            except KeyError:
+                return False
+        
+        return do_check(first,second) or do_check(second,first)
+    
+    def Clear(self):
+        self._ignored = {}
+        self._count = 0
+    
+    def Filter(self,func):
+        """Applies a filter on all ignored items, and remove all matches where func(first,second)
+        doesn't return True.
+        """
+        filtered = IgnoreList()
+        for first,second in self:
+            if func(first,second):
+                filtered.Ignore(first,second)
+        self._ignored = filtered._ignored
+        self._count = filtered._count
+    
+    def Ignore(self,first,second):
+        if self.AreIgnored(first,second):
+            return
+        try:
+            matches = self._ignored[first]
+            matches.add(second)
+        except KeyError:
+            try:
+                matches = self._ignored[second]
+                matches.add(first)
+            except KeyError:
+                matches = set()
+                matches.add(second)
+                self._ignored[first] = matches
+        self._count += 1
+    
+    def load_from_xml(self,infile):
+        """Loads the ignore list from a XML created with save_to_xml.
+        
+        infile can be a file object or a filename.
+        """
+        try:
+            doc = xml.dom.minidom.parse(infile)
+        except Exception:
+            return
+        file_nodes = doc.getElementsByTagName('file')
+        for fn in file_nodes:
+            if not fn.getAttributeNode('path'):
+                continue
+            file_path = fn.getAttributeNode('path').nodeValue
+            subfile_nodes = fn.getElementsByTagName('file')
+            for sfn in subfile_nodes:
+                if not sfn.getAttributeNode('path'):
+                    continue
+                subfile_path = sfn.getAttributeNode('path').nodeValue
+                self.Ignore(file_path,subfile_path)
+    
+    def save_to_xml(self,outfile):
+        """Create a XML file that can be used by load_from_xml.
+        
+        outfile can be a file object or a filename.
+        """
+        doc = xml.dom.minidom.Document()
+        root = doc.appendChild(doc.createElement('ignore_list'))
+        for file,subfiles in self._ignored.items():
+            file_node = root.appendChild(doc.createElement('file'))
+            if isinstance(file,unicode):
+                file = file.encode('utf-8')
+            file_node.setAttribute('path',file)
+            for subfile in subfiles:
+                subfile_node = file_node.appendChild(doc.createElement('file'))
+                if isinstance(subfile,unicode):
+                    subfile = subfile.encode('utf-8')
+                subfile_node.setAttribute('path',subfile)
+        with FileOrPath(outfile, 'wb') as fp:
+            doc.writexml(fp,'\t','\t','\n',encoding='utf-8')
+    
+
--- a/core/results.py
+++ b/core/results.py
@@ -0,0 +1,370 @@
+# Created By: Virgil Dupras
+# Created On: 2006/02/23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import logging
+import re
+from xml.sax import handler, make_parser, SAXException
+from xml.sax.saxutils import XMLGenerator
+from xml.sax.xmlreader import AttributesImpl
+
+from . import engine
+from hsutil.job import nulljob
+from hsutil.markable import Markable
+from hsutil.misc import flatten, cond, nonone
+from hsutil.str import format_size
+from hsutil.files import open_if_filename
+
+class Results(Markable):
+    #---Override
+    def __init__(self, data_module):
+        super(Results, self).__init__()
+        self.__groups = []
+        self.__group_of_duplicate = {}
+        self.__groups_sort_descriptor = None # This is a tuple (key, asc)
+        self.__dupes = None
+        self.__dupes_sort_descriptor = None # This is a tuple (key, asc, delta)
+        self.__filters = None
+        self.__filtered_dupes = None
+        self.__filtered_groups = None
+        self.__recalculate_stats()
+        self.__marked_size = 0
+        self.data = data_module
+    
+    def _did_mark(self, dupe):
+        self.__marked_size += dupe.size
+    
+    def _did_unmark(self, dupe):
+        self.__marked_size -= dupe.size
+    
+    def _get_markable_count(self):
+        return self.__total_count
+    
+    def _is_markable(self, dupe):
+        if dupe.is_ref:
+            return False
+        g = self.get_group_of_duplicate(dupe)
+        if not g:
+            return False
+        if dupe is g.ref:
+            return False
+        if self.__filtered_dupes and dupe not in self.__filtered_dupes:
+            return False
+        return True
+    
+    #---Private
+    def __get_dupe_list(self):
+        if self.__dupes is None:
+            self.__dupes = flatten(group.dupes for group in self.groups)
+            if None in self.__dupes:
+                # This is debug logging to try to figure out #44
+                logging.warning("There is a None value in the Results' dupe list. dupes: %r groups: %r", self.__dupes, self.groups)
+            if self.__filtered_dupes:
+                self.__dupes = [dupe for dupe in self.__dupes if dupe in self.__filtered_dupes]
+            sd = self.__dupes_sort_descriptor
+            if sd:
+                self.sort_dupes(sd[0], sd[1], sd[2])
+        return self.__dupes
+    
+    def __get_groups(self):
+        if self.__filtered_groups is None:
+            return self.__groups
+        else:
+            return self.__filtered_groups
+    
+    def __get_stat_line(self):
+        if self.__filtered_dupes is None:
+            mark_count = self.mark_count
+            marked_size = self.__marked_size
+            total_count = self.__total_count
+            total_size = self.__total_size
+        else:
+            mark_count = len([dupe for dupe in self.__filtered_dupes if self.is_marked(dupe)])
+            marked_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_marked(dupe))
+            total_count = len([dupe for dupe in self.__filtered_dupes if self.is_markable(dupe)])
+            total_size = sum(dupe.size for dupe in self.__filtered_dupes if self.is_markable(dupe))
+        if self.mark_inverted:
+            marked_size = self.__total_size - marked_size
+        result = '%d / %d (%s / %s) duplicates marked.' % (
+            mark_count,
+            total_count,
+            format_size(marked_size, 2),
+            format_size(total_size, 2),
+        )
+        if self.__filters:
+            result += ' filter: %s' % ' --> '.join(self.__filters)
+        return result
+    
+    def __recalculate_stats(self):
+        self.__total_size = 0
+        self.__total_count = 0
+        for group in self.groups:
+            markable = [dupe for dupe in group.dupes if self._is_markable(dupe)]
+            self.__total_count += len(markable)
+            self.__total_size += sum(dupe.size for dupe in markable)
+    
+    def __set_groups(self, new_groups):
+        self.mark_none()
+        self.__groups = new_groups
+        self.__group_of_duplicate = {}
+        for g in self.__groups:
+            for dupe in g:
+                self.__group_of_duplicate[dupe] = g
+                if not hasattr(dupe, 'is_ref'):
+                    dupe.is_ref = False
+        old_filters = nonone(self.__filters, [])
+        self.apply_filter(None)
+        for filter_str in old_filters:
+            self.apply_filter(filter_str)
+    
+    #---Public
+    def apply_filter(self, filter_str):
+        ''' Applies a filter 'filter_str' to self.groups
+        
+            When you apply the filter, only  dupes with the filename matching 'filter_str' will be in
+            in the results. To cancel the filter, just call apply_filter with 'filter_str' to None, 
+            and the results will go back to normal.
+            
+            If call apply_filter on a filtered results, the filter will be applied 
+            *on the filtered results*.
+            
+            'filter_str' is a string containing a regexp to filter dupes with.
+        '''
+        if not filter_str:
+            self.__filtered_dupes = None
+            self.__filtered_groups = None
+            self.__filters = None
+        else:
+            if not self.__filters:
+                self.__filters = []
+            try:
+                filter_re = re.compile(filter_str, re.IGNORECASE)
+            except re.error:
+                return # don't apply this filter.
+            self.__filters.append(filter_str)
+            if self.__filtered_dupes is None:
+                self.__filtered_dupes = flatten(g[:] for g in self.groups)
+            self.__filtered_dupes = set(dupe for dupe in self.__filtered_dupes if filter_re.search(dupe.name))
+            filtered_groups = set()
+            for dupe in self.__filtered_dupes:
+                filtered_groups.add(self.get_group_of_duplicate(dupe))
+            self.__filtered_groups = list(filtered_groups)
+        self.__recalculate_stats()
+        sd = self.__groups_sort_descriptor
+        if sd:
+            self.sort_groups(sd[0], sd[1])
+        self.__dupes = None
+    
+    def get_group_of_duplicate(self, dupe):
+        try:
+            return self.__group_of_duplicate[dupe]
+        except (TypeError, KeyError):
+            return None
+    
+    is_markable = _is_markable
+    
+    def load_from_xml(self, infile, get_file, j=nulljob):
+        self.apply_filter(None)
+        handler = _ResultsHandler(get_file)
+        try:
+            parser = make_parser()
+        except Exception as e:
+            # This special handling is to try to figure out the cause of #47
+            # We don't silently return, because we want the user to send error report.
+            logging.exception(e)
+            try:
+                import xml.parsers.expat
+                logging.warning('importing xml.parsers.expat went ok, WTF?')
+            except Exception as e:
+                # This log should give a little more details about the cause of this all
+                logging.exception(e)
+                raise
+            raise
+        parser.setContentHandler(handler)
+        try:
+            infile, must_close = open_if_filename(infile)
+        except IOError:
+            return
+        BUFSIZE = 1024 * 1024 # 1mb buffer
+        infile.seek(0, 2)
+        j.start_job(infile.tell() // BUFSIZE)
+        infile.seek(0, 0)
+        try:
+            while True:
+                data = infile.read(BUFSIZE)
+                if not data:
+                    break
+                parser.feed(data)
+                j.add_progress()
+        except SAXException:
+            return
+        self.groups = handler.groups
+        for dupe_file in handler.marked:
+            self.mark(dupe_file)
+    
+    def make_ref(self, dupe):
+        g = self.get_group_of_duplicate(dupe)
+        r = g.ref
+        self._remove_mark_flag(dupe)
+        g.switch_ref(dupe);
+        if not r.is_ref:
+            self.__total_count += 1
+            self.__total_size += r.size
+        if not dupe.is_ref:
+            self.__total_count -= 1
+            self.__total_size -= dupe.size
+        self.__dupes = None
+    
+    def perform_on_marked(self, func, remove_from_results):
+        problems = []
+        for d in self.dupes:
+            if self.is_marked(d) and (not func(d)):
+                problems.append(d)
+        if remove_from_results:
+            to_remove = [d for d in self.dupes if self.is_marked(d) and (d not in problems)]
+            self.remove_duplicates(to_remove)
+            self.mark_none()
+            for d in problems:
+                self.mark(d)
+        return len(problems)
+    
+    def remove_duplicates(self, dupes):
+        '''Remove 'dupes' from their respective group, and remove the group is it ends up empty.
+        '''
+        affected_groups = set()
+        for dupe in dupes:
+            group = self.get_group_of_duplicate(dupe)
+            if dupe not in group.dupes:
+                return
+            group.remove_dupe(dupe, False)
+            self._remove_mark_flag(dupe)
+            self.__total_count -= 1
+            self.__total_size -= dupe.size
+            if not group:
+                self.__groups.remove(group)
+                if self.__filtered_groups:
+                    self.__filtered_groups.remove(group)
+            else:
+                affected_groups.add(group)
+        for group in affected_groups:
+            group.discard_matches()
+        self.__dupes = None
+    
+    def save_to_xml(self, outfile):
+        self.apply_filter(None)
+        outfile, must_close = open_if_filename(outfile, 'wb')
+        writer = XMLGenerator(outfile, 'utf-8')
+        writer.startDocument()
+        empty_attrs = AttributesImpl({})
+        writer.startElement('results', empty_attrs)
+        for g in self.groups:
+            writer.startElement('group', empty_attrs)
+            dupe2index = {}
+            for index, d in enumerate(g):
+                dupe2index[d] = index
+                try:
+                    words = engine.unpack_fields(d.words)
+                except AttributeError:
+                    words = ()
+                attrs = AttributesImpl({
+                    'path': unicode(d.path),
+                    'is_ref': cond(d.is_ref, 'y', 'n'),
+                    'words': ','.join(words),
+                    'marked': cond(self.is_marked(d), 'y', 'n')
+                })
+                writer.startElement('file', attrs)
+                writer.endElement('file')
+            for match in g.matches:
+                attrs = AttributesImpl({
+                    'first': str(dupe2index[match.first]),
+                    'second': str(dupe2index[match.second]),
+                    'percentage': str(int(match.percentage)),
+                })
+                writer.startElement('match', attrs)
+                writer.endElement('match')
+            writer.endElement('group')
+        writer.endElement('results')
+        writer.endDocument()
+        if must_close:
+            outfile.close()
+    
+    def sort_dupes(self, key, asc=True, delta=False):
+        if not self.__dupes:
+            self.__get_dupe_list()
+        self.__dupes.sort(key=lambda d: self.data.GetDupeSortKey(d, lambda: self.get_group_of_duplicate(d), key, delta))
+        if not asc:
+            self.__dupes.reverse()
+        self.__dupes_sort_descriptor = (key,asc,delta)
+    
+    def sort_groups(self,key,asc=True):
+        self.groups.sort(key=lambda g: self.data.GetGroupSortKey(g, key))
+        if not asc:
+            self.groups.reverse()
+        self.__groups_sort_descriptor = (key,asc)
+    
+    #---Properties
+    dupes     = property(__get_dupe_list)
+    groups    = property(__get_groups, __set_groups)
+    stat_line = property(__get_stat_line)
+
+class _ResultsHandler(handler.ContentHandler):
+    def __init__(self, get_file):
+        self.group = None
+        self.dupes = None
+        self.marked = set()
+        self.groups = []
+        self.get_file = get_file
+    
+    def startElement(self, name, attrs):
+        if name == 'group':
+            self.group = engine.Group()
+            self.dupes = []
+            return
+        if (name == 'file') and (self.group is not None):
+            if not (('path' in attrs) and ('words' in attrs)):
+                return
+            path = attrs['path']
+            file = self.get_file(path)
+            if file is None:
+                return
+            file.words = attrs['words'].split(',')
+            file.is_ref = attrs.get('is_ref') == 'y'
+            self.dupes.append(file)
+            if attrs.get('marked') == 'y':
+                self.marked.add(file)
+        if (name == 'match') and (self.group is not None):
+            try:
+                first_file = self.dupes[int(attrs['first'])]
+                second_file = self.dupes[int(attrs['second'])]
+                percentage = int(attrs['percentage'])
+                self.group.add_match(engine.Match(first_file, second_file, percentage))
+            except (IndexError, KeyError, ValueError): # Covers missing attr, non-int values and indexes out of bounds
+                pass
+    
+    def endElement(self, name):
+        def do_match(ref_file, other_files, group):
+            if not other_files:
+                return
+            for other_file in other_files:
+                group.add_match(engine.get_match(ref_file, other_file))
+            do_match(other_files[0], other_files[1:], group)
+        
+        if name == 'group':
+            group = self.group
+            self.group = None
+            dupes = self.dupes
+            self.dupes = []
+            if group is None:
+                return
+            if len(dupes) < 2:
+                return
+            if not group.matches: # <match> elements not present, do it manually, without %
+                do_match(dupes[0], dupes[1:], group)
+            group.prioritize(lambda x: dupes.index(x))
+            self.groups.append(group)
+    
--- a/core/scanner.py
+++ b/core/scanner.py
@@ -0,0 +1,109 @@
+# Created By: Virgil Dupras
+# Created On: 2006/03/03
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import logging
+
+
+from hsutil import job, io
+from hsutil.misc import dedupe
+from hsutil.str import get_file_ext, rem_file_ext
+
+from . import engine
+from .ignore import IgnoreList
+
+(SCAN_TYPE_FILENAME,
+SCAN_TYPE_FIELDS,
+SCAN_TYPE_FIELDS_NO_ORDER,
+SCAN_TYPE_TAG,
+UNUSED, # Must not be removed. Constants here are what scan_type in the prefs are.
+SCAN_TYPE_CONTENT,
+SCAN_TYPE_CONTENT_AUDIO) = range(7)
+
+SCANNABLE_TAGS = ['track', 'artist', 'album', 'title', 'genre', 'year']
+
+class Scanner(object):
+    def __init__(self):
+        self.ignore_list = IgnoreList()
+        self.discarded_file_count = 0
+    
+    def _getmatches(self, files, j):
+        if self.size_threshold:
+            j = j.start_subjob([2, 8])
+            for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
+                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
+            files = [f for f in files if f.size >= self.size_threshold]
+        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
+            sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
+            return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
+        else:
+            j = j.start_subjob([2, 8])
+            kw = {}
+            kw['match_similar_words'] = self.match_similar_words
+            kw['weight_words'] = self.word_weighting
+            kw['min_match_percentage'] = self.min_match_percentage
+            if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
+                self.scan_type = SCAN_TYPE_FIELDS
+                kw['no_field_order'] = True
+            func = {
+                SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
+                SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
+                SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
+            }[self.scan_type]
+            for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
+                f.words = func(f)
+            return engine.getmatches(files, j=j, **kw)
+    
+    @staticmethod
+    def _key_func(dupe):
+        return (not dupe.is_ref, -dupe.size)
+    
+    @staticmethod
+    def _tie_breaker(ref, dupe):
+        refname = rem_file_ext(ref.name).lower()
+        dupename = rem_file_ext(dupe.name).lower()
+        if 'copy' in refname and 'copy' not in dupename:
+            return True
+        if refname.startswith(dupename) and (refname[len(dupename):].strip().isdigit()):
+            return True
+        return len(dupe.path) > len(ref.path)
+    
+    def GetDupeGroups(self, files, j=job.nulljob):
+        j = j.start_subjob([8, 2])
+        for f in [f for f in files if not hasattr(f, 'is_ref')]:
+            f.is_ref = False
+        logging.info('Getting matches')
+        matches = self._getmatches(files, j)
+        logging.info('Found %d matches' % len(matches))
+        j.set_progress(100, 'Removing false matches')
+        if not self.mix_file_kind:
+            matches = [m for m in matches if get_file_ext(m.first.name) == get_file_ext(m.second.name)]
+        matches = [m for m in matches if io.exists(m.first.path) and io.exists(m.second.path)]
+        if self.ignore_list:
+            j = j.start_subjob(2)
+            iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
+            matches = [m for m in iter_matches 
+                if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
+        logging.info('Grouping matches')
+        groups = engine.get_groups(matches, j)
+        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
+        self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
+        groups = [g for g in groups if any(not f.is_ref for f in g)]
+        logging.info('Created %d groups' % len(groups))
+        j.set_progress(100, 'Doing group prioritization')
+        for g in groups:
+            g.prioritize(self._key_func, self._tie_breaker)
+        return groups
+    
+    match_similar_words  = False
+    min_match_percentage = 80
+    mix_file_kind        = True
+    scan_type            = SCAN_TYPE_FILENAME
+    scanned_tags         = set(['artist', 'title'])
+    size_threshold       = 0
+    word_weighting       = False
--- a/core/tests/init.py
+++ b/core/tests/init.py
--- a/core/tests/app_cocoa_test.py
+++ b/core/tests/app_cocoa_test.py
@@ -0,0 +1,366 @@
+# Created By: Virgil Dupras
+# Created On: 2006/11/11
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import tempfile
+import shutil
+import logging
+import os.path as op
+
+from nose.tools import eq_
+
+from hsutil.path import Path
+from hsutil.testcase import TestCase
+from hsutil.decorators import log_calls
+from hsutil import io
+
+from . import data
+from .results_test import GetTestGroups
+from .. import engine, fs
+try:
+    from ..app_cocoa import DupeGuru as DupeGuruBase
+except ImportError:
+    from nose.plugins.skip import SkipTest
+    raise SkipTest("These tests can only be run on OS X")
+
+class DupeGuru(DupeGuruBase):
+    def __init__(self):
+        DupeGuruBase.__init__(self, data, '/tmp', appid=4)
+    
+    def _start_job(self, jobid, func):
+        func(nulljob)
+    
+def r2np(rows):
+    #Transforms a list of rows [1,2,3] into a list of node paths [[1],[2],[3]]
+    return [[i] for i in rows]
+
+class TCDupeGuru(TestCase):
+    def setUp(self):
+        self.app = DupeGuru()
+        self.objects,self.matches,self.groups = GetTestGroups()
+        self.app.results.groups = self.groups
+        tmppath = self.tmppath()
+        io.mkdir(tmppath + 'foo')
+        io.mkdir(tmppath + 'bar')
+        self.app.directories.add_path(tmppath)
+    
+    def test_GetObjects(self):
+        app = self.app
+        objects = self.objects
+        groups = self.groups
+        g,d = app.GetObjects([0])
+        self.assert_(g is groups[0])
+        self.assert_(d is None)
+        g,d = app.GetObjects([0,0])
+        self.assert_(g is groups[0])
+        self.assert_(d is objects[1])
+        g,d = app.GetObjects([1,0])
+        self.assert_(g is groups[1])
+        self.assert_(d is objects[4])
+    
+    def test_GetObjects_after_sort(self):
+        app = self.app
+        objects = self.objects
+        groups = self.groups[:] #To keep the old order in memory
+        app.sort_groups(0,False) #0 = Filename
+        #Now, the group order is supposed to be reversed
+        g,d = app.GetObjects([0,0])
+        self.assert_(g is groups[1])
+        self.assert_(d is objects[4])
+    
+    def test_GetObjects_out_of_range(self):
+        app = self.app
+        self.assertEqual((None,None),app.GetObjects([2]))
+        self.assertEqual((None,None),app.GetObjects([]))
+        self.assertEqual((None,None),app.GetObjects([1,2]))
+    
+    def test_selected_result_node_paths(self):
+        # app.selected_dupes is correctly converted into node paths
+        app = self.app
+        objects = self.objects
+        paths = [[0, 0], [0, 1], [1]]
+        app.SelectResultNodePaths(paths)
+        eq_(app.selected_result_node_paths(), paths)
+    
+    def test_selected_result_node_paths_after_deletion(self):
+        # cases where the selected dupes aren't there are correctly handled
+        app = self.app
+        objects = self.objects
+        paths = [[0, 0], [0, 1], [1]]
+        app.SelectResultNodePaths(paths)
+        app.RemoveSelected()
+        # The first 2 dupes have been removed. The 3rd one is a ref. it stays there, in first pos.
+        eq_(app.selected_result_node_paths(), [[0]]) # no exception
+    
+    def test_selectResultNodePaths(self):
+        app = self.app
+        objects = self.objects
+        app.SelectResultNodePaths([[0,0],[0,1]])
+        self.assertEqual(2,len(app.selected_dupes))
+        self.assert_(app.selected_dupes[0] is objects[1])
+        self.assert_(app.selected_dupes[1] is objects[2])
+    
+    def test_selectResultNodePaths_with_ref(self):
+        app = self.app
+        objects = self.objects
+        app.SelectResultNodePaths([[0,0],[0,1],[1]])
+        self.assertEqual(3,len(app.selected_dupes))
+        self.assert_(app.selected_dupes[0] is objects[1])
+        self.assert_(app.selected_dupes[1] is objects[2])
+        self.assert_(app.selected_dupes[2] is self.groups[1].ref)
+    
+    def test_selectResultNodePaths_empty(self):
+        self.app.SelectResultNodePaths([])
+        self.assertEqual(0,len(self.app.selected_dupes))        
+    
+    def test_selectResultNodePaths_after_sort(self):
+        app = self.app
+        objects = self.objects
+        groups = self.groups[:] #To keep the old order in memory
+        app.sort_groups(0,False) #0 = Filename
+        #Now, the group order is supposed to be reversed
+        app.SelectResultNodePaths([[0,0],[1],[1,0]])
+        self.assertEqual(3,len(app.selected_dupes))
+        self.assert_(app.selected_dupes[0] is objects[4])
+        self.assert_(app.selected_dupes[1] is groups[0].ref)
+        self.assert_(app.selected_dupes[2] is objects[1])
+    
+    def test_selectResultNodePaths_out_of_range(self):
+        app = self.app
+        app.SelectResultNodePaths([[0,0],[0,1],[1],[1,1],[2]])
+        self.assertEqual(3,len(app.selected_dupes))
+    
+    def test_selected_powermarker_node_paths(self):
+        # app.selected_dupes is correctly converted into paths
+        app = self.app
+        objects = self.objects
+        paths = r2np([0, 1, 2])
+        app.SelectPowerMarkerNodePaths(paths)
+        eq_(app.selected_powermarker_node_paths(), paths)
+    
+    def test_selected_powermarker_node_paths_after_deletion(self):
+        # cases where the selected dupes aren't there are correctly handled
+        app = self.app
+        objects = self.objects
+        paths = r2np([0, 1, 2])
+        app.SelectPowerMarkerNodePaths(paths)
+        app.RemoveSelected()
+        eq_(app.selected_powermarker_node_paths(), []) # no exception
+    
+    def test_selectPowerMarkerRows(self):
+        app = self.app
+        objects = self.objects
+        app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
+        self.assertEqual(3,len(app.selected_dupes))
+        self.assert_(app.selected_dupes[0] is objects[1])
+        self.assert_(app.selected_dupes[1] is objects[2])
+        self.assert_(app.selected_dupes[2] is objects[4])
+    
+    def test_selectPowerMarkerRows_empty(self):
+        self.app.SelectPowerMarkerNodePaths([])
+        self.assertEqual(0,len(self.app.selected_dupes))
+    
+    def test_selectPowerMarkerRows_after_sort(self):
+        app = self.app
+        objects = self.objects
+        app.sort_dupes(0,False) #0 = Filename
+        app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
+        self.assertEqual(3,len(app.selected_dupes))
+        self.assert_(app.selected_dupes[0] is objects[4])
+        self.assert_(app.selected_dupes[1] is objects[2])
+        self.assert_(app.selected_dupes[2] is objects[1])
+    
+    def test_selectPowerMarkerRows_out_of_range(self):
+        app = self.app
+        app.SelectPowerMarkerNodePaths(r2np([0,1,2,3]))
+        self.assertEqual(3,len(app.selected_dupes))
+    
+    def test_toggleSelectedMark(self):
+        app = self.app
+        objects = self.objects
+        app.ToggleSelectedMarkState()
+        self.assertEqual(0,app.results.mark_count)
+        app.SelectPowerMarkerNodePaths(r2np([0,2]))
+        app.ToggleSelectedMarkState()
+        self.assertEqual(2,app.results.mark_count)
+        self.assert_(not app.results.is_marked(objects[0]))
+        self.assert_(app.results.is_marked(objects[1]))
+        self.assert_(not app.results.is_marked(objects[2]))
+        self.assert_(not app.results.is_marked(objects[3]))
+        self.assert_(app.results.is_marked(objects[4]))
+    
+    def test_refreshDetailsWithSelected(self):
+        def mock_refresh(dupe,group):
+            self.called = True
+            if self.app.selected_dupes:
+                self.assert_(dupe is self.app.selected_dupes[0])
+                self.assert_(group is self.app.results.get_group_of_duplicate(dupe))
+            else:
+                self.assert_(dupe is None)
+                self.assert_(group is None)
+        
+        self.app.RefreshDetailsTable = mock_refresh
+        self.called = False
+        self.app.SelectPowerMarkerNodePaths(r2np([0,2]))
+        self.app.RefreshDetailsWithSelected()
+        self.assert_(self.called)
+        self.called = False
+        self.app.SelectPowerMarkerNodePaths([])
+        self.app.RefreshDetailsWithSelected()
+        self.assert_(self.called)
+    
+    def test_makeSelectedReference(self):
+        app = self.app
+        objects = self.objects
+        groups = self.groups
+        app.SelectPowerMarkerNodePaths(r2np([0,2]))
+        app.MakeSelectedReference()
+        self.assert_(groups[0].ref is objects[1])
+        self.assert_(groups[1].ref is objects[4])
+    
+    def test_makeSelectedReference_by_selecting_two_dupes_in_the_same_group(self):
+        app = self.app
+        objects = self.objects
+        groups = self.groups
+        app.SelectPowerMarkerNodePaths(r2np([0,1,2]))
+        #Only 0 and 2 must go ref, not 1 because it is a part of the same group
+        app.MakeSelectedReference()
+        self.assert_(groups[0].ref is objects[1])
+        self.assert_(groups[1].ref is objects[4])
+    
+    def test_removeSelected(self):
+        app = self.app
+        app.SelectPowerMarkerNodePaths(r2np([0,2]))
+        app.RemoveSelected()
+        self.assertEqual(1,len(app.results.dupes))
+        app.RemoveSelected()
+        self.assertEqual(1,len(app.results.dupes))
+        app.SelectPowerMarkerNodePaths(r2np([0,2]))
+        app.RemoveSelected()
+        self.assertEqual(0,len(app.results.dupes))
+    
+    def test_addDirectory_simple(self):
+        # There's already a directory in self.app, so adding another once makes 2 of em
+        app = self.app
+        eq_(app.add_directory(self.datadirpath()), 0)
+        eq_(len(app.directories), 2)
+    
+    def test_addDirectory_already_there(self):
+        app = self.app
+        self.assertEqual(0,app.add_directory(self.datadirpath()))
+        self.assertEqual(1,app.add_directory(self.datadirpath()))
+    
+    def test_addDirectory_does_not_exist(self):
+        app = self.app
+        self.assertEqual(2,app.add_directory('/does_not_exist'))
+    
+    def test_ignore(self):
+        app = self.app
+        app.SelectPowerMarkerNodePaths(r2np([2])) #The dupe of the second, 2 sized group
+        app.AddSelectedToIgnoreList()
+        self.assertEqual(1,len(app.scanner.ignore_list))
+        app.SelectPowerMarkerNodePaths(r2np([0])) #first dupe of the 3 dupes group
+        app.AddSelectedToIgnoreList()
+        #BOTH the ref and the other dupe should have been added
+        self.assertEqual(3,len(app.scanner.ignore_list))
+    
+    def test_purgeIgnoreList(self):
+        app = self.app
+        p1 = self.filepath('zerofile')
+        p2 = self.filepath('zerofill')
+        dne = '/does_not_exist'
+        app.scanner.ignore_list.Ignore(dne,p1)
+        app.scanner.ignore_list.Ignore(p2,dne)
+        app.scanner.ignore_list.Ignore(p1,p2)
+        app.PurgeIgnoreList()
+        self.assertEqual(1,len(app.scanner.ignore_list))
+        self.assert_(app.scanner.ignore_list.AreIgnored(p1,p2))
+        self.assert_(not app.scanner.ignore_list.AreIgnored(dne,p1))
+    
+    def test_only_unicode_is_added_to_ignore_list(self):
+        def FakeIgnore(first,second):
+            if not isinstance(first,unicode):
+                self.fail()
+            if not isinstance(second,unicode):
+                self.fail()
+        
+        app = self.app
+        app.scanner.ignore_list.Ignore = FakeIgnore
+        app.SelectPowerMarkerNodePaths(r2np([2])) #The dupe of the second, 2 sized group
+        app.AddSelectedToIgnoreList()
+    
+    def test_GetOutlineViewChildCounts_out_of_range(self):
+        # Out of range requests don't crash and return an empty value
+        app = self.app
+        # [0, 2] is out of range
+        eq_(app.GetOutlineViewChildCounts(1, [0, 2]), []) # no crash
+    
+    def test_GetOutlineViewValues_out_of_range(self):
+        # Out of range requests don't crash and return an empty value
+        app = self.app
+        # [0, 2] is out of range
+        eq_(app.GetOutlineViewValues(1, [0, 2]), []) # no crash
+    
+
+class TCDupeGuru_renameSelected(TestCase):
+    def setUp(self):
+        p = self.tmppath()
+        fp = open(unicode(p + 'foo bar 1'),mode='w')
+        fp.close()
+        fp = open(unicode(p + 'foo bar 2'),mode='w')
+        fp.close()
+        fp = open(unicode(p + 'foo bar 3'),mode='w')
+        fp.close()
+        files = fs.get_files(p)
+        matches = engine.getmatches(files)
+        groups = engine.get_groups(matches)
+        g = groups[0]
+        g.prioritize(lambda x:x.name)
+        app = DupeGuru()
+        app.results.groups = groups
+        self.app = app
+        self.groups = groups
+        self.p = p
+        self.files = files
+    
+    def test_simple(self):
+        app = self.app
+        g = self.groups[0]
+        app.SelectPowerMarkerNodePaths(r2np([0]))
+        assert app.RenameSelected('renamed')
+        names = io.listdir(self.p)
+        assert 'renamed' in names
+        assert 'foo bar 2' not in names
+        eq_(g.dupes[0].name, 'renamed')
+    
+    def test_none_selected(self):
+        app = self.app
+        g = self.groups[0]
+        app.SelectPowerMarkerNodePaths([])
+        self.mock(logging, 'warning', log_calls(lambda msg: None))
+        assert not app.RenameSelected('renamed')
+        msg = logging.warning.calls[0]['msg']
+        eq_('dupeGuru Warning: list index out of range', msg)
+        names = io.listdir(self.p)
+        assert 'renamed' not in names
+        assert 'foo bar 2' in names
+        eq_(g.dupes[0].name, 'foo bar 2')
+    
+    def test_name_already_exists(self):
+        app = self.app
+        g = self.groups[0]
+        app.SelectPowerMarkerNodePaths(r2np([0]))
+        self.mock(logging, 'warning', log_calls(lambda msg: None))
+        assert not app.RenameSelected('foo bar 1')
+        msg = logging.warning.calls[0]['msg']
+        assert msg.startswith('dupeGuru Warning: \'foo bar 1\' already exists in')
+        names = io.listdir(self.p)
+        assert 'foo bar 1' in names
+        assert 'foo bar 2' in names
+        eq_(g.dupes[0].name, 'foo bar 2')
+    
--- a/core/tests/app_test.py
+++ b/core/tests/app_test.py
@@ -0,0 +1,136 @@
+# Created By: Virgil Dupras
+# Created On: 2007-06-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import os
+
+from hsutil.testcase import TestCase
+from hsutil import io
+from hsutil.path import Path
+from hsutil.decorators import log_calls
+import hsutil.files
+from hsutil.job import nulljob
+
+from . import data
+from .. import app, fs
+from ..app import DupeGuru as DupeGuruBase
+
+class DupeGuru(DupeGuruBase):
+    def __init__(self):
+        DupeGuruBase.__init__(self, data, '/tmp', appid=4)
+    
+    def _start_job(self, jobid, func):
+        func(nulljob)
+    
+
+class TCDupeGuru(TestCase):
+    cls_tested_module = app
+    def test_apply_filter_calls_results_apply_filter(self):
+        app = DupeGuru()
+        self.mock(app.results, 'apply_filter', log_calls(app.results.apply_filter))
+        app.apply_filter('foo')
+        self.assertEqual(2, len(app.results.apply_filter.calls))
+        call = app.results.apply_filter.calls[0]
+        self.assert_(call['filter_str'] is None)
+        call = app.results.apply_filter.calls[1]
+        self.assertEqual('foo', call['filter_str'])
+    
+    def test_apply_filter_escapes_regexp(self):
+        app = DupeGuru()
+        self.mock(app.results, 'apply_filter', log_calls(app.results.apply_filter))
+        app.apply_filter('()[]\\.|+?^abc')
+        call = app.results.apply_filter.calls[1]
+        self.assertEqual('\\(\\)\\[\\]\\\\\\.\\|\\+\\?\\^abc', call['filter_str'])
+        app.apply_filter('(*)') # In "simple mode", we want the * to behave as a wilcard
+        call = app.results.apply_filter.calls[3]
+        self.assertEqual('\(.*\)', call['filter_str'])
+        app.options['escape_filter_regexp'] = False
+        app.apply_filter('(abc)')
+        call = app.results.apply_filter.calls[5]
+        self.assertEqual('(abc)', call['filter_str'])
+    
+    def test_copy_or_move(self):
+        # The goal here is just to have a test for a previous blowup I had. I know my test coverage
+        # for this unit is pathetic. What's done is done. My approach now is to add tests for
+        # every change I want to make. The blowup was caused by a missing import.
+        p = self.tmppath()
+        io.open(p + 'foo', 'w').close()
+        self.mock(hsutil.files, 'copy', log_calls(lambda source_path, dest_path: None))
+        self.mock(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
+        app = DupeGuru()
+        app.directories.add_path(p)
+        [f] = app.directories.get_files()
+        app.copy_or_move(f, True, 'some_destination', 0)
+        self.assertEqual(1, len(hsutil.files.copy.calls))
+        call = hsutil.files.copy.calls[0]
+        self.assertEqual('some_destination', call['dest_path'])
+        self.assertEqual(f.path, call['source_path'])
+    
+    def test_copy_or_move_clean_empty_dirs(self):
+        tmppath = Path(self.tmpdir())
+        sourcepath = tmppath + 'source'
+        io.mkdir(sourcepath)
+        io.open(sourcepath + 'myfile', 'w')
+        app = DupeGuru()
+        app.directories.add_path(tmppath)
+        [myfile] = app.directories.get_files()
+        self.mock(app, 'clean_empty_dirs', log_calls(lambda path: None))
+        app.copy_or_move(myfile, False, tmppath + 'dest', 0)
+        calls = app.clean_empty_dirs.calls
+        self.assertEqual(1, len(calls))
+        self.assertEqual(sourcepath, calls[0]['path'])
+    
+    def test_Scan_with_objects_evaluating_to_false(self):
+        class FakeFile(fs.File):
+            def __nonzero__(self):
+                return False
+            
+        
+        # At some point, any() was used in a wrong way that made Scan() wrongly return 1
+        app = DupeGuru()
+        f1, f2 = [FakeFile('foo') for i in range(2)]
+        f1.is_ref, f2.is_ref = (False, False)
+        assert not (bool(f1) and bool(f2))
+        app.directories.get_files = lambda: [f1, f2]
+        app.directories._dirs.append('this is just so Scan() doesnt return 3')
+        app.start_scanning() # no exception
+    
+
+class TCDupeGuru_clean_empty_dirs(TestCase):
+    cls_tested_module = app
+    def setUp(self):
+        self.mock(hsutil.files, 'delete_if_empty', log_calls(lambda path, files_to_delete=[]: None))
+        self.app = DupeGuru()
+    
+    def test_option_off(self):
+        self.app.clean_empty_dirs(Path('/foo/bar'))
+        self.assertEqual(0, len(hsutil.files.delete_if_empty.calls))
+    
+    def test_option_on(self):
+        self.app.options['clean_empty_dirs'] = True
+        self.app.clean_empty_dirs(Path('/foo/bar'))
+        calls = hsutil.files.delete_if_empty.calls
+        self.assertEqual(1, len(calls))
+        self.assertEqual(Path('/foo/bar'), calls[0]['path'])
+        self.assertEqual(['.DS_Store'], calls[0]['files_to_delete'])
+    
+    def test_recurse_up(self):
+        # delete_if_empty must be recursively called up in the path until it returns False
+        @log_calls
+        def mock_delete_if_empty(path, files_to_delete=[]):
+            return len(path) > 1
+        
+        self.mock(hsutil.files, 'delete_if_empty', mock_delete_if_empty)
+        self.app.options['clean_empty_dirs'] = True
+        self.app.clean_empty_dirs(Path('not-empty/empty/empty'))
+        calls = hsutil.files.delete_if_empty.calls
+        self.assertEqual(3, len(calls))
+        self.assertEqual(Path('not-empty/empty/empty'), calls[0]['path'])
+        self.assertEqual(Path('not-empty/empty'), calls[1]['path'])
+        self.assertEqual(Path('not-empty'), calls[2]['path'])
+    
--- a/core/tests/data.py
+++ b/core/tests/data.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+# data module for tests
+
+from hsutil.str import format_size
+from ..data import format_path, cmp_value
+
+COLUMNS = [
+    {'attr':'name','display':'Filename'},
+    {'attr':'path','display':'Directory'},
+    {'attr':'size','display':'Size (KB)'},
+    {'attr':'extension','display':'Kind'},
+]
+
+METADATA_TO_READ = ['size']
+
+def GetDisplayInfo(dupe, group, delta):
+    size = dupe.size
+    m = group.get_match_of(dupe)
+    if m and delta:
+        r = group.ref
+        size -= r.size
+    return [
+        dupe.name,
+        format_path(dupe.path),
+        format_size(size, 0, 1, False),
+        dupe.extension,
+    ]
+
+def GetDupeSortKey(dupe, get_group, key, delta):
+    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
+    if delta and (key == 2):
+        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
+    return r
+
+def GetGroupSortKey(group, key):
+    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/core/tests/directories_test.py
+++ b/core/tests/directories_test.py
@@ -0,0 +1,279 @@
+# Created By: Virgil Dupras
+# Created On: 2006/02/27
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import os.path as op
+import os
+import time
+
+from nose.tools import eq_
+
+from hsutil import io
+from hsutil.path import Path
+from hsutil.testcase import TestCase
+
+from ..directories import *
+
+testpath = Path(TestCase.datadirpath())
+
+def create_fake_fs(rootpath):
+    rootpath = rootpath + 'fs'
+    io.mkdir(rootpath)
+    io.mkdir(rootpath + 'dir1')
+    io.mkdir(rootpath + 'dir2')
+    io.mkdir(rootpath + 'dir3')
+    fp = io.open(rootpath + 'file1.test', 'w')
+    fp.write('1')
+    fp.close()
+    fp = io.open(rootpath + 'file2.test', 'w')
+    fp.write('12')
+    fp.close()
+    fp = io.open(rootpath + 'file3.test', 'w')
+    fp.write('123')
+    fp.close()
+    fp = io.open(rootpath + ('dir1', 'file1.test'), 'w')
+    fp.write('1')
+    fp.close()
+    fp = io.open(rootpath + ('dir2', 'file2.test'), 'w')
+    fp.write('12')
+    fp.close()
+    fp = io.open(rootpath + ('dir3', 'file3.test'), 'w')
+    fp.write('123')
+    fp.close()
+    return rootpath
+
+class TCDirectories(TestCase):
+    def test_empty(self):
+        d = Directories()
+        self.assertEqual(0,len(d))
+        self.assert_('foobar' not in d)
+    
+    def test_add_path(self):
+        d = Directories()
+        p = testpath + 'utils'
+        d.add_path(p)
+        self.assertEqual(1,len(d))
+        self.assert_(p in d)
+        self.assert_((p + 'foobar') in d)
+        self.assert_(p[:-1] not in d)
+        p = self.tmppath()
+        d.add_path(p)
+        self.assertEqual(2,len(d))
+        self.assert_(p in d)
+    
+    def test_AddPath_when_path_is_already_there(self):
+        d = Directories()
+        p = testpath + 'utils'
+        d.add_path(p)
+        self.assertRaises(AlreadyThereError, d.add_path, p)
+        self.assertRaises(AlreadyThereError, d.add_path, p + 'foobar')
+        self.assertEqual(1, len(d))
+    
+    def test_add_path_containing_paths_already_there(self):
+        d = Directories()
+        d.add_path(testpath + 'utils')
+        self.assertEqual(1, len(d))
+        d.add_path(testpath)
+        eq_(len(d), 1)
+        eq_(d[0], testpath)
+    
+    def test_AddPath_non_latin(self):
+    	p = Path(self.tmpdir())
+    	to_add = p + u'unicode\u201a'
+    	os.mkdir(unicode(to_add))
+    	d = Directories()
+    	try:
+    		d.add_path(to_add)
+    	except UnicodeDecodeError:
+    		self.fail()
+    
+    def test_del(self):
+        d = Directories()
+        d.add_path(testpath + 'utils')
+        try:
+            del d[1]
+            self.fail()
+        except IndexError:
+            pass
+        d.add_path(self.tmppath())
+        del d[1]
+        self.assertEqual(1, len(d))
+    
+    def test_states(self):
+        d = Directories()
+        p = testpath + 'utils'
+        d.add_path(p)
+        self.assertEqual(STATE_NORMAL,d.get_state(p))
+        d.set_state(p,STATE_REFERENCE)
+        self.assertEqual(STATE_REFERENCE,d.get_state(p))
+        self.assertEqual(STATE_REFERENCE,d.get_state(p + 'dir1'))
+        self.assertEqual(1,len(d.states))
+        self.assertEqual(p,d.states.keys()[0])
+        self.assertEqual(STATE_REFERENCE,d.states[p])
+    
+    def test_get_state_with_path_not_there(self):
+        # When the path's not there, just return STATE_NORMAL
+        d = Directories()
+        d.add_path(testpath + 'utils')
+        eq_(d.get_state(testpath), STATE_NORMAL)
+    
+    def test_states_remain_when_larger_directory_eat_smaller_ones(self):
+        d = Directories()
+        p = testpath + 'utils'
+        d.add_path(p)
+        d.set_state(p,STATE_EXCLUDED)
+        d.add_path(testpath)
+        d.set_state(testpath,STATE_REFERENCE)
+        self.assertEqual(STATE_EXCLUDED,d.get_state(p))
+        self.assertEqual(STATE_EXCLUDED,d.get_state(p + 'dir1'))
+        self.assertEqual(STATE_REFERENCE,d.get_state(testpath))
+    
+    def test_set_state_keep_state_dict_size_to_minimum(self):
+        d = Directories()
+        p = create_fake_fs(self.tmppath())
+        d.add_path(p)
+        d.set_state(p,STATE_REFERENCE)
+        d.set_state(p + 'dir1',STATE_REFERENCE)
+        self.assertEqual(1,len(d.states))
+        self.assertEqual(STATE_REFERENCE,d.get_state(p + 'dir1'))
+        d.set_state(p + 'dir1',STATE_NORMAL)
+        self.assertEqual(2,len(d.states))
+        self.assertEqual(STATE_NORMAL,d.get_state(p + 'dir1'))
+        d.set_state(p + 'dir1',STATE_REFERENCE)
+        self.assertEqual(1,len(d.states))
+        self.assertEqual(STATE_REFERENCE,d.get_state(p + 'dir1'))
+    
+    def test_get_files(self):
+        d = Directories()
+        p = create_fake_fs(self.tmppath())
+        d.add_path(p)
+        d.set_state(p + 'dir1',STATE_REFERENCE)
+        d.set_state(p + 'dir2',STATE_EXCLUDED)
+        files = list(d.get_files())
+        self.assertEqual(5, len(files))
+        for f in files:
+            if f.path[:-1] == p + 'dir1':
+                assert f.is_ref
+            else:
+                assert not f.is_ref
+    
+    def test_get_files_with_inherited_exclusion(self):
+        d = Directories()
+        p = testpath + 'utils'
+        d.add_path(p)
+        d.set_state(p,STATE_EXCLUDED)
+        self.assertEqual([], list(d.get_files()))
+    
+    def test_save_and_load(self):
+        d1 = Directories()
+        d2 = Directories()
+        p1 = self.tmppath()
+        p2 = self.tmppath()
+        d1.add_path(p1)
+        d1.add_path(p2)
+        d1.set_state(p1, STATE_REFERENCE)
+        d1.set_state(p1 + 'dir1',STATE_EXCLUDED)
+        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
+        d1.save_to_file(tmpxml)
+        d2.load_from_file(tmpxml)
+        self.assertEqual(2, len(d2))
+        self.assertEqual(STATE_REFERENCE,d2.get_state(p1))
+        self.assertEqual(STATE_EXCLUDED,d2.get_state(p1 + 'dir1'))
+    
+    def test_invalid_path(self):
+        d = Directories()
+        p = Path('does_not_exist')
+        self.assertRaises(InvalidPathError, d.add_path, p)
+        self.assertEqual(0, len(d))
+    
+    def test_set_state_on_invalid_path(self):
+        d = Directories()
+        try:
+            d.set_state(Path('foobar',),STATE_NORMAL)
+        except LookupError:
+            self.fail()
+    
+    def test_load_from_file_with_invalid_path(self):
+        #This test simulates a load from file resulting in a
+        #InvalidPath raise. Other directories must be loaded.
+        d1 = Directories()
+        d1.add_path(testpath + 'utils')
+        #Will raise InvalidPath upon loading
+        p = self.tmppath()
+        d1.add_path(p)
+        io.rmdir(p)
+        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
+        d1.save_to_file(tmpxml)
+        d2 = Directories()
+        d2.load_from_file(tmpxml)
+        self.assertEqual(1, len(d2))
+    
+    def test_unicode_save(self):
+        d = Directories()
+        p1 = self.tmppath() + u'hello\xe9'
+        io.mkdir(p1)
+        io.mkdir(p1 + u'foo\xe9')
+        d.add_path(p1)
+        d.set_state(p1 + u'foo\xe9', STATE_EXCLUDED)
+        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
+        try:
+            d.save_to_file(tmpxml)
+        except UnicodeDecodeError:
+            self.fail()
+    
+    def test_get_files_refreshes_its_directories(self):
+        d = Directories()
+        p = create_fake_fs(self.tmppath())
+        d.add_path(p)
+        files = d.get_files()
+        self.assertEqual(6, len(list(files)))
+        time.sleep(1)
+        os.remove(str(p + ('dir1','file1.test')))
+        files = d.get_files()
+        self.assertEqual(5, len(list(files)))
+    
+    def test_get_files_does_not_choke_on_non_existing_directories(self):
+        d = Directories()
+        p = Path(self.tmpdir())
+        d.add_path(p)
+        io.rmtree(p)
+        self.assertEqual([], list(d.get_files()))
+    
+    def test_get_state_returns_excluded_by_default_for_hidden_directories(self):
+        d = Directories()
+        p = Path(self.tmpdir())
+        hidden_dir_path = p + '.foo'
+        io.mkdir(p + '.foo')
+        d.add_path(p)
+        self.assertEqual(d.get_state(hidden_dir_path), STATE_EXCLUDED)
+        # But it can be overriden
+        d.set_state(hidden_dir_path, STATE_NORMAL)
+        self.assertEqual(d.get_state(hidden_dir_path), STATE_NORMAL)
+    
+    def test_default_path_state_override(self):
+        # It's possible for a subclass to override the default state of a path
+        class MyDirectories(Directories):
+            def _default_state_for_path(self, path):
+                if 'foobar' in path:
+                    return STATE_EXCLUDED
+        
+        d = MyDirectories()
+        p1 = self.tmppath()
+        io.mkdir(p1 + 'foobar')
+        io.open(p1 + 'foobar/somefile', 'w').close()
+        io.mkdir(p1 + 'foobaz')
+        io.open(p1 + 'foobaz/somefile', 'w').close()
+        d.add_path(p1)
+        eq_(d.get_state(p1 + 'foobaz'), STATE_NORMAL)
+        eq_(d.get_state(p1 + 'foobar'), STATE_EXCLUDED)
+        eq_(len(list(d.get_files())), 1) # only the 'foobaz' file is there
+        # However, the default state can be changed
+        d.set_state(p1 + 'foobar', STATE_NORMAL)
+        eq_(d.get_state(p1 + 'foobar'), STATE_NORMAL)
+        eq_(len(list(d.get_files())), 2)
+    
--- a/core/tests/engine_test.py
+++ b/core/tests/engine_test.py
@@ -0,0 +1,815 @@
+# Created By: Virgil Dupras
+# Created On: 2006/01/29
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import sys
+
+from nose.tools import eq_
+
+from hsutil import job
+from hsutil.decorators import log_calls
+from hsutil.testcase import TestCase
+
+from .. import engine, fs
+from ..engine import *
+
+class NamedObject(object):
+    def __init__(self, name="foobar", with_words=False, size=1):
+        self.name = name
+        self.size = size
+        self.md5partial = name
+        self.md5 = name
+        if with_words:
+            self.words = getwords(name)
+    
+
+no = NamedObject
+
+def get_match_triangle():
+    o1 = NamedObject(with_words=True)
+    o2 = NamedObject(with_words=True)
+    o3 = NamedObject(with_words=True)
+    m1 = get_match(o1,o2)
+    m2 = get_match(o1,o3)
+    m3 = get_match(o2,o3)
+    return [m1, m2, m3]
+
+def get_test_group():
+    m1, m2, m3 = get_match_triangle()
+    result = Group()
+    result.add_match(m1)
+    result.add_match(m2)
+    result.add_match(m3)
+    return result
+
+class TCgetwords(TestCase):
+    def test_spaces(self):
+        self.assertEqual(['a', 'b', 'c', 'd'], getwords("a b c d"))
+        self.assertEqual(['a', 'b', 'c', 'd'], getwords(" a  b  c d "))
+    
+    def test_splitter_chars(self):
+        self.assertEqual(
+            [chr(i) for i in xrange(ord('a'),ord('z')+1)],
+            getwords("a-b_c&d+e(f)g;h\\i[j]k{l}m:n.o,p<q>r/s?t~u!v@w#x$y*z")
+        )
+    
+    def test_joiner_chars(self):
+        self.assertEqual(["aec"], getwords(u"a'e\u0301c"))
+    
+    def test_empty(self):
+        self.assertEqual([], getwords(''))
+        
+    def test_returns_lowercase(self):
+        self.assertEqual(['foo', 'bar'], getwords('FOO BAR'))
+    
+    def test_decompose_unicode(self):
+        self.assertEqual(getwords(u'foo\xe9bar'), ['fooebar'])
+    
+
+class TCgetfields(TestCase):
+    def test_simple(self):
+        self.assertEqual([['a', 'b'], ['c', 'd', 'e']], getfields('a b - c d e'))
+    
+    def test_empty(self):
+        self.assertEqual([], getfields(''))
+        
+    def test_cleans_empty_fields(self):
+        expected = [['a', 'bc', 'def']]
+        actual = getfields(' - a bc def')
+        self.assertEqual(expected, actual)
+        expected = [['bc', 'def']]
+    
+
+class TCunpack_fields(TestCase):
+    def test_with_fields(self):
+        expected = ['a', 'b', 'c', 'd', 'e', 'f']
+        actual = unpack_fields([['a'], ['b', 'c'], ['d', 'e', 'f']])
+        self.assertEqual(expected, actual)
+    
+    def test_without_fields(self):
+        expected = ['a', 'b', 'c', 'd', 'e', 'f']
+        actual = unpack_fields(['a', 'b', 'c', 'd', 'e', 'f'])
+        self.assertEqual(expected, actual)
+    
+    def test_empty(self):
+        self.assertEqual([], unpack_fields([]))
+    
+
+class TCWordCompare(TestCase):
+    def test_list(self):
+        self.assertEqual(100, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c', 'd']))
+        self.assertEqual(86, compare(['a', 'b', 'c', 'd'],['a', 'b', 'c']))
+    
+    def test_unordered(self):
+        #Sometimes, users don't want fuzzy matching too much When they set the slider
+        #to 100, they don't expect a filename with the same words, but not the same order, to match.
+        #Thus, we want to return 99 in that case.
+        self.assertEqual(99, compare(['a', 'b', 'c', 'd'], ['d', 'b', 'c', 'a']))
+    
+    def test_word_occurs_twice(self):
+        #if a word occurs twice in first, but once in second, we want the word to be only counted once
+        self.assertEqual(89, compare(['a', 'b', 'c', 'd', 'a'], ['d', 'b', 'c', 'a']))
+    
+    def test_uses_copy_of_lists(self):
+        first = ['foo', 'bar']
+        second = ['bar', 'bleh']
+        compare(first, second)
+        self.assertEqual(['foo', 'bar'], first)
+        self.assertEqual(['bar', 'bleh'], second)
+    
+    def test_word_weight(self):
+        self.assertEqual(int((6.0 / 13.0) * 100), compare(['foo', 'bar'], ['bar', 'bleh'], (WEIGHT_WORDS, )))
+    
+    def test_similar_words(self):
+        self.assertEqual(100, compare(['the', 'white', 'stripes'],['the', 'whites', 'stripe'], (MATCH_SIMILAR_WORDS, )))
+    
+    def test_empty(self):
+        self.assertEqual(0, compare([], []))
+    
+    def test_with_fields(self):
+        self.assertEqual(67, compare([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
+    
+    def test_propagate_flags_with_fields(self):
+        def mock_compare(first, second, flags):
+            self.assertEqual((0, 1, 2, 3, 5), flags)
+        
+        self.mock(engine, 'compare_fields', mock_compare)
+        compare([['a']], [['a']], (0, 1, 2, 3, 5))
+    
+
+class TCWordCompareWithFields(TestCase):
+    def test_simple(self):
+        self.assertEqual(67, compare_fields([['a', 'b'], ['c', 'd', 'e']], [['a', 'b'], ['c', 'd', 'f']]))
+    
+    def test_empty(self):
+        self.assertEqual(0, compare_fields([], []))
+    
+    def test_different_length(self):
+        self.assertEqual(0, compare_fields([['a'], ['b']], [['a'], ['b'], ['c']]))
+    
+    def test_propagates_flags(self):
+        def mock_compare(first, second, flags):
+            self.assertEqual((0, 1, 2, 3, 5), flags)
+        
+        self.mock(engine, 'compare_fields', mock_compare)
+        compare_fields([['a']], [['a']],(0, 1, 2, 3, 5))
+    
+    def test_order(self):
+        first = [['a', 'b'], ['c', 'd', 'e']]
+        second = [['c', 'd', 'f'], ['a', 'b']]
+        self.assertEqual(0, compare_fields(first, second))
+    
+    def test_no_order(self):
+        first = [['a','b'],['c','d','e']]
+        second = [['c','d','f'],['a','b']]
+        self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
+        first = [['a','b'],['a','b']] #a field can only be matched once.
+        second = [['c','d','f'],['a','b']]
+        self.assertEqual(0, compare_fields(first, second, (NO_FIELD_ORDER, )))
+        first = [['a','b'],['a','b','c']] 
+        second = [['c','d','f'],['a','b']]
+        self.assertEqual(33, compare_fields(first, second, (NO_FIELD_ORDER, )))
+    
+    def test_compare_fields_without_order_doesnt_alter_fields(self):
+        #The NO_ORDER comp type altered the fields!
+        first = [['a','b'],['c','d','e']]
+        second = [['c','d','f'],['a','b']]
+        self.assertEqual(67, compare_fields(first, second, (NO_FIELD_ORDER, )))
+        self.assertEqual([['a','b'],['c','d','e']],first)
+        self.assertEqual([['c','d','f'],['a','b']],second)
+    
+
+class TCbuild_word_dict(TestCase):
+    def test_with_standard_words(self):
+        l = [NamedObject('foo bar',True)]
+        l.append(NamedObject('bar baz',True))
+        l.append(NamedObject('baz bleh foo',True))
+        d = build_word_dict(l)
+        self.assertEqual(4,len(d))
+        self.assertEqual(2,len(d['foo']))
+        self.assert_(l[0] in d['foo'])
+        self.assert_(l[2] in d['foo'])
+        self.assertEqual(2,len(d['bar']))
+        self.assert_(l[0] in d['bar'])
+        self.assert_(l[1] in d['bar'])
+        self.assertEqual(2,len(d['baz']))
+        self.assert_(l[1] in d['baz'])
+        self.assert_(l[2] in d['baz'])
+        self.assertEqual(1,len(d['bleh']))
+        self.assert_(l[2] in d['bleh'])
+    
+    def test_unpack_fields(self):
+        o = NamedObject('')
+        o.words = [['foo','bar'],['baz']]
+        d = build_word_dict([o])
+        self.assertEqual(3,len(d))
+        self.assertEqual(1,len(d['foo']))
+    
+    def test_words_are_unaltered(self):
+        o = NamedObject('')
+        o.words = [['foo','bar'],['baz']]
+        d = build_word_dict([o])
+        self.assertEqual([['foo','bar'],['baz']],o.words)
+    
+    def test_object_instances_can_only_be_once_in_words_object_list(self):
+        o = NamedObject('foo foo',True)
+        d = build_word_dict([o])
+        self.assertEqual(1,len(d['foo']))
+    
+    def test_job(self):
+        def do_progress(p,d=''):
+            self.log.append(p)
+            return True
+        
+        j = job.Job(1,do_progress)
+        self.log = []
+        s = "foo bar"
+        build_word_dict([NamedObject(s, True), NamedObject(s, True), NamedObject(s, True)], j)
+        self.assertEqual(0,self.log[0])
+        self.assertEqual(33,self.log[1])
+        self.assertEqual(66,self.log[2])
+        self.assertEqual(100,self.log[3])
+    
+
+class TCmerge_similar_words(TestCase):
+    def test_some_similar_words(self):
+        d = {
+            'foobar':set([1]),
+            'foobar1':set([2]),
+            'foobar2':set([3]),
+        }
+        merge_similar_words(d)
+        self.assertEqual(1,len(d))
+        self.assertEqual(3,len(d['foobar']))
+    
+    
+
+class TCreduce_common_words(TestCase):
+    def test_typical(self):
+        d = {
+            'foo': set([NamedObject('foo bar',True) for i in range(50)]),
+            'bar': set([NamedObject('foo bar',True) for i in range(49)])
+        }
+        reduce_common_words(d, 50)
+        self.assert_('foo' not in d)
+        self.assertEqual(49,len(d['bar']))
+    
+    def test_dont_remove_objects_with_only_common_words(self):
+        d = {
+            'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]),
+            'uncommon': set([NamedObject("common uncommon",True)])
+        }
+        reduce_common_words(d, 50)
+        self.assertEqual(1,len(d['common']))
+        self.assertEqual(1,len(d['uncommon']))
+    
+    def test_values_still_are_set_instances(self):
+        d = {
+            'common': set([NamedObject("common uncommon",True) for i in range(50)] + [NamedObject("common",True)]),
+            'uncommon': set([NamedObject("common uncommon",True)])
+        }
+        reduce_common_words(d, 50)
+        self.assert_(isinstance(d['common'],set))
+        self.assert_(isinstance(d['uncommon'],set))
+    
+    def test_dont_raise_KeyError_when_a_word_has_been_removed(self):
+        #If a word has been removed by the reduce, an object in a subsequent common word that
+        #contains the word that has been removed would cause a KeyError.
+        d = {
+            'foo': set([NamedObject('foo bar baz',True) for i in range(50)]),
+            'bar': set([NamedObject('foo bar baz',True) for i in range(50)]),
+            'baz': set([NamedObject('foo bar baz',True) for i in range(49)])
+        }
+        try:
+            reduce_common_words(d, 50)
+        except KeyError:
+            self.fail()
+    
+    def test_unpack_fields(self):
+        #object.words may be fields.
+        def create_it():
+            o = NamedObject('')
+            o.words = [['foo','bar'],['baz']]
+            return o
+        
+        d = {
+            'foo': set([create_it() for i in range(50)])
+        }
+        try:
+            reduce_common_words(d, 50)
+        except TypeError:
+            self.fail("must support fields.")
+    
+    def test_consider_a_reduced_common_word_common_even_after_reduction(self):
+        #There was a bug in the code that causeda word that has already been reduced not to
+        #be counted as a common word for subsequent words. For example, if 'foo' is processed
+        #as a common word, keeping a "foo bar" file in it, and the 'bar' is processed, "foo bar"
+        #would not stay in 'bar' because 'foo' is not a common word anymore.
+        only_common = NamedObject('foo bar',True)
+        d = {
+            'foo': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]),
+            'bar': set([NamedObject('foo bar baz',True) for i in range(49)] + [only_common]),
+            'baz': set([NamedObject('foo bar baz',True) for i in range(49)])
+        }
+        reduce_common_words(d, 50)
+        self.assertEqual(1,len(d['foo']))
+        self.assertEqual(1,len(d['bar']))
+        self.assertEqual(49,len(d['baz']))
+    
+
+class TCget_match(TestCase):
+    def test_simple(self):
+        o1 = NamedObject("foo bar",True)
+        o2 = NamedObject("bar bleh",True)
+        m = get_match(o1,o2)
+        self.assertEqual(50,m.percentage)
+        self.assertEqual(['foo','bar'],m.first.words)
+        self.assertEqual(['bar','bleh'],m.second.words)
+        self.assert_(m.first is o1)
+        self.assert_(m.second is o2)
+    
+    def test_in(self):
+        o1 = NamedObject("foo",True)
+        o2 = NamedObject("bar",True)
+        m = get_match(o1,o2)
+        self.assert_(o1 in m)
+        self.assert_(o2 in m)
+        self.assert_(object() not in m)
+    
+    def test_word_weight(self):
+        self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
+    
+
+class GetMatches(TestCase):
+    def test_empty(self):
+        eq_(getmatches([]), [])
+    
+    def test_simple(self):
+        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
+        r = getmatches(l)
+        self.assertEqual(2,len(r))
+        seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
+        m = seek[0]
+        self.assertEqual(['foo','bar'],m.first.words)
+        self.assertEqual(['bar','bleh'],m.second.words)
+        seek = [m for m in r if m.percentage == 33] #"foo bar" and "a b c foo"
+        m = seek[0]
+        self.assertEqual(['foo','bar'],m.first.words)
+        self.assertEqual(['a','b','c','foo'],m.second.words)
+    
+    def test_null_and_unrelated_objects(self):
+        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
+        r = getmatches(l)
+        self.assertEqual(1,len(r))
+        m = r[0]
+        self.assertEqual(50,m.percentage)
+        self.assertEqual(['foo','bar'],m.first.words)
+        self.assertEqual(['bar','bleh'],m.second.words)
+    
+    def test_twice_the_same_word(self):
+        l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
+        r = getmatches(l)
+        self.assertEqual(1,len(r))
+    
+    def test_twice_the_same_word_when_preworded(self):
+        l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
+        r = getmatches(l)
+        self.assertEqual(1,len(r))
+    
+    def test_two_words_match(self):
+        l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
+        r = getmatches(l)
+        self.assertEqual(1,len(r))
+    
+    def test_match_files_with_only_common_words(self):
+        #If a word occurs more than 50 times, it is excluded from the matching process
+        #The problem with the common_word_threshold is that the files containing only common
+        #words will never be matched together. We *should* match them.
+        # This test assumes that the common word threashold const is 50
+        l = [NamedObject("foo") for i in range(50)]
+        r = getmatches(l)
+        self.assertEqual(1225,len(r))
+    
+    def test_use_words_already_there_if_there(self):
+        o1 = NamedObject('foo')
+        o2 = NamedObject('bar')
+        o2.words = ['foo']
+        eq_(1, len(getmatches([o1,o2])))
+    
+    def test_job(self):
+        def do_progress(p,d=''):
+            self.log.append(p)
+            return True
+        
+        j = job.Job(1,do_progress)
+        self.log = []
+        s = "foo bar"
+        getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
+        self.assert_(len(self.log) > 2)
+        self.assertEqual(0,self.log[0])
+        self.assertEqual(100,self.log[-1])
+    
+    def test_weight_words(self):
+        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
+        m = getmatches(l, weight_words=True)[0]
+        self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
+    
+    def test_similar_word(self):
+        l = [NamedObject("foobar"),NamedObject("foobars")]
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
+        eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
+        l = [NamedObject("foobar"),NamedObject("foo")]
+        eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
+        l = [NamedObject("bizkit"),NamedObject("bizket")]
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
+        l = [NamedObject("foobar"),NamedObject("foosbar")]
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
+    
+    def test_single_object_with_similar_words(self):
+        l = [NamedObject("foo foos")]
+        eq_(len(getmatches(l, match_similar_words=True)), 0)
+    
+    def test_double_words_get_counted_only_once(self):
+        l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
+        m = getmatches(l)[0]
+        self.assertEqual(75,m.percentage)
+    
+    def test_with_fields(self):
+        o1 = NamedObject("foo bar - foo bleh")
+        o2 = NamedObject("foo bar - bleh bar")
+        o1.words = getfields(o1.name)
+        o2.words = getfields(o2.name)
+        m = getmatches([o1, o2])[0]
+        self.assertEqual(50, m.percentage)
+    
+    def test_with_fields_no_order(self):
+        o1 = NamedObject("foo bar - foo bleh")
+        o2 = NamedObject("bleh bang - foo bar")
+        o1.words = getfields(o1.name)
+        o2.words = getfields(o2.name)
+        m = getmatches([o1, o2], no_field_order=True)[0]
+        eq_(m.percentage, 50)
+    
+    def test_only_match_similar_when_the_option_is_set(self):
+        l = [NamedObject("foobar"),NamedObject("foobars")]
+        eq_(len(getmatches(l, match_similar_words=False)), 0)
+    
+    def test_dont_recurse_do_match(self):
+        # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
+        sys.setrecursionlimit(100)
+        files = [NamedObject('foo bar') for i in range(101)]
+        try:
+            getmatches(files)
+        except RuntimeError:
+            self.fail()
+        finally:
+            sys.setrecursionlimit(1000)
+    
+    def test_min_match_percentage(self):
+        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
+        r = getmatches(l, min_match_percentage=50)
+        self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
+    
+    def test_MemoryError(self):
+        @log_calls
+        def mocked_match(first, second, flags):
+            if len(mocked_match.calls) > 42:
+                raise MemoryError()
+            return Match(first, second, 0)
+        
+        objects = [NamedObject() for i in range(10)] # results in 45 matches
+        self.mock(engine, 'get_match', mocked_match)
+        try:
+            r = getmatches(objects)
+        except MemoryError:
+            self.fail('MemorryError must be handled')
+        self.assertEqual(42, len(r))
+    
+
+class GetMatchesByContents(TestCase):
+    def test_dont_compare_empty_files(self):
+        o1, o2 = no(size=0), no(size=0)
+        assert not getmatches_by_contents([o1, o2])
+    
+
+class TCGroup(TestCase):
+    def test_empy(self):
+        g = Group()
+        self.assertEqual(None,g.ref)
+        self.assertEqual([],g.dupes)
+        self.assertEqual(0,len(g.matches))
+    
+    def test_add_match(self):
+        g = Group()
+        m = get_match(NamedObject("foo",True),NamedObject("bar",True))
+        g.add_match(m)
+        self.assert_(g.ref is m.first)
+        self.assertEqual([m.second],g.dupes)
+        self.assertEqual(1,len(g.matches))
+        self.assert_(m in g.matches)
+    
+    def test_multiple_add_match(self):
+        g = Group()
+        o1 = NamedObject("a",True)
+        o2 = NamedObject("b",True)
+        o3 = NamedObject("c",True)
+        o4 = NamedObject("d",True)
+        g.add_match(get_match(o1,o2))
+        self.assert_(g.ref is o1)
+        self.assertEqual([o2],g.dupes)
+        self.assertEqual(1,len(g.matches))
+        g.add_match(get_match(o1,o3))
+        self.assertEqual([o2],g.dupes)
+        self.assertEqual(2,len(g.matches))
+        g.add_match(get_match(o2,o3))
+        self.assertEqual([o2,o3],g.dupes)
+        self.assertEqual(3,len(g.matches))
+        g.add_match(get_match(o1,o4))
+        self.assertEqual([o2,o3],g.dupes)
+        self.assertEqual(4,len(g.matches))
+        g.add_match(get_match(o2,o4))
+        self.assertEqual([o2,o3],g.dupes)
+        self.assertEqual(5,len(g.matches))
+        g.add_match(get_match(o3,o4))
+        self.assertEqual([o2,o3,o4],g.dupes)
+        self.assertEqual(6,len(g.matches))
+    
+    def test_len(self):
+        g = Group()
+        self.assertEqual(0,len(g))
+        g.add_match(get_match(NamedObject("foo",True),NamedObject("bar",True)))
+        self.assertEqual(2,len(g))
+    
+    def test_add_same_match_twice(self):
+        g = Group()
+        m = get_match(NamedObject("foo",True),NamedObject("foo",True))
+        g.add_match(m)
+        self.assertEqual(2,len(g))
+        self.assertEqual(1,len(g.matches))
+        g.add_match(m)
+        self.assertEqual(2,len(g))
+        self.assertEqual(1,len(g.matches))
+    
+    def test_in(self):
+        g = Group()
+        o1 = NamedObject("foo",True)
+        o2 = NamedObject("bar",True)
+        self.assert_(o1 not in g)
+        g.add_match(get_match(o1,o2))
+        self.assert_(o1 in g)
+        self.assert_(o2 in g)
+    
+    def test_remove(self):
+        g = Group()
+        o1 = NamedObject("foo",True)
+        o2 = NamedObject("bar",True)
+        o3 = NamedObject("bleh",True)
+        g.add_match(get_match(o1,o2))
+        g.add_match(get_match(o1,o3))
+        g.add_match(get_match(o2,o3))
+        self.assertEqual(3,len(g.matches))
+        self.assertEqual(3,len(g))
+        g.remove_dupe(o3)
+        self.assertEqual(1,len(g.matches))
+        self.assertEqual(2,len(g))
+        g.remove_dupe(o1)
+        self.assertEqual(0,len(g.matches))
+        self.assertEqual(0,len(g))
+    
+    def test_remove_with_ref_dupes(self):
+        g = Group()
+        o1 = NamedObject("foo",True)
+        o2 = NamedObject("bar",True)
+        o3 = NamedObject("bleh",True)
+        g.add_match(get_match(o1,o2))
+        g.add_match(get_match(o1,o3))
+        g.add_match(get_match(o2,o3))
+        o1.is_ref = True
+        o2.is_ref = True
+        g.remove_dupe(o3)
+        self.assertEqual(0,len(g))
+    
+    def test_switch_ref(self):
+        o1 = NamedObject(with_words=True)
+        o2 = NamedObject(with_words=True)
+        g = Group()
+        g.add_match(get_match(o1,o2))
+        self.assert_(o1 is g.ref)
+        g.switch_ref(o2)
+        self.assert_(o2 is g.ref)
+        self.assertEqual([o1],g.dupes)
+        g.switch_ref(o2)
+        self.assert_(o2 is g.ref)
+        g.switch_ref(NamedObject('',True))
+        self.assert_(o2 is g.ref)
+    
+    def test_get_match_of(self):
+        g = Group()
+        for m in get_match_triangle():
+            g.add_match(m)
+        o = g.dupes[0]
+        m = g.get_match_of(o)
+        self.assert_(g.ref in m)
+        self.assert_(o in m)
+        self.assert_(g.get_match_of(NamedObject('',True)) is None)
+        self.assert_(g.get_match_of(g.ref) is None)
+    
+    def test_percentage(self):
+        #percentage should return the avg percentage in relation to the ref
+        m1,m2,m3 = get_match_triangle()
+        m1 = Match(m1[0], m1[1], 100)
+        m2 = Match(m2[0], m2[1], 50)
+        m3 = Match(m3[0], m3[1], 33)
+        g = Group()
+        g.add_match(m1)
+        g.add_match(m2)
+        g.add_match(m3)
+        self.assertEqual(75,g.percentage)
+        g.switch_ref(g.dupes[0])
+        self.assertEqual(66,g.percentage)
+        g.remove_dupe(g.dupes[0])
+        self.assertEqual(33,g.percentage)
+        g.add_match(m1)
+        g.add_match(m2)
+        self.assertEqual(66,g.percentage)
+    
+    def test_percentage_on_empty_group(self):
+        g = Group()
+        self.assertEqual(0,g.percentage)
+    
+    def test_prioritize(self):
+        m1,m2,m3 = get_match_triangle()
+        o1 = m1.first
+        o2 = m1.second
+        o3 = m2.second
+        o1.name = 'c'
+        o2.name = 'b'
+        o3.name = 'a'
+        g = Group()
+        g.add_match(m1)
+        g.add_match(m2)
+        g.add_match(m3)
+        self.assert_(o1 is g.ref)
+        g.prioritize(lambda x:x.name)
+        self.assert_(o3 is g.ref)
+    
+    def test_prioritize_with_tie_breaker(self):
+        # if the ref has the same key as one or more of the dupe, run the tie_breaker func among them
+        g = get_test_group()
+        o1, o2, o3 = g.ordered
+        tie_breaker = lambda ref, dupe: dupe is o3
+        g.prioritize(lambda x:0, tie_breaker)
+        self.assertTrue(g.ref is o3)
+    
+    def test_prioritize_with_tie_breaker_runs_on_all_dupes(self):
+        # Even if a dupe is chosen to switch with ref with a tie breaker, we still run the tie breaker 
+        # with other dupes and the newly chosen ref
+        g = get_test_group()
+        o1, o2, o3 = g.ordered
+        o1.foo = 1
+        o2.foo = 2
+        o3.foo = 3
+        tie_breaker = lambda ref, dupe: dupe.foo > ref.foo
+        g.prioritize(lambda x:0, tie_breaker)
+        self.assertTrue(g.ref is o3)
+    
+    def test_prioritize_with_tie_breaker_runs_only_on_tie_dupes(self):
+        # The tie breaker only runs on dupes that had the same value for the key_func
+        g = get_test_group()
+        o1, o2, o3 = g.ordered
+        o1.foo = 2
+        o2.foo = 2
+        o3.foo = 1
+        o1.bar = 1
+        o2.bar = 2
+        o3.bar = 3
+        key_func = lambda x: -x.foo
+        tie_breaker = lambda ref, dupe: dupe.bar > ref.bar
+        g.prioritize(key_func, tie_breaker)
+        self.assertTrue(g.ref is o2)
+    
+    def test_list_like(self):
+        g = Group()
+        o1,o2 = (NamedObject("foo",True),NamedObject("bar",True))
+        g.add_match(get_match(o1,o2))
+        self.assert_(g[0] is o1)
+        self.assert_(g[1] is o2)
+    
+    def test_discard_matches(self):
+        g = Group()
+        o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True))
+        g.add_match(get_match(o1,o2))
+        g.add_match(get_match(o1,o3))
+        g.discard_matches()
+        self.assertEqual(1,len(g.matches))
+        self.assertEqual(0,len(g.candidates))
+    
+
+class TCget_groups(TestCase):
+    def test_empty(self):
+        r = get_groups([])
+        self.assertEqual([],r)
+    
+    def test_simple(self):
+        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
+        matches = getmatches(l)
+        m = matches[0]
+        r = get_groups(matches)
+        self.assertEqual(1,len(r))
+        g = r[0]
+        self.assert_(g.ref is m.first)
+        self.assertEqual([m.second],g.dupes)
+    
+    def test_group_with_multiple_matches(self):
+        #This results in 3 matches
+        l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
+        matches = getmatches(l)
+        r = get_groups(matches)
+        self.assertEqual(1,len(r))
+        g = r[0]
+        self.assertEqual(3,len(g))
+    
+    def test_must_choose_a_group(self):
+        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
+        #There will be 2 groups here: group "a b" and group "c d"
+        #"b c" can go either of them, but not both.
+        matches = getmatches(l)
+        r = get_groups(matches)
+        self.assertEqual(2,len(r))
+        self.assertEqual(5,len(r[0])+len(r[1]))
+    
+    def test_should_all_go_in_the_same_group(self):
+        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
+        #There will be 2 groups here: group "a b" and group "c d"
+        #"b c" can fit in both, but it must be in only one of them
+        matches = getmatches(l)
+        r = get_groups(matches)
+        self.assertEqual(1,len(r))
+    
+    def test_give_priority_to_matches_with_higher_percentage(self):
+        o1 = NamedObject(with_words=True)
+        o2 = NamedObject(with_words=True)
+        o3 = NamedObject(with_words=True)
+        m1 = Match(o1, o2, 1)
+        m2 = Match(o2, o3, 2)
+        r = get_groups([m1,m2])
+        self.assertEqual(1,len(r))
+        g = r[0]
+        self.assertEqual(2,len(g))
+        self.assert_(o1 not in g)
+        self.assert_(o2 in g)
+        self.assert_(o3 in g)
+    
+    def test_four_sized_group(self):
+        l = [NamedObject("foobar") for i in xrange(4)]
+        m = getmatches(l)
+        r = get_groups(m)
+        self.assertEqual(1,len(r))
+        self.assertEqual(4,len(r[0]))
+    
+    def test_referenced_by_ref2(self):
+        o1 = NamedObject(with_words=True)
+        o2 = NamedObject(with_words=True)
+        o3 = NamedObject(with_words=True)
+        m1 = get_match(o1,o2)
+        m2 = get_match(o3,o1)
+        m3 = get_match(o3,o2)
+        r = get_groups([m1,m2,m3])
+        self.assertEqual(3,len(r[0]))
+    
+    def test_job(self):
+        def do_progress(p,d=''):
+            self.log.append(p)
+            return True
+        
+        self.log = []
+        j = job.Job(1,do_progress)
+        m1,m2,m3 = get_match_triangle()
+        #101%: To make sure it is processed first so the job test works correctly
+        m4 = Match(NamedObject('a',True), NamedObject('a',True), 101)
+        get_groups([m1,m2,m3,m4],j)
+        self.assertEqual(0,self.log[0])
+        self.assertEqual(100,self.log[-1])
+    
+    def test_group_admissible_discarded_dupes(self):
+        # If, with a (A, B, C, D) set, all match with A, but C and D don't match with B and that the
+        # (A, B) match is the highest (thus resulting in an (A, B) group), still match C and D
+        # in a separate group instead of discarding them.
+        A, B, C, D = [NamedObject() for _ in range(4)]
+        m1 = Match(A, B, 90) # This is the strongest "A" match
+        m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group
+        m3 = Match(A, D, 80) # Same thing for D
+        m4 = Match(C, D, 70) # However, because C and D match, they should have their own group.
+        groups = get_groups([m1, m2, m3, m4])
+        eq_(len(groups), 2)
+        g1, g2 = groups
+        assert A in g1
+        assert B in g1
+        assert C in g2
+        assert D in g2
+    
--- a/core/tests/ignore_test.py
+++ b/core/tests/ignore_test.py
@@ -0,0 +1,152 @@
+# Created By: Virgil Dupras
+# Created On: 2006/05/02
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import cStringIO
+import xml.dom.minidom
+
+from nose.tools import eq_
+
+from ..ignore import *
+
+def test_empty():
+    il = IgnoreList()
+    eq_(0,len(il))
+    assert not il.AreIgnored('foo','bar')
+
+def test_simple():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    assert il.AreIgnored('foo','bar')
+    assert il.AreIgnored('bar','foo')
+    assert not il.AreIgnored('foo','bleh')
+    assert not il.AreIgnored('bleh','bar')
+    eq_(1,len(il))
+
+def test_multiple():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('foo','bleh')
+    il.Ignore('bleh','bar')
+    il.Ignore('aybabtu','bleh')
+    assert il.AreIgnored('foo','bar')
+    assert il.AreIgnored('bar','foo')
+    assert il.AreIgnored('foo','bleh')
+    assert il.AreIgnored('bleh','bar')
+    assert not il.AreIgnored('aybabtu','bar')
+    eq_(4,len(il))
+
+def test_clear():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Clear()
+    assert not il.AreIgnored('foo','bar')
+    assert not il.AreIgnored('bar','foo')
+    eq_(0,len(il))
+
+def test_add_same_twice():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('bar','foo')
+    eq_(1,len(il))
+
+def test_save_to_xml():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('foo','bleh')
+    il.Ignore('bleh','bar')
+    f = cStringIO.StringIO()
+    il.save_to_xml(f)
+    f.seek(0)
+    doc = xml.dom.minidom.parse(f)
+    root = doc.documentElement
+    eq_('ignore_list',root.nodeName)
+    children = [c for c in root.childNodes if c.localName]
+    eq_(2,len(children))
+    eq_(2,len([c for c in children if c.nodeName == 'file']))
+    f1,f2 = children
+    subchildren = [c for c in f1.childNodes if c.localName == 'file'] +\
+        [c for c in f2.childNodes if c.localName == 'file']
+    eq_(3,len(subchildren))
+
+def test_SaveThenLoad():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('foo','bleh')
+    il.Ignore('bleh','bar')
+    il.Ignore(u'\u00e9','bar')
+    f = cStringIO.StringIO()
+    il.save_to_xml(f)
+    f.seek(0)
+    il = IgnoreList()
+    il.load_from_xml(f)
+    eq_(4,len(il))
+    assert il.AreIgnored(u'\u00e9','bar')
+    
+def test_LoadXML_with_empty_file_tags():
+    f = cStringIO.StringIO()
+    f.write('<?xml version="1.0" encoding="utf-8"?><ignore_list><file><file/></file></ignore_list>')
+    f.seek(0)
+    il = IgnoreList()
+    il.load_from_xml(f)
+    eq_(0,len(il))
+    
+def test_AreIgnore_works_when_a_child_is_a_key_somewhere_else():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('bar','baz')
+    assert il.AreIgnored('bar','foo')
+
+
+def test_no_dupes_when_a_child_is_a_key_somewhere_else():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('bar','baz')
+    il.Ignore('bar','foo')
+    eq_(2,len(il))
+
+def test_iterate():
+    #It must be possible to iterate through ignore list
+    il = IgnoreList()
+    expected = [('foo','bar'),('bar','baz'),('foo','baz')]
+    for i in expected:
+        il.Ignore(i[0],i[1])
+    for i in il:
+        expected.remove(i) #No exception should be raised
+    assert not expected #expected should be empty
+
+def test_filter():
+    il = IgnoreList()
+    il.Ignore('foo','bar')
+    il.Ignore('bar','baz')
+    il.Ignore('foo','baz')
+    il.Filter(lambda f,s: f == 'bar')
+    eq_(1,len(il))
+    assert not il.AreIgnored('foo','bar')
+    assert il.AreIgnored('bar','baz')
+
+def test_save_with_non_ascii_non_unicode_items():
+    il = IgnoreList()
+    il.Ignore('\xac','\xbf')
+    f = cStringIO.StringIO()
+    try:
+        il.save_to_xml(f)
+    except Exception as e:
+        raise AssertionError(unicode(e))
+
+def test_len():
+    il = IgnoreList()
+    eq_(0,len(il))
+    il.Ignore('foo','bar')
+    eq_(1,len(il))
+
+def test_nonzero():
+    il = IgnoreList()
+    assert not il
+    il.Ignore('foo','bar')
+    assert il
--- a/core/tests/results_test.py
+++ b/core/tests/results_test.py
@@ -0,0 +1,717 @@
+# Created By: Virgil Dupras
+# Created On: 2006/02/23
+# $Id$                                  
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import unittest
+import StringIO
+import xml.dom.minidom
+import os.path as op
+
+from hsutil.path import Path
+from hsutil.testcase import TestCase
+from hsutil.misc import first
+
+from . import engine_test, data
+from .. import engine
+from ..results import *
+
+class NamedObject(engine_test.NamedObject):
+    path = property(lambda x:Path('basepath') + x.name)
+    is_ref = False
+    
+    def __nonzero__(self):
+        return False #Make sure that operations are made correctly when the bool value of files is false.
+
+# Returns a group set that looks like that:
+# "foo bar" (1)
+#   "bar bleh" (1024)
+#   "foo bleh" (1)
+# "ibabtu" (1)
+#   "ibabtu" (1)
+def GetTestGroups():
+    objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
+    objects[1].size = 1024
+    matches = engine.getmatches(objects) #we should have 5 matches
+    groups = engine.get_groups(matches) #We should have 2 groups
+    for g in groups:
+        g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
+    groups.sort(key=len, reverse=True) # We want the group with 3 members to be first.
+    return (objects,matches,groups)
+
+class TCResultsEmpty(TestCase):
+    def setUp(self):
+        self.results = Results(data)
+    
+    def test_apply_invalid_filter(self):
+        # If the applied filter is an invalid regexp, just ignore the filter.
+        self.results.apply_filter('[') # invalid
+        self.test_stat_line() # make sure that the stats line isn't saying we applied a '[' filter
+    
+    def test_stat_line(self):
+        self.assertEqual("0 / 0 (0.00 B / 0.00 B) duplicates marked.",self.results.stat_line)
+    
+    def test_groups(self):
+        self.assertEqual(0,len(self.results.groups))
+    
+    def test_get_group_of_duplicate(self):
+        self.assert_(self.results.get_group_of_duplicate('foo') is None)
+    
+    def test_save_to_xml(self):
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        doc = xml.dom.minidom.parse(f)
+        root = doc.documentElement
+        self.assertEqual('results',root.nodeName)
+    
+
+class TCResultsWithSomeGroups(TestCase):
+    def setUp(self):
+        self.results = Results(data)
+        self.objects,self.matches,self.groups = GetTestGroups()
+        self.results.groups = self.groups
+    
+    def test_stat_line(self):
+        self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
+    
+    def test_groups(self):
+        self.assertEqual(2,len(self.results.groups))
+    
+    def test_get_group_of_duplicate(self):
+        for o in self.objects:
+            g = self.results.get_group_of_duplicate(o)
+            self.assert_(isinstance(g, engine.Group))
+            self.assert_(o in g)
+        self.assert_(self.results.get_group_of_duplicate(self.groups[0]) is None)
+    
+    def test_remove_duplicates(self):
+        g1,g2 = self.results.groups
+        self.results.remove_duplicates([g1.dupes[0]])
+        self.assertEqual(2,len(g1))
+        self.assert_(g1 in self.results.groups)
+        self.results.remove_duplicates([g1.ref])
+        self.assertEqual(2,len(g1))
+        self.assert_(g1 in self.results.groups)
+        self.results.remove_duplicates([g1.dupes[0]])
+        self.assertEqual(0,len(g1))
+        self.assert_(g1 not in self.results.groups)
+        self.results.remove_duplicates([g2.dupes[0]])
+        self.assertEqual(0,len(g2))
+        self.assert_(g2 not in self.results.groups)
+        self.assertEqual(0,len(self.results.groups))
+    
+    def test_remove_duplicates_with_ref_files(self):
+        g1,g2 = self.results.groups
+        self.objects[0].is_ref = True
+        self.objects[1].is_ref = True
+        self.results.remove_duplicates([self.objects[2]])
+        self.assertEqual(0,len(g1))
+        self.assert_(g1 not in self.results.groups)
+    
+    def test_make_ref(self):
+        g = self.results.groups[0]
+        d = g.dupes[0]
+        self.results.make_ref(d)
+        self.assert_(d is g.ref)
+    
+    def test_sort_groups(self):
+        self.results.make_ref(self.objects[1]) #We want to make the 1024 sized object to go ref.
+        g1,g2 = self.groups
+        self.results.sort_groups(2) #2 is the key for size
+        self.assert_(self.results.groups[0] is g2)
+        self.assert_(self.results.groups[1] is g1)
+        self.results.sort_groups(2,False)
+        self.assert_(self.results.groups[0] is g1)
+        self.assert_(self.results.groups[1] is g2)
+    
+    def test_set_groups_when_sorted(self):
+        self.results.make_ref(self.objects[1]) #We want to make the 1024 sized object to go ref.
+        self.results.sort_groups(2)
+        objects,matches,groups = GetTestGroups()
+        g1,g2 = groups
+        g1.switch_ref(objects[1])
+        self.results.groups = groups
+        self.assert_(self.results.groups[0] is g2)
+        self.assert_(self.results.groups[1] is g1)
+    
+    def test_get_dupe_list(self):
+        self.assertEqual([self.objects[1],self.objects[2],self.objects[4]],self.results.dupes)
+    
+    def test_dupe_list_is_cached(self):
+        self.assert_(self.results.dupes is self.results.dupes)
+    
+    def test_dupe_list_cache_is_invalidated_when_needed(self):
+        o1,o2,o3,o4,o5 = self.objects
+        self.assertEqual([o2,o3,o5],self.results.dupes)
+        self.results.make_ref(o2)
+        self.assertEqual([o1,o3,o5],self.results.dupes)
+        objects,matches,groups = GetTestGroups()
+        o1,o2,o3,o4,o5 = objects
+        self.results.groups = groups
+        self.assertEqual([o2,o3,o5],self.results.dupes)
+    
+    def test_dupe_list_sort(self):
+        o1,o2,o3,o4,o5 = self.objects
+        o1.size = 5
+        o2.size = 4
+        o3.size = 3
+        o4.size = 2
+        o5.size = 1
+        self.results.sort_dupes(2)
+        self.assertEqual([o5,o3,o2],self.results.dupes)
+        self.results.sort_dupes(2,False)
+        self.assertEqual([o2,o3,o5],self.results.dupes)
+    
+    def test_dupe_list_remember_sort(self):
+        o1,o2,o3,o4,o5 = self.objects
+        o1.size = 5
+        o2.size = 4
+        o3.size = 3
+        o4.size = 2
+        o5.size = 1
+        self.results.sort_dupes(2)
+        self.results.make_ref(o2)
+        self.assertEqual([o5,o3,o1],self.results.dupes)
+    
+    def test_dupe_list_sort_delta_values(self):
+        o1,o2,o3,o4,o5 = self.objects
+        o1.size = 10
+        o2.size = 2 #-8
+        o3.size = 3 #-7
+        o4.size = 20
+        o5.size = 1 #-19
+        self.results.sort_dupes(2,delta=True)
+        self.assertEqual([o5,o2,o3],self.results.dupes)
+    
+    def test_sort_empty_list(self):
+        #There was an infinite loop when sorting an empty list.
+        r = Results(data)
+        r.sort_dupes(0)
+        self.assertEqual([],r.dupes)
+    
+    def test_dupe_list_update_on_remove_duplicates(self):
+        o1,o2,o3,o4,o5 = self.objects
+        self.assertEqual(3,len(self.results.dupes))
+        self.results.remove_duplicates([o2])
+        self.assertEqual(2,len(self.results.dupes))
+    
+
+class TCResultsMarkings(TestCase):
+    def setUp(self):
+        self.results = Results(data)
+        self.objects,self.matches,self.groups = GetTestGroups()
+        self.results.groups = self.groups
+    
+    def test_stat_line(self):
+        self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.mark(self.objects[1])
+        self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.mark_invert()
+        self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.mark_invert()
+        self.results.unmark(self.objects[1])
+        self.results.mark(self.objects[2])
+        self.results.mark(self.objects[4])
+        self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.mark(self.objects[0]) #this is a ref, it can't be counted
+        self.assertEqual("2 / 3 (2.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.groups = self.groups
+        self.assertEqual("0 / 3 (0.00 B / 1.01 KB) duplicates marked.",self.results.stat_line)
+    
+    def test_with_ref_duplicate(self):
+        self.objects[1].is_ref = True
+        self.results.groups = self.groups
+        self.assert_(not self.results.mark(self.objects[1]))
+        self.results.mark(self.objects[2])
+        self.assertEqual("1 / 2 (1.00 B / 2.00 B) duplicates marked.",self.results.stat_line)
+    
+    def test_perform_on_marked(self):
+        def log_object(o):
+            log.append(o)
+            return True
+        
+        log = []
+        self.results.mark_all()
+        self.results.perform_on_marked(log_object,False)
+        self.assert_(self.objects[1] in log)
+        self.assert_(self.objects[2] in log)
+        self.assert_(self.objects[4] in log)
+        self.assertEqual(3,len(log))
+        log = []
+        self.results.mark_none()
+        self.results.mark(self.objects[4])
+        self.results.perform_on_marked(log_object,True)
+        self.assertEqual(1,len(log))
+        self.assert_(self.objects[4] in log)
+        self.assertEqual(1,len(self.results.groups))
+    
+    def test_perform_on_marked_with_problems(self):
+        def log_object(o):
+            log.append(o)
+            return o is not self.objects[1]
+        
+        log = []
+        self.results.mark_all()
+        self.assert_(self.results.is_marked(self.objects[1]))
+        self.assertEqual(1,self.results.perform_on_marked(log_object, True))
+        self.assertEqual(3,len(log))
+        self.assertEqual(1,len(self.results.groups))
+        self.assertEqual(2,len(self.results.groups[0]))
+        self.assert_(self.objects[1] in self.results.groups[0])
+        self.assert_(not self.results.is_marked(self.objects[2]))
+        self.assert_(self.results.is_marked(self.objects[1]))
+    
+    def test_perform_on_marked_with_ref(self):
+        def log_object(o):
+            log.append(o)
+            return True
+        
+        log = []
+        self.objects[0].is_ref = True
+        self.objects[1].is_ref = True
+        self.results.mark_all()
+        self.results.perform_on_marked(log_object,True)
+        self.assert_(self.objects[1] not in log)
+        self.assert_(self.objects[2] in log)
+        self.assert_(self.objects[4] in log)
+        self.assertEqual(2,len(log))
+        self.assertEqual(0,len(self.results.groups))
+    
+    def test_perform_on_marked_remove_objects_only_at_the_end(self):
+        def check_groups(o):
+            self.assertEqual(3,len(g1))
+            self.assertEqual(2,len(g2))
+            return True
+        
+        g1,g2 = self.results.groups
+        self.results.mark_all()
+        self.results.perform_on_marked(check_groups,True)
+        self.assertEqual(0,len(g1))
+        self.assertEqual(0,len(g2))
+        self.assertEqual(0,len(self.results.groups))
+    
+    def test_remove_duplicates(self):
+        g1 = self.results.groups[0]
+        g2 = self.results.groups[1]
+        self.results.mark(g1.dupes[0])
+        self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.remove_duplicates([g1.dupes[1]])
+        self.assertEqual("1 / 2 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.remove_duplicates([g1.dupes[0]])
+        self.assertEqual("0 / 1 (0.00 B / 1.00 B) duplicates marked.",self.results.stat_line)
+    
+    def test_make_ref(self):
+        g = self.results.groups[0]
+        d = g.dupes[0]
+        self.results.mark(d)
+        self.assertEqual("1 / 3 (1.00 KB / 1.01 KB) duplicates marked.",self.results.stat_line)
+        self.results.make_ref(d)
+        self.assertEqual("0 / 3 (0.00 B / 3.00 B) duplicates marked.",self.results.stat_line)
+        self.results.make_ref(d)
+        self.assertEqual("0 / 3 (0.00 B / 3.00 B) duplicates marked.",self.results.stat_line)
+    
+    def test_SaveXML(self):
+        self.results.mark(self.objects[1])
+        self.results.mark_invert()
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        doc = xml.dom.minidom.parse(f)
+        root = doc.documentElement
+        g1,g2 = root.getElementsByTagName('group')
+        d1,d2,d3 = g1.getElementsByTagName('file')
+        self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
+        self.assertEqual('n',d2.getAttributeNode('marked').nodeValue)
+        self.assertEqual('y',d3.getAttributeNode('marked').nodeValue)
+        d1,d2 = g2.getElementsByTagName('file')
+        self.assertEqual('n',d1.getAttributeNode('marked').nodeValue)
+        self.assertEqual('y',d2.getAttributeNode('marked').nodeValue)
+    
+    def test_LoadXML(self):
+        def get_file(path):
+            return [f for f in self.objects if str(f.path) == path][0]
+        
+        self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
+        self.results.mark(self.objects[1])
+        self.results.mark_invert()
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        r = Results(data)
+        r.load_from_xml(f,get_file)
+        self.assert_(not r.is_marked(self.objects[0]))
+        self.assert_(not r.is_marked(self.objects[1]))
+        self.assert_(r.is_marked(self.objects[2]))
+        self.assert_(not r.is_marked(self.objects[3]))
+        self.assert_(r.is_marked(self.objects[4]))
+    
+
+class TCResultsXML(TestCase):
+    def setUp(self):
+        self.results = Results(data)
+        self.objects, self.matches, self.groups = GetTestGroups()
+        self.results.groups = self.groups
+    
+    def get_file(self, path): # use this as a callback for load_from_xml
+        return [o for o in self.objects if o.path == path][0]
+    
+    def test_save_to_xml(self):
+        self.objects[0].is_ref = True
+        self.objects[0].words = [['foo','bar']]
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        doc = xml.dom.minidom.parse(f)
+        root = doc.documentElement
+        self.assertEqual('results',root.nodeName)
+        children = [c for c in root.childNodes if c.localName]
+        self.assertEqual(2,len(children))
+        self.assertEqual(2,len([c for c in children if c.nodeName == 'group']))
+        g1,g2 = children
+        children = [c for c in g1.childNodes if c.localName]
+        self.assertEqual(6,len(children))
+        self.assertEqual(3,len([c for c in children if c.nodeName == 'file']))
+        self.assertEqual(3,len([c for c in children if c.nodeName == 'match']))
+        d1,d2,d3 = [c for c in children if c.nodeName == 'file']
+        self.assertEqual(op.join('basepath','foo bar'),d1.getAttributeNode('path').nodeValue)
+        self.assertEqual(op.join('basepath','bar bleh'),d2.getAttributeNode('path').nodeValue)
+        self.assertEqual(op.join('basepath','foo bleh'),d3.getAttributeNode('path').nodeValue)
+        self.assertEqual('y',d1.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('n',d3.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('foo,bar',d1.getAttributeNode('words').nodeValue)
+        self.assertEqual('bar,bleh',d2.getAttributeNode('words').nodeValue)
+        self.assertEqual('foo,bleh',d3.getAttributeNode('words').nodeValue)
+        children = [c for c in g2.childNodes if c.localName]
+        self.assertEqual(3,len(children))
+        self.assertEqual(2,len([c for c in children if c.nodeName == 'file']))
+        self.assertEqual(1,len([c for c in children if c.nodeName == 'match']))
+        d1,d2 = [c for c in children if c.nodeName == 'file']
+        self.assertEqual(op.join('basepath','ibabtu'),d1.getAttributeNode('path').nodeValue)
+        self.assertEqual(op.join('basepath','ibabtu'),d2.getAttributeNode('path').nodeValue)
+        self.assertEqual('n',d1.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('n',d2.getAttributeNode('is_ref').nodeValue)
+        self.assertEqual('ibabtu',d1.getAttributeNode('words').nodeValue)
+        self.assertEqual('ibabtu',d2.getAttributeNode('words').nodeValue)
+    
+    def test_LoadXML(self):
+        def get_file(path):
+            return [f for f in self.objects if str(f.path) == path][0]
+        
+        self.objects[0].is_ref = True
+        self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        r = Results(data)
+        r.load_from_xml(f,get_file)
+        self.assertEqual(2,len(r.groups))
+        g1,g2 = r.groups
+        self.assertEqual(3,len(g1))
+        self.assert_(g1[0].is_ref)
+        self.assert_(not g1[1].is_ref)
+        self.assert_(not g1[2].is_ref)
+        self.assert_(g1[0] is self.objects[0])
+        self.assert_(g1[1] is self.objects[1])
+        self.assert_(g1[2] is self.objects[2])
+        self.assertEqual(['foo','bar'],g1[0].words)
+        self.assertEqual(['bar','bleh'],g1[1].words)
+        self.assertEqual(['foo','bleh'],g1[2].words)
+        self.assertEqual(2,len(g2))
+        self.assert_(not g2[0].is_ref)
+        self.assert_(not g2[1].is_ref)
+        self.assert_(g2[0] is self.objects[3])
+        self.assert_(g2[1] is self.objects[4])
+        self.assertEqual(['ibabtu'],g2[0].words)
+        self.assertEqual(['ibabtu'],g2[1].words)
+    
+    def test_LoadXML_with_filename(self):
+        def get_file(path):
+            return [f for f in self.objects if str(f.path) == path][0]
+        
+        filename = op.join(self.tmpdir(), 'dupeguru_results.xml')
+        self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
+        self.results.save_to_xml(filename)
+        r = Results(data)
+        r.load_from_xml(filename,get_file)
+        self.assertEqual(2,len(r.groups))
+    
+    def test_LoadXML_with_some_files_that_dont_exist_anymore(self):
+        def get_file(path):
+            if path.endswith('ibabtu 2'):
+                return None
+            return [f for f in self.objects if str(f.path) == path][0]
+        
+        self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        r = Results(data)
+        r.load_from_xml(f,get_file)
+        self.assertEqual(1,len(r.groups))
+        self.assertEqual(3,len(r.groups[0]))
+    
+    def test_LoadXML_missing_attributes_and_bogus_elements(self):
+        def get_file(path):
+            return [f for f in self.objects if str(f.path) == path][0]
+        
+        doc = xml.dom.minidom.Document()
+        root = doc.appendChild(doc.createElement('foobar')) #The root element shouldn't matter, really.
+        group_node = root.appendChild(doc.createElement('group'))
+        dupe_node = group_node.appendChild(doc.createElement('file')) #Perfectly correct file
+        dupe_node.setAttribute('path',op.join('basepath','foo bar'))
+        dupe_node.setAttribute('is_ref','y')
+        dupe_node.setAttribute('words','foo,bar')
+        dupe_node = group_node.appendChild(doc.createElement('file')) #is_ref missing, default to 'n'
+        dupe_node.setAttribute('path',op.join('basepath','foo bleh'))
+        dupe_node.setAttribute('words','foo,bleh')
+        dupe_node = group_node.appendChild(doc.createElement('file')) #words are missing, invalid.
+        dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
+        dupe_node = group_node.appendChild(doc.createElement('file')) #path is missing, invalid.
+        dupe_node.setAttribute('words','foo,bleh')
+        dupe_node = group_node.appendChild(doc.createElement('foobar')) #Invalid element name
+        dupe_node.setAttribute('path',op.join('basepath','bar bleh'))
+        dupe_node.setAttribute('is_ref','y')
+        dupe_node.setAttribute('words','bar,bleh')
+        match_node = group_node.appendChild(doc.createElement('match')) # match pointing to a bad index
+        match_node.setAttribute('first', '42')
+        match_node.setAttribute('second', '45')
+        match_node = group_node.appendChild(doc.createElement('match')) # match with missing attrs
+        match_node = group_node.appendChild(doc.createElement('match')) # match with non-int values
+        match_node.setAttribute('first', 'foo')
+        match_node.setAttribute('second', 'bar')
+        match_node.setAttribute('percentage', 'baz')
+        group_node = root.appendChild(doc.createElement('foobar')) #invalid group
+        group_node = root.appendChild(doc.createElement('group')) #empty group
+        f = StringIO.StringIO()
+        doc.writexml(f,'\t','\t','\n',encoding='utf-8')
+        f.seek(0)
+        r = Results(data)
+        r.load_from_xml(f,get_file)
+        self.assertEqual(1,len(r.groups))
+        self.assertEqual(2,len(r.groups[0]))
+    
+    def test_xml_non_ascii(self):
+        def get_file(path):
+            if path == op.join('basepath',u'\xe9foo bar'):
+                return objects[0]
+            if path == op.join('basepath',u'bar bleh'):
+                return objects[1]
+        
+        objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
+        matches = engine.getmatches(objects) #we should have 5 matches
+        groups = engine.get_groups(matches) #We should have 2 groups
+        for g in groups:
+            g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
+        results = Results(data)
+        results.groups = groups
+        f = StringIO.StringIO()
+        results.save_to_xml(f)
+        f.seek(0)
+        r = Results(data)
+        r.load_from_xml(f,get_file)
+        g = r.groups[0]
+        self.assertEqual(u"\xe9foo bar",g[0].name)
+        self.assertEqual(['efoo','bar'],g[0].words)
+    
+    def test_load_invalid_xml(self):
+        f = StringIO.StringIO()
+        f.write('<this is invalid')
+        f.seek(0)
+        r = Results(data)
+        r.load_from_xml(f,None)
+        self.assertEqual(0,len(r.groups))
+    
+    def test_load_non_existant_xml(self):
+        r = Results(data)
+        try:
+            r.load_from_xml('does_not_exist.xml', None)
+        except IOError:
+            self.fail()
+        self.assertEqual(0,len(r.groups))
+    
+    def test_remember_match_percentage(self):
+        group = self.groups[0]
+        d1, d2, d3 = group
+        fake_matches = set()
+        fake_matches.add(engine.Match(d1, d2, 42))
+        fake_matches.add(engine.Match(d1, d3, 43))
+        fake_matches.add(engine.Match(d2, d3, 46))
+        group.matches = fake_matches
+        f = StringIO.StringIO()
+        results = self.results
+        results.save_to_xml(f)
+        f.seek(0)
+        results = Results(data)
+        results.load_from_xml(f, self.get_file)
+        group = results.groups[0]
+        d1, d2, d3 = group
+        match = group.get_match_of(d2) #d1 - d2
+        self.assertEqual(42, match[2])
+        match = group.get_match_of(d3) #d1 - d3
+        self.assertEqual(43, match[2])
+        group.switch_ref(d2)
+        match = group.get_match_of(d3) #d2 - d3
+        self.assertEqual(46, match[2])
+    
+    def test_save_and_load(self):
+        # previously, when reloading matches, they wouldn't be reloaded as namedtuples
+        f = StringIO.StringIO()
+        self.results.save_to_xml(f)
+        f.seek(0)
+        self.results.load_from_xml(f, self.get_file)
+        first(self.results.groups[0].matches).percentage
+    
+
+class TCResultsFilter(TestCase):
+    def setUp(self):
+        self.results = Results(data)
+        self.objects, self.matches, self.groups = GetTestGroups()
+        self.results.groups = self.groups
+        self.results.apply_filter(r'foo')
+    
+    def test_groups(self):
+        self.assertEqual(1, len(self.results.groups))
+        self.assert_(self.results.groups[0] is self.groups[0])
+    
+    def test_dupes(self):
+        # There are 2 objects matching. The first one is ref. Only the 3rd one is supposed to be in dupes.
+        self.assertEqual(1, len(self.results.dupes))
+        self.assert_(self.results.dupes[0] is self.objects[2])
+    
+    def test_cancel_filter(self):
+        self.results.apply_filter(None)
+        self.assertEqual(3, len(self.results.dupes))
+        self.assertEqual(2, len(self.results.groups))
+    
+    def test_dupes_reconstructed_filtered(self):
+        # make_ref resets self.__dupes to None. When it's reconstructed, we want it filtered
+        dupe = self.results.dupes[0] #3rd object
+        self.results.make_ref(dupe)
+        self.assertEqual(1, len(self.results.dupes))
+        self.assert_(self.results.dupes[0] is self.objects[0])
+    
+    def test_include_ref_dupes_in_filter(self):
+        # When only the ref of a group match the filter, include it in the group
+        self.results.apply_filter(None)
+        self.results.apply_filter(r'foo bar')
+        self.assertEqual(1, len(self.results.groups))
+        self.assertEqual(0, len(self.results.dupes))
+    
+    def test_filters_build_on_one_another(self):
+        self.results.apply_filter(r'bar')
+        self.assertEqual(1, len(self.results.groups))
+        self.assertEqual(0, len(self.results.dupes))
+    
+    def test_stat_line(self):
+        expected = '0 / 1 (0.00 B / 1.00 B) duplicates marked. filter: foo'
+        self.assertEqual(expected, self.results.stat_line)
+        self.results.apply_filter(r'bar')
+        expected = '0 / 0 (0.00 B / 0.00 B) duplicates marked. filter: foo --> bar'
+        self.assertEqual(expected, self.results.stat_line)
+        self.results.apply_filter(None)
+        expected = '0 / 3 (0.00 B / 1.01 KB) duplicates marked.'
+        self.assertEqual(expected, self.results.stat_line)
+    
+    def test_mark_count_is_filtered_as_well(self):
+        self.results.apply_filter(None)
+        # We don't want to perform mark_all() because we want the mark list to contain objects
+        for dupe in self.results.dupes:
+            self.results.mark(dupe)
+        self.results.apply_filter(r'foo')
+        expected = '1 / 1 (1.00 B / 1.00 B) duplicates marked. filter: foo'
+        self.assertEqual(expected, self.results.stat_line)
+    
+    def test_sort_groups(self):
+        self.results.apply_filter(None)
+        self.results.make_ref(self.objects[1]) # to have the 1024 b obkect as ref
+        g1,g2 = self.groups
+        self.results.apply_filter('a') # Matches both group
+        self.results.sort_groups(2) #2 is the key for size
+        self.assert_(self.results.groups[0] is g2)
+        self.assert_(self.results.groups[1] is g1)
+        self.results.apply_filter(None)
+        self.assert_(self.results.groups[0] is g2)
+        self.assert_(self.results.groups[1] is g1)
+        self.results.sort_groups(2, False)
+        self.results.apply_filter('a')
+        self.assert_(self.results.groups[1] is g2)
+        self.assert_(self.results.groups[0] is g1)
+    
+    def test_set_group(self):
+        #We want the new group to be filtered
+        self.objects, self.matches, self.groups = GetTestGroups()
+        self.results.groups = self.groups
+        self.assertEqual(1, len(self.results.groups))
+        self.assert_(self.results.groups[0] is self.groups[0])
+    
+    def test_load_cancels_filter(self):
+        def get_file(path):
+            return [f for f in self.objects if str(f.path) == path][0]
+        
+        filename = op.join(self.tmpdir(), 'dupeguru_results.xml')
+        self.objects[4].name = 'ibabtu 2' #we can't have 2 files with the same path
+        self.results.save_to_xml(filename)
+        r = Results(data)
+        r.apply_filter('foo')
+        r.load_from_xml(filename,get_file)
+        self.assertEqual(2,len(r.groups))
+    
+    def test_remove_dupe(self):
+        self.results.remove_duplicates([self.results.dupes[0]])
+        self.results.apply_filter(None)
+        self.assertEqual(2,len(self.results.groups))
+        self.assertEqual(2,len(self.results.dupes))
+        self.results.apply_filter('ibabtu')
+        self.results.remove_duplicates([self.results.dupes[0]])
+        self.results.apply_filter(None)
+        self.assertEqual(1,len(self.results.groups))
+        self.assertEqual(1,len(self.results.dupes))
+    
+    def test_filter_is_case_insensitive(self):
+        self.results.apply_filter(None)
+        self.results.apply_filter('FOO')
+        self.assertEqual(1, len(self.results.dupes))
+    
+    def test_make_ref_on_filtered_out_doesnt_mess_stats(self):
+        # When filtered, a group containing filtered out dupes will display them as being reference.
+        # When calling make_ref on such a dupe, the total size and dupecount stats gets messed up
+        # because they are *not* counted in the stats in the first place.
+        g1, g2 = self.groups
+        bar_bleh = g1[1] # The "bar bleh" dupe is filtered out
+        self.results.make_ref(bar_bleh)
+        # Now the stats should display *2* markable dupes (instead of 1)
+        expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked. filter: foo'
+        self.assertEqual(expected, self.results.stat_line)
+        self.results.apply_filter(None) # Now let's make sure our unfiltered results aren't fucked up
+        expected = '0 / 3 (0.00 B / 3.00 B) duplicates marked.'
+        self.assertEqual(expected, self.results.stat_line)
+    
+
+class TCResultsRefFile(TestCase):
+    def setUp(self):
+        self.results = Results(data)
+        self.objects, self.matches, self.groups = GetTestGroups()
+        self.objects[0].is_ref = True
+        self.objects[1].is_ref = True
+        self.results.groups = self.groups
+    
+    def test_stat_line(self):
+        expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked.'
+        self.assertEqual(expected, self.results.stat_line)
+    
+    def test_make_ref(self):
+        d = self.results.groups[0].dupes[1] #non-ref
+        r = self.results.groups[0].ref
+        self.results.make_ref(d)
+        expected = '0 / 1 (0.00 B / 1.00 B) duplicates marked.'
+        self.assertEqual(expected, self.results.stat_line)
+        self.results.make_ref(r)
+        expected = '0 / 2 (0.00 B / 2.00 B) duplicates marked.'
+        self.assertEqual(expected, self.results.stat_line)
+    
--- a/core/tests/scanner_test.py
+++ b/core/tests/scanner_test.py
@@ -0,0 +1,467 @@
+# Created By: Virgil Dupras
+# Created On: 2006/03/03
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from nose.tools import eq_
+
+from hsutil import job, io
+from hsutil.path import Path
+from hsutil.testcase import TestCase
+
+from .. import fs
+from ..engine import getwords, Match
+from ..ignore import IgnoreList
+from ..scanner import *
+
+class NamedObject(object):
+    def __init__(self, name="foobar", size=1):
+        self.name = name
+        self.size = size
+        self.path = Path('')
+        self.words = getwords(name)
+    
+
+no = NamedObject
+
+#--- Scanner
+class ScannerTestFakeFiles(TestCase):
+    def setUp(self):
+        # This is a hack to avoid invalidating all previous tests since the scanner started to test
+        # for file existence before doing the match grouping.
+        self.mock(io, 'exists', lambda _: True)
+    
+    def test_empty(self):
+        s = Scanner()
+        r = s.GetDupeGroups([])
+        eq_(r, [])
+    
+    def test_default_settings(self):
+        s = Scanner()
+        eq_(s.min_match_percentage, 80)
+        eq_(s.scan_type, SCAN_TYPE_FILENAME)
+        eq_(s.mix_file_kind, True)
+        eq_(s.word_weighting, False)
+        eq_(s.match_similar_words, False)
+        assert isinstance(s.ignore_list, IgnoreList)
+    
+    def test_simple_with_default_settings(self):
+        s = Scanner()
+        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        g = r[0]
+        #'foo bleh' cannot be in the group because the default min match % is 80
+        eq_(len(g), 2)
+        assert g.ref in f[:2]
+        assert g.dupes[0] in f[:2]
+    
+    def test_simple_with_lower_min_match(self):
+        s = Scanner()
+        s.min_match_percentage = 50
+        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        g = r[0]
+        eq_(len(g), 3)
+    
+    def test_trim_all_ref_groups(self):
+        # When all files of a group are ref, don't include that group in the results, but also don't
+        # count the files from that group as discarded.
+        s = Scanner()
+        f = [no('foo'), no('foo'), no('bar'), no('bar')]
+        f[2].is_ref = True
+        f[3].is_ref = True
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(s.discarded_file_count, 0)
+    
+    def test_priorize(self):
+        s = Scanner()
+        f = [no('foo'), no('foo'), no('bar'), no('bar')]
+        f[1].size = 2
+        f[2].size = 3
+        f[3].is_ref = True
+        r = s.GetDupeGroups(f)
+        g1, g2 = r
+        assert f[1] in (g1.ref,g2.ref)
+        assert f[0] in (g1.dupes[0],g2.dupes[0])
+        assert f[3] in (g1.ref,g2.ref)
+        assert f[2] in (g1.dupes[0],g2.dupes[0])
+    
+    def test_content_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [no('foo'), no('bar'), no('bleh')]
+        f[0].md5 = f[0].md5partial = 'foobar'
+        f[1].md5 = f[1].md5partial = 'foobar'
+        f[2].md5 = f[2].md5partial = 'bleh'
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+        eq_(s.discarded_file_count, 0) # don't count the different md5 as discarded!
+    
+    def test_content_scan_compare_sizes_first(self):
+        class MyFile(no):
+            @property
+            def md5(file):
+                raise AssertionError()
+    
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [MyFile('foo', 1), MyFile('bar', 2)]
+        eq_(len(s.GetDupeGroups(f)), 0)
+    
+    def test_min_match_perc_doesnt_matter_for_content_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [no('foo'), no('bar'), no('bleh')]
+        f[0].md5 = f[0].md5partial = 'foobar'
+        f[1].md5 = f[1].md5partial = 'foobar'
+        f[2].md5 = f[2].md5partial = 'bleh'
+        s.min_match_percentage = 101
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+        s.min_match_percentage = 0
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+    
+    def test_content_scan_doesnt_put_md5_in_words_at_the_end(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        f = [no('foo'),no('bar')]
+        f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
+        f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
+        r = s.GetDupeGroups(f)
+        g = r[0]
+    
+    def test_extension_is_not_counted_in_filename_scan(self):
+        s = Scanner()
+        s.min_match_percentage = 100
+        f = [no('foo.bar'), no('foo.bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+    
+    def test_job(self):
+        def do_progress(progress, desc=''):
+            log.append(progress)
+            return True
+    
+        s = Scanner()
+        log = []
+        f = [no('foo bar'), no('foo bar'), no('foo bleh')]
+        r = s.GetDupeGroups(f, job.Job(1, do_progress))
+        eq_(log[0], 0)
+        eq_(log[-1], 100)
+    
+    def test_mix_file_kind(self):
+        s = Scanner()
+        s.mix_file_kind = False
+        f = [no('foo.1'), no('foo.2')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 0)
+    
+    def test_word_weighting(self):
+        s = Scanner()
+        s.min_match_percentage = 75
+        s.word_weighting = True
+        f = [no('foo bar'), no('foo bar bleh')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        g = r[0]
+        m = g.get_match_of(g.dupes[0])
+        eq_(m.percentage, 75) # 16 letters, 12 matching
+    
+    def test_similar_words(self):
+        s = Scanner()
+        s.match_similar_words = True
+        f = [no('The White Stripes'), no('The Whites Stripe'), no('Limp Bizkit'), no('Limp Bizkitt')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 2)
+    
+    def test_fields(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_FIELDS
+        f = [no('The White Stripes - Little Ghost'), no('The White Stripes - Little Acorn')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 0)
+    
+    def test_fields_no_order(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_FIELDS_NO_ORDER
+        f = [no('The White Stripes - Little Ghost'), no('Little Ghost - The White Stripes')]
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+    
+    def test_tag_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes'
+        o1.title = 'The Air Near My Fingers'
+        o2.artist = 'The White Stripes'
+        o2.title = 'The Air Near My Fingers'
+        r = s.GetDupeGroups([o1,o2])
+        eq_(len(r), 1)
+    
+    def test_tag_with_album_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['artist', 'album', 'title'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o3 = no('bleh')
+        o1.artist = 'The White Stripes'
+        o1.title = 'The Air Near My Fingers'
+        o1.album = 'Elephant'
+        o2.artist = 'The White Stripes'
+        o2.title = 'The Air Near My Fingers'
+        o2.album = 'Elephant'
+        o3.artist = 'The White Stripes'
+        o3.title = 'The Air Near My Fingers'
+        o3.album = 'foobar'
+        r = s.GetDupeGroups([o1,o2,o3])
+        eq_(len(r), 1)
+    
+    def test_that_dash_in_tags_dont_create_new_fields(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['artist', 'album', 'title'])
+        s.min_match_percentage = 50
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes - a'
+        o1.title = 'The Air Near My Fingers - a'
+        o1.album = 'Elephant - a'
+        o2.artist = 'The White Stripes - b'
+        o2.title = 'The Air Near My Fingers - b'
+        o2.album = 'Elephant - b'
+        r = s.GetDupeGroups([o1,o2])
+        eq_(len(r), 1)
+    
+    def test_tag_scan_with_different_scanned(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['track', 'year'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes'
+        o1.title = 'some title'
+        o1.track = 'foo'
+        o1.year = 'bar'
+        o2.artist = 'The White Stripes'
+        o2.title = 'another title'
+        o2.track = 'foo'
+        o2.year = 'bar'
+        r = s.GetDupeGroups([o1, o2])
+        eq_(len(r), 1)
+    
+    def test_tag_scan_only_scans_existing_tags(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['artist', 'foo'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.artist = 'The White Stripes'
+        o1.foo = 'foo'
+        o2.artist = 'The White Stripes'
+        o2.foo = 'bar'
+        r = s.GetDupeGroups([o1, o2])
+        eq_(len(r), 1) # Because 'foo' is not scanned, they match
+    
+    def test_tag_scan_converts_to_str(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['track'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.track = 42
+        o2.track = 42
+        try:
+            r = s.GetDupeGroups([o1, o2])
+        except TypeError:
+            raise AssertionError()
+        eq_(len(r), 1)
+    
+    def test_tag_scan_non_ascii(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_TAG
+        s.scanned_tags = set(['title'])
+        o1 = no('foo')
+        o2 = no('bar')
+        o1.title = u'foobar\u00e9'
+        o2.title = u'foobar\u00e9'
+        try:
+            r = s.GetDupeGroups([o1, o2])
+        except UnicodeEncodeError:
+            raise AssertionError()
+        eq_(len(r), 1)
+    
+    def test_audio_content_scan(self):
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
+        f = [no('foo'), no('bar'), no('bleh')]
+        f[0].md5 = 'foo'
+        f[1].md5 = 'bar'
+        f[2].md5 = 'bleh'
+        f[0].md5partial = 'foo'
+        f[1].md5partial = 'foo'
+        f[2].md5partial = 'bleh'
+        f[0].audiosize = 1
+        f[1].audiosize = 1
+        f[2].audiosize = 1
+        r = s.GetDupeGroups(f)
+        eq_(len(r), 1)
+        eq_(len(r[0]), 2)
+    
+    def test_audio_content_scan_compare_sizes_first(self):
+        class MyFile(no):
+            @property
+            def md5partial(file):
+                raise AssertionError()
+    
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT_AUDIO
+        f = [MyFile('foo'), MyFile('bar')]
+        f[0].audiosize = 1
+        f[1].audiosize = 2
+        eq_(len(s.GetDupeGroups(f)), 0)
+    
+    def test_ignore_list(self):
+        s = Scanner()
+        f1 = no('foobar')
+        f2 = no('foobar')
+        f3 = no('foobar')
+        f1.path = Path('dir1/foobar')
+        f2.path = Path('dir2/foobar')
+        f3.path = Path('dir3/foobar')
+        s.ignore_list.Ignore(str(f1.path),str(f2.path))
+        s.ignore_list.Ignore(str(f1.path),str(f3.path))
+        r = s.GetDupeGroups([f1,f2,f3])
+        eq_(len(r), 1)
+        g = r[0]
+        eq_(len(g.dupes), 1)
+        assert f1 not in g
+        assert f2 in g
+        assert f3 in g
+        # Ignored matches are not counted as discarded
+        eq_(s.discarded_file_count, 0)
+    
+    def test_ignore_list_checks_for_unicode(self):
+        #scanner was calling path_str for ignore list checks. Since the Path changes, it must
+        #be unicode(path)
+        s = Scanner()
+        f1 = no('foobar')
+        f2 = no('foobar')
+        f3 = no('foobar')
+        f1.path = Path(u'foo1\u00e9')
+        f2.path = Path(u'foo2\u00e9')
+        f3.path = Path(u'foo3\u00e9')
+        s.ignore_list.Ignore(unicode(f1.path),unicode(f2.path))
+        s.ignore_list.Ignore(unicode(f1.path),unicode(f3.path))
+        r = s.GetDupeGroups([f1,f2,f3])
+        eq_(len(r), 1)
+        g = r[0]
+        eq_(len(g.dupes), 1)
+        assert f1 not in g
+        assert f2 in g
+        assert f3 in g
+    
+    def test_file_evaluates_to_false(self):
+        # A very wrong way to use any() was added at some point, causing resulting group list
+        # to be empty.
+        class FalseNamedObject(NamedObject):
+            def __nonzero__(self):
+                return False
+        
+    
+        s = Scanner()
+        f1 = FalseNamedObject('foobar')
+        f2 = FalseNamedObject('foobar')
+        r = s.GetDupeGroups([f1, f2])
+        eq_(len(r), 1)
+    
+    def test_size_threshold(self):
+        # Only file equal or higher than the size_threshold in size are scanned
+        s = Scanner()
+        f1 = no('foo', 1)
+        f2 = no('foo', 2)
+        f3 = no('foo', 3)
+        s.size_threshold = 2
+        groups = s.GetDupeGroups([f1,f2,f3])
+        eq_(len(groups), 1)
+        [group] = groups
+        eq_(len(group), 2)
+        assert f1 not in group
+        assert f2 in group
+        assert f3 in group
+    
+    def test_tie_breaker_path_deepness(self):
+        # If there is a tie in prioritization, path deepness is used as a tie breaker
+        s = Scanner()
+        o1, o2 = no('foo'), no('foo')
+        o1.path = Path('foo')
+        o2.path = Path('foo/bar')
+        [group] = s.GetDupeGroups([o1, o2])
+        assert group.ref is o2
+    
+    def test_tie_breaker_copy(self):
+        # if copy is in the words used (even if it has a deeper path), it becomes a dupe
+        s = Scanner()
+        o1, o2 = no('foo bar Copy'), no('foo bar')
+        o1.path = Path('deeper/path')
+        o2.path = Path('foo')
+        [group] = s.GetDupeGroups([o1, o2])
+        assert group.ref is o2
+    
+    def test_tie_breaker_same_name_plus_digit(self):
+        # if ref has the same words as dupe, but has some just one extra word which is a digit, it
+        # becomes a dupe
+        s = Scanner()
+        o1, o2 = no('foo bar 42'), no('foo bar')
+        o1.path = Path('deeper/path')
+        o2.path = Path('foo')
+        [group] = s.GetDupeGroups([o1, o2])
+        assert group.ref is o2
+    
+    def test_partial_group_match(self):
+        # Count the number od discarded matches (when a file doesn't match all other dupes of the 
+        # group) in Scanner.discarded_file_count
+        s = Scanner()
+        o1, o2, o3 = no('a b'), no('a'), no('b')
+        s.min_match_percentage = 50
+        [group] = s.GetDupeGroups([o1, o2, o3])
+        eq_(len(group), 2)
+        assert o1 in group
+        assert o2 in group
+        assert o3 not in group
+        eq_(s.discarded_file_count, 1)
+    
+
+class ScannerTest(TestCase):
+    def test_dont_group_files_that_dont_exist(self):
+        # when creating groups, check that files exist first. It's possible that these files have
+        # been moved during the scan by the user.
+        # In this test, we have to delete one of the files between the get_matches() part and the
+        # get_groups() part.
+        s = Scanner()
+        s.scan_type = SCAN_TYPE_CONTENT
+        p = self.tmppath()
+        io.open(p + 'file1', 'w').write('foo')
+        io.open(p + 'file2', 'w').write('foo')
+        file1, file2 = fs.get_files(p)
+        def getmatches(*args, **kw):
+            io.remove(file2.path)
+            return [Match(file1, file2, 100)]
+        s._getmatches = getmatches
+        
+        assert not s.GetDupeGroups([file1, file2])
+