dgpe qt: build related fixes.

--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40217
Added dgpe 1.7.8 to the changelog.
2026-03-12 11:31:39 +00:00 · 2009-10-24 16:30:37 +00:00 · 2009-10-24 14:18:36 +00:00 · 2009-10-24 13:54:57 +00:00 · 2009-10-24 12:21:39 +00:00 · 2009-10-24 12:21:09 +00:00
44 changed files with 1168 additions and 705 deletions
--- a/base/py/app.py
+++ b/base/py/app.py
@@ -14,13 +14,13 @@ import os
 import os.path as op
 import logging
-from hsutil import job, io, files
+from hsutil import io, files
 from hsutil.path import Path
 from hsutil.reg import RegistrableApplication, RegistrationRequired
 from hsutil.misc import flatten, first
 from hsutil.str import escape
-from . import directories, results, scanner, export
+from . import directories, results, scanner, export, fs
 JOB_SCAN = 'job_scan'
 JOB_LOAD = 'job_load'
@@ -98,13 +98,8 @@ class DupeGuru(RegistrableApplication):
            return ['---'] * len(self.data.COLUMNS)
    def _get_file(self, str_path):
-        p = Path(str_path)
+        path = Path(str_path)
-        for d in self.directories:
+        return fs.get_file(path, self.directories.fileclasses)    
            if p not in d.path:
                continue
            result = d.find_path(p[d.path:])
            if result is not None:
                return result
    @staticmethod
    def _recycle_dupe(dupe):
@@ -150,7 +145,7 @@ class DupeGuru(RegistrableApplication):
                       2 = absolute re-creation.
        """
        source_path = dupe.path
-        location_path = dupe.root.path
+        location_path = first(p for p in self.directories if dupe.path in p)
        dest_path = Path(destination)
        if dest_type == 2:
            dest_path = dest_path + source_path[1:-1] #Remove drive letter and filename
--- a/base/py/app_cocoa.py
+++ b/base/py/app_cocoa.py
@@ -12,13 +12,12 @@ from AppKit import *
 import logging
 import os.path as op
 import hsfs as fs
 from hsutil import io, cocoa, job
 from hsutil.cocoa import install_exception_hook
 from hsutil.misc import stripnone
 from hsutil.reg import RegistrationRequired
-import app, data
+from . import app, fs
 JOBID2TITLE = {
    app.JOB_SCAN: "Scanning for duplicates",
@@ -43,8 +42,6 @@ class DupeGuru(app.DupeGuru):
        logging.basicConfig(level=LOGGING_LEVEL, format='%(levelname)s %(message)s')
        logging.debug('started in debug mode')
        install_exception_hook()
        if data_module is None:
            data_module = data
        appsupport = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, True)[0]
        appdata = op.join(appsupport, appdata_subdir)
        app.DupeGuru.__init__(self, data_module, appdata, appid)
@@ -91,15 +88,15 @@ class DupeGuru(app.DupeGuru):
        except IndexError:
            return (None,None)
-    def GetDirectory(self,node_path,curr_dir=None):
+    def get_folder_path(self, node_path, curr_path=None):
        if not node_path:
-            return curr_dir
+            return curr_path
-        if curr_dir is not None:
+        current_index = node_path[0]
-            l = curr_dir.dirs
+        if curr_path is None:
            curr_path = self.directories[current_index]
        else:
-            l = self.directories
+            curr_path = self.directories.get_subfolders(curr_path)[current_index]
-        d = l[node_path[0]]
+        return self.get_folder_path(node_path[1:], curr_path)
        return self.GetDirectory(node_path[1:],d)
    def RefreshDetailsTable(self,dupe,group):
        l1 = self._get_display_info(dupe, group, False)
@@ -146,13 +143,13 @@ class DupeGuru(app.DupeGuru):
    def RemoveSelected(self):
        self.results.remove_duplicates(self.selected_dupes)
-    def RenameSelected(self,newname):
+    def RenameSelected(self, newname):
        try:
            d = self.selected_dupes[0]
-            d = d.move(d.parent,newname)
+            d.rename(newname)
            return True
-        except (IndexError,fs.FSError),e:
+        except (IndexError, fs.FSError) as e:
-            logging.warning("dupeGuru Warning: %s" % str(e))
+            logging.warning("dupeGuru Warning: %s" % unicode(e))
        return False
    def RevealSelected(self):
@@ -214,9 +211,9 @@ class DupeGuru(app.DupeGuru):
            self.results.dupes[row] for row in rows if row in xrange(len(self.results.dupes))
        ]
-    def SetDirectoryState(self,node_path,state):
+    def SetDirectoryState(self, node_path, state):
-        d = self.GetDirectory(node_path)
+        p = self.get_folder_path(node_path)
-        self.directories.set_state(d.path,state)
+        self.directories.set_state(p, state)
    def sort_dupes(self,key,asc):
        self.results.sort_dupes(key,asc,self.display_delta_values)
@@ -245,8 +242,12 @@ class DupeGuru(app.DupeGuru):
            return [len(g.dupes) for g in self.results.groups]
        elif tag == 1: #Directories
            try:
-                dirs = self.GetDirectory(node_path).dirs if node_path else self.directories
+                if node_path:
-                return [d.dircount for d in dirs]
+                    path = self.get_folder_path(node_path)
                    subfolders = self.directories.get_subfolders(path)
                else:
                    subfolders = self.directories
                return [len(self.directories.get_subfolders(path)) for path in subfolders]
            except IndexError: # node_path out of range
                return []
        else: #Power Marker
@@ -270,8 +271,9 @@ class DupeGuru(app.DupeGuru):
            return result
        elif tag == 1: #Directories
            try:
-                d = self.GetDirectory(node_path)
+                path = self.get_folder_path(node_path)
-                return [d.name, self.directories.get_state(d.path)]
+                name = unicode(path) if len(node_path) == 1 else path[-1]
                return [name, self.directories.get_state(path)]
            except IndexError: # node_path out of range
                return []
--- a/base/py/data.py
+++ b/base/py/data.py
@@ -40,63 +40,3 @@ def format_dupe_count(c):
 def cmp_value(value):
    return value.lower() if isinstance(value, basestring) else value
 COLUMNS = [
    {'attr':'name','display':'Filename'},
    {'attr':'path','display':'Directory'},
    {'attr':'size','display':'Size (KB)'},
    {'attr':'extension','display':'Kind'},
    {'attr':'ctime','display':'Creation'},
    {'attr':'mtime','display':'Modification'},
    {'attr':'percentage','display':'Match %'},
    {'attr':'words','display':'Words Used'},
    {'attr':'dupe_count','display':'Dupe Count'},
 ]
 METADATA_TO_READ = ['size', 'ctime', 'mtime']
 def GetDisplayInfo(dupe, group, delta):
    size = dupe.size
    ctime = dupe.ctime
    mtime = dupe.mtime
    m = group.get_match_of(dupe)
    if m:
        percentage = m.percentage
        dupe_count = 0
        if delta:
            r = group.ref
            size -= r.size
            ctime -= r.ctime
            mtime -= r.mtime
    else:
        percentage = group.percentage
        dupe_count = len(group.dupes)
    return [
        dupe.name,
        format_path(dupe.path),
        format_size(size, 0, 1, False),
        dupe.extension,
        format_timestamp(ctime, delta and m),
        format_timestamp(mtime, delta and m),
        format_perc(percentage),
        format_words(dupe.words),
        format_dupe_count(dupe_count)
    ]
 def GetDupeSortKey(dupe, get_group, key, delta):
    if key == 6:
        m = get_group().get_match_of(dupe)
        return m.percentage
    if key == 8:
        return 0
    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
    if delta and (key in (2, 4, 5)):
        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
    return r
 def GetGroupSortKey(group, key):
    if key == 6:
        return group.percentage
    if key == 8:
        return len(group)
    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/base/py/directories.py
+++ b/base/py/directories.py
@@ -9,11 +9,12 @@
 import xml.dom.minidom
-from hsfs import phys
+from hsutil import io
 import hsfs as fs
 from hsutil.files import FileOrPath
 from hsutil.path import Path
 from . import fs
 (STATE_NORMAL,
 STATE_REFERENCE,
 STATE_EXCLUDED) = range(3)
@@ -26,15 +27,14 @@ class InvalidPathError(Exception):
 class Directories(object):
    #---Override
-    def __init__(self):
+    def __init__(self, fileclasses=[fs.File]):
        self._dirs = []
        self.states = {}
-        self.dirclass = phys.Directory
+        self.fileclasses = fileclasses
        self.special_dirclasses = {}
-    def __contains__(self,path):
+    def __contains__(self, path):
-        for d in self._dirs:
+        for p in self._dirs:
-            if path in d.path:
+            if path in p:
                return True
        return False
@@ -53,8 +53,7 @@ class Directories(object):
        if path[-1].startswith('.'): # hidden
            return STATE_EXCLUDED
-    def _get_files(self, from_dir):
+    def _get_files(self, from_path):
        from_path = from_dir.path
        state = self.get_state(from_path)
        if state == STATE_EXCLUDED:
            # Recursively get files from folders with lots of subfolder is expensive. However, there
@@ -62,14 +61,21 @@ class Directories(object):
            # through self.states and see if we must continue, or we can stop right here to save time
            if not any(p[:len(from_path)] == from_path for p in self.states):
                return
-        result = []
+        try:
-        for subdir in from_dir.dirs:
+            filepaths = set()
-            for file in self._get_files(subdir):
+            if state != STATE_EXCLUDED:
-                yield file
+                for file in fs.get_files(from_path, fileclasses=self.fileclasses):
-        if state != STATE_EXCLUDED:
+                    file.is_ref = state == STATE_REFERENCE
-            for file in from_dir.files:
+                    filepaths.add(file.path)
-                file.is_ref = state == STATE_REFERENCE
+                    yield file
-                yield file
+            subpaths = [from_path + name for name in io.listdir(from_path)]
            # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
            subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
            for subfolder in subfolders:
                for file in self._get_files(subfolder):
                    yield file
        except (EnvironmentError, fs.InvalidPath):
            pass
    #---Public
    def add_path(self, path):
@@ -80,29 +86,30 @@ class Directories(object):
        under it will be removed. Can also raise InvalidPathError if 'path' does not exist.
        """
        if path in self:
-            raise AlreadyThereError
+            raise AlreadyThereError()
-        self._dirs = [d for d in self._dirs if d.path not in path]
+        if not io.exists(path):
        try:
            dirclass = self.special_dirclasses.get(path, self.dirclass)
            d = dirclass(None, unicode(path))
            d[:] #If an InvalidPath exception has to be raised, it will be raised here
            self._dirs.append(d)
            return d
        except fs.InvalidPath:
            raise InvalidPathError()
        self._dirs = [p for p in self._dirs if p not in path]
        self._dirs.append(path)
    @staticmethod
    def get_subfolders(path):
        """returns a sorted list of paths corresponding to subfolders in `path`"""
        try:
            names = [name for name in io.listdir(path) if io.isdir(path + name)]
            names.sort(key=lambda x:x.lower())
            return [path + name for name in names]
        except EnvironmentError:
            return []
    def get_files(self):
        """Returns a list of all files that are not excluded.
        Returned files also have their 'is_ref' attr set.
        """
-        for d in self._dirs:
+        for path in self._dirs:
-            d.force_update()
+            for file in self._get_files(path):
-            try:
+                yield file
                for file in self._get_files(d):
                    yield file
            except fs.InvalidPath:
                pass
    def get_state(self, path):
        """Returns the state of 'path' (One of the STATE_* const.)
@@ -123,8 +130,8 @@ class Directories(object):
            doc = xml.dom.minidom.parse(infile)
        except:
            return
-        root_dir_nodes = doc.getElementsByTagName('root_directory')
+        root_path_nodes = doc.getElementsByTagName('root_directory')
-        for rdn in root_dir_nodes:
+        for rdn in root_path_nodes:
            if not rdn.getAttributeNode('path'):
                continue
            path = rdn.getAttributeNode('path').nodeValue
@@ -144,9 +151,9 @@ class Directories(object):
        with FileOrPath(outfile, 'wb') as fp:
            doc = xml.dom.minidom.Document()
            root = doc.appendChild(doc.createElement('directories'))
-            for root_dir in self:
+            for root_path in self:
-                root_dir_node = root.appendChild(doc.createElement('root_directory'))
+                root_path_node = root.appendChild(doc.createElement('root_directory'))
-                root_dir_node.setAttribute('path', unicode(root_dir.path).encode('utf-8'))
+                root_path_node.setAttribute('path', unicode(root_path).encode('utf-8'))
            for path, state in self.states.iteritems():
                state_node = root.appendChild(doc.createElement('state'))
                state_node.setAttribute('path', unicode(path).encode('utf-8'))
--- a/base/py/engine.py
+++ b/base/py/engine.py
@@ -9,6 +9,7 @@
 from __future__ import division
 import difflib
 import itertools
 import logging
 import string
 from collections import defaultdict, namedtuple
@@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
    percentage = compare(first.words, second.words, flags)
    return Match(first, second, percentage)
-class MatchFactory(object):
+def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, 
-    common_word_threshold = 50
+    no_field_order=False, j=job.nulljob):
-    match_similar_words = False
+    COMMON_WORD_THRESHOLD = 50
-    min_match_percentage = 0
+    LIMIT = 5000000
-    weight_words = False
+    j = j.start_subjob(2)
-    no_field_order = False
+    sj = j.start_subjob(2)
-    limit = 5000000
+    for o in objects:
-    
+        if not hasattr(o, 'words'):
-    def getmatches(self, objects, j=job.nulljob):
+            o.words = getwords(o.name)
-        j = j.start_subjob(2)
+    word_dict = build_word_dict(objects, sj)
-        sj = j.start_subjob(2)
+    reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
-        for o in objects:
+    if match_similar_words:
-            if not hasattr(o, 'words'):
+        merge_similar_words(word_dict)
-                o.words = getwords(o.name)
+    match_flags = []
-        word_dict = build_word_dict(objects, sj)
+    if weight_words:
-        reduce_common_words(word_dict, self.common_word_threshold)
+        match_flags.append(WEIGHT_WORDS)
-        if self.match_similar_words:
+    if match_similar_words:
-            merge_similar_words(word_dict)
+        match_flags.append(MATCH_SIMILAR_WORDS)
-        match_flags = []
+    if no_field_order:
-        if self.weight_words:
+        match_flags.append(NO_FIELD_ORDER)
-            match_flags.append(WEIGHT_WORDS)
+    j.start_job(len(word_dict), '0 matches found')
-        if self.match_similar_words:
+    compared = defaultdict(set)
-            match_flags.append(MATCH_SIMILAR_WORDS)
+    result = []
-        if self.no_field_order:
+    try:
-            match_flags.append(NO_FIELD_ORDER)
+        # This whole 'popping' thing is there to avoid taking too much memory at the same time.
-        j.start_job(len(word_dict), '0 matches found')
+        while word_dict:
-        compared = defaultdict(set)
+            items = word_dict.popitem()[1]
-        result = []
+            while items:
-        try:
+                ref = items.pop()
-            # This whole 'popping' thing is there to avoid taking too much memory at the same time.
+                compared_already = compared[ref]
-            while word_dict:
+                to_compare = items - compared_already
-                items = word_dict.popitem()[1]
+                compared_already |= to_compare
-                while items:
+                for other in to_compare:
-                    ref = items.pop()
+                    m = get_match(ref, other, match_flags)
-                    compared_already = compared[ref]
+                    if m.percentage >= min_match_percentage:
-                    to_compare = items - compared_already
+                        result.append(m)
-                    compared_already |= to_compare
+                        if len(result) >= LIMIT:
-                    for other in to_compare:
+                            return result
-                        m = get_match(ref, other, match_flags)
+            j.add_progress(desc='%d matches found' % len(result))
-                        if m.percentage >= self.min_match_percentage:
+    except MemoryError:
-                            result.append(m)
+        # This is the place where the memory usage is at its peak during the scan.
-                            if len(result) >= self.limit:
+        # Just continue the process with an incomplete list of matches.
-                                return result
+        del compared # This should give us enough room to call logging.
-                j.add_progress(desc='%d matches found' % len(result))
+        logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
        except MemoryError:
            # This is the place where the memory usage is at its peak during the scan.
            # Just continue the process with an incomplete list of matches.
            del compared # This should give us enough room to call logging.
            logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
            return result
        return result
-    
+    return result
 def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
    j = j.start_subjob([2, 8])
    size2files = defaultdict(set)
    for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
        size2files[getattr(file, sizeattr)].add(file)
    possible_matches = [files for files in size2files.values() if len(files) > 1]
    del size2files
    result = []
    j.start_job(len(possible_matches), '0 matches found')
    for group in possible_matches:
        for first, second in itertools.combinations(group, 2):
            if first.md5partial == second.md5partial:
                if partial or first.md5 == second.md5:
                    result.append(Match(first, second, 100))
        j.add_progress(desc='%d matches found' % len(result))
    return result
 class Group(object):
    #---Override
--- a/base/py/fs.py
+++ b/base/py/fs.py
@@ -0,0 +1,178 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-22
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 # This is a fork from hsfs. The reason for this fork is that hsfs has been designed for musicGuru
 # and was re-used for dupeGuru. The problem is that hsfs is way over-engineered for dupeGuru,
 # resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
 # and I'm doing it now.
 from __future__ import unicode_literals
 import hashlib
 import logging
 from hsutil import io
 from hsutil.misc import nonone, flatten
 from hsutil.str import get_file_ext
 class FSError(Exception):
    cls_message = "An error has occured on '{name}' in '{parent}'"
    def __init__(self, fsobject, parent=None):
        message = self.cls_message
        if isinstance(fsobject, basestring):
            name = fsobject
        elif isinstance(fsobject, File):
            name = fsobject.name
        else:
            name = ''
        parentname = unicode(parent) if parent is not None else ''
        Exception.__init__(self, message.format(name=name, parent=parentname))
 class AlreadyExistsError(FSError):
    "The directory or file name we're trying to add already exists"
    cls_message = "'{name}' already exists in '{parent}'"
 class InvalidPath(FSError):
    "The path of self is invalid, and cannot be worked with."
    cls_message = "'{name}' is invalid."
 class InvalidDestinationError(FSError):
    """A copy/move operation has been called, but the destination is invalid."""
    cls_message = "'{name}' is an invalid destination for this operation."
 class OperationError(FSError):
    """A copy/move/delete operation has been called, but the checkup after the 
    operation shows that it didn't work."""
    cls_message = "Operation on '{name}' failed."
 class File(object):
    INITIAL_INFO = {
        'size': 0,
        'ctime': 0,
        'mtime': 0,
        'md5': '',
        'md5partial': '',
    }
    def __init__(self, path):
        self.path = path
        #This offset is where we should start reading the file to get a partial md5
        #For audio file, it should be where audio data starts
        self._md5partial_offset = 0x4000 #16Kb
        self._md5partial_size   = 0x4000 #16Kb
    def __getattr__(self, attrname):
        # Only called when attr is not there
        if attrname in self.INITIAL_INFO:
            try:
                self._read_info(attrname)
            except Exception as e:
                logging.warning("An error '%s' was raised while decoding '%s'", e, repr(self.path))
            try:
                return self.__dict__[attrname]
            except KeyError:
                return self.INITIAL_INFO[attrname]
        raise AttributeError()
    def _read_info(self, field):
        if field in ('size', 'ctime', 'mtime'):
            stats = io.stat(self.path)
            self.size = nonone(stats.st_size, 0)
            self.ctime = nonone(stats.st_ctime, 0)
            self.mtime = nonone(stats.st_mtime, 0)
        elif field == 'md5partial':
            try:
                fp = io.open(self.path, 'rb')
                offset = self._md5partial_offset
                size = self._md5partial_size
                fp.seek(offset)
                partialdata = fp.read(size)
                md5 = hashlib.md5(partialdata)
                self.md5partial = md5.digest()
                fp.close()
            except Exception:
                pass
        elif field == 'md5':
            try:
                fp = io.open(self.path, 'rb')
                filedata = fp.read()
                md5 = hashlib.md5(filedata)
                self.md5 = md5.digest()
                fp.close()
            except Exception:
                pass
    def _read_all_info(self, attrnames=None):
        """Cache all possible info.
        If `attrnames` is not None, caches only attrnames.
        """
        if attrnames is None:
            attrnames = self.INITIAL_INFO.keys()
        for attrname in attrnames:
            if attrname not in self.__dict__:
                self._read_info(attrname)
    #--- Public
    @classmethod
    def can_handle(cls, path):
        return not io.islink(path) and io.isfile(path)
    def rename(self, newname):
        if newname == self.name:
            return
        destpath = self.path[:-1] + newname
        if io.exists(destpath):
            raise AlreadyExistsError(newname, self.path[:-1])
        try:
            io.rename(self.path, destpath)
        except EnvironmentError:
            raise OperationError(self)
        if not io.exists(destpath):
            raise OperationError(self)
        self.path = destpath
    #--- Properties
    @property
    def extension(self):
        return get_file_ext(self.name)
    @property
    def name(self):
        return self.path[-1]
 def get_file(path, fileclasses=[File]):
    for fileclass in fileclasses:
        if fileclass.can_handle(path):
            return fileclass(path)
 def get_files(path, fileclasses=[File]):
    assert all(issubclass(fileclass, File) for fileclass in fileclasses)
    try:
        paths = [path + name for name in io.listdir(path)]
        result = []
        for path in paths:
            file = get_file(path, fileclasses=fileclasses)
            if file is not None:
                result.append(file)
        return result
    except EnvironmentError:
        raise InvalidPath(path)
 def get_all_files(path, fileclasses=[File]):
    files = get_files(path, fileclasses=fileclasses)
    filepaths = set(f.path for f in files)
    subpaths = [path + name for name in io.listdir(path)]
    # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
    subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
    subfiles = flatten(get_all_files(subpath, fileclasses=fileclasses) for subpath in subfolders)
    return subfiles + files
--- a/base/py/scanner.py
+++ b/base/py/scanner.py
@@ -32,40 +32,32 @@ class Scanner(object):
        self.ignore_list = IgnoreList()
        self.discarded_file_count = 0
    @staticmethod
    def _filter_matches_by_content(matches, partial, j):
        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
        md5attrname = 'md5partial' if partial else 'md5'
        md5 = lambda f: getattr(f, md5attrname)
        for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
            md5(matched_file)
        j.set_progress(100, 'Removing false matches')
        return [m for m in matches if md5(m.first) == md5(m.second)]
    def _getmatches(self, files, j):
        j = j.start_subjob(2)
        mf = engine.MatchFactory()
        if self.scan_type != SCAN_TYPE_CONTENT:
            mf.match_similar_words = self.match_similar_words
            mf.weight_words = self.word_weighting
            mf.min_match_percentage = self.min_match_percentage
        if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
            self.scan_type = SCAN_TYPE_FIELDS
            mf.no_field_order = True
        func = {
            SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
            SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
            SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
            SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
            SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
        }[self.scan_type]
        for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
            if self.size_threshold:
                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
            f.words = func(f)
        if self.size_threshold:
            j = j.start_subjob([2, 8])
            for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
            files = [f for f in files if f.size >= self.size_threshold]
-        return mf.getmatches(files, j)
+        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
            sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
            return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
        else:
            j = j.start_subjob([2, 8])
            kw = {}
            kw['match_similar_words'] = self.match_similar_words
            kw['weight_words'] = self.word_weighting
            kw['min_match_percentage'] = self.min_match_percentage
            if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
                self.scan_type = SCAN_TYPE_FIELDS
                kw['no_field_order'] = True
            func = {
                SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
                SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
                SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
            }[self.scan_type]
            for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
                f.words = func(f)
            return engine.getmatches(files, j=j, **kw)
    @staticmethod
    def _key_func(dupe):
@@ -86,10 +78,7 @@ class Scanner(object):
        for f in [f for f in files if not hasattr(f, 'is_ref')]:
            f.is_ref = False
        logging.info('Getting matches')
-        if self.match_factory is None:
+        matches = self._getmatches(files, j)
            matches = self._getmatches(files, j)
        else:
            matches = self.match_factory.getmatches(files, j)
        logging.info('Found %d matches' % len(matches))
        if not self.mix_file_kind:
            j.set_progress(100, 'Removing false matches')
@@ -99,14 +88,6 @@ class Scanner(object):
            iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
            matches = [m for m in iter_matches 
                if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
            j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
            matches = self._filter_matches_by_content(matches, partial=True, j=j)
            if self.scan_type == SCAN_TYPE_CONTENT:
                matches = self._filter_matches_by_content(matches, partial=False, j=j)
            # We compared md5. No words were involved.
            for m in matches:
                m.first.words = m.second.words = ['--']
        logging.info('Grouping matches')
        groups = engine.get_groups(matches, j)
        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
@@ -118,7 +99,6 @@ class Scanner(object):
            g.prioritize(self._key_func, self._tie_breaker)
        return groups
    match_factory        = None
    match_similar_words  = False
    min_match_percentage = 80
    mix_file_kind        = True
@@ -126,9 +106,3 @@ class Scanner(object):
    scanned_tags         = set(['artist', 'title'])
    size_threshold       = 0
    word_weighting       = False
 class ScannerME(Scanner): # Scanner for Music Edition
    @staticmethod
    def _key_func(dupe):
        return (not dupe.is_ref, -dupe.bitrate, -dupe.size)
--- a/base/py/tests/app_cocoa_test.py
+++ b/base/py/tests/app_cocoa_test.py
@@ -18,10 +18,10 @@ from hsutil.path import Path
 from hsutil.testcase import TestCase
 from hsutil.decorators import log_calls
 from hsutil import io
 import hsfs.phys
 from . import data
 from .results_test import GetTestGroups
-from .. import engine, data
+from .. import engine, fs
 try:
    from ..app_cocoa import DupeGuru as DupeGuruBase
 except ImportError:
@@ -35,7 +35,6 @@ class DupeGuru(DupeGuruBase):
    def _start_job(self, jobid, func):
        func(nulljob)
 def r2np(rows):
    #Transforms a list of rows [1,2,3] into a list of node paths [[1],[2],[3]]
    return [[i] for i in rows]
@@ -310,15 +309,15 @@ class TCDupeGuru(TestCase):
 class TCDupeGuru_renameSelected(TestCase):
    def setUp(self):
-        p = Path(tempfile.mkdtemp())
+        p = self.tmppath()
-        fp = open(str(p + 'foo bar 1'),mode='w')
+        fp = open(unicode(p + 'foo bar 1'),mode='w')
        fp.close()
-        fp = open(str(p + 'foo bar 2'),mode='w')
+        fp = open(unicode(p + 'foo bar 2'),mode='w')
        fp.close()
-        fp = open(str(p + 'foo bar 3'),mode='w')
+        fp = open(unicode(p + 'foo bar 3'),mode='w')
        fp.close()
-        refdir = hsfs.phys.Directory(None,str(p))
+        files = fs.get_files(p)
-        matches = engine.MatchFactory().getmatches(refdir.files)
+        matches = engine.getmatches(files)
        groups = engine.get_groups(matches)
        g = groups[0]
        g.prioritize(lambda x:x.name)
@@ -327,45 +326,41 @@ class TCDupeGuru_renameSelected(TestCase):
        self.app = app
        self.groups = groups
        self.p = p
-        self.refdir = refdir
+        self.files = files
    def tearDown(self):
        shutil.rmtree(str(self.p))
    def test_simple(self):
        app = self.app
        refdir = self.refdir
        g = self.groups[0]
        app.SelectPowerMarkerNodePaths(r2np([0]))
-        self.assert_(app.RenameSelected('renamed'))
+        assert app.RenameSelected('renamed')
-        self.assert_('renamed' in refdir)
+        names = io.listdir(self.p)
-        self.assert_('foo bar 2' not in refdir)
+        assert 'renamed' in names
-        self.assert_(g.dupes[0] is refdir['renamed'])
+        assert 'foo bar 2' not in names
-        self.assert_(g.dupes[0] in refdir)
+        eq_(g.dupes[0].name, 'renamed')
    def test_none_selected(self):
        app = self.app
        refdir = self.refdir
        g = self.groups[0]
        app.SelectPowerMarkerNodePaths([])
        self.mock(logging, 'warning', log_calls(lambda msg: None))
-        self.assert_(not app.RenameSelected('renamed'))
+        assert not app.RenameSelected('renamed')
        msg = logging.warning.calls[0]['msg']
-        self.assertEqual('dupeGuru Warning: list index out of range', msg)
+        eq_('dupeGuru Warning: list index out of range', msg)
-        self.assert_('renamed' not in refdir)
+        names = io.listdir(self.p)
-        self.assert_('foo bar 2' in refdir)
+        assert 'renamed' not in names
-        self.assert_(g.dupes[0] is refdir['foo bar 2'])
+        assert 'foo bar 2' in names
        eq_(g.dupes[0].name, 'foo bar 2')
    def test_name_already_exists(self):
        app = self.app
        refdir = self.refdir
        g = self.groups[0]
        app.SelectPowerMarkerNodePaths(r2np([0]))
        self.mock(logging, 'warning', log_calls(lambda msg: None))
-        self.assert_(not app.RenameSelected('foo bar 1'))
+        assert not app.RenameSelected('foo bar 1')
        msg = logging.warning.calls[0]['msg']
-        self.assert_(msg.startswith('dupeGuru Warning: \'foo bar 2\' already exists in'))
+        assert msg.startswith('dupeGuru Warning: \'foo bar 1\' already exists in')
-        self.assert_('foo bar 1' in refdir)
+        names = io.listdir(self.p)
-        self.assert_('foo bar 2' in refdir)
+        assert 'foo bar 1' in names
-        self.assert_(g.dupes[0] is refdir['foo bar 2'])
+        assert 'foo bar 2' in names
        eq_(g.dupes[0].name, 'foo bar 2')
--- a/base/py/tests/app_test.py
+++ b/base/py/tests/app_test.py
@@ -13,12 +13,11 @@ from hsutil.testcase import TestCase
 from hsutil import io
 from hsutil.path import Path
 from hsutil.decorators import log_calls
 import hsfs as fs
 import hsfs.phys
 import hsutil.files
 from hsutil.job import nulljob
-from .. import data, app
+from . import data
 from .. import app, fs
 from ..app import DupeGuru as DupeGuruBase
 class DupeGuru(DupeGuruBase):
@@ -59,27 +58,27 @@ class TCDupeGuru(TestCase):
        # The goal here is just to have a test for a previous blowup I had. I know my test coverage
        # for this unit is pathetic. What's done is done. My approach now is to add tests for
        # every change I want to make. The blowup was caused by a missing import.
-        dupe_parent = fs.Directory(None, 'foo')
+        p = self.tmppath()
-        dupe = fs.File(dupe_parent, 'bar')
+        io.open(p + 'foo', 'w').close()
        dupe.copy = log_calls(lambda dest, newname: None)
        self.mock(hsutil.files, 'copy', log_calls(lambda source_path, dest_path: None))
        self.mock(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
        self.mock(fs.phys, 'Directory', fs.Directory) # We don't want an error because makedirs didn't work
        app = DupeGuru()
-        app.copy_or_move(dupe, True, 'some_destination', 0)
+        app.directories.add_path(p)
        [f] = app.directories.get_files()
        app.copy_or_move(f, True, 'some_destination', 0)
        self.assertEqual(1, len(hsutil.files.copy.calls))
        call = hsutil.files.copy.calls[0]
        self.assertEqual('some_destination', call['dest_path'])
-        self.assertEqual(dupe.path, call['source_path'])
+        self.assertEqual(f.path, call['source_path'])
    def test_copy_or_move_clean_empty_dirs(self):
        tmppath = Path(self.tmpdir())
        sourcepath = tmppath + 'source'
        io.mkdir(sourcepath)
        io.open(sourcepath + 'myfile', 'w')
        tmpdir = hsfs.phys.Directory(None, unicode(tmppath))
        myfile = tmpdir['source']['myfile']
        app = DupeGuru()
        app.directories.add_path(tmppath)
        [myfile] = app.directories.get_files()
        self.mock(app, 'clean_empty_dirs', log_calls(lambda path: None))
        app.copy_or_move(myfile, False, tmppath + 'dest', 0)
        calls = app.clean_empty_dirs.calls
@@ -87,9 +86,14 @@ class TCDupeGuru(TestCase):
        self.assertEqual(sourcepath, calls[0]['path'])
    def test_Scan_with_objects_evaluating_to_false(self):
        class FakeFile(fs.File):
            def __nonzero__(self):
                return False
        # At some point, any() was used in a wrong way that made Scan() wrongly return 1
        app = DupeGuru()
-        f1, f2 = [fs.File(None, 'foo') for i in range(2)]
+        f1, f2 = [FakeFile('foo') for i in range(2)]
        f1.is_ref, f2.is_ref = (False, False)
        assert not (bool(f1) and bool(f2))
        app.directories.get_files = lambda: [f1, f2]
--- a/base/py/tests/data.py
+++ b/base/py/tests/data.py
@@ -0,0 +1,45 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-23
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 # data module for tests
 from hsutil.str import format_size
 from dupeguru.data import format_path, cmp_value
 COLUMNS = [
    {'attr':'name','display':'Filename'},
    {'attr':'path','display':'Directory'},
    {'attr':'size','display':'Size (KB)'},
    {'attr':'extension','display':'Kind'},
 ]
 METADATA_TO_READ = ['size']
 def GetDisplayInfo(dupe, group, delta):
    size = dupe.size
    m = group.get_match_of(dupe)
    if m and delta:
        r = group.ref
        size -= r.size
    return [
        dupe.name,
        format_path(dupe.path),
        format_size(size, 0, 1, False),
        dupe.extension,
    ]
 def GetDupeSortKey(dupe, get_group, key, delta):
    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
    if delta and (key == 2):
        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
    return r
 def GetGroupSortKey(group, key):
    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/base/py/tests/directories_test.py
+++ b/base/py/tests/directories_test.py
@@ -10,20 +10,43 @@
 import os.path as op
 import os
 import time
 import shutil
 from nose.tools import eq_
-from hsutil import job, io
+from hsutil import io
 from hsutil.path import Path
 from hsutil.testcase import TestCase
 import hsfs.phys
 from hsfs.tests import phys_test
 from ..directories import *
 testpath = Path(TestCase.datadirpath())
 def create_fake_fs(rootpath):
    rootpath = rootpath + 'fs'
    io.mkdir(rootpath)
    io.mkdir(rootpath + 'dir1')
    io.mkdir(rootpath + 'dir2')
    io.mkdir(rootpath + 'dir3')
    fp = io.open(rootpath + 'file1.test', 'w')
    fp.write('1')
    fp.close()
    fp = io.open(rootpath + 'file2.test', 'w')
    fp.write('12')
    fp.close()
    fp = io.open(rootpath + 'file3.test', 'w')
    fp.write('123')
    fp.close()
    fp = io.open(rootpath + ('dir1', 'file1.test'), 'w')
    fp.write('1')
    fp.close()
    fp = io.open(rootpath + ('dir2', 'file2.test'), 'w')
    fp.write('12')
    fp.close()
    fp = io.open(rootpath + ('dir3', 'file3.test'), 'w')
    fp.write('123')
    fp.close()
    return rootpath
 class TCDirectories(TestCase):
    def test_empty(self):
        d = Directories()
@@ -33,13 +56,11 @@ class TCDirectories(TestCase):
    def test_add_path(self):
        d = Directories()
        p = testpath + 'utils'
-        added = d.add_path(p)
+        d.add_path(p)
        self.assertEqual(1,len(d))
        self.assert_(p in d)
        self.assert_((p + 'foobar') in d)
        self.assert_(p[:-1] not in d)
        self.assertEqual(p,added.path)
        self.assert_(d[0] is added)
        p = self.tmppath()
        d.add_path(p)
        self.assertEqual(2,len(d))
@@ -53,13 +74,13 @@ class TCDirectories(TestCase):
        self.assertRaises(AlreadyThereError, d.add_path, p + 'foobar')
        self.assertEqual(1, len(d))
-    def test_AddPath_containing_paths_already_there(self):
+    def test_add_path_containing_paths_already_there(self):
        d = Directories()
        d.add_path(testpath + 'utils')
        self.assertEqual(1, len(d))
-        added = d.add_path(testpath)
+        d.add_path(testpath)
-        self.assertEqual(1, len(d))
+        eq_(len(d), 1)
-        self.assert_(added is d[0])
+        eq_(d[0], testpath)
    def test_AddPath_non_latin(self):
    	p = Path(self.tmpdir())
@@ -114,7 +135,7 @@ class TCDirectories(TestCase):
    def test_set_state_keep_state_dict_size_to_minimum(self):
        d = Directories()
-        p = Path(phys_test.create_fake_fs(self.tmpdir()))
+        p = create_fake_fs(self.tmppath())
        d.add_path(p)
        d.set_state(p,STATE_REFERENCE)
        d.set_state(p + 'dir1',STATE_REFERENCE)
@@ -129,7 +150,7 @@ class TCDirectories(TestCase):
    def test_get_files(self):
        d = Directories()
-        p = Path(phys_test.create_fake_fs(self.tmpdir()))
+        p = create_fake_fs(self.tmppath())
        d.add_path(p)
        d.set_state(p + 'dir1',STATE_REFERENCE)
        d.set_state(p + 'dir2',STATE_EXCLUDED)
@@ -177,52 +198,28 @@ class TCDirectories(TestCase):
        except LookupError:
            self.fail()
    def test_default_dirclass(self):
        self.assert_(Directories().dirclass is hsfs.phys.Directory)
    def test_dirclass(self):
        class MySpecialDirclass(hsfs.phys.Directory): pass
        d = Directories()
        d.dirclass = MySpecialDirclass
        d.add_path(testpath)
        self.assert_(isinstance(d[0], MySpecialDirclass))
    def test_load_from_file_with_invalid_path(self):
        #This test simulates a load from file resulting in a
        #InvalidPath raise. Other directories must be loaded.
        d1 = Directories()
        d1.add_path(testpath + 'utils')
        #Will raise InvalidPath upon loading
-        d1.add_path(self.tmppath()).name = 'does_not_exist'
+        p = self.tmppath()
        d1.add_path(p)
        io.rmdir(p)
        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
        d1.save_to_file(tmpxml)
        d2 = Directories()
        d2.load_from_file(tmpxml)
        self.assertEqual(1, len(d2))
    def test_load_from_file_with_same_paths(self):
        #This test simulates a load from file resulting in a
        #AlreadyExists raise. Other directories must be loaded.
        d1 = Directories()
        p1 = self.tmppath()
        p2 = self.tmppath()
        d1.add_path(p1)
        d1.add_path(p2)
        #Will raise AlreadyExists upon loading
        d1.add_path(self.tmppath()).name = unicode(p1)
        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
        d1.save_to_file(tmpxml)
        d2 = Directories()
        d2.load_from_file(tmpxml)
        self.assertEqual(2, len(d2))
    def test_unicode_save(self):
        d = Directories()
        p1 = self.tmppath() + u'hello\xe9'
        io.mkdir(p1)
        io.mkdir(p1 + u'foo\xe9')
        d.add_path(p1)
-        d.set_state(d[0][0].path, STATE_EXCLUDED)
+        d.set_state(p1 + u'foo\xe9', STATE_EXCLUDED)
        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
        try:
            d.save_to_file(tmpxml)
@@ -231,7 +228,7 @@ class TCDirectories(TestCase):
    def test_get_files_refreshes_its_directories(self):
        d = Directories()
-        p = Path(phys_test.create_fake_fs(self.tmpdir()))
+        p = create_fake_fs(self.tmppath())
        d.add_path(p)
        files = d.get_files()
        self.assertEqual(6, len(list(files)))
@@ -258,16 +255,6 @@ class TCDirectories(TestCase):
        d.set_state(hidden_dir_path, STATE_NORMAL)
        self.assertEqual(d.get_state(hidden_dir_path), STATE_NORMAL)
    def test_special_dirclasses(self):
        # if a path is in special_dirclasses, use this class instead
        class MySpecialDirclass(hsfs.phys.Directory): pass
        d = Directories()
        p1 = self.tmppath()
        p2 = self.tmppath()
        d.special_dirclasses[p1] = MySpecialDirclass
        self.assert_(isinstance(d.add_path(p2), hsfs.phys.Directory))
        self.assert_(isinstance(d.add_path(p1), MySpecialDirclass))
    def test_default_path_state_override(self):
        # It's possible for a subclass to override the default state of a path
        class MyDirectories(Directories):
--- a/base/py/tests/engine_test.py
+++ b/base/py/tests/engine_test.py
@@ -340,21 +340,13 @@ class TCget_match(TestCase):
        self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
-class TCMatchFactory(TestCase):
+class GetMatches(TestCase):
    def test_empty(self):
-        self.assertEqual([],MatchFactory().getmatches([]))
+        eq_(getmatches([]), [])
    def test_defaults(self):
        mf = MatchFactory()
        self.assertEqual(50,mf.common_word_threshold)
        self.assertEqual(False,mf.weight_words)
        self.assertEqual(False,mf.match_similar_words)
        self.assertEqual(False,mf.no_field_order)
        self.assertEqual(0,mf.min_match_percentage)
    def test_simple(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(2,len(r))
        seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
        m = seek[0]
@@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
    def test_null_and_unrelated_objects(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
        m = r[0]
        self.assertEqual(50,m.percentage)
@@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
    def test_twice_the_same_word(self):
        l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    def test_twice_the_same_word_when_preworded(self):
        l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    def test_two_words_match(self):
        l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    def test_match_files_with_only_common_words(self):
        #If a word occurs more than 50 times, it is excluded from the matching process
        #The problem with the common_word_threshold is that the files containing only common
        #words will never be matched together. We *should* match them.
-        mf = MatchFactory()
+        # This test assumes that the common word threashold const is 50
        mf.common_word_threshold = 50
        l = [NamedObject("foo") for i in range(50)]
-        r = mf.getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1225,len(r))
    def test_use_words_already_there_if_there(self):
        o1 = NamedObject('foo')
        o2 = NamedObject('bar')
        o2.words = ['foo']
-        self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
+        eq_(1, len(getmatches([o1,o2])))
    def test_job(self):
        def do_progress(p,d=''):
@@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
        j = job.Job(1,do_progress)
        self.log = []
        s = "foo bar"
-        MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
+        getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
        self.assert_(len(self.log) > 2)
        self.assertEqual(0,self.log[0])
        self.assertEqual(100,self.log[-1])
    def test_weight_words(self):
        mf = MatchFactory()
        mf.weight_words = True
        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
-        m = mf.getmatches(l)[0]
+        m = getmatches(l, weight_words=True)[0]
        self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
    def test_similar_word(self):
        mf = MatchFactory()
        mf.match_similar_words = True
        l = [NamedObject("foobar"),NamedObject("foobars")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
-        self.assertEqual(100,mf.getmatches(l)[0].percentage)
+        eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
        l = [NamedObject("foobar"),NamedObject("foo")]
-        self.assertEqual(0,len(mf.getmatches(l))) #too far
+        eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
        l = [NamedObject("bizkit"),NamedObject("bizket")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
        l = [NamedObject("foobar"),NamedObject("foosbar")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
    def test_single_object_with_similar_words(self):
        mf = MatchFactory()
        mf.match_similar_words = True
        l = [NamedObject("foo foos")]
-        self.assertEqual(0,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 0)
    def test_double_words_get_counted_only_once(self):
        mf = MatchFactory()
        l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
-        m = mf.getmatches(l)[0]
+        m = getmatches(l)[0]
        self.assertEqual(75,m.percentage)
    def test_with_fields(self):
        mf = MatchFactory()
        o1 = NamedObject("foo bar - foo bleh")
        o2 = NamedObject("foo bar - bleh bar")
        o1.words = getfields(o1.name)
        o2.words = getfields(o2.name)
-        m = mf.getmatches([o1, o2])[0]
+        m = getmatches([o1, o2])[0]
        self.assertEqual(50, m.percentage)
    def test_with_fields_no_order(self):
        mf = MatchFactory()
        mf.no_field_order = True
        o1 = NamedObject("foo bar - foo bleh")
        o2 = NamedObject("bleh bang - foo bar")
        o1.words = getfields(o1.name)
        o2.words = getfields(o2.name)
-        m = mf.getmatches([o1, o2])[0]
+        m = getmatches([o1, o2], no_field_order=True)[0]
-        self.assertEqual(50 ,m.percentage)
+        eq_(m.percentage, 50)
    def test_only_match_similar_when_the_option_is_set(self):
        mf = MatchFactory()
        mf.match_similar_words = False
        l = [NamedObject("foobar"),NamedObject("foobars")]
-        self.assertEqual(0,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=False)), 0)
    def test_dont_recurse_do_match(self):
        # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
        sys.setrecursionlimit(100)
        mf = MatchFactory()
        files = [NamedObject('foo bar') for i in range(101)]
        try:
-            mf.getmatches(files)
+            getmatches(files)
        except RuntimeError:
            self.fail()
        finally:
@@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
    def test_min_match_percentage(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
-        mf = MatchFactory()
+        r = getmatches(l, min_match_percentage=50)
        mf.min_match_percentage = 50
        r = mf.getmatches(l)
        self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
    def test_limit(self):
        l = [NamedObject(),NamedObject(),NamedObject()]
        mf = MatchFactory()
        mf.limit = 2
        r = mf.getmatches(l)
        self.assertEqual(2,len(r))
    def test_MemoryError(self):
        @log_calls
        def mocked_match(first, second, flags):
@@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
        objects = [NamedObject() for i in range(10)] # results in 45 matches
        self.mock(engine, 'get_match', mocked_match)
        mf = MatchFactory()
        try:
-            r = mf.getmatches(objects)
+            r = getmatches(objects)
        except MemoryError:
            self.fail('MemorryError must be handled')
        self.assertEqual(42, len(r))
@@ -738,7 +706,7 @@ class TCget_groups(TestCase):
    def test_simple(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        m = matches[0]
        r = get_groups(matches)
        self.assertEqual(1,len(r))
@@ -749,7 +717,7 @@ class TCget_groups(TestCase):
    def test_group_with_multiple_matches(self):
        #This results in 3 matches
        l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(1,len(r))
        g = r[0]
@@ -759,7 +727,7 @@ class TCget_groups(TestCase):
        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
        #There will be 2 groups here: group "a b" and group "c d"
        #"b c" can go either of them, but not both.
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(2,len(r))
        self.assertEqual(5,len(r[0])+len(r[1]))
@@ -768,7 +736,7 @@ class TCget_groups(TestCase):
        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
        #There will be 2 groups here: group "a b" and group "c d"
        #"b c" can fit in both, but it must be in only one of them
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(1,len(r))
@@ -788,7 +756,7 @@ class TCget_groups(TestCase):
    def test_four_sized_group(self):
        l = [NamedObject("foobar") for i in xrange(4)]
-        m = MatchFactory().getmatches(l)
+        m = getmatches(l)
        r = get_groups(m)
        self.assertEqual(1,len(r))
        self.assertEqual(4,len(r[0]))
--- a/base/py/tests/results_test.py
+++ b/base/py/tests/results_test.py
@@ -16,8 +16,8 @@ from hsutil.path import Path
 from hsutil.testcase import TestCase
 from hsutil.misc import first
-from . import engine_test
+from . import engine_test, data
-from .. import data, engine
+from .. import engine
 from ..results import *
 class NamedObject(engine_test.NamedObject):
@@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
 def GetTestGroups():
    objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
    objects[1].size = 1024
-    matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
+    matches = engine.getmatches(objects) #we should have 5 matches
    groups = engine.get_groups(matches) #We should have 2 groups
    for g in groups:
        g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
@@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
                return objects[1]
        objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
-        matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
+        matches = engine.getmatches(objects) #we should have 5 matches
        groups = engine.get_groups(matches) #We should have 2 groups
        for g in groups:
            g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
--- a/base/py/tests/scanner_test.py
+++ b/base/py/tests/scanner_test.py
@@ -132,8 +132,6 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end():
    f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
    r = s.GetDupeGroups(f)
    g = r[0]
    eq_(g.ref.words, ['--'])
    eq_(g.dupes[0].words, ['--'])
 def test_extension_is_not_counted_in_filename_scan():
    s = Scanner()
@@ -369,23 +367,6 @@ def test_ignore_list_checks_for_unicode():
    assert f2 in g
    assert f3 in g
 def test_custom_match_factory():
    class MatchFactory(object):
        def getmatches(self, objects, j=None):
            return [Match(objects[0], objects[1], 420)]
    s = Scanner()
    s.match_factory = MatchFactory()
    o1, o2 = no('foo'), no('bar')
    groups = s.GetDupeGroups([o1, o2])
    eq_(len(groups), 1)
    g = groups[0]
    eq_(len(g), 2)
    g.switch_ref(o1)
    m = g.get_match_of(o2)
    eq_(m, (o1, o2, 420))
 def test_file_evaluates_to_false():
    # A very wrong way to use any() was added at some point, causing resulting group list
    # to be empty.
@@ -455,15 +436,3 @@ def test_partial_group_match():
    assert o2 in group
    assert o3 not in group
    eq_(s.discarded_file_count, 1)
 #--- Scanner ME
 def test_priorize_me():
    # in ScannerME, bitrate goes first (right after is_ref) in priorization
    s = ScannerME()
    o1, o2 = no('foo'), no('foo')
    o1.bitrate = 1
    o2.bitrate = 2
    [group] = s.GetDupeGroups([o1, o2])
    assert group.ref is o2
--- a/base/qt/app.py
+++ b/base/qt/app.py
@@ -16,10 +16,10 @@ import os.path as op
 from PyQt4.QtCore import Qt, QTimer, QObject, QCoreApplication, QUrl, SIGNAL
 from PyQt4.QtGui import QProgressDialog, QDesktopServices, QFileDialog, QDialog, QMessageBox
 import hsfs as fs
 from hsutil import job
 from hsutil.reg import RegistrationRequired
 from dupeguru import fs
 from dupeguru.app import (DupeGuru as DupeGuruBase, JOB_SCAN, JOB_LOAD, JOB_MOVE, JOB_COPY, 
    JOB_DELETE)
@@ -145,6 +145,7 @@ class DupeGuru(DupeGuruBase, QObject):
    def ask_for_reg_code(self):
        if self.reg.ask_for_code():
            #XXX bug???
            self._setup_ui_as_registered()
    @demo_method
--- a/base/qt/directories_model.py
+++ b/base/qt/directories_model.py
@@ -47,7 +47,14 @@ class DirectoryNode(TreeNode):
        return DirectoryNode(self.model, self, ref, row)
    def _getChildren(self):
-        return self.ref.dirs
+        return self.model._dirs.get_subfolders(self.ref)
    @property
    def name(self):
        if self.parent is not None:
            return self.ref[-1]
        else:
            return unicode(self.ref)
 class DirectoriesModel(TreeModel):
@@ -70,13 +77,13 @@ class DirectoriesModel(TreeModel):
        node = index.internalPointer()
        if role == Qt.DisplayRole:
            if index.column() == 0:
-                return node.ref.name
+                return node.name
            else:
-                return STATES[self._dirs.get_state(node.ref.path)]
+                return STATES[self._dirs.get_state(node.ref)]
        elif role == Qt.EditRole and index.column() == 1:
-            return self._dirs.get_state(node.ref.path)
+            return self._dirs.get_state(node.ref)
        elif role == Qt.ForegroundRole:
-            state = self._dirs.get_state(node.ref.path)
+            state = self._dirs.get_state(node.ref)
            if state == 1:
                return QBrush(Qt.blue)
            elif state == 2:
@@ -101,6 +108,6 @@ class DirectoriesModel(TreeModel):
        if not index.isValid() or role != Qt.EditRole or index.column() != 1:
            return False
        node = index.internalPointer()
-        self._dirs.set_state(node.ref.path, value)
+        self._dirs.set_state(node.ref, value)
        return True
--- a/me/cocoa/py/dg_cocoa.py
+++ b/me/cocoa/py/dg_cocoa.py
@@ -8,12 +8,13 @@
 import objc
 from AppKit import *
-from dupeguru import app_me_cocoa, scanner
+from dupeguru_me.app_cocoa import DupeGuruME
 from dupeguru.scanner import (SCAN_TYPE_FILENAME, SCAN_TYPE_FIELDS, SCAN_TYPE_FIELDS_NO_ORDER,
    SCAN_TYPE_TAG, SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO)
 # Fix py2app imports which chokes on relative imports
-from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
+from dupeguru_me import app_cocoa, data, fs, scanner
-from hsfs import auto, stats, tree, music
+from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner, fs
 from hsfs.phys import music
 from hsmedia import aiff, flac, genres, id3v1, id3v2, mp4, mpeg, ogg, wma
 from hsutil import conflict
@@ -23,7 +24,7 @@ class PyApp(NSObject):
 class PyDupeGuru(PyApp):
    def init(self):
        self = super(PyDupeGuru,self).init()
-        self.app = app_me_cocoa.DupeGuruME()
+        self.app = DupeGuruME()
        return self
    #---Directories
@@ -180,12 +181,12 @@ class PyDupeGuru(PyApp):
    def setScanType_(self, scan_type):
        try:
            self.app.scanner.scan_type = [
-                scanner.SCAN_TYPE_FILENAME,
+                SCAN_TYPE_FILENAME,
-                scanner.SCAN_TYPE_FIELDS,
+                SCAN_TYPE_FIELDS,
-                scanner.SCAN_TYPE_FIELDS_NO_ORDER,
+                SCAN_TYPE_FIELDS_NO_ORDER,
-                scanner.SCAN_TYPE_TAG,
+                SCAN_TYPE_TAG,
-                scanner.SCAN_TYPE_CONTENT,
+                SCAN_TYPE_CONTENT,
-                scanner.SCAN_TYPE_CONTENT_AUDIO
+                SCAN_TYPE_CONTENT_AUDIO
            ][scan_type]
        except IndexError:
            pass
--- a/me/py/init.py
+++ b/me/py/init.py
--- a/base/py/app_me_cocoa.py
+++ b/base/py/app_me_cocoa.py
@@ -7,29 +7,29 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 import os.path as op
 import logging
 from appscript import app, k, CommandError
 import time
 from hsutil.cocoa import as_fetch
 import hsfs.phys.music
-import app_cocoa, data_me, scanner
+from dupeguru.app_cocoa import JOBID2TITLE, DupeGuru as DupeGuruBase
 from . import data, scanner, fs
 JOB_REMOVE_DEAD_TRACKS = 'jobRemoveDeadTracks'
 JOB_SCAN_DEAD_TRACKS = 'jobScanDeadTracks'
-app_cocoa.JOBID2TITLE.update({
+JOBID2TITLE.update({
    JOB_REMOVE_DEAD_TRACKS: "Removing dead tracks from your iTunes Library",
    JOB_SCAN_DEAD_TRACKS: "Scanning the iTunes Library",
 })
-class DupeGuruME(app_cocoa.DupeGuru):
+class DupeGuruME(DupeGuruBase):
    def __init__(self):
-        app_cocoa.DupeGuru.__init__(self, data_me, 'dupeGuru Music Edition', appid=1)
+        DupeGuruBase.__init__(self, data, 'dupeGuru Music Edition', appid=1)
        self.scanner = scanner.ScannerME()
-        self.directories.dirclass = hsfs.phys.music.Directory
+        self.directories.fileclasses = [fs.Mp3File, fs.Mp4File, fs.WmaFile, fs.OggFile, fs.FlacFile, fs.AiffFile]
        self.dead_tracks = []
    def remove_dead_tracks(self):
--- a/base/py/data_me.py
+++ b/base/py/data_me.py
@@ -8,7 +8,7 @@
 # http://www.hardcoded.net/licenses/hs_license
 from hsutil.str import format_time, FT_MINUTES, format_size
-from .data import (format_path, format_timestamp, format_words, format_perc, 
+from dupeguru.data import (format_path, format_timestamp, format_words, format_perc, 
    format_dupe_count, cmp_value)
 COLUMNS = [
@@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
        str(dupe.track),
        dupe.comment,
        format_perc(percentage),
-        format_words(dupe.words),
+        format_words(dupe.words) if hasattr(dupe, 'words') else '',
        format_dupe_count(dupe_count)
    ]
--- a/me/py/fs.py
+++ b/me/py/fs.py
@@ -0,0 +1,183 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-23
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 from hsmedia import mpeg, wma, mp4, ogg, flac, aiff
 from hsutil.str import get_file_ext
 from dupeguru import fs
 TAG_FIELDS = ['audiosize', 'duration', 'bitrate', 'samplerate', 'title', 'artist',
    'album', 'genre', 'year', 'track', 'comment']
 class MusicFile(fs.File):
    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
    INITIAL_INFO.update({
        'audiosize': 0,
        'bitrate'  : 0,
        'duration' : 0,
        'samplerate':0,
        'artist'  : '',
        'album'   : '',
        'title'   : '',
        'genre'   : '',
        'comment' : '',
        'year'    : '',
        'track'   : 0,
    })
    HANDLED_EXTS = set()
    @classmethod
    def can_handle(cls, path):
        if not fs.File.can_handle(path):
            return False
        return get_file_ext(path[-1]) in cls.HANDLED_EXTS
 class Mp3File(MusicFile):
    HANDLED_EXTS = set(['mp3'])
    def _read_info(self, field):
        if field == 'md5partial':
            fileinfo = mpeg.Mpeg(unicode(self.path))
            self._md5partial_offset = fileinfo.audio_offset
            self._md5partial_size = fileinfo.audio_size
        MusicFile._read_info(self, field)
        if field in TAG_FIELDS:
            fileinfo = mpeg.Mpeg(unicode(self.path))
            self.audiosize = fileinfo.audio_size
            self.bitrate = fileinfo.bitrate
            self.duration = fileinfo.duration
            self.samplerate = fileinfo.sample_rate
            i1 = fileinfo.id3v1
            # id3v1, even when non-existant, gives empty values. not id3v2. if id3v2 don't exist,
            # just replace it with id3v1
            i2 = fileinfo.id3v2
            if not i2.exists:
                i2 = i1
            self.artist = i2.artist or i1.artist
            self.album = i2.album or i1.album
            self.title = i2.title or i1.title
            self.genre = i2.genre or i1.genre
            self.comment = i2.comment or i1.comment
            self.year = i2.year or i1.year
            self.track = i2.track or i1.track
 class WmaFile(MusicFile):
    HANDLED_EXTS = set(['wma'])
    def _read_info(self, field):
        if field == 'md5partial':
            dec = wma.WMADecoder(unicode(self.path))
            self._md5partial_offset = dec.audio_offset
            self._md5partial_size = dec.audio_size
        MusicFile._read_info(self, field)
        if field in TAG_FIELDS:
            dec = wma.WMADecoder(unicode(self.path))
            self.audiosize = dec.audio_size
            self.bitrate = dec.bitrate
            self.duration = dec.duration
            self.samplerate = dec.sample_rate
            self.artist = dec.artist
            self.album = dec.album
            self.title = dec.title
            self.genre = dec.genre
            self.comment = dec.comment
            self.year = dec.year
            self.track = dec.track
 class Mp4File(MusicFile):
    HANDLED_EXTS = set(['m4a', 'm4p'])
    def _read_info(self, field):
        if field == 'md5partial':
            dec = mp4.File(unicode(self.path))
            self._md5partial_offset = dec.audio_offset
            self._md5partial_size = dec.audio_size
            dec.close()
        MusicFile._read_info(self, field)
        if field in TAG_FIELDS:
            dec = mp4.File(unicode(self.path))
            self.audiosize = dec.audio_size
            self.bitrate = dec.bitrate
            self.duration = dec.duration
            self.samplerate = dec.sample_rate
            self.artist = dec.artist
            self.album = dec.album
            self.title = dec.title
            self.genre = dec.genre
            self.comment = dec.comment
            self.year = dec.year
            self.track = dec.track
            dec.close()
 class OggFile(MusicFile):
    HANDLED_EXTS = set(['ogg'])
    def _read_info(self, field):
        if field == 'md5partial':
            dec = ogg.Vorbis(unicode(self.path))
            self._md5partial_offset = dec.audio_offset
            self._md5partial_size = dec.audio_size
        MusicFile._read_info(self, field)
        if field in TAG_FIELDS:
            dec = ogg.Vorbis(unicode(self.path))
            self.audiosize = dec.audio_size
            self.bitrate = dec.bitrate
            self.duration = dec.duration
            self.samplerate = dec.sample_rate
            self.artist = dec.artist
            self.album = dec.album
            self.title = dec.title
            self.genre = dec.genre
            self.comment = dec.comment
            self.year = dec.year
            self.track = dec.track
 class FlacFile(MusicFile):
    HANDLED_EXTS = set(['flac'])
    def _read_info(self, field):
        if field == 'md5partial':
            dec = flac.FLAC(unicode(self.path))
            self._md5partial_offset = dec.audio_offset
            self._md5partial_size = dec.audio_size
        MusicFile._read_info(self, field)
        if field in TAG_FIELDS:
            dec = flac.FLAC(unicode(self.path))
            self.audiosize = dec.audio_size
            self.bitrate = dec.bitrate
            self.duration = dec.duration
            self.samplerate = dec.sample_rate
            self.artist = dec.artist
            self.album = dec.album
            self.title = dec.title
            self.genre = dec.genre
            self.comment = dec.comment
            self.year = dec.year
            self.track = dec.track
 class AiffFile(MusicFile):
    HANDLED_EXTS = set(['aif', 'aiff', 'aifc'])
    def _read_info(self, field):
        if field == 'md5partial':
            dec = aiff.File(unicode(self.path))
            self._md5partial_offset = dec.audio_offset
            self._md5partial_size = dec.audio_size
        MusicFile._read_info(self, field)
        if field in TAG_FIELDS:
            dec = aiff.File(unicode(self.path))
            self.audiosize = dec.audio_size
            self.bitrate = dec.bitrate
            self.duration = dec.duration
            self.samplerate = dec.sample_rate
            tag = dec.tag
            if tag is not None:
                self.artist = tag.artist
                self.album = tag.album
                self.title = tag.title
                self.genre = tag.genre
                self.comment = tag.comment
                self.year = tag.year
                self.track = tag.track
--- a/me/py/scanner.py
+++ b/me/py/scanner.py
@@ -0,0 +1,16 @@
 # Created By: Virgil Dupras
 # Created On: 2006/03/03
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 from dupeguru.scanner import Scanner as ScannerBase
 class ScannerME(ScannerBase):
    @staticmethod
    def _key_func(dupe):
        return (not dupe.is_ref, -dupe.bitrate, -dupe.size)
--- a/me/py/tests/init.py
+++ b/me/py/tests/init.py
--- a/me/py/tests/scanner_test.py
+++ b/me/py/tests/scanner_test.py
@@ -0,0 +1,33 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-23
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 from hsutil.path import Path
 from dupeguru.engine import getwords
 from ..scanner import *
 class NamedObject(object):
    def __init__(self, name="foobar", size=1):
        self.name = name
        self.size = size
        self.path = Path('')
        self.words = getwords(name)
 no = NamedObject
 def test_priorize_me():
    # in ScannerME, bitrate goes first (right after is_ref) in priorization
    s = ScannerME()
    o1, o2 = no('foo'), no('foo')
    o1.bitrate = 1
    o2.bitrate = 2
    [group] = s.GetDupeGroups([o1, o2])
    assert group.ref is o2
--- a/me/qt/app.py
+++ b/me/qt/app.py
@@ -7,9 +7,7 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
-import hsfs.phys.music
+from dupeguru_me import data, scanner, fs
 from dupeguru import data_me, scanner
 from base.app import DupeGuru as DupeGuruBase
 from details_dialog import DetailsDialog
@@ -23,11 +21,11 @@ class DupeGuru(DupeGuruBase):
    DELTA_COLUMNS = frozenset([2, 3, 4, 5, 7, 8])
    def __init__(self):
-        DupeGuruBase.__init__(self, data_me, appid=1)
+        DupeGuruBase.__init__(self, data, appid=1)
    def _setup(self):
        self.scanner = scanner.ScannerME()
-        self.directories.dirclass = hsfs.phys.music.Directory
+        self.directories.fileclasses = [fs.Mp3File, fs.Mp4File, fs.WmaFile, fs.OggFile, fs.FlacFile, fs.AiffFile]
        DupeGuruBase._setup(self)
    def _update_options(self):
--- a/pe/cocoa/py/dg_cocoa.py
+++ b/pe/cocoa/py/dg_cocoa.py
@@ -12,7 +12,6 @@ from dupeguru_pe import app_cocoa as app_pe_cocoa
 # Fix py2app imports which chokes on relative imports
 from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
 from dupeguru_pe import block, cache, matchbase, data
 from hsfs import auto, stats, tree
 from hsutil import conflict
 class PyApp(NSObject):
@@ -39,7 +38,7 @@ class PyDupeGuru(PyApp):
        self.app.scanner.ignore_list.Clear()
    def clearPictureCache(self):
-        self.app.scanner.match_factory.cached_blocks.clear()
+        self.app.scanner.cached_blocks.clear()
    def doScan(self):
        return self.app.start_scanning()
@@ -172,10 +171,10 @@ class PyDupeGuru(PyApp):
    #---Properties
    def setMatchScaled_(self,match_scaled):
-        self.app.scanner.match_factory.match_scaled = match_scaled
+        self.app.scanner.match_scaled = match_scaled
    def setMinMatchPercentage_(self,percentage):
-        self.app.scanner.match_factory.threshold = int(percentage)
+        self.app.scanner.threshold = int(percentage)
    def setMixFileKind_(self,mix_file_kind):
        self.app.scanner.mix_file_kind = mix_file_kind
--- a/pe/help/changelog.yaml
+++ b/pe/help/changelog.yaml
@@ -1,3 +1,7 @@
 - date: 2009-10-24
  version: 1.7.8
  description: |
    * Fixed a bug sometimes causing some duplicates to be ignored during the scans. (#73)
 - date: 2009-10-14
  version: 1.7.7
  description: |
--- a/pe/py/app_cocoa.py
+++ b/pe/py/app_cocoa.py
@@ -7,41 +7,43 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 import os
 import os.path as op
 import logging
 import plistlib
 import re
 import objc
 from Foundation import *
 from AppKit import *
 from appscript import app, k
-from hsutil import job, io
+from hsutil import io
 import hsfs as fs
 from hsfs import phys, InvalidPath
 from hsutil import files
 from hsutil.str import get_file_ext
 from hsutil.path import Path
 from hsutil.cocoa import as_fetch
 from dupeguru import fs
 from dupeguru import app_cocoa, directories
-from . import data, matchbase
+from . import data
 from .cache import string_to_colors, Cache
 from .scanner import ScannerPE
 mainBundle = NSBundle.mainBundle()
 PictureBlocks = mainBundle.classNamed_('PictureBlocks')
 assert PictureBlocks is not None
-class Photo(phys.File):
+class Photo(fs.File):
-    INITIAL_INFO = phys.File.INITIAL_INFO.copy()
+    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
    INITIAL_INFO.update({
        'dimensions': (0,0),
    })
    HANDLED_EXTS = set(['png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'tif', 'nef', 'cr2'])
    @classmethod
    def can_handle(cls, path):
        return fs.File.can_handle(path) and get_file_ext(path[-1]) in cls.HANDLED_EXTS
    def _read_info(self, field):
-        super(Photo, self)._read_info(field)
+        fs.File._read_info(self, field)
        if field == 'dimensions':
            size = PictureBlocks.getImageSize_(unicode(self.path))
            self.dimensions = (size.width, size.height)
@@ -49,7 +51,7 @@ class Photo(phys.File):
    def get_blocks(self, block_count_per_side):
        try:
            blocks = PictureBlocks.getBlocksFromImagePath_blockCount_(unicode(self.path), block_count_per_side)
-        except Exception, e:
+        except Exception as e:
            raise IOError('The reading of "%s" failed with "%s"' % (unicode(self.path), unicode(e)))
        if not blocks:
            raise IOError('The picture %s could not be read' % unicode(self.path))
@@ -57,89 +59,79 @@ class Photo(phys.File):
 class IPhoto(Photo):
    def __init__(self, parent, whole_path):
        super(IPhoto, self).__init__(parent, whole_path[-1])
        self.whole_path = whole_path
    def _build_path(self):
        return self.whole_path
    @property
    def display_path(self):
-        return super(IPhoto, self)._build_path()
+        return Path(('iPhoto Library', self.name))
 def get_iphoto_database_path():
    ud = NSUserDefaults.standardUserDefaults()
    prefs = ud.persistentDomainForName_('com.apple.iApps')
    if 'iPhotoRecentDatabases' not in prefs:
        raise directories.InvalidPathError()
    plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
    return Path(plisturl.path())
-class Directory(phys.Directory):
+def get_iphoto_pictures(plistpath):
-    cls_file_class = Photo
+    if not io.exists(plistpath):
-    cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'nef', 'cr2')
+        raise InvalidPath(self)
-    
+    s = io.open(plistpath).read()
-    def _fetch_subitems(self):
+    # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
-        subdirs, subfiles = super(Directory,self)._fetch_subitems() 
+    s = s.replace('\x10', '')
-        return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
+    # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
-    
+    # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
-
+    # bundle's regexp
-class IPhotoLibrary(fs.Directory):
+    s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
-    def __init__(self, plistpath):
+    if count:
-        self.plistpath = plistpath
+        logging.warning("%d invalid XML entities replacement made", count)
-        self.refpath = plistpath[:-1]
+    plist = plistlib.readPlistFromString(s)
-        # the AlbumData.xml file lives right in the library path
+    result = []
-        super(IPhotoLibrary, self).__init__(None, 'iPhoto Library')
+    for photo_data in plist['Master Image List'].values():
        if not io.exists(plistpath):
            raise InvalidPath(self)
    def _update_photo(self, photo_data):
        if photo_data['MediaType'] != 'Image':
-            return
+            continue
        photo_path = Path(photo_data['ImagePath'])
-        subpath = photo_path[len(self.refpath):-1]
+        photo = IPhoto(photo_path)
-        subdir = self
+        result.append(photo)
-        for element in subpath:
+    return result
-            try:
+
-                subdir = subdir[element]
+class Directories(directories.Directories):
-            except KeyError:
+    def __init__(self):
-                subdir = fs.Directory(subdir, element)
+        directories.Directories.__init__(self, fileclasses=[Photo])
-        try:
+        self.iphoto_libpath = get_iphoto_database_path()
-            IPhoto(subdir, photo_path)
+        self.set_state(self.iphoto_libpath[:-1], directories.STATE_EXCLUDED)
        except fs.AlreadyExistsError:
            # it's possible for 2 entries in the plist to point to the same path. Ignore one of them.
            pass
-    def update(self):
+    def _get_files(self, from_path):
-        self.clear()
+        if from_path == Path('iPhoto Library'):
-        s = open(unicode(self.plistpath)).read()
+            is_ref = self.get_state(from_path) == directories.STATE_REFERENCE
-        # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
+            photos = get_iphoto_pictures(self.iphoto_libpath)
-        s = s.replace('\x10', '')
+            for photo in photos:
-        # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
+                photo.is_ref = is_ref
-        # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
+            return photos
-        # bundle's regexp
+        else:
-        s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
+            return directories.Directories._get_files(self, from_path)
        if count:
            logging.warning("%d invalid XML entities replacement made", count)
        plist = plistlib.readPlistFromString(s)
        for photo_data in plist['Master Image List'].values():
            self._update_photo(photo_data)
-    def force_update(self): # Don't update
+    @staticmethod
-        pass
+    def get_subfolders(path):
        if path == Path('iPhoto Library'):
            return []
        else:
            return directories.Directories.get_subfolders(path)
    def add_path(self, path):
        if path == Path('iPhoto Library'):
            if path in self:
                raise AlreadyThereError()
            self._dirs.append(path)
        else:
            directories.Directories.add_path(self, path)
 class DupeGuruPE(app_cocoa.DupeGuru):
    def __init__(self):
        app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru Picture Edition', appid=5)
-        self.scanner.match_factory = matchbase.AsyncMatchFactory()
+        self.scanner = ScannerPE()
-        self.directories.dirclass = Directory
+        self.directories = Directories()
        self.directories.special_dirclasses[Path('iPhoto Library')] = lambda _, __: self._create_iphoto_library()
        p = op.join(self.appdata, 'cached_pictures.db')
-        self.scanner.match_factory.cached_blocks = Cache(p)
+        self.scanner.cached_blocks = Cache(p)
    def _create_iphoto_library(self):
        ud = NSUserDefaults.standardUserDefaults()
        prefs = ud.persistentDomainForName_('com.apple.iApps')
        if 'iPhotoRecentDatabases' not in prefs:
            raise directories.InvalidPathError
        plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
        plistpath = Path(plisturl.path())
        return IPhotoLibrary(plistpath)
    def _do_delete(self, j):
        def op(dupe):
@@ -174,40 +166,19 @@ class DupeGuruPE(app_cocoa.DupeGuru):
    def _do_load(self, j):
        self.directories.load_from_file(op.join(self.appdata, 'last_directories.xml'))
        for d in self.directories:
            if isinstance(d, IPhotoLibrary):
                d.update()
        self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
    def _get_file(self, str_path):
        p = Path(str_path)
-        for d in self.directories:
+        if p in self.directories.iphoto_libpath[:-1]:
-            result = None
+            return IPhoto(p)
-            if p in d.path:
+        return app_cocoa.DupeGuru._get_file(self, str_path)
                result = d.find_path(p[d.path:])
            if isinstance(d, IPhotoLibrary) and p in d.refpath:
                result = d.find_path(p[d.refpath:])
            if result is not None:
                return result
    def add_directory(self, d):
        result = app_cocoa.DupeGuru.add_directory(self, d)
        if (result == 0) and (d == 'iPhoto Library'):
            [iphotolib] = [dir for dir in self.directories if dir.path == d]
            iphotolib.update()
        return result
    def copy_or_move(self, dupe, copy, destination, dest_type):
        if isinstance(dupe, IPhoto):
            copy = True
        return app_cocoa.DupeGuru.copy_or_move(self, dupe, copy, destination, dest_type)
    def start_scanning(self):
        for directory in self.directories:
            if isinstance(directory, IPhotoLibrary):
                self.directories.set_state(directory.refpath, directories.STATE_EXCLUDED)
        return app_cocoa.DupeGuru.start_scanning(self)
    def selected_dupe_path(self):
        if not self.selected_dupes:
            return None
--- a/pe/py/matchbase.py
+++ b/pe/py/matchbase.py
@@ -20,58 +20,42 @@ from .block import avgdiff, DifferentBlockCountError, NoBlocksError
 from .cache import Cache
 MIN_ITERATIONS = 3
 BLOCK_COUNT_PER_SIDE = 15
 # Enough so that we're sure that the main thread will not wait after a result.get() call
 # cpucount*2 should be enough to be sure that the spawned process will not wait after the results
 # collection made by the main process.
 RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2
-def get_match(first,second,percentage):
+def prepare_pictures(pictures, cached_blocks, j=job.nulljob):
    # The MemoryError handlers in there use logging without first caring about whether or not
    # there is enough memory left to carry on the operation because it is assumed that the
    # MemoryError happens when trying to read an image file, which is freed from memory by the
    # time that MemoryError is raised.
    prepared = [] # only pictures for which there was no error getting blocks
    try:
        for picture in j.iter_with_progress(pictures, 'Analyzed %d/%d pictures'):
            picture.dimensions
            picture.unicode_path = unicode(picture.path)
            try:
                if picture.unicode_path not in cached_blocks:
                    blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
                    cached_blocks[picture.unicode_path] = blocks
                prepared.append(picture)
            except IOError as e:
                logging.warning(unicode(e))
            except MemoryError:
                logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
                if picture.size < 10 * 1024 * 1024: # We're really running out of memory
                    raise
    except MemoryError:
        logging.warning('Ran out of memory while preparing pictures')
    return prepared
 def get_match(first, second, percentage):
    if percentage < 0:
        percentage = 0
-    return Match(first,second,percentage)
+    return Match(first, second, percentage)
 class MatchFactory(object):
    cached_blocks = None
    block_count_per_side = 15
    threshold = 75
    match_scaled = False
    def _do_getmatches(self, files, j):
        raise NotImplementedError()
    def getmatches(self, files, j=job.nulljob):
        # The MemoryError handlers in there use logging without first caring about whether or not
        # there is enough memory left to carry on the operation because it is assumed that the
        # MemoryError happens when trying to read an image file, which is freed from memory by the
        # time that MemoryError is raised.
        j = j.start_subjob([3, 7])
        logging.info('Preparing %d files' % len(files))
        prepared = self.prepare_files(files, j)
        logging.info('Finished preparing %d files' % len(prepared))
        return self._do_getmatches(prepared, j)
    def prepare_files(self, files, j=job.nulljob):
        prepared = [] # only files for which there was no error getting blocks
        try:
            for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
                picture.dimensions
                picture.unicode_path = unicode(picture.path)
                try:
                    if picture.unicode_path not in self.cached_blocks:
                        blocks = picture.get_blocks(self.block_count_per_side)
                        self.cached_blocks[picture.unicode_path] = blocks
                    prepared.append(picture)
                except IOError as e:
                    logging.warning(unicode(e))
                except MemoryError:
                    logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
                    if picture.size < 10 * 1024 * 1024: # We're really running out of memory
                        raise
        except MemoryError:
            logging.warning('Ran out of memory while preparing files')
        return prepared
 def async_compare(ref_id, other_ids, dbname, threshold):
    cache = Cache(dbname, threaded=False)
@@ -89,53 +73,55 @@ def async_compare(ref_id, other_ids, dbname, threshold):
            results.append((ref_id, other_id, percentage))
    cache.con.close()
    return results
 class AsyncMatchFactory(MatchFactory):
    def _do_getmatches(self, pictures, j):
        def empty_out_queue(queue, into):
            try:
                while True:
                    into.append(queue.get(block=False))
            except Empty:
                pass
        j = j.start_subjob([9, 1], 'Preparing for matching')
        cache = self.cached_blocks
        id2picture = {}
        dimensions2pictures = defaultdict(set)
        for picture in pictures:
            try:
                picture.cache_id = cache.get_id(picture.unicode_path)
                id2picture[picture.cache_id] = picture
                if not self.match_scaled:
                    dimensions2pictures[picture.dimensions].add(picture)
            except ValueError:
                pass
        pictures = [p for p in pictures if hasattr(p, 'cache_id')]
        pool = multiprocessing.Pool()
        async_results = []
        matches = []
        pictures_copy = set(pictures)
        for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
            others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
            others.remove(ref)
            if others:
                cache_ids = [f.cache_id for f in others]
                args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
                async_results.append(pool.apply_async(async_compare, args))
            if len(async_results) > RESULTS_QUEUE_LIMIT:
                result = async_results.pop(0)
                matches.extend(result.get())
        result = []
        for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
            ref = id2picture[ref_id]
            other = id2picture[other_id]
            if percentage == 100 and ref.md5 != other.md5:
                percentage = 99
            if percentage >= self.threshold:
                result.append(get_match(ref, other, percentage))
        return result
 def getmatches(pictures, cached_blocks, threshold=75, match_scaled=False, j=job.nulljob):
    def empty_out_queue(queue, into):
        try:
            while True:
                into.append(queue.get(block=False))
        except Empty:
            pass
    j = j.start_subjob([3, 7])
    pictures = prepare_pictures(pictures, cached_blocks, j)
    j = j.start_subjob([9, 1], 'Preparing for matching')
    cache = cached_blocks
    id2picture = {}
    dimensions2pictures = defaultdict(set)
    for picture in pictures:
        try:
            picture.cache_id = cache.get_id(picture.unicode_path)
            id2picture[picture.cache_id] = picture
            if not match_scaled:
                dimensions2pictures[picture.dimensions].add(picture)
        except ValueError:
            pass
    pictures = [p for p in pictures if hasattr(p, 'cache_id')]
    pool = multiprocessing.Pool()
    async_results = []
    matches = []
    pictures_copy = set(pictures)
    for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
        others = pictures_copy if match_scaled else dimensions2pictures[ref.dimensions]
        others.remove(ref)
        if others:
            cache_ids = [f.cache_id for f in others]
            args = (ref.cache_id, cache_ids, cached_blocks.dbname, threshold)
            async_results.append(pool.apply_async(async_compare, args))
        if len(async_results) > RESULTS_QUEUE_LIMIT:
            result = async_results.pop(0)
            matches.extend(result.get())
    for result in async_results: # process the rest of the results
        matches.extend(result.get())
    result = []
    for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
        ref = id2picture[ref_id]
        other = id2picture[other_id]
        if percentage == 100 and ref.md5 != other.md5:
            percentage = 99
        if percentage >= threshold:
            result.append(get_match(ref, other, percentage))
    return result
 multiprocessing.freeze_support()
--- a/pe/py/scanner.py
+++ b/pe/py/scanner.py
@@ -0,0 +1,22 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-18
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 from dupeguru.scanner import Scanner
 from . import matchbase
 class ScannerPE(Scanner):
    cached_blocks = None
    match_scaled = False
    threshold = 75
    def _getmatches(self, files, j):
        return matchbase.getmatches(files, self.cached_blocks, self.threshold, self.match_scaled, j)
--- a/pe/qt/app.py
+++ b/pe/qt/app.py
@@ -12,12 +12,12 @@ import os.path as op
 from PyQt4.QtGui import QImage
 import PIL.Image
 from hsfs import phys
 from hsutil.str import get_file_ext
 from dupeguru import fs
 from dupeguru_pe import data as data_pe
 from dupeguru_pe.cache import Cache
-from dupeguru_pe.matchbase import AsyncMatchFactory
+from dupeguru_pe.scanner import ScannerPE
 from block import getblocks
 from base.app import DupeGuru as DupeGuruBase
@@ -26,14 +26,19 @@ from main_window import MainWindow
 from preferences import Preferences
 from preferences_dialog import PreferencesDialog
-class File(phys.File):
+class File(fs.File):
-    INITIAL_INFO = phys.File.INITIAL_INFO.copy()
+    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
    INITIAL_INFO.update({
        'dimensions': (0,0),
    })
    HANDLED_EXTS = set(['png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif'])
    @classmethod
    def can_handle(cls, path):
        return fs.File.can_handle(path) and get_file_ext(path[-1]) in cls.HANDLED_EXTS
    def _read_info(self, field):
-        super(File, self)._read_info(field)
+        fs.File._read_info(self, field)
        if field == 'dimensions':
            im = PIL.Image.open(unicode(self.path))
            self.dimensions = im.size
@@ -44,15 +49,6 @@ class File(phys.File):
        return getblocks(image, block_count_per_side)
 class Directory(phys.Directory):
    cls_file_class = File
    cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff')
    def _fetch_subitems(self):
        subdirs, subfiles = super(Directory, self)._fetch_subitems() 
        return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
 class DupeGuru(DupeGuruBase):
    LOGO_NAME = 'logo_pe'
    NAME = 'dupeGuru Picture Edition'
@@ -63,15 +59,15 @@ class DupeGuru(DupeGuruBase):
        DupeGuruBase.__init__(self, data_pe, appid=5)
    def _setup(self):
-        self.scanner.match_factory = AsyncMatchFactory()
+        self.scanner = ScannerPE()
-        self.directories.dirclass = Directory
+        self.directories.fileclasses = [File]
-        self.scanner.match_factory.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db'))
+        self.scanner.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db'))
        DupeGuruBase._setup(self)
    def _update_options(self):
        DupeGuruBase._update_options(self)
-        self.scanner.match_factory.match_scaled = self.prefs.match_scaled
+        self.scanner.match_scaled = self.prefs.match_scaled
-        self.scanner.match_factory.threshold = self.prefs.filter_hardness
+        self.scanner.threshold = self.prefs.filter_hardness
    def _create_details_dialog(self, parent):
        return DetailsDialog(parent, self)
--- a/pe/qt/dgpe.spec
+++ b/pe/qt/dgpe.spec
@@ -1,6 +1,6 @@
 # -*- mode: python -*-
 a = Analysis([os.path.join(HOMEPATH,'support\\_mountzlib.py'), os.path.join(HOMEPATH,'support\\useUnicode.py'), 'start.py'],
-             pathex=['C:\\src\\dupeguru\\pe\\qt'])
+             pathex=[])
 pyz = PYZ(a.pure)
 exe = EXE(pyz,
          a.scripts,
--- a/pe/qt/gen.py
+++ b/pe/qt/gen.py
@@ -16,6 +16,7 @@ from hsutil.build import print_and_do, build_all_qt_ui
 build_all_qt_ui(op.join('qtlib', 'ui'))
 build_all_qt_ui('base')
 build_all_qt_ui('.')
 print_and_do("pyrcc4 base\\dg.qrc > base\\dg_rc.py")
 def move(src, dst):
    if not op.exists(src):
--- a/pe/qt/main_window.py
+++ b/pe/qt/main_window.py
@@ -23,6 +23,6 @@ class MainWindow(MainWindowBase):
        title = "Clear Picture Cache"
        msg = "Do you really want to remove all your cached picture analysis?"
        if self._confirm(title, msg, QMessageBox.No):
-            self.app.scanner.match_factory.cached_blocks.clear()
+            self.app.scanner.cached_blocks.clear()
            QMessageBox.information(self, title, "Picture cache cleared.")
--- a/pe/qt/start.py
+++ b/pe/qt/start.py
@@ -14,6 +14,9 @@ import base.dg_rc
 from app import DupeGuru
 # This is a workaround for a pyinstaller problem where compiled dupeguru can't read tiff files
 from PIL import TiffImagePlugin, TiffTags
 if __name__ == "__main__":
    app = QApplication(sys.argv)
    app.setWindowIcon(QIcon(QPixmap(":/logo_pe")))
--- a/se/cocoa/py/dg_cocoa.py
+++ b/se/cocoa/py/dg_cocoa.py
@@ -8,12 +8,12 @@
 import objc
 from AppKit import *
-from dupeguru import app_se_cocoa, scanner
+from dupeguru_se.app_cocoa import DupeGuru
 from dupeguru import scanner
 # Fix py2app imports with chokes on relative imports
-from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
+from dupeguru_se import fs, data
-from hsfs import auto, stats, tree
+from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, fs
 from hsfs.phys import bundle
 from hsutil import conflict
 class PyApp(NSObject):
@@ -22,7 +22,7 @@ class PyApp(NSObject):
 class PyDupeGuru(PyApp):
    def init(self):
        self = super(PyDupeGuru,self).init()
-        self.app = app_se_cocoa.DupeGuru()
+        self.app = DupeGuru()
        return self
    #---Directories
--- a/se/py/LICENSE
+++ b/se/py/LICENSE
@@ -0,0 +1,11 @@
 Copyright 2009 Hardcoded Software Inc. (http://www.hardcoded.net)
 All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of Hardcoded Software Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
    * If the source code has been published less than two years ago, any redistribution, in whole or in part, must retain full licensing functionality, without any attempt to change, obscure or in other ways circumvent its intent.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/se/py/init.py
+++ b/se/py/init.py
@@ -0,0 +1 @@
--- a/base/py/app_se_cocoa.py
+++ b/base/py/app_se_cocoa.py
@@ -11,14 +11,15 @@ import logging
 from AppKit import *
-from hsfs.phys import Directory as DirectoryBase
+from hsutil import io
 from hsfs.phys.bundle import Bundle
 from hsutil.path import Path
 from hsutil.misc import extract
 from hsutil.str import get_file_ext
-from . import app_cocoa, data
+from dupeguru import fs
-from .directories import Directories as DirectoriesBase, STATE_EXCLUDED
+from dupeguru.app_cocoa import DupeGuru as DupeGuruBase
 from dupeguru.directories import Directories as DirectoriesBase, STATE_EXCLUDED
 from . import data
 from .fs import Bundle as BundleBase
 if NSWorkspace.sharedWorkspace().respondsToSelector_('typeOfFile:error:'): # Only from 10.5
    def is_bundle(str_path):
@@ -31,27 +32,17 @@ else: # Tiger
    def is_bundle(str_path): # just return a list of a few known bundle extensions.
        return get_file_ext(str_path) in ('app', 'pages', 'numbers')
-class DGDirectory(DirectoryBase):
+class Bundle(BundleBase):
-    def _create_sub_file(self, name, with_parent=True):
+    @classmethod
-        if is_bundle(unicode(self.path + name)):
+    def can_handle(cls, path):
-            parent = self if with_parent else None
+        return not io.islink(path) and io.isdir(path) and is_bundle(unicode(path))
            return Bundle(parent, name)
        else:
            return super(DGDirectory, self)._create_sub_file(name, with_parent)
    def _fetch_subitems(self):
        subdirs, subfiles = super(DGDirectory, self)._fetch_subitems()
        apps, normal_dirs = extract(lambda name: is_bundle(unicode(self.path + name)), subdirs)
        subfiles += apps
        return normal_dirs, subfiles
 class Directories(DirectoriesBase):
    ROOT_PATH_TO_EXCLUDE = map(Path, ['/Library', '/Volumes', '/System', '/bin', '/sbin', '/opt', '/private', '/dev'])
    HOME_PATH_TO_EXCLUDE = [Path('Library')]
    def __init__(self):
-        DirectoriesBase.__init__(self)
+        DirectoriesBase.__init__(self, fileclasses=[Bundle, fs.File])
        self.dirclass = DGDirectory
    def _default_state_for_path(self, path):
        result = DirectoriesBase._default_state_for_path(self, path)
@@ -63,8 +54,8 @@ class Directories(DirectoriesBase):
            return STATE_EXCLUDED
-class DupeGuru(app_cocoa.DupeGuru):
+class DupeGuru(DupeGuruBase):
    def __init__(self):
-        app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru', appid=4)
+        DupeGuruBase.__init__(self, data, 'dupeGuru', appid=4)
        self.directories = Directories()
--- a/se/py/data.py
+++ b/se/py/data.py
@@ -0,0 +1,72 @@
 # Created By: Virgil Dupras
 # Created On: 2006/03/15
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 from hsutil.str import format_size
 from dupeguru.data import (format_path, format_timestamp, format_words, format_perc, 
    format_dupe_count, cmp_value)
 COLUMNS = [
    {'attr':'name','display':'Filename'},
    {'attr':'path','display':'Directory'},
    {'attr':'size','display':'Size (KB)'},
    {'attr':'extension','display':'Kind'},
    {'attr':'ctime','display':'Creation'},
    {'attr':'mtime','display':'Modification'},
    {'attr':'percentage','display':'Match %'},
    {'attr':'words','display':'Words Used'},
    {'attr':'dupe_count','display':'Dupe Count'},
 ]
 METADATA_TO_READ = ['size', 'ctime', 'mtime']
 def GetDisplayInfo(dupe, group, delta):
    size = dupe.size
    ctime = dupe.ctime
    mtime = dupe.mtime
    m = group.get_match_of(dupe)
    if m:
        percentage = m.percentage
        dupe_count = 0
        if delta:
            r = group.ref
            size -= r.size
            ctime -= r.ctime
            mtime -= r.mtime
    else:
        percentage = group.percentage
        dupe_count = len(group.dupes)
    return [
        dupe.name,
        format_path(dupe.path),
        format_size(size, 0, 1, False),
        dupe.extension,
        format_timestamp(ctime, delta and m),
        format_timestamp(mtime, delta and m),
        format_perc(percentage),
        format_words(dupe.words) if hasattr(dupe, 'words') else '',
        format_dupe_count(dupe_count)
    ]
 def GetDupeSortKey(dupe, get_group, key, delta):
    if key == 6:
        m = get_group().get_match_of(dupe)
        return m.percentage
    if key == 8:
        return 0
    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
    if delta and (key in (2, 4, 5)):
        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
    return r
 def GetGroupSortKey(group, key):
    if key == 6:
        return group.percentage
    if key == 8:
        return len(group)
    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/se/py/fs.py
+++ b/se/py/fs.py
@@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-23
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 import hashlib
 from hsutil import io
 from hsutil.misc import nonone
 from dupeguru import fs
 class Bundle(fs.File):
    """This class is for Mac OSX bundles (.app). Bundles are seen by the OS as
    normal directories, but I don't want that in dupeGuru. I want dupeGuru
    to see them as files.
    """
    def _read_info(self, field):
        if field in ('size', 'ctime', 'mtime'):
            files = fs.get_all_files(self.path)
            size = sum((file.size for file in files), 0)
            self.size = size
            stats = io.stat(self.path)
            self.ctime = nonone(stats.st_ctime, 0)
            self.mtime = nonone(stats.st_mtime, 0)
        elif field in ('md5', 'md5partial'):
            # What's sensitive here is that we must make sure that subfiles'
            # md5 are always added up in the same order, but we also want a
            # different md5 if a file gets moved in a different subdirectory.
            def get_dir_md5_concat():
                files = fs.get_all_files(self.path)
                files.sort(key=lambda f:f.path)
                md5s = [getattr(f, field) for f in files]
                return ''.join(md5s)
            md5 = hashlib.md5(get_dir_md5_concat())
            digest = md5.digest()
            setattr(self, field, digest)
--- a/se/py/tests/init.py
+++ b/se/py/tests/init.py
--- a/se/py/tests/fs_test.py
+++ b/se/py/tests/fs_test.py
@@ -0,0 +1,48 @@
 # -*- coding: utf-8 -*-
 # Created By: Virgil Dupras
 # Created On: 2009-10-23
 # $Id$
 # Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
 # 
 # This software is licensed under the "HS" License as described in the "LICENSE" file, 
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
 import hashlib
 from nose.tools import eq_
 from hsutil.testcase import TestCase
 from dupeguru.fs import File
 from dupeguru.tests.directories_test import create_fake_fs
 from .. import fs
 class TCBundle(TestCase):
    def test_size_aggregates_subfiles(self):
        p = create_fake_fs(self.tmppath())
        b = fs.Bundle(p)
        eq_(b.size, 12)
    def test_md5_aggregate_subfiles_sorted(self):
        #dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
        #all files' md5 it contains, but it must make sure that it does so in the 
        #same order everytime.
        p = create_fake_fs(self.tmppath())
        b = fs.Bundle(p)
        md5s = File(p + ('dir1', 'file1.test')).md5
        md5s += File(p + ('dir2', 'file2.test')).md5
        md5s += File(p + ('dir3', 'file3.test')).md5
        md5s += File(p + 'file1.test').md5
        md5s += File(p + 'file2.test').md5
        md5s += File(p + 'file3.test').md5
        md5 = hashlib.md5(md5s)
        eq_(b.md5, md5.digest())
    def test_has_file_attrs(self):
        #a Bundle must behave like a file, so it must have ctime and mtime attributes
        b = fs.Bundle(self.tmppath())
        assert b.mtime > 0
        assert b.ctime > 0
        eq_(b.extension, '')
--- a/se/qt/app.py
+++ b/se/qt/app.py
@@ -7,7 +7,7 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license
-from dupeguru import data
+from dupeguru_se import data
 from dupeguru.directories import Directories as DirectoriesBase, STATE_EXCLUDED
 from base.app import DupeGuru as DupeGuruBase
Author	SHA1	Message	Date
hsoft	911521d8e0	dgpe qt: build related fixes. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40217	2009-10-24 16:30:37 +00:00
hsoft	b25c1c3a3b	Added dgpe 1.7.8 to the changelog. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40215	2009-10-24 14:18:36 +00:00
hsoft	37a40040b3	[#73 state:port] Fixed a bug causing some matches to be ignored in the new pe match algo. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40212	2009-10-24 13:54:57 +00:00
hsoft	25dadc83eb	sgpe cocoa: adjusted to hsfs removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40210	2009-10-24 12:21:39 +00:00
hsoft	b8c11b5aae	dgpe cocoa: removed hsfs from externals. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40209	2009-10-24 12:21:09 +00:00
hsoft	a3ab314378	dgpe qt: adjusted to the hsfs move. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40208	2009-10-23 15:04:37 +00:00
hsoft	794192835d	dgme cocoa: added dupeguru_me external and removed the hsfs one. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40207	2009-10-23 14:46:00 +00:00
hsoft	385768a69b	dgme qt: adjusted code to the hsfs move. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40206	2009-10-23 14:35:51 +00:00
hsoft	a281931b16	dgme qt: added the dupeguru_me external and removed the hsfs one. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40205	2009-10-23 14:34:59 +00:00
hsoft	085311d559	Added the folder me/py --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40204	2009-10-23 14:05:06 +00:00
hsoft	4d7f032889	dgse cocoa: fixed quirks created by the hsfs move. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40203	2009-10-23 13:46:18 +00:00
hsoft	cf44c93013	dgse cocoa: added the dupeguru_se external and removed the hsfs one. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40202	2009-10-23 13:45:15 +00:00
hsoft	787cbcd01f	dgse qt: removed hsfs external --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40201	2009-10-23 12:59:29 +00:00
hsoft	b2b316b642	dgse qt: removed all hsfs usages. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40200	2009-10-23 12:56:52 +00:00
hsoft	49165125e4	dg se: Moved se-specific code from dupeguru to dupeguru_se. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40199	2009-10-23 08:19:48 +00:00
hsoft	54ac0fd19e	dg qt: oops, now I added the external ref. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40198	2009-10-23 08:19:02 +00:00
hsoft	0aff7f16e5	dg qt: Added the dupeguru_se external. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40197	2009-10-23 08:17:35 +00:00
hsoft	f9abc3b35d	Added a dupeguru_se sub-package. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40196	2009-10-23 08:02:43 +00:00
hsoft	b167a51243	Added dupeguru.fs, which is a simpler fork of hsfs and aims to replace it in the dupeguru project. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40195	2009-10-22 15:23:32 +00:00
hsoft	371cdda911	dgpe cocoa: adjusted to MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40194	2009-10-18 09:29:33 +00:00
hsoft	11977c6533	dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193	2009-10-18 09:26:04 +00:00
hsoft	7228adf433	Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192	2009-10-18 08:46:00 +00:00