dgpe qt: build related fixes.

--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40217
Added dgpe 1.7.8 to the changelog.
2026-03-12 11:31:39 +00:00 · 2009-10-24 16:30:37 +00:00 · 2009-10-24 14:18:36 +00:00 · 2009-10-24 13:54:57 +00:00 · 2009-10-24 12:21:39 +00:00 · 2009-10-24 12:21:09 +00:00
44 changed files with 1168 additions and 705 deletions
--- a/base/py/app.py
+++ b/base/py/app.py
@@ -14,13 +14,13 @@ import os
 import os.path as op
 import logging

-from hsutil import job, io, files
+from hsutil import io, files
 from hsutil.path import Path
 from hsutil.reg import RegistrableApplication, RegistrationRequired
 from hsutil.misc import flatten, first
 from hsutil.str import escape

-from . import directories, results, scanner, export
+from . import directories, results, scanner, export, fs

 JOB_SCAN = 'job_scan'
 JOB_LOAD = 'job_load'
@@ -98,13 +98,8 @@ class DupeGuru(RegistrableApplication):
            return ['---'] * len(self.data.COLUMNS)
    
    def _get_file(self, str_path):
-        p = Path(str_path)
-        for d in self.directories:
-            if p not in d.path:
-                continue
-            result = d.find_path(p[d.path:])
-            if result is not None:
-                return result
+        path = Path(str_path)
+        return fs.get_file(path, self.directories.fileclasses)    
    
    @staticmethod
    def _recycle_dupe(dupe):
@@ -150,7 +145,7 @@ class DupeGuru(RegistrableApplication):
                       2 = absolute re-creation.
        """
        source_path = dupe.path
-        location_path = dupe.root.path
+        location_path = first(p for p in self.directories if dupe.path in p)
        dest_path = Path(destination)
        if dest_type == 2:
            dest_path = dest_path + source_path[1:-1] #Remove drive letter and filename
--- a/base/py/app_cocoa.py
+++ b/base/py/app_cocoa.py
@@ -12,13 +12,12 @@ from AppKit import *
 import logging
 import os.path as op

-import hsfs as fs
 from hsutil import io, cocoa, job
 from hsutil.cocoa import install_exception_hook
 from hsutil.misc import stripnone
 from hsutil.reg import RegistrationRequired

-import app, data
+from . import app, fs

 JOBID2TITLE = {
    app.JOB_SCAN: "Scanning for duplicates",
@@ -43,8 +42,6 @@ class DupeGuru(app.DupeGuru):
        logging.basicConfig(level=LOGGING_LEVEL, format='%(levelname)s %(message)s')
        logging.debug('started in debug mode')
        install_exception_hook()
-        if data_module is None:
-            data_module = data
        appsupport = NSSearchPathForDirectoriesInDomains(NSApplicationSupportDirectory, NSUserDomainMask, True)[0]
        appdata = op.join(appsupport, appdata_subdir)
        app.DupeGuru.__init__(self, data_module, appdata, appid)
@@ -91,15 +88,15 @@ class DupeGuru(app.DupeGuru):
        except IndexError:
            return (None,None)
    
-    def GetDirectory(self,node_path,curr_dir=None):
+    def get_folder_path(self, node_path, curr_path=None):
        if not node_path:
-            return curr_dir
-        if curr_dir is not None:
-            l = curr_dir.dirs
+            return curr_path
+        current_index = node_path[0]
+        if curr_path is None:
+            curr_path = self.directories[current_index]
        else:
-            l = self.directories
-        d = l[node_path[0]]
-        return self.GetDirectory(node_path[1:],d)
+            curr_path = self.directories.get_subfolders(curr_path)[current_index]
+        return self.get_folder_path(node_path[1:], curr_path)
    
    def RefreshDetailsTable(self,dupe,group):
        l1 = self._get_display_info(dupe, group, False)
@@ -146,13 +143,13 @@ class DupeGuru(app.DupeGuru):
    def RemoveSelected(self):
        self.results.remove_duplicates(self.selected_dupes)
    
-    def RenameSelected(self,newname):
+    def RenameSelected(self, newname):
        try:
            d = self.selected_dupes[0]
-            d = d.move(d.parent,newname)
+            d.rename(newname)
            return True
-        except (IndexError,fs.FSError),e:
-            logging.warning("dupeGuru Warning: %s" % str(e))
+        except (IndexError, fs.FSError) as e:
+            logging.warning("dupeGuru Warning: %s" % unicode(e))
        return False
    
    def RevealSelected(self):
@@ -214,9 +211,9 @@ class DupeGuru(app.DupeGuru):
            self.results.dupes[row] for row in rows if row in xrange(len(self.results.dupes))
        ]
    
-    def SetDirectoryState(self,node_path,state):
-        d = self.GetDirectory(node_path)
-        self.directories.set_state(d.path,state)
+    def SetDirectoryState(self, node_path, state):
+        p = self.get_folder_path(node_path)
+        self.directories.set_state(p, state)
    
    def sort_dupes(self,key,asc):
        self.results.sort_dupes(key,asc,self.display_delta_values)
@@ -245,8 +242,12 @@ class DupeGuru(app.DupeGuru):
            return [len(g.dupes) for g in self.results.groups]
        elif tag == 1: #Directories
            try:
-                dirs = self.GetDirectory(node_path).dirs if node_path else self.directories
-                return [d.dircount for d in dirs]
+                if node_path:
+                    path = self.get_folder_path(node_path)
+                    subfolders = self.directories.get_subfolders(path)
+                else:
+                    subfolders = self.directories
+                return [len(self.directories.get_subfolders(path)) for path in subfolders]
            except IndexError: # node_path out of range
                return []
        else: #Power Marker
@@ -270,8 +271,9 @@ class DupeGuru(app.DupeGuru):
            return result
        elif tag == 1: #Directories
            try:
-                d = self.GetDirectory(node_path)
-                return [d.name, self.directories.get_state(d.path)]
+                path = self.get_folder_path(node_path)
+                name = unicode(path) if len(node_path) == 1 else path[-1]
+                return [name, self.directories.get_state(path)]
            except IndexError: # node_path out of range
                return []
    
--- a/base/py/data.py
+++ b/base/py/data.py
@@ -40,63 +40,3 @@ def format_dupe_count(c):

 def cmp_value(value):
    return value.lower() if isinstance(value, basestring) else value
-
-COLUMNS = [
-    {'attr':'name','display':'Filename'},
-    {'attr':'path','display':'Directory'},
-    {'attr':'size','display':'Size (KB)'},
-    {'attr':'extension','display':'Kind'},
-    {'attr':'ctime','display':'Creation'},
-    {'attr':'mtime','display':'Modification'},
-    {'attr':'percentage','display':'Match %'},
-    {'attr':'words','display':'Words Used'},
-    {'attr':'dupe_count','display':'Dupe Count'},
-]
-
-METADATA_TO_READ = ['size', 'ctime', 'mtime']
-
-def GetDisplayInfo(dupe, group, delta):
-    size = dupe.size
-    ctime = dupe.ctime
-    mtime = dupe.mtime
-    m = group.get_match_of(dupe)
-    if m:
-        percentage = m.percentage
-        dupe_count = 0
-        if delta:
-            r = group.ref
-            size -= r.size
-            ctime -= r.ctime
-            mtime -= r.mtime
-    else:
-        percentage = group.percentage
-        dupe_count = len(group.dupes)
-    return [
-        dupe.name,
-        format_path(dupe.path),
-        format_size(size, 0, 1, False),
-        dupe.extension,
-        format_timestamp(ctime, delta and m),
-        format_timestamp(mtime, delta and m),
-        format_perc(percentage),
-        format_words(dupe.words),
-        format_dupe_count(dupe_count)
-    ]
-
-def GetDupeSortKey(dupe, get_group, key, delta):
-    if key == 6:
-        m = get_group().get_match_of(dupe)
-        return m.percentage
-    if key == 8:
-        return 0
-    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
-    if delta and (key in (2, 4, 5)):
-        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
-    return r
-
-def GetGroupSortKey(group, key):
-    if key == 6:
-        return group.percentage
-    if key == 8:
-        return len(group)
-    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/base/py/directories.py
+++ b/base/py/directories.py
@@ -9,11 +9,12 @@

 import xml.dom.minidom

-from hsfs import phys
-import hsfs as fs
+from hsutil import io
 from hsutil.files import FileOrPath
 from hsutil.path import Path

+from . import fs
+
 (STATE_NORMAL,
 STATE_REFERENCE,
 STATE_EXCLUDED) = range(3)
@@ -26,15 +27,14 @@ class InvalidPathError(Exception):

 class Directories(object):
    #---Override
-    def __init__(self):
+    def __init__(self, fileclasses=[fs.File]):
        self._dirs = []
        self.states = {}
-        self.dirclass = phys.Directory
-        self.special_dirclasses = {}
+        self.fileclasses = fileclasses
    
-    def __contains__(self,path):
-        for d in self._dirs:
-            if path in d.path:
+    def __contains__(self, path):
+        for p in self._dirs:
+            if path in p:
                return True
        return False
    
@@ -53,8 +53,7 @@ class Directories(object):
        if path[-1].startswith('.'): # hidden
            return STATE_EXCLUDED
    
-    def _get_files(self, from_dir):
-        from_path = from_dir.path
+    def _get_files(self, from_path):
        state = self.get_state(from_path)
        if state == STATE_EXCLUDED:
            # Recursively get files from folders with lots of subfolder is expensive. However, there
@@ -62,14 +61,21 @@ class Directories(object):
            # through self.states and see if we must continue, or we can stop right here to save time
            if not any(p[:len(from_path)] == from_path for p in self.states):
                return
-        result = []
-        for subdir in from_dir.dirs:
-            for file in self._get_files(subdir):
-                yield file
-        if state != STATE_EXCLUDED:
-            for file in from_dir.files:
-                file.is_ref = state == STATE_REFERENCE
-                yield file
+        try:
+            filepaths = set()
+            if state != STATE_EXCLUDED:
+                for file in fs.get_files(from_path, fileclasses=self.fileclasses):
+                    file.is_ref = state == STATE_REFERENCE
+                    filepaths.add(file.path)
+                    yield file
+            subpaths = [from_path + name for name in io.listdir(from_path)]
+            # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
+            subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
+            for subfolder in subfolders:
+                for file in self._get_files(subfolder):
+                    yield file
+        except (EnvironmentError, fs.InvalidPath):
+            pass
    
    #---Public
    def add_path(self, path):
@@ -80,29 +86,30 @@ class Directories(object):
        under it will be removed. Can also raise InvalidPathError if 'path' does not exist.
        """
        if path in self:
-            raise AlreadyThereError
-        self._dirs = [d for d in self._dirs if d.path not in path]
-        try:
-            dirclass = self.special_dirclasses.get(path, self.dirclass)
-            d = dirclass(None, unicode(path))
-            d[:] #If an InvalidPath exception has to be raised, it will be raised here
-            self._dirs.append(d)
-            return d
-        except fs.InvalidPath:
+            raise AlreadyThereError()
+        if not io.exists(path):
            raise InvalidPathError()
+        self._dirs = [p for p in self._dirs if p not in path]
+        self._dirs.append(path)
+    
+    @staticmethod
+    def get_subfolders(path):
+        """returns a sorted list of paths corresponding to subfolders in `path`"""
+        try:
+            names = [name for name in io.listdir(path) if io.isdir(path + name)]
+            names.sort(key=lambda x:x.lower())
+            return [path + name for name in names]
+        except EnvironmentError:
+            return []
    
    def get_files(self):
        """Returns a list of all files that are not excluded.
        
        Returned files also have their 'is_ref' attr set.
        """
-        for d in self._dirs:
-            d.force_update()
-            try:
-                for file in self._get_files(d):
-                    yield file
-            except fs.InvalidPath:
-                pass
+        for path in self._dirs:
+            for file in self._get_files(path):
+                yield file
    
    def get_state(self, path):
        """Returns the state of 'path' (One of the STATE_* const.)
@@ -123,8 +130,8 @@ class Directories(object):
            doc = xml.dom.minidom.parse(infile)
        except:
            return
-        root_dir_nodes = doc.getElementsByTagName('root_directory')
-        for rdn in root_dir_nodes:
+        root_path_nodes = doc.getElementsByTagName('root_directory')
+        for rdn in root_path_nodes:
            if not rdn.getAttributeNode('path'):
                continue
            path = rdn.getAttributeNode('path').nodeValue
@@ -144,9 +151,9 @@ class Directories(object):
        with FileOrPath(outfile, 'wb') as fp:
            doc = xml.dom.minidom.Document()
            root = doc.appendChild(doc.createElement('directories'))
-            for root_dir in self:
-                root_dir_node = root.appendChild(doc.createElement('root_directory'))
-                root_dir_node.setAttribute('path', unicode(root_dir.path).encode('utf-8'))
+            for root_path in self:
+                root_path_node = root.appendChild(doc.createElement('root_directory'))
+                root_path_node.setAttribute('path', unicode(root_path).encode('utf-8'))
            for path, state in self.states.iteritems():
                state_node = root.appendChild(doc.createElement('state'))
                state_node.setAttribute('path', unicode(path).encode('utf-8'))
--- a/base/py/engine.py
+++ b/base/py/engine.py
@@ -9,6 +9,7 @@

 from __future__ import division
 import difflib
+import itertools
 import logging
 import string
 from collections import defaultdict, namedtuple
@@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
    percentage = compare(first.words, second.words, flags)
    return Match(first, second, percentage)

-class MatchFactory(object):
-    common_word_threshold = 50
-    match_similar_words = False
-    min_match_percentage = 0
-    weight_words = False
-    no_field_order = False
-    limit = 5000000
-    
-    def getmatches(self, objects, j=job.nulljob):
-        j = j.start_subjob(2)
-        sj = j.start_subjob(2)
-        for o in objects:
-            if not hasattr(o, 'words'):
-                o.words = getwords(o.name)
-        word_dict = build_word_dict(objects, sj)
-        reduce_common_words(word_dict, self.common_word_threshold)
-        if self.match_similar_words:
-            merge_similar_words(word_dict)
-        match_flags = []
-        if self.weight_words:
-            match_flags.append(WEIGHT_WORDS)
-        if self.match_similar_words:
-            match_flags.append(MATCH_SIMILAR_WORDS)
-        if self.no_field_order:
-            match_flags.append(NO_FIELD_ORDER)
-        j.start_job(len(word_dict), '0 matches found')
-        compared = defaultdict(set)
-        result = []
-        try:
-            # This whole 'popping' thing is there to avoid taking too much memory at the same time.
-            while word_dict:
-                items = word_dict.popitem()[1]
-                while items:
-                    ref = items.pop()
-                    compared_already = compared[ref]
-                    to_compare = items - compared_already
-                    compared_already |= to_compare
-                    for other in to_compare:
-                        m = get_match(ref, other, match_flags)
-                        if m.percentage >= self.min_match_percentage:
-                            result.append(m)
-                            if len(result) >= self.limit:
-                                return result
-                j.add_progress(desc='%d matches found' % len(result))
-        except MemoryError:
-            # This is the place where the memory usage is at its peak during the scan.
-            # Just continue the process with an incomplete list of matches.
-            del compared # This should give us enough room to call logging.
-            logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
-            return result
+def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, 
+    no_field_order=False, j=job.nulljob):
+    COMMON_WORD_THRESHOLD = 50
+    LIMIT = 5000000
+    j = j.start_subjob(2)
+    sj = j.start_subjob(2)
+    for o in objects:
+        if not hasattr(o, 'words'):
+            o.words = getwords(o.name)
+    word_dict = build_word_dict(objects, sj)
+    reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
+    if match_similar_words:
+        merge_similar_words(word_dict)
+    match_flags = []
+    if weight_words:
+        match_flags.append(WEIGHT_WORDS)
+    if match_similar_words:
+        match_flags.append(MATCH_SIMILAR_WORDS)
+    if no_field_order:
+        match_flags.append(NO_FIELD_ORDER)
+    j.start_job(len(word_dict), '0 matches found')
+    compared = defaultdict(set)
+    result = []
+    try:
+        # This whole 'popping' thing is there to avoid taking too much memory at the same time.
+        while word_dict:
+            items = word_dict.popitem()[1]
+            while items:
+                ref = items.pop()
+                compared_already = compared[ref]
+                to_compare = items - compared_already
+                compared_already |= to_compare
+                for other in to_compare:
+                    m = get_match(ref, other, match_flags)
+                    if m.percentage >= min_match_percentage:
+                        result.append(m)
+                        if len(result) >= LIMIT:
+                            return result
+            j.add_progress(desc='%d matches found' % len(result))
+    except MemoryError:
+        # This is the place where the memory usage is at its peak during the scan.
+        # Just continue the process with an incomplete list of matches.
+        del compared # This should give us enough room to call logging.
+        logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
        return result
-    
+    return result
+
+def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
+    j = j.start_subjob([2, 8])
+    size2files = defaultdict(set)
+    for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
+        size2files[getattr(file, sizeattr)].add(file)
+    possible_matches = [files for files in size2files.values() if len(files) > 1]
+    del size2files
+    result = []
+    j.start_job(len(possible_matches), '0 matches found')
+    for group in possible_matches:
+        for first, second in itertools.combinations(group, 2):
+            if first.md5partial == second.md5partial:
+                if partial or first.md5 == second.md5:
+                    result.append(Match(first, second, 100))
+        j.add_progress(desc='%d matches found' % len(result))
+    return result

 class Group(object):
    #---Override
--- a/base/py/fs.py
+++ b/base/py/fs.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-22
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+# This is a fork from hsfs. The reason for this fork is that hsfs has been designed for musicGuru
+# and was re-used for dupeGuru. The problem is that hsfs is way over-engineered for dupeGuru,
+# resulting needless complexity and memory usage. It's been a while since I wanted to do that fork,
+# and I'm doing it now.
+
+from __future__ import unicode_literals
+
+import hashlib
+import logging
+
+from hsutil import io
+from hsutil.misc import nonone, flatten
+from hsutil.str import get_file_ext
+
+class FSError(Exception):
+    cls_message = "An error has occured on '{name}' in '{parent}'"
+    def __init__(self, fsobject, parent=None):
+        message = self.cls_message
+        if isinstance(fsobject, basestring):
+            name = fsobject
+        elif isinstance(fsobject, File):
+            name = fsobject.name
+        else:
+            name = ''
+        parentname = unicode(parent) if parent is not None else ''
+        Exception.__init__(self, message.format(name=name, parent=parentname))
+    
+
+class AlreadyExistsError(FSError):
+    "The directory or file name we're trying to add already exists"
+    cls_message = "'{name}' already exists in '{parent}'"
+
+class InvalidPath(FSError):
+    "The path of self is invalid, and cannot be worked with."
+    cls_message = "'{name}' is invalid."
+
+class InvalidDestinationError(FSError):
+    """A copy/move operation has been called, but the destination is invalid."""
+    cls_message = "'{name}' is an invalid destination for this operation."
+
+class OperationError(FSError):
+    """A copy/move/delete operation has been called, but the checkup after the 
+    operation shows that it didn't work."""
+    cls_message = "Operation on '{name}' failed."
+
+class File(object):
+    INITIAL_INFO = {
+        'size': 0,
+        'ctime': 0,
+        'mtime': 0,
+        'md5': '',
+        'md5partial': '',
+    }
+    
+    def __init__(self, path):
+        self.path = path
+        #This offset is where we should start reading the file to get a partial md5
+        #For audio file, it should be where audio data starts
+        self._md5partial_offset = 0x4000 #16Kb
+        self._md5partial_size   = 0x4000 #16Kb
+    
+    def __getattr__(self, attrname):
+        # Only called when attr is not there
+        if attrname in self.INITIAL_INFO:
+            try:
+                self._read_info(attrname)
+            except Exception as e:
+                logging.warning("An error '%s' was raised while decoding '%s'", e, repr(self.path))
+            try:
+                return self.__dict__[attrname]
+            except KeyError:
+                return self.INITIAL_INFO[attrname]
+        raise AttributeError()
+    
+    def _read_info(self, field):
+        if field in ('size', 'ctime', 'mtime'):
+            stats = io.stat(self.path)
+            self.size = nonone(stats.st_size, 0)
+            self.ctime = nonone(stats.st_ctime, 0)
+            self.mtime = nonone(stats.st_mtime, 0)
+        elif field == 'md5partial':
+            try:
+                fp = io.open(self.path, 'rb')
+                offset = self._md5partial_offset
+                size = self._md5partial_size
+                fp.seek(offset)
+                partialdata = fp.read(size)
+                md5 = hashlib.md5(partialdata)
+                self.md5partial = md5.digest()
+                fp.close()
+            except Exception:
+                pass
+        elif field == 'md5':
+            try:
+                fp = io.open(self.path, 'rb')
+                filedata = fp.read()
+                md5 = hashlib.md5(filedata)
+                self.md5 = md5.digest()
+                fp.close()
+            except Exception:
+                pass
+    
+    def _read_all_info(self, attrnames=None):
+        """Cache all possible info.
+        
+        If `attrnames` is not None, caches only attrnames.
+        """
+        if attrnames is None:
+            attrnames = self.INITIAL_INFO.keys()
+        for attrname in attrnames:
+            if attrname not in self.__dict__:
+                self._read_info(attrname)
+    
+    #--- Public
+    @classmethod
+    def can_handle(cls, path):
+        return not io.islink(path) and io.isfile(path)
+    
+    def rename(self, newname):
+        if newname == self.name:
+            return
+        destpath = self.path[:-1] + newname
+        if io.exists(destpath):
+            raise AlreadyExistsError(newname, self.path[:-1])
+        try:
+            io.rename(self.path, destpath)
+        except EnvironmentError:
+            raise OperationError(self)
+        if not io.exists(destpath):
+            raise OperationError(self)
+        self.path = destpath
+    
+    #--- Properties
+    @property
+    def extension(self):
+        return get_file_ext(self.name)
+    
+    @property
+    def name(self):
+        return self.path[-1]
+    
+
+def get_file(path, fileclasses=[File]):
+    for fileclass in fileclasses:
+        if fileclass.can_handle(path):
+            return fileclass(path)
+
+def get_files(path, fileclasses=[File]):
+    assert all(issubclass(fileclass, File) for fileclass in fileclasses)
+    try:
+        paths = [path + name for name in io.listdir(path)]
+        result = []
+        for path in paths:
+            file = get_file(path, fileclasses=fileclasses)
+            if file is not None:
+                result.append(file)
+        return result
+    except EnvironmentError:
+        raise InvalidPath(path)
+
+def get_all_files(path, fileclasses=[File]):
+    files = get_files(path, fileclasses=fileclasses)
+    filepaths = set(f.path for f in files)
+    subpaths = [path + name for name in io.listdir(path)]
+    # it's possible that a folder (bundle) gets into the file list. in that case, we don't want to recurse into it
+    subfolders = [p for p in subpaths if not io.islink(p) and io.isdir(p) and p not in filepaths]
+    subfiles = flatten(get_all_files(subpath, fileclasses=fileclasses) for subpath in subfolders)
+    return subfiles + files
--- a/base/py/scanner.py
+++ b/base/py/scanner.py
@@ -32,40 +32,32 @@ class Scanner(object):
        self.ignore_list = IgnoreList()
        self.discarded_file_count = 0
    
-    @staticmethod
-    def _filter_matches_by_content(matches, partial, j):
-        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
-        md5attrname = 'md5partial' if partial else 'md5'
-        md5 = lambda f: getattr(f, md5attrname)
-        for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
-            md5(matched_file)
-        j.set_progress(100, 'Removing false matches')
-        return [m for m in matches if md5(m.first) == md5(m.second)]
-    
    def _getmatches(self, files, j):
-        j = j.start_subjob(2)
-        mf = engine.MatchFactory()
-        if self.scan_type != SCAN_TYPE_CONTENT:
-            mf.match_similar_words = self.match_similar_words
-            mf.weight_words = self.word_weighting
-            mf.min_match_percentage = self.min_match_percentage
-        if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
-            self.scan_type = SCAN_TYPE_FIELDS
-            mf.no_field_order = True
-        func = {
-            SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
-            SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
-            SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
-            SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
-            SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
-        }[self.scan_type]
-        for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
-            if self.size_threshold:
-                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
-            f.words = func(f)
        if self.size_threshold:
+            j = j.start_subjob([2, 8])
+            for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
+                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
            files = [f for f in files if f.size >= self.size_threshold]
-        return mf.getmatches(files, j)
+        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
+            sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
+            return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
+        else:
+            j = j.start_subjob([2, 8])
+            kw = {}
+            kw['match_similar_words'] = self.match_similar_words
+            kw['weight_words'] = self.word_weighting
+            kw['min_match_percentage'] = self.min_match_percentage
+            if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
+                self.scan_type = SCAN_TYPE_FIELDS
+                kw['no_field_order'] = True
+            func = {
+                SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
+                SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
+                SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
+            }[self.scan_type]
+            for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
+                f.words = func(f)
+            return engine.getmatches(files, j=j, **kw)
    
    @staticmethod
    def _key_func(dupe):
@@ -86,10 +78,7 @@ class Scanner(object):
        for f in [f for f in files if not hasattr(f, 'is_ref')]:
            f.is_ref = False
        logging.info('Getting matches')
-        if self.match_factory is None:
-            matches = self._getmatches(files, j)
-        else:
-            matches = self.match_factory.getmatches(files, j)
+        matches = self._getmatches(files, j)
        logging.info('Found %d matches' % len(matches))
        if not self.mix_file_kind:
            j.set_progress(100, 'Removing false matches')
@@ -99,14 +88,6 @@ class Scanner(object):
            iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
            matches = [m for m in iter_matches 
                if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
-        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
-            j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
-            matches = self._filter_matches_by_content(matches, partial=True, j=j)
-            if self.scan_type == SCAN_TYPE_CONTENT:
-                matches = self._filter_matches_by_content(matches, partial=False, j=j)
-            # We compared md5. No words were involved.
-            for m in matches:
-                m.first.words = m.second.words = ['--']
        logging.info('Grouping matches')
        groups = engine.get_groups(matches, j)
        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
@@ -118,7 +99,6 @@ class Scanner(object):
            g.prioritize(self._key_func, self._tie_breaker)
        return groups
    
-    match_factory        = None
    match_similar_words  = False
    min_match_percentage = 80
    mix_file_kind        = True
@@ -126,9 +106,3 @@ class Scanner(object):
    scanned_tags         = set(['artist', 'title'])
    size_threshold       = 0
    word_weighting       = False
-
-class ScannerME(Scanner): # Scanner for Music Edition
-    @staticmethod
-    def _key_func(dupe):
-        return (not dupe.is_ref, -dupe.bitrate, -dupe.size)
-    
--- a/base/py/tests/app_cocoa_test.py
+++ b/base/py/tests/app_cocoa_test.py
@@ -18,10 +18,10 @@ from hsutil.path import Path
 from hsutil.testcase import TestCase
 from hsutil.decorators import log_calls
 from hsutil import io
-import hsfs.phys

+from . import data
 from .results_test import GetTestGroups
-from .. import engine, data
+from .. import engine, fs
 try:
    from ..app_cocoa import DupeGuru as DupeGuruBase
 except ImportError:
@@ -35,7 +35,6 @@ class DupeGuru(DupeGuruBase):
    def _start_job(self, jobid, func):
        func(nulljob)
    
-
 def r2np(rows):
    #Transforms a list of rows [1,2,3] into a list of node paths [[1],[2],[3]]
    return [[i] for i in rows]
@@ -310,15 +309,15 @@ class TCDupeGuru(TestCase):

 class TCDupeGuru_renameSelected(TestCase):
    def setUp(self):
-        p = Path(tempfile.mkdtemp())
-        fp = open(str(p + 'foo bar 1'),mode='w')
+        p = self.tmppath()
+        fp = open(unicode(p + 'foo bar 1'),mode='w')
        fp.close()
-        fp = open(str(p + 'foo bar 2'),mode='w')
+        fp = open(unicode(p + 'foo bar 2'),mode='w')
        fp.close()
-        fp = open(str(p + 'foo bar 3'),mode='w')
+        fp = open(unicode(p + 'foo bar 3'),mode='w')
        fp.close()
-        refdir = hsfs.phys.Directory(None,str(p))
-        matches = engine.MatchFactory().getmatches(refdir.files)
+        files = fs.get_files(p)
+        matches = engine.getmatches(files)
        groups = engine.get_groups(matches)
        g = groups[0]
        g.prioritize(lambda x:x.name)
@@ -327,45 +326,41 @@ class TCDupeGuru_renameSelected(TestCase):
        self.app = app
        self.groups = groups
        self.p = p
-        self.refdir = refdir
-    
-    def tearDown(self):
-        shutil.rmtree(str(self.p))
+        self.files = files
    
    def test_simple(self):
        app = self.app
-        refdir = self.refdir
        g = self.groups[0]
        app.SelectPowerMarkerNodePaths(r2np([0]))
-        self.assert_(app.RenameSelected('renamed'))
-        self.assert_('renamed' in refdir)
-        self.assert_('foo bar 2' not in refdir)
-        self.assert_(g.dupes[0] is refdir['renamed'])
-        self.assert_(g.dupes[0] in refdir)
+        assert app.RenameSelected('renamed')
+        names = io.listdir(self.p)
+        assert 'renamed' in names
+        assert 'foo bar 2' not in names
+        eq_(g.dupes[0].name, 'renamed')
    
    def test_none_selected(self):
        app = self.app
-        refdir = self.refdir
        g = self.groups[0]
        app.SelectPowerMarkerNodePaths([])
        self.mock(logging, 'warning', log_calls(lambda msg: None))
-        self.assert_(not app.RenameSelected('renamed'))
+        assert not app.RenameSelected('renamed')
        msg = logging.warning.calls[0]['msg']
-        self.assertEqual('dupeGuru Warning: list index out of range', msg)
-        self.assert_('renamed' not in refdir)
-        self.assert_('foo bar 2' in refdir)
-        self.assert_(g.dupes[0] is refdir['foo bar 2'])
+        eq_('dupeGuru Warning: list index out of range', msg)
+        names = io.listdir(self.p)
+        assert 'renamed' not in names
+        assert 'foo bar 2' in names
+        eq_(g.dupes[0].name, 'foo bar 2')
    
    def test_name_already_exists(self):
        app = self.app
-        refdir = self.refdir
        g = self.groups[0]
        app.SelectPowerMarkerNodePaths(r2np([0]))
        self.mock(logging, 'warning', log_calls(lambda msg: None))
-        self.assert_(not app.RenameSelected('foo bar 1'))
+        assert not app.RenameSelected('foo bar 1')
        msg = logging.warning.calls[0]['msg']
-        self.assert_(msg.startswith('dupeGuru Warning: \'foo bar 2\' already exists in'))
-        self.assert_('foo bar 1' in refdir)
-        self.assert_('foo bar 2' in refdir)
-        self.assert_(g.dupes[0] is refdir['foo bar 2'])
+        assert msg.startswith('dupeGuru Warning: \'foo bar 1\' already exists in')
+        names = io.listdir(self.p)
+        assert 'foo bar 1' in names
+        assert 'foo bar 2' in names
+        eq_(g.dupes[0].name, 'foo bar 2')
    
--- a/base/py/tests/app_test.py
+++ b/base/py/tests/app_test.py
@@ -13,12 +13,11 @@ from hsutil.testcase import TestCase
 from hsutil import io
 from hsutil.path import Path
 from hsutil.decorators import log_calls
-import hsfs as fs
-import hsfs.phys
 import hsutil.files
 from hsutil.job import nulljob

-from .. import data, app
+from . import data
+from .. import app, fs
 from ..app import DupeGuru as DupeGuruBase

 class DupeGuru(DupeGuruBase):
@@ -59,27 +58,27 @@ class TCDupeGuru(TestCase):
        # The goal here is just to have a test for a previous blowup I had. I know my test coverage
        # for this unit is pathetic. What's done is done. My approach now is to add tests for
        # every change I want to make. The blowup was caused by a missing import.
-        dupe_parent = fs.Directory(None, 'foo')
-        dupe = fs.File(dupe_parent, 'bar')
-        dupe.copy = log_calls(lambda dest, newname: None)
+        p = self.tmppath()
+        io.open(p + 'foo', 'w').close()
        self.mock(hsutil.files, 'copy', log_calls(lambda source_path, dest_path: None))
        self.mock(os, 'makedirs', lambda path: None) # We don't want the test to create that fake directory
-        self.mock(fs.phys, 'Directory', fs.Directory) # We don't want an error because makedirs didn't work
        app = DupeGuru()
-        app.copy_or_move(dupe, True, 'some_destination', 0)
+        app.directories.add_path(p)
+        [f] = app.directories.get_files()
+        app.copy_or_move(f, True, 'some_destination', 0)
        self.assertEqual(1, len(hsutil.files.copy.calls))
        call = hsutil.files.copy.calls[0]
        self.assertEqual('some_destination', call['dest_path'])
-        self.assertEqual(dupe.path, call['source_path'])
+        self.assertEqual(f.path, call['source_path'])
    
    def test_copy_or_move_clean_empty_dirs(self):
        tmppath = Path(self.tmpdir())
        sourcepath = tmppath + 'source'
        io.mkdir(sourcepath)
        io.open(sourcepath + 'myfile', 'w')
-        tmpdir = hsfs.phys.Directory(None, unicode(tmppath))
-        myfile = tmpdir['source']['myfile']
        app = DupeGuru()
+        app.directories.add_path(tmppath)
+        [myfile] = app.directories.get_files()
        self.mock(app, 'clean_empty_dirs', log_calls(lambda path: None))
        app.copy_or_move(myfile, False, tmppath + 'dest', 0)
        calls = app.clean_empty_dirs.calls
@@ -87,9 +86,14 @@ class TCDupeGuru(TestCase):
        self.assertEqual(sourcepath, calls[0]['path'])
    
    def test_Scan_with_objects_evaluating_to_false(self):
+        class FakeFile(fs.File):
+            def __nonzero__(self):
+                return False
+            
+        
        # At some point, any() was used in a wrong way that made Scan() wrongly return 1
        app = DupeGuru()
-        f1, f2 = [fs.File(None, 'foo') for i in range(2)]
+        f1, f2 = [FakeFile('foo') for i in range(2)]
        f1.is_ref, f2.is_ref = (False, False)
        assert not (bool(f1) and bool(f2))
        app.directories.get_files = lambda: [f1, f2]
--- a/base/py/tests/data.py
+++ b/base/py/tests/data.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+# data module for tests
+
+from hsutil.str import format_size
+from dupeguru.data import format_path, cmp_value
+
+COLUMNS = [
+    {'attr':'name','display':'Filename'},
+    {'attr':'path','display':'Directory'},
+    {'attr':'size','display':'Size (KB)'},
+    {'attr':'extension','display':'Kind'},
+]
+
+METADATA_TO_READ = ['size']
+
+def GetDisplayInfo(dupe, group, delta):
+    size = dupe.size
+    m = group.get_match_of(dupe)
+    if m and delta:
+        r = group.ref
+        size -= r.size
+    return [
+        dupe.name,
+        format_path(dupe.path),
+        format_size(size, 0, 1, False),
+        dupe.extension,
+    ]
+
+def GetDupeSortKey(dupe, get_group, key, delta):
+    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
+    if delta and (key == 2):
+        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
+    return r
+
+def GetGroupSortKey(group, key):
+    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/base/py/tests/directories_test.py
+++ b/base/py/tests/directories_test.py
@@ -10,20 +10,43 @@
 import os.path as op
 import os
 import time
-import shutil

 from nose.tools import eq_

-from hsutil import job, io
+from hsutil import io
 from hsutil.path import Path
 from hsutil.testcase import TestCase
-import hsfs.phys
-from hsfs.tests import phys_test

 from ..directories import *

 testpath = Path(TestCase.datadirpath())

+def create_fake_fs(rootpath):
+    rootpath = rootpath + 'fs'
+    io.mkdir(rootpath)
+    io.mkdir(rootpath + 'dir1')
+    io.mkdir(rootpath + 'dir2')
+    io.mkdir(rootpath + 'dir3')
+    fp = io.open(rootpath + 'file1.test', 'w')
+    fp.write('1')
+    fp.close()
+    fp = io.open(rootpath + 'file2.test', 'w')
+    fp.write('12')
+    fp.close()
+    fp = io.open(rootpath + 'file3.test', 'w')
+    fp.write('123')
+    fp.close()
+    fp = io.open(rootpath + ('dir1', 'file1.test'), 'w')
+    fp.write('1')
+    fp.close()
+    fp = io.open(rootpath + ('dir2', 'file2.test'), 'w')
+    fp.write('12')
+    fp.close()
+    fp = io.open(rootpath + ('dir3', 'file3.test'), 'w')
+    fp.write('123')
+    fp.close()
+    return rootpath
+
 class TCDirectories(TestCase):
    def test_empty(self):
        d = Directories()
@@ -33,13 +56,11 @@ class TCDirectories(TestCase):
    def test_add_path(self):
        d = Directories()
        p = testpath + 'utils'
-        added = d.add_path(p)
+        d.add_path(p)
        self.assertEqual(1,len(d))
        self.assert_(p in d)
        self.assert_((p + 'foobar') in d)
        self.assert_(p[:-1] not in d)
-        self.assertEqual(p,added.path)
-        self.assert_(d[0] is added)
        p = self.tmppath()
        d.add_path(p)
        self.assertEqual(2,len(d))
@@ -53,13 +74,13 @@ class TCDirectories(TestCase):
        self.assertRaises(AlreadyThereError, d.add_path, p + 'foobar')
        self.assertEqual(1, len(d))
    
-    def test_AddPath_containing_paths_already_there(self):
+    def test_add_path_containing_paths_already_there(self):
        d = Directories()
        d.add_path(testpath + 'utils')
        self.assertEqual(1, len(d))
-        added = d.add_path(testpath)
-        self.assertEqual(1, len(d))
-        self.assert_(added is d[0])
+        d.add_path(testpath)
+        eq_(len(d), 1)
+        eq_(d[0], testpath)
    
    def test_AddPath_non_latin(self):
    	p = Path(self.tmpdir())
@@ -114,7 +135,7 @@ class TCDirectories(TestCase):
    
    def test_set_state_keep_state_dict_size_to_minimum(self):
        d = Directories()
-        p = Path(phys_test.create_fake_fs(self.tmpdir()))
+        p = create_fake_fs(self.tmppath())
        d.add_path(p)
        d.set_state(p,STATE_REFERENCE)
        d.set_state(p + 'dir1',STATE_REFERENCE)
@@ -129,7 +150,7 @@ class TCDirectories(TestCase):
    
    def test_get_files(self):
        d = Directories()
-        p = Path(phys_test.create_fake_fs(self.tmpdir()))
+        p = create_fake_fs(self.tmppath())
        d.add_path(p)
        d.set_state(p + 'dir1',STATE_REFERENCE)
        d.set_state(p + 'dir2',STATE_EXCLUDED)
@@ -177,52 +198,28 @@ class TCDirectories(TestCase):
        except LookupError:
            self.fail()
    
-    def test_default_dirclass(self):
-        self.assert_(Directories().dirclass is hsfs.phys.Directory)
-    
-    def test_dirclass(self):
-        class MySpecialDirclass(hsfs.phys.Directory): pass
-        d = Directories()
-        d.dirclass = MySpecialDirclass
-        d.add_path(testpath)
-        self.assert_(isinstance(d[0], MySpecialDirclass))
-    
    def test_load_from_file_with_invalid_path(self):
        #This test simulates a load from file resulting in a
        #InvalidPath raise. Other directories must be loaded.
        d1 = Directories()
        d1.add_path(testpath + 'utils')
        #Will raise InvalidPath upon loading
-        d1.add_path(self.tmppath()).name = 'does_not_exist'
+        p = self.tmppath()
+        d1.add_path(p)
+        io.rmdir(p)
        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
        d1.save_to_file(tmpxml)
        d2 = Directories()
        d2.load_from_file(tmpxml)
        self.assertEqual(1, len(d2))
    
-    def test_load_from_file_with_same_paths(self):
-        #This test simulates a load from file resulting in a
-        #AlreadyExists raise. Other directories must be loaded.
-        d1 = Directories()
-        p1 = self.tmppath()
-        p2 = self.tmppath()
-        d1.add_path(p1)
-        d1.add_path(p2)
-        #Will raise AlreadyExists upon loading
-        d1.add_path(self.tmppath()).name = unicode(p1)
-        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
-        d1.save_to_file(tmpxml)
-        d2 = Directories()
-        d2.load_from_file(tmpxml)
-        self.assertEqual(2, len(d2))
-    
    def test_unicode_save(self):
        d = Directories()
        p1 = self.tmppath() + u'hello\xe9'
        io.mkdir(p1)
        io.mkdir(p1 + u'foo\xe9')
        d.add_path(p1)
-        d.set_state(d[0][0].path, STATE_EXCLUDED)
+        d.set_state(p1 + u'foo\xe9', STATE_EXCLUDED)
        tmpxml = op.join(self.tmpdir(), 'directories_testunit.xml')
        try:
            d.save_to_file(tmpxml)
@@ -231,7 +228,7 @@ class TCDirectories(TestCase):
    
    def test_get_files_refreshes_its_directories(self):
        d = Directories()
-        p = Path(phys_test.create_fake_fs(self.tmpdir()))
+        p = create_fake_fs(self.tmppath())
        d.add_path(p)
        files = d.get_files()
        self.assertEqual(6, len(list(files)))
@@ -258,16 +255,6 @@ class TCDirectories(TestCase):
        d.set_state(hidden_dir_path, STATE_NORMAL)
        self.assertEqual(d.get_state(hidden_dir_path), STATE_NORMAL)
    
-    def test_special_dirclasses(self):
-        # if a path is in special_dirclasses, use this class instead
-        class MySpecialDirclass(hsfs.phys.Directory): pass
-        d = Directories()
-        p1 = self.tmppath()
-        p2 = self.tmppath()
-        d.special_dirclasses[p1] = MySpecialDirclass
-        self.assert_(isinstance(d.add_path(p2), hsfs.phys.Directory))
-        self.assert_(isinstance(d.add_path(p1), MySpecialDirclass))
-    
    def test_default_path_state_override(self):
        # It's possible for a subclass to override the default state of a path
        class MyDirectories(Directories):
--- a/base/py/tests/engine_test.py
+++ b/base/py/tests/engine_test.py
@@ -340,21 +340,13 @@ class TCget_match(TestCase):
        self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
    

-class TCMatchFactory(TestCase):
+class GetMatches(TestCase):
    def test_empty(self):
-        self.assertEqual([],MatchFactory().getmatches([]))
-    
-    def test_defaults(self):
-        mf = MatchFactory()
-        self.assertEqual(50,mf.common_word_threshold)
-        self.assertEqual(False,mf.weight_words)
-        self.assertEqual(False,mf.match_similar_words)
-        self.assertEqual(False,mf.no_field_order)
-        self.assertEqual(0,mf.min_match_percentage)
+        eq_(getmatches([]), [])
    
    def test_simple(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(2,len(r))
        seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
        m = seek[0]
@@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
    
    def test_null_and_unrelated_objects(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
        m = r[0]
        self.assertEqual(50,m.percentage)
@@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
    
    def test_twice_the_same_word(self):
        l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    
    def test_twice_the_same_word_when_preworded(self):
        l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    
    def test_two_words_match(self):
        l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    
    def test_match_files_with_only_common_words(self):
        #If a word occurs more than 50 times, it is excluded from the matching process
        #The problem with the common_word_threshold is that the files containing only common
        #words will never be matched together. We *should* match them.
-        mf = MatchFactory()
-        mf.common_word_threshold = 50
+        # This test assumes that the common word threashold const is 50
        l = [NamedObject("foo") for i in range(50)]
-        r = mf.getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1225,len(r))
    
    def test_use_words_already_there_if_there(self):
        o1 = NamedObject('foo')
        o2 = NamedObject('bar')
        o2.words = ['foo']
-        self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
+        eq_(1, len(getmatches([o1,o2])))
    
    def test_job(self):
        def do_progress(p,d=''):
@@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
        j = job.Job(1,do_progress)
        self.log = []
        s = "foo bar"
-        MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
+        getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
        self.assert_(len(self.log) > 2)
        self.assertEqual(0,self.log[0])
        self.assertEqual(100,self.log[-1])
    
    def test_weight_words(self):
-        mf = MatchFactory()
-        mf.weight_words = True
        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
-        m = mf.getmatches(l)[0]
+        m = getmatches(l, weight_words=True)[0]
        self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
    
    def test_similar_word(self):
-        mf = MatchFactory()
-        mf.match_similar_words = True
        l = [NamedObject("foobar"),NamedObject("foobars")]
-        self.assertEqual(1,len(mf.getmatches(l)))
-        self.assertEqual(100,mf.getmatches(l)[0].percentage)
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
+        eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
        l = [NamedObject("foobar"),NamedObject("foo")]
-        self.assertEqual(0,len(mf.getmatches(l))) #too far
+        eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
        l = [NamedObject("bizkit"),NamedObject("bizket")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
        l = [NamedObject("foobar"),NamedObject("foosbar")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
    
    def test_single_object_with_similar_words(self):
-        mf = MatchFactory()
-        mf.match_similar_words = True
        l = [NamedObject("foo foos")]
-        self.assertEqual(0,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 0)
    
    def test_double_words_get_counted_only_once(self):
-        mf = MatchFactory()
        l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
-        m = mf.getmatches(l)[0]
+        m = getmatches(l)[0]
        self.assertEqual(75,m.percentage)
    
    def test_with_fields(self):
-        mf = MatchFactory()
        o1 = NamedObject("foo bar - foo bleh")
        o2 = NamedObject("foo bar - bleh bar")
        o1.words = getfields(o1.name)
        o2.words = getfields(o2.name)
-        m = mf.getmatches([o1, o2])[0]
+        m = getmatches([o1, o2])[0]
        self.assertEqual(50, m.percentage)
    
    def test_with_fields_no_order(self):
-        mf = MatchFactory()
-        mf.no_field_order = True
        o1 = NamedObject("foo bar - foo bleh")
        o2 = NamedObject("bleh bang - foo bar")
        o1.words = getfields(o1.name)
        o2.words = getfields(o2.name)
-        m = mf.getmatches([o1, o2])[0]
-        self.assertEqual(50 ,m.percentage)
+        m = getmatches([o1, o2], no_field_order=True)[0]
+        eq_(m.percentage, 50)
    
    def test_only_match_similar_when_the_option_is_set(self):
-        mf = MatchFactory()
-        mf.match_similar_words = False
        l = [NamedObject("foobar"),NamedObject("foobars")]
-        self.assertEqual(0,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=False)), 0)
    
    def test_dont_recurse_do_match(self):
        # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
        sys.setrecursionlimit(100)
-        mf = MatchFactory()
        files = [NamedObject('foo bar') for i in range(101)]
        try:
-            mf.getmatches(files)
+            getmatches(files)
        except RuntimeError:
            self.fail()
        finally:
@@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
    
    def test_min_match_percentage(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
-        mf = MatchFactory()
-        mf.min_match_percentage = 50
-        r = mf.getmatches(l)
+        r = getmatches(l, min_match_percentage=50)
        self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
    
-    def test_limit(self):
-        l = [NamedObject(),NamedObject(),NamedObject()]
-        mf = MatchFactory()
-        mf.limit = 2
-        r = mf.getmatches(l)
-        self.assertEqual(2,len(r))
-    
    def test_MemoryError(self):
        @log_calls
        def mocked_match(first, second, flags):
@@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
        
        objects = [NamedObject() for i in range(10)] # results in 45 matches
        self.mock(engine, 'get_match', mocked_match)
-        mf = MatchFactory()
        try:
-            r = mf.getmatches(objects)
+            r = getmatches(objects)
        except MemoryError:
            self.fail('MemorryError must be handled')
        self.assertEqual(42, len(r))
@@ -738,7 +706,7 @@ class TCget_groups(TestCase):
    
    def test_simple(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        m = matches[0]
        r = get_groups(matches)
        self.assertEqual(1,len(r))
@@ -749,7 +717,7 @@ class TCget_groups(TestCase):
    def test_group_with_multiple_matches(self):
        #This results in 3 matches
        l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(1,len(r))
        g = r[0]
@@ -759,7 +727,7 @@ class TCget_groups(TestCase):
        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
        #There will be 2 groups here: group "a b" and group "c d"
        #"b c" can go either of them, but not both.
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(2,len(r))
        self.assertEqual(5,len(r[0])+len(r[1]))
@@ -768,7 +736,7 @@ class TCget_groups(TestCase):
        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
        #There will be 2 groups here: group "a b" and group "c d"
        #"b c" can fit in both, but it must be in only one of them
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(1,len(r))
    
@@ -788,7 +756,7 @@ class TCget_groups(TestCase):
    
    def test_four_sized_group(self):
        l = [NamedObject("foobar") for i in xrange(4)]
-        m = MatchFactory().getmatches(l)
+        m = getmatches(l)
        r = get_groups(m)
        self.assertEqual(1,len(r))
        self.assertEqual(4,len(r[0]))
--- a/base/py/tests/results_test.py
+++ b/base/py/tests/results_test.py
@@ -16,8 +16,8 @@ from hsutil.path import Path
 from hsutil.testcase import TestCase
 from hsutil.misc import first

-from . import engine_test
-from .. import data, engine
+from . import engine_test, data
+from .. import engine
 from ..results import *

 class NamedObject(engine_test.NamedObject):
@@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
 def GetTestGroups():
    objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
    objects[1].size = 1024
-    matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
+    matches = engine.getmatches(objects) #we should have 5 matches
    groups = engine.get_groups(matches) #We should have 2 groups
    for g in groups:
        g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
@@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
                return objects[1]
        
        objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
-        matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
+        matches = engine.getmatches(objects) #we should have 5 matches
        groups = engine.get_groups(matches) #We should have 2 groups
        for g in groups:
            g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
--- a/base/py/tests/scanner_test.py
+++ b/base/py/tests/scanner_test.py
@@ -132,8 +132,6 @@ def test_content_scan_doesnt_put_md5_in_words_at_the_end():
    f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
    r = s.GetDupeGroups(f)
    g = r[0]
-    eq_(g.ref.words, ['--'])
-    eq_(g.dupes[0].words, ['--'])

 def test_extension_is_not_counted_in_filename_scan():
    s = Scanner()
@@ -369,23 +367,6 @@ def test_ignore_list_checks_for_unicode():
    assert f2 in g
    assert f3 in g

-def test_custom_match_factory():
-    class MatchFactory(object):
-        def getmatches(self, objects, j=None):
-            return [Match(objects[0], objects[1], 420)]
-        
-    
-    s = Scanner()
-    s.match_factory = MatchFactory()
-    o1, o2 = no('foo'), no('bar')
-    groups = s.GetDupeGroups([o1, o2])
-    eq_(len(groups), 1)
-    g = groups[0]
-    eq_(len(g), 2)
-    g.switch_ref(o1)
-    m = g.get_match_of(o2)
-    eq_(m, (o1, o2, 420))
-
 def test_file_evaluates_to_false():
    # A very wrong way to use any() was added at some point, causing resulting group list
    # to be empty.
@@ -455,15 +436,3 @@ def test_partial_group_match():
    assert o2 in group
    assert o3 not in group
    eq_(s.discarded_file_count, 1)
-
-
-#--- Scanner ME
-def test_priorize_me():
-    # in ScannerME, bitrate goes first (right after is_ref) in priorization
-    s = ScannerME()
-    o1, o2 = no('foo'), no('foo')
-    o1.bitrate = 1
-    o2.bitrate = 2
-    [group] = s.GetDupeGroups([o1, o2])
-    assert group.ref is o2
-
--- a/base/qt/app.py
+++ b/base/qt/app.py
@@ -16,10 +16,10 @@ import os.path as op
 from PyQt4.QtCore import Qt, QTimer, QObject, QCoreApplication, QUrl, SIGNAL
 from PyQt4.QtGui import QProgressDialog, QDesktopServices, QFileDialog, QDialog, QMessageBox

-import hsfs as fs
 from hsutil import job
 from hsutil.reg import RegistrationRequired

+from dupeguru import fs
 from dupeguru.app import (DupeGuru as DupeGuruBase, JOB_SCAN, JOB_LOAD, JOB_MOVE, JOB_COPY, 
    JOB_DELETE)
    
@@ -145,6 +145,7 @@ class DupeGuru(DupeGuruBase, QObject):
    
    def ask_for_reg_code(self):
        if self.reg.ask_for_code():
+            #XXX bug???
            self._setup_ui_as_registered()
    
    @demo_method
--- a/base/qt/directories_model.py
+++ b/base/qt/directories_model.py
@@ -47,7 +47,14 @@ class DirectoryNode(TreeNode):
        return DirectoryNode(self.model, self, ref, row)
    
    def _getChildren(self):
-        return self.ref.dirs
+        return self.model._dirs.get_subfolders(self.ref)
+    
+    @property
+    def name(self):
+        if self.parent is not None:
+            return self.ref[-1]
+        else:
+            return unicode(self.ref)
    

 class DirectoriesModel(TreeModel):
@@ -70,13 +77,13 @@ class DirectoriesModel(TreeModel):
        node = index.internalPointer()
        if role == Qt.DisplayRole:
            if index.column() == 0:
-                return node.ref.name
+                return node.name
            else:
-                return STATES[self._dirs.get_state(node.ref.path)]
+                return STATES[self._dirs.get_state(node.ref)]
        elif role == Qt.EditRole and index.column() == 1:
-            return self._dirs.get_state(node.ref.path)
+            return self._dirs.get_state(node.ref)
        elif role == Qt.ForegroundRole:
-            state = self._dirs.get_state(node.ref.path)
+            state = self._dirs.get_state(node.ref)
            if state == 1:
                return QBrush(Qt.blue)
            elif state == 2:
@@ -101,6 +108,6 @@ class DirectoriesModel(TreeModel):
        if not index.isValid() or role != Qt.EditRole or index.column() != 1:
            return False
        node = index.internalPointer()
-        self._dirs.set_state(node.ref.path, value)
+        self._dirs.set_state(node.ref, value)
        return True
    
--- a/me/cocoa/py/dg_cocoa.py
+++ b/me/cocoa/py/dg_cocoa.py
@@ -8,12 +8,13 @@
 import objc
 from AppKit import *

-from dupeguru import app_me_cocoa, scanner
+from dupeguru_me.app_cocoa import DupeGuruME
+from dupeguru.scanner import (SCAN_TYPE_FILENAME, SCAN_TYPE_FIELDS, SCAN_TYPE_FIELDS_NO_ORDER,
+    SCAN_TYPE_TAG, SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO)

 # Fix py2app imports which chokes on relative imports
-from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
-from hsfs import auto, stats, tree, music
-from hsfs.phys import music
+from dupeguru_me import app_cocoa, data, fs, scanner
+from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner, fs
 from hsmedia import aiff, flac, genres, id3v1, id3v2, mp4, mpeg, ogg, wma
 from hsutil import conflict

@@ -23,7 +24,7 @@ class PyApp(NSObject):
 class PyDupeGuru(PyApp):
    def init(self):
        self = super(PyDupeGuru,self).init()
-        self.app = app_me_cocoa.DupeGuruME()
+        self.app = DupeGuruME()
        return self
    
    #---Directories
@@ -180,12 +181,12 @@ class PyDupeGuru(PyApp):
    def setScanType_(self, scan_type):
        try:
            self.app.scanner.scan_type = [
-                scanner.SCAN_TYPE_FILENAME,
-                scanner.SCAN_TYPE_FIELDS,
-                scanner.SCAN_TYPE_FIELDS_NO_ORDER,
-                scanner.SCAN_TYPE_TAG,
-                scanner.SCAN_TYPE_CONTENT,
-                scanner.SCAN_TYPE_CONTENT_AUDIO
+                SCAN_TYPE_FILENAME,
+                SCAN_TYPE_FIELDS,
+                SCAN_TYPE_FIELDS_NO_ORDER,
+                SCAN_TYPE_TAG,
+                SCAN_TYPE_CONTENT,
+                SCAN_TYPE_CONTENT_AUDIO
            ][scan_type]
        except IndexError:
            pass
--- a/me/py/init.py
+++ b/me/py/init.py
--- a/base/py/app_me_cocoa.py
+++ b/base/py/app_me_cocoa.py
@@ -7,29 +7,29 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license

-import os.path as op
 import logging
 from appscript import app, k, CommandError
 import time

 from hsutil.cocoa import as_fetch
-import hsfs.phys.music

-import app_cocoa, data_me, scanner
+from dupeguru.app_cocoa import JOBID2TITLE, DupeGuru as DupeGuruBase
+
+from . import data, scanner, fs

 JOB_REMOVE_DEAD_TRACKS = 'jobRemoveDeadTracks'
 JOB_SCAN_DEAD_TRACKS = 'jobScanDeadTracks'

-app_cocoa.JOBID2TITLE.update({
+JOBID2TITLE.update({
    JOB_REMOVE_DEAD_TRACKS: "Removing dead tracks from your iTunes Library",
    JOB_SCAN_DEAD_TRACKS: "Scanning the iTunes Library",
 })

-class DupeGuruME(app_cocoa.DupeGuru):
+class DupeGuruME(DupeGuruBase):
    def __init__(self):
-        app_cocoa.DupeGuru.__init__(self, data_me, 'dupeGuru Music Edition', appid=1)
+        DupeGuruBase.__init__(self, data, 'dupeGuru Music Edition', appid=1)
        self.scanner = scanner.ScannerME()
-        self.directories.dirclass = hsfs.phys.music.Directory
+        self.directories.fileclasses = [fs.Mp3File, fs.Mp4File, fs.WmaFile, fs.OggFile, fs.FlacFile, fs.AiffFile]
        self.dead_tracks = []
    
    def remove_dead_tracks(self):
--- a/base/py/data_me.py
+++ b/base/py/data_me.py
@@ -8,7 +8,7 @@
 # http://www.hardcoded.net/licenses/hs_license

 from hsutil.str import format_time, FT_MINUTES, format_size
-from .data import (format_path, format_timestamp, format_words, format_perc, 
+from dupeguru.data import (format_path, format_timestamp, format_words, format_perc, 
    format_dupe_count, cmp_value)

 COLUMNS = [
@@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
        str(dupe.track),
        dupe.comment,
        format_perc(percentage),
-        format_words(dupe.words),
+        format_words(dupe.words) if hasattr(dupe, 'words') else '',
        format_dupe_count(dupe_count)
    ]

--- a/me/py/fs.py
+++ b/me/py/fs.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from hsmedia import mpeg, wma, mp4, ogg, flac, aiff
+from hsutil.str import get_file_ext
+from dupeguru import fs
+
+TAG_FIELDS = ['audiosize', 'duration', 'bitrate', 'samplerate', 'title', 'artist',
+    'album', 'genre', 'year', 'track', 'comment']
+
+class MusicFile(fs.File):
+    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
+    INITIAL_INFO.update({
+        'audiosize': 0,
+        'bitrate'  : 0,
+        'duration' : 0,
+        'samplerate':0,
+        'artist'  : '',
+        'album'   : '',
+        'title'   : '',
+        'genre'   : '',
+        'comment' : '',
+        'year'    : '',
+        'track'   : 0,
+    })
+    HANDLED_EXTS = set()
+    
+    @classmethod
+    def can_handle(cls, path):
+        if not fs.File.can_handle(path):
+            return False
+        return get_file_ext(path[-1]) in cls.HANDLED_EXTS
+    
+
+class Mp3File(MusicFile):
+    HANDLED_EXTS = set(['mp3'])
+    def _read_info(self, field):
+        if field == 'md5partial':
+            fileinfo = mpeg.Mpeg(unicode(self.path))
+            self._md5partial_offset = fileinfo.audio_offset
+            self._md5partial_size = fileinfo.audio_size
+        MusicFile._read_info(self, field)
+        if field in TAG_FIELDS:
+            fileinfo = mpeg.Mpeg(unicode(self.path))
+            self.audiosize = fileinfo.audio_size
+            self.bitrate = fileinfo.bitrate
+            self.duration = fileinfo.duration
+            self.samplerate = fileinfo.sample_rate
+            i1 = fileinfo.id3v1
+            # id3v1, even when non-existant, gives empty values. not id3v2. if id3v2 don't exist,
+            # just replace it with id3v1
+            i2 = fileinfo.id3v2
+            if not i2.exists:
+                i2 = i1
+            self.artist = i2.artist or i1.artist
+            self.album = i2.album or i1.album
+            self.title = i2.title or i1.title
+            self.genre = i2.genre or i1.genre
+            self.comment = i2.comment or i1.comment
+            self.year = i2.year or i1.year
+            self.track = i2.track or i1.track
+
+class WmaFile(MusicFile):
+    HANDLED_EXTS = set(['wma'])
+    def _read_info(self, field):
+        if field == 'md5partial':
+            dec = wma.WMADecoder(unicode(self.path))
+            self._md5partial_offset = dec.audio_offset
+            self._md5partial_size = dec.audio_size
+        MusicFile._read_info(self, field)
+        if field in TAG_FIELDS:
+            dec = wma.WMADecoder(unicode(self.path))
+            self.audiosize = dec.audio_size
+            self.bitrate = dec.bitrate
+            self.duration = dec.duration
+            self.samplerate = dec.sample_rate
+            self.artist = dec.artist
+            self.album = dec.album
+            self.title = dec.title
+            self.genre = dec.genre
+            self.comment = dec.comment
+            self.year = dec.year
+            self.track = dec.track
+
+class Mp4File(MusicFile):
+    HANDLED_EXTS = set(['m4a', 'm4p'])
+    def _read_info(self, field):
+        if field == 'md5partial':
+            dec = mp4.File(unicode(self.path))
+            self._md5partial_offset = dec.audio_offset
+            self._md5partial_size = dec.audio_size
+            dec.close()
+        MusicFile._read_info(self, field)
+        if field in TAG_FIELDS:
+            dec = mp4.File(unicode(self.path))
+            self.audiosize = dec.audio_size
+            self.bitrate = dec.bitrate
+            self.duration = dec.duration
+            self.samplerate = dec.sample_rate
+            self.artist = dec.artist
+            self.album = dec.album
+            self.title = dec.title
+            self.genre = dec.genre
+            self.comment = dec.comment
+            self.year = dec.year
+            self.track = dec.track
+            dec.close()
+
+class OggFile(MusicFile):
+    HANDLED_EXTS = set(['ogg'])
+    def _read_info(self, field):
+        if field == 'md5partial':
+            dec = ogg.Vorbis(unicode(self.path))
+            self._md5partial_offset = dec.audio_offset
+            self._md5partial_size = dec.audio_size
+        MusicFile._read_info(self, field)
+        if field in TAG_FIELDS:
+            dec = ogg.Vorbis(unicode(self.path))
+            self.audiosize = dec.audio_size
+            self.bitrate = dec.bitrate
+            self.duration = dec.duration
+            self.samplerate = dec.sample_rate
+            self.artist = dec.artist
+            self.album = dec.album
+            self.title = dec.title
+            self.genre = dec.genre
+            self.comment = dec.comment
+            self.year = dec.year
+            self.track = dec.track
+
+class FlacFile(MusicFile):
+    HANDLED_EXTS = set(['flac'])
+    def _read_info(self, field):
+        if field == 'md5partial':
+            dec = flac.FLAC(unicode(self.path))
+            self._md5partial_offset = dec.audio_offset
+            self._md5partial_size = dec.audio_size
+        MusicFile._read_info(self, field)
+        if field in TAG_FIELDS:
+            dec = flac.FLAC(unicode(self.path))
+            self.audiosize = dec.audio_size
+            self.bitrate = dec.bitrate
+            self.duration = dec.duration
+            self.samplerate = dec.sample_rate
+            self.artist = dec.artist
+            self.album = dec.album
+            self.title = dec.title
+            self.genre = dec.genre
+            self.comment = dec.comment
+            self.year = dec.year
+            self.track = dec.track
+
+class AiffFile(MusicFile):
+    HANDLED_EXTS = set(['aif', 'aiff', 'aifc'])
+    def _read_info(self, field):
+        if field == 'md5partial':
+            dec = aiff.File(unicode(self.path))
+            self._md5partial_offset = dec.audio_offset
+            self._md5partial_size = dec.audio_size
+        MusicFile._read_info(self, field)
+        if field in TAG_FIELDS:
+            dec = aiff.File(unicode(self.path))
+            self.audiosize = dec.audio_size
+            self.bitrate = dec.bitrate
+            self.duration = dec.duration
+            self.samplerate = dec.sample_rate
+            tag = dec.tag
+            if tag is not None:
+                self.artist = tag.artist
+                self.album = tag.album
+                self.title = tag.title
+                self.genre = tag.genre
+                self.comment = tag.comment
+                self.year = tag.year
+                self.track = tag.track
+    
--- a/me/py/scanner.py
+++ b/me/py/scanner.py
@@ -0,0 +1,16 @@
+# Created By: Virgil Dupras
+# Created On: 2006/03/03
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from dupeguru.scanner import Scanner as ScannerBase
+
+class ScannerME(ScannerBase):
+    @staticmethod
+    def _key_func(dupe):
+        return (not dupe.is_ref, -dupe.bitrate, -dupe.size)
+    
--- a/me/py/tests/init.py
+++ b/me/py/tests/init.py
--- a/me/py/tests/scanner_test.py
+++ b/me/py/tests/scanner_test.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from hsutil.path import Path
+
+from dupeguru.engine import getwords
+from ..scanner import *
+
+class NamedObject(object):
+    def __init__(self, name="foobar", size=1):
+        self.name = name
+        self.size = size
+        self.path = Path('')
+        self.words = getwords(name)
+    
+
+no = NamedObject
+
+def test_priorize_me():
+    # in ScannerME, bitrate goes first (right after is_ref) in priorization
+    s = ScannerME()
+    o1, o2 = no('foo'), no('foo')
+    o1.bitrate = 1
+    o2.bitrate = 2
+    [group] = s.GetDupeGroups([o1, o2])
+    assert group.ref is o2
--- a/me/qt/app.py
+++ b/me/qt/app.py
@@ -7,9 +7,7 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license

-import hsfs.phys.music
-
-from dupeguru import data_me, scanner
+from dupeguru_me import data, scanner, fs

 from base.app import DupeGuru as DupeGuruBase
 from details_dialog import DetailsDialog
@@ -23,11 +21,11 @@ class DupeGuru(DupeGuruBase):
    DELTA_COLUMNS = frozenset([2, 3, 4, 5, 7, 8])
    
    def __init__(self):
-        DupeGuruBase.__init__(self, data_me, appid=1)
+        DupeGuruBase.__init__(self, data, appid=1)
    
    def _setup(self):
        self.scanner = scanner.ScannerME()
-        self.directories.dirclass = hsfs.phys.music.Directory
+        self.directories.fileclasses = [fs.Mp3File, fs.Mp4File, fs.WmaFile, fs.OggFile, fs.FlacFile, fs.AiffFile]
        DupeGuruBase._setup(self)
    
    def _update_options(self):
--- a/pe/cocoa/py/dg_cocoa.py
+++ b/pe/cocoa/py/dg_cocoa.py
@@ -12,7 +12,6 @@ from dupeguru_pe import app_cocoa as app_pe_cocoa
 # Fix py2app imports which chokes on relative imports
 from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
 from dupeguru_pe import block, cache, matchbase, data
-from hsfs import auto, stats, tree
 from hsutil import conflict

 class PyApp(NSObject):
@@ -39,7 +38,7 @@ class PyDupeGuru(PyApp):
        self.app.scanner.ignore_list.Clear()
    
    def clearPictureCache(self):
-        self.app.scanner.match_factory.cached_blocks.clear()
+        self.app.scanner.cached_blocks.clear()
    
    def doScan(self):
        return self.app.start_scanning()
@@ -172,10 +171,10 @@ class PyDupeGuru(PyApp):
    
    #---Properties
    def setMatchScaled_(self,match_scaled):
-        self.app.scanner.match_factory.match_scaled = match_scaled
+        self.app.scanner.match_scaled = match_scaled
    
    def setMinMatchPercentage_(self,percentage):
-        self.app.scanner.match_factory.threshold = int(percentage)
+        self.app.scanner.threshold = int(percentage)
    
    def setMixFileKind_(self,mix_file_kind):
        self.app.scanner.mix_file_kind = mix_file_kind
--- a/pe/help/changelog.yaml
+++ b/pe/help/changelog.yaml
@@ -1,3 +1,7 @@
+- date: 2009-10-24
+  version: 1.7.8
+  description: |
+    * Fixed a bug sometimes causing some duplicates to be ignored during the scans. (#73)
 - date: 2009-10-14
  version: 1.7.7
  description: |
--- a/pe/py/app_cocoa.py
+++ b/pe/py/app_cocoa.py
@@ -7,41 +7,43 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license

-import os
 import os.path as op
 import logging
 import plistlib
 import re

-import objc
 from Foundation import *
 from AppKit import *
 from appscript import app, k

-from hsutil import job, io
-import hsfs as fs
-from hsfs import phys, InvalidPath
-from hsutil import files
+from hsutil import io
 from hsutil.str import get_file_ext
 from hsutil.path import Path
 from hsutil.cocoa import as_fetch

+from dupeguru import fs
 from dupeguru import app_cocoa, directories
-from . import data, matchbase
+from . import data
 from .cache import string_to_colors, Cache
+from .scanner import ScannerPE

 mainBundle = NSBundle.mainBundle()
 PictureBlocks = mainBundle.classNamed_('PictureBlocks')
 assert PictureBlocks is not None

-class Photo(phys.File):
-    INITIAL_INFO = phys.File.INITIAL_INFO.copy()
+class Photo(fs.File):
+    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
    INITIAL_INFO.update({
        'dimensions': (0,0),
    })
+    HANDLED_EXTS = set(['png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'tif', 'nef', 'cr2'])
+    
+    @classmethod
+    def can_handle(cls, path):
+        return fs.File.can_handle(path) and get_file_ext(path[-1]) in cls.HANDLED_EXTS
    
    def _read_info(self, field):
-        super(Photo, self)._read_info(field)
+        fs.File._read_info(self, field)
        if field == 'dimensions':
            size = PictureBlocks.getImageSize_(unicode(self.path))
            self.dimensions = (size.width, size.height)
@@ -49,7 +51,7 @@ class Photo(phys.File):
    def get_blocks(self, block_count_per_side):
        try:
            blocks = PictureBlocks.getBlocksFromImagePath_blockCount_(unicode(self.path), block_count_per_side)
-        except Exception, e:
+        except Exception as e:
            raise IOError('The reading of "%s" failed with "%s"' % (unicode(self.path), unicode(e)))
        if not blocks:
            raise IOError('The picture %s could not be read' % unicode(self.path))
@@ -57,89 +59,79 @@ class Photo(phys.File):
    

 class IPhoto(Photo):
-    def __init__(self, parent, whole_path):
-        super(IPhoto, self).__init__(parent, whole_path[-1])
-        self.whole_path = whole_path
-    
-    def _build_path(self):
-        return self.whole_path
-    
    @property
    def display_path(self):
-        return super(IPhoto, self)._build_path()
+        return Path(('iPhoto Library', self.name))
    
+def get_iphoto_database_path():
+    ud = NSUserDefaults.standardUserDefaults()
+    prefs = ud.persistentDomainForName_('com.apple.iApps')
+    if 'iPhotoRecentDatabases' not in prefs:
+        raise directories.InvalidPathError()
+    plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
+    return Path(plisturl.path())

-class Directory(phys.Directory):
-    cls_file_class = Photo
-    cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'psd', 'bmp', 'tiff', 'nef', 'cr2')
-    
-    def _fetch_subitems(self):
-        subdirs, subfiles = super(Directory,self)._fetch_subitems() 
-        return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
-    
-
-class IPhotoLibrary(fs.Directory):
-    def __init__(self, plistpath):
-        self.plistpath = plistpath
-        self.refpath = plistpath[:-1]
-        # the AlbumData.xml file lives right in the library path
-        super(IPhotoLibrary, self).__init__(None, 'iPhoto Library')
-        if not io.exists(plistpath):
-            raise InvalidPath(self)
-    
-    def _update_photo(self, photo_data):
+def get_iphoto_pictures(plistpath):
+    if not io.exists(plistpath):
+        raise InvalidPath(self)
+    s = io.open(plistpath).read()
+    # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
+    s = s.replace('\x10', '')
+    # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
+    # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
+    # bundle's regexp
+    s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
+    if count:
+        logging.warning("%d invalid XML entities replacement made", count)
+    plist = plistlib.readPlistFromString(s)
+    result = []
+    for photo_data in plist['Master Image List'].values():
        if photo_data['MediaType'] != 'Image':
-            return
+            continue
        photo_path = Path(photo_data['ImagePath'])
-        subpath = photo_path[len(self.refpath):-1]
-        subdir = self
-        for element in subpath:
-            try:
-                subdir = subdir[element]
-            except KeyError:
-                subdir = fs.Directory(subdir, element)
-        try:
-            IPhoto(subdir, photo_path)
-        except fs.AlreadyExistsError:
-            # it's possible for 2 entries in the plist to point to the same path. Ignore one of them.
-            pass
+        photo = IPhoto(photo_path)
+        result.append(photo)
+    return result
+
+class Directories(directories.Directories):
+    def __init__(self):
+        directories.Directories.__init__(self, fileclasses=[Photo])
+        self.iphoto_libpath = get_iphoto_database_path()
+        self.set_state(self.iphoto_libpath[:-1], directories.STATE_EXCLUDED)
    
-    def update(self):
-        self.clear()
-        s = open(unicode(self.plistpath)).read()
-        # There was a case where a guy had 0x10 chars in his plist, causing expat errors on loading
-        s = s.replace('\x10', '')
-        # It seems that iPhoto sometimes doesn't properly escape & chars. The regexp below is to find
-        # any & char that is not a &-based entity (&amp;, &quot;, etc.). based on TextMate's XML
-        # bundle's regexp
-        s, count = re.subn(r'&(?![a-zA-Z0-9_-]+|#[0-9]+|#x[0-9a-fA-F]+;)', '', s)
-        if count:
-            logging.warning("%d invalid XML entities replacement made", count)
-        plist = plistlib.readPlistFromString(s)
-        for photo_data in plist['Master Image List'].values():
-            self._update_photo(photo_data)
+    def _get_files(self, from_path):
+        if from_path == Path('iPhoto Library'):
+            is_ref = self.get_state(from_path) == directories.STATE_REFERENCE
+            photos = get_iphoto_pictures(self.iphoto_libpath)
+            for photo in photos:
+                photo.is_ref = is_ref
+            return photos
+        else:
+            return directories.Directories._get_files(self, from_path)
    
-    def force_update(self): # Don't update
-        pass
+    @staticmethod
+    def get_subfolders(path):
+        if path == Path('iPhoto Library'):
+            return []
+        else:
+            return directories.Directories.get_subfolders(path)
+    
+    def add_path(self, path):
+        if path == Path('iPhoto Library'):
+            if path in self:
+                raise AlreadyThereError()
+            self._dirs.append(path)
+        else:
+            directories.Directories.add_path(self, path)
    

 class DupeGuruPE(app_cocoa.DupeGuru):
    def __init__(self):
        app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru Picture Edition', appid=5)
-        self.scanner.match_factory = matchbase.AsyncMatchFactory()
-        self.directories.dirclass = Directory
-        self.directories.special_dirclasses[Path('iPhoto Library')] = lambda _, __: self._create_iphoto_library()
+        self.scanner = ScannerPE()
+        self.directories = Directories()
        p = op.join(self.appdata, 'cached_pictures.db')
-        self.scanner.match_factory.cached_blocks = Cache(p)
-    
-    def _create_iphoto_library(self):
-        ud = NSUserDefaults.standardUserDefaults()
-        prefs = ud.persistentDomainForName_('com.apple.iApps')
-        if 'iPhotoRecentDatabases' not in prefs:
-            raise directories.InvalidPathError
-        plisturl = NSURL.URLWithString_(prefs['iPhotoRecentDatabases'][0])
-        plistpath = Path(plisturl.path())
-        return IPhotoLibrary(plistpath)
+        self.scanner.cached_blocks = Cache(p)
    
    def _do_delete(self, j):
        def op(dupe):
@@ -174,40 +166,19 @@ class DupeGuruPE(app_cocoa.DupeGuru):
    
    def _do_load(self, j):
        self.directories.load_from_file(op.join(self.appdata, 'last_directories.xml'))
-        for d in self.directories:
-            if isinstance(d, IPhotoLibrary):
-                d.update()
        self.results.load_from_xml(op.join(self.appdata, 'last_results.xml'), self._get_file, j)
    
    def _get_file(self, str_path):
        p = Path(str_path)
-        for d in self.directories:
-            result = None
-            if p in d.path:
-                result = d.find_path(p[d.path:])
-            if isinstance(d, IPhotoLibrary) and p in d.refpath:
-                result = d.find_path(p[d.refpath:])
-            if result is not None:
-                return result
-    
-    def add_directory(self, d):
-        result = app_cocoa.DupeGuru.add_directory(self, d)
-        if (result == 0) and (d == 'iPhoto Library'):
-            [iphotolib] = [dir for dir in self.directories if dir.path == d]
-            iphotolib.update()
-        return result
+        if p in self.directories.iphoto_libpath[:-1]:
+            return IPhoto(p)
+        return app_cocoa.DupeGuru._get_file(self, str_path)
    
    def copy_or_move(self, dupe, copy, destination, dest_type):
        if isinstance(dupe, IPhoto):
            copy = True
        return app_cocoa.DupeGuru.copy_or_move(self, dupe, copy, destination, dest_type)
    
-    def start_scanning(self):
-        for directory in self.directories:
-            if isinstance(directory, IPhotoLibrary):
-                self.directories.set_state(directory.refpath, directories.STATE_EXCLUDED)
-        return app_cocoa.DupeGuru.start_scanning(self)
-    
    def selected_dupe_path(self):
        if not self.selected_dupes:
            return None
--- a/pe/py/matchbase.py
+++ b/pe/py/matchbase.py
@@ -20,58 +20,42 @@ from .block import avgdiff, DifferentBlockCountError, NoBlocksError
 from .cache import Cache

 MIN_ITERATIONS = 3
+BLOCK_COUNT_PER_SIDE = 15

 # Enough so that we're sure that the main thread will not wait after a result.get() call
 # cpucount*2 should be enough to be sure that the spawned process will not wait after the results
 # collection made by the main process.
 RESULTS_QUEUE_LIMIT = multiprocessing.cpu_count() * 2

-def get_match(first,second,percentage):
+def prepare_pictures(pictures, cached_blocks, j=job.nulljob):
+    # The MemoryError handlers in there use logging without first caring about whether or not
+    # there is enough memory left to carry on the operation because it is assumed that the
+    # MemoryError happens when trying to read an image file, which is freed from memory by the
+    # time that MemoryError is raised.
+    prepared = [] # only pictures for which there was no error getting blocks
+    try:
+        for picture in j.iter_with_progress(pictures, 'Analyzed %d/%d pictures'):
+            picture.dimensions
+            picture.unicode_path = unicode(picture.path)
+            try:
+                if picture.unicode_path not in cached_blocks:
+                    blocks = picture.get_blocks(BLOCK_COUNT_PER_SIDE)
+                    cached_blocks[picture.unicode_path] = blocks
+                prepared.append(picture)
+            except IOError as e:
+                logging.warning(unicode(e))
+            except MemoryError:
+                logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
+                if picture.size < 10 * 1024 * 1024: # We're really running out of memory
+                    raise
+    except MemoryError:
+        logging.warning('Ran out of memory while preparing pictures')
+    return prepared
+
+def get_match(first, second, percentage):
    if percentage < 0:
        percentage = 0
-    return Match(first,second,percentage)
-
-class MatchFactory(object):
-    cached_blocks = None
-    block_count_per_side = 15
-    threshold = 75
-    match_scaled = False
-    
-    def _do_getmatches(self, files, j):
-        raise NotImplementedError()
-    
-    def getmatches(self, files, j=job.nulljob):
-        # The MemoryError handlers in there use logging without first caring about whether or not
-        # there is enough memory left to carry on the operation because it is assumed that the
-        # MemoryError happens when trying to read an image file, which is freed from memory by the
-        # time that MemoryError is raised.
-        j = j.start_subjob([3, 7])
-        logging.info('Preparing %d files' % len(files))
-        prepared = self.prepare_files(files, j)
-        logging.info('Finished preparing %d files' % len(prepared))
-        return self._do_getmatches(prepared, j)
-    
-    def prepare_files(self, files, j=job.nulljob):
-        prepared = [] # only files for which there was no error getting blocks
-        try:
-            for picture in j.iter_with_progress(files, 'Analyzed %d/%d pictures'):
-                picture.dimensions
-                picture.unicode_path = unicode(picture.path)
-                try:
-                    if picture.unicode_path not in self.cached_blocks:
-                        blocks = picture.get_blocks(self.block_count_per_side)
-                        self.cached_blocks[picture.unicode_path] = blocks
-                    prepared.append(picture)
-                except IOError as e:
-                    logging.warning(unicode(e))
-                except MemoryError:
-                    logging.warning(u'Ran out of memory while reading %s of size %d' % (picture.unicode_path, picture.size))
-                    if picture.size < 10 * 1024 * 1024: # We're really running out of memory
-                        raise
-        except MemoryError:
-            logging.warning('Ran out of memory while preparing files')
-        return prepared
-    
+    return Match(first, second, percentage)

 def async_compare(ref_id, other_ids, dbname, threshold):
    cache = Cache(dbname, threaded=False)
@@ -89,53 +73,55 @@ def async_compare(ref_id, other_ids, dbname, threshold):
            results.append((ref_id, other_id, percentage))
    cache.con.close()
    return results
-
-class AsyncMatchFactory(MatchFactory):
-    def _do_getmatches(self, pictures, j):
-        def empty_out_queue(queue, into):
-            try:
-                while True:
-                    into.append(queue.get(block=False))
-            except Empty:
-                pass
-        
-        j = j.start_subjob([9, 1], 'Preparing for matching')
-        cache = self.cached_blocks
-        id2picture = {}
-        dimensions2pictures = defaultdict(set)
-        for picture in pictures:
-            try:
-                picture.cache_id = cache.get_id(picture.unicode_path)
-                id2picture[picture.cache_id] = picture
-                if not self.match_scaled:
-                    dimensions2pictures[picture.dimensions].add(picture)
-            except ValueError:
-                pass
-        pictures = [p for p in pictures if hasattr(p, 'cache_id')]
-        pool = multiprocessing.Pool()
-        async_results = []
-        matches = []
-        pictures_copy = set(pictures)
-        for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
-            others = pictures_copy if self.match_scaled else dimensions2pictures[ref.dimensions]
-            others.remove(ref)
-            if others:
-                cache_ids = [f.cache_id for f in others]
-                args = (ref.cache_id, cache_ids, self.cached_blocks.dbname, self.threshold)
-                async_results.append(pool.apply_async(async_compare, args))
-            if len(async_results) > RESULTS_QUEUE_LIMIT:
-                result = async_results.pop(0)
-                matches.extend(result.get())
-        
-        result = []
-        for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
-            ref = id2picture[ref_id]
-            other = id2picture[other_id]
-            if percentage == 100 and ref.md5 != other.md5:
-                percentage = 99
-            if percentage >= self.threshold:
-                result.append(get_match(ref, other, percentage))
-        return result
    
+def getmatches(pictures, cached_blocks, threshold=75, match_scaled=False, j=job.nulljob):
+    def empty_out_queue(queue, into):
+        try:
+            while True:
+                into.append(queue.get(block=False))
+        except Empty:
+            pass
+    
+    j = j.start_subjob([3, 7])
+    pictures = prepare_pictures(pictures, cached_blocks, j)
+    j = j.start_subjob([9, 1], 'Preparing for matching')
+    cache = cached_blocks
+    id2picture = {}
+    dimensions2pictures = defaultdict(set)
+    for picture in pictures:
+        try:
+            picture.cache_id = cache.get_id(picture.unicode_path)
+            id2picture[picture.cache_id] = picture
+            if not match_scaled:
+                dimensions2pictures[picture.dimensions].add(picture)
+        except ValueError:
+            pass
+    pictures = [p for p in pictures if hasattr(p, 'cache_id')]
+    pool = multiprocessing.Pool()
+    async_results = []
+    matches = []
+    pictures_copy = set(pictures)
+    for ref in j.iter_with_progress(pictures, 'Matched %d/%d pictures'):
+        others = pictures_copy if match_scaled else dimensions2pictures[ref.dimensions]
+        others.remove(ref)
+        if others:
+            cache_ids = [f.cache_id for f in others]
+            args = (ref.cache_id, cache_ids, cached_blocks.dbname, threshold)
+            async_results.append(pool.apply_async(async_compare, args))
+        if len(async_results) > RESULTS_QUEUE_LIMIT:
+            result = async_results.pop(0)
+            matches.extend(result.get())
+    for result in async_results: # process the rest of the results
+        matches.extend(result.get())
+    
+    result = []
+    for ref_id, other_id, percentage in j.iter_with_progress(matches, 'Verified %d/%d matches', every=10):
+        ref = id2picture[ref_id]
+        other = id2picture[other_id]
+        if percentage == 100 and ref.md5 != other.md5:
+            percentage = 99
+        if percentage >= threshold:
+            result.append(get_match(ref, other, percentage))
+    return result

 multiprocessing.freeze_support()
--- a/pe/py/scanner.py
+++ b/pe/py/scanner.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-18
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from dupeguru.scanner import Scanner
+
+from . import matchbase
+
+class ScannerPE(Scanner):
+    cached_blocks = None
+    match_scaled = False
+    threshold = 75
+    
+    def _getmatches(self, files, j):
+        return matchbase.getmatches(files, self.cached_blocks, self.threshold, self.match_scaled, j)
+    
--- a/pe/qt/app.py
+++ b/pe/qt/app.py
@@ -12,12 +12,12 @@ import os.path as op
 from PyQt4.QtGui import QImage
 import PIL.Image

-from hsfs import phys
 from hsutil.str import get_file_ext

+from dupeguru import fs
 from dupeguru_pe import data as data_pe
 from dupeguru_pe.cache import Cache
-from dupeguru_pe.matchbase import AsyncMatchFactory
+from dupeguru_pe.scanner import ScannerPE

 from block import getblocks
 from base.app import DupeGuru as DupeGuruBase
@@ -26,14 +26,19 @@ from main_window import MainWindow
 from preferences import Preferences
 from preferences_dialog import PreferencesDialog

-class File(phys.File):
-    INITIAL_INFO = phys.File.INITIAL_INFO.copy()
+class File(fs.File):
+    INITIAL_INFO = fs.File.INITIAL_INFO.copy()
    INITIAL_INFO.update({
        'dimensions': (0,0),
    })
+    HANDLED_EXTS = set(['png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif'])
+    
+    @classmethod
+    def can_handle(cls, path):
+        return fs.File.can_handle(path) and get_file_ext(path[-1]) in cls.HANDLED_EXTS
    
    def _read_info(self, field):
-        super(File, self)._read_info(field)
+        fs.File._read_info(self, field)
        if field == 'dimensions':
            im = PIL.Image.open(unicode(self.path))
            self.dimensions = im.size
@@ -44,15 +49,6 @@ class File(phys.File):
        return getblocks(image, block_count_per_side)
    

-class Directory(phys.Directory):
-    cls_file_class = File
-    cls_supported_exts = ('png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff')
-    
-    def _fetch_subitems(self):
-        subdirs, subfiles = super(Directory, self)._fetch_subitems() 
-        return subdirs, [name for name in subfiles if get_file_ext(name) in self.cls_supported_exts]
-    
-
 class DupeGuru(DupeGuruBase):
    LOGO_NAME = 'logo_pe'
    NAME = 'dupeGuru Picture Edition'
@@ -63,15 +59,15 @@ class DupeGuru(DupeGuruBase):
        DupeGuruBase.__init__(self, data_pe, appid=5)
    
    def _setup(self):
-        self.scanner.match_factory = AsyncMatchFactory()
-        self.directories.dirclass = Directory
-        self.scanner.match_factory.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db'))
+        self.scanner = ScannerPE()
+        self.directories.fileclasses = [File]
+        self.scanner.cached_blocks = Cache(op.join(self.appdata, 'cached_pictures.db'))
        DupeGuruBase._setup(self)
    
    def _update_options(self):
        DupeGuruBase._update_options(self)
-        self.scanner.match_factory.match_scaled = self.prefs.match_scaled
-        self.scanner.match_factory.threshold = self.prefs.filter_hardness
+        self.scanner.match_scaled = self.prefs.match_scaled
+        self.scanner.threshold = self.prefs.filter_hardness
    
    def _create_details_dialog(self, parent):
        return DetailsDialog(parent, self)
--- a/pe/qt/dgpe.spec
+++ b/pe/qt/dgpe.spec
@@ -1,6 +1,6 @@
 # -*- mode: python -*-
 a = Analysis([os.path.join(HOMEPATH,'support\\_mountzlib.py'), os.path.join(HOMEPATH,'support\\useUnicode.py'), 'start.py'],
-             pathex=['C:\\src\\dupeguru\\pe\\qt'])
+             pathex=[])
 pyz = PYZ(a.pure)
 exe = EXE(pyz,
          a.scripts,
--- a/pe/qt/gen.py
+++ b/pe/qt/gen.py
@@ -16,6 +16,7 @@ from hsutil.build import print_and_do, build_all_qt_ui
 build_all_qt_ui(op.join('qtlib', 'ui'))
 build_all_qt_ui('base')
 build_all_qt_ui('.')
+print_and_do("pyrcc4 base\\dg.qrc > base\\dg_rc.py")

 def move(src, dst):
    if not op.exists(src):
--- a/pe/qt/main_window.py
+++ b/pe/qt/main_window.py
@@ -23,6 +23,6 @@ class MainWindow(MainWindowBase):
        title = "Clear Picture Cache"
        msg = "Do you really want to remove all your cached picture analysis?"
        if self._confirm(title, msg, QMessageBox.No):
-            self.app.scanner.match_factory.cached_blocks.clear()
+            self.app.scanner.cached_blocks.clear()
            QMessageBox.information(self, title, "Picture cache cleared.")
    
--- a/pe/qt/start.py
+++ b/pe/qt/start.py
@@ -14,6 +14,9 @@ import base.dg_rc

 from app import DupeGuru

+# This is a workaround for a pyinstaller problem where compiled dupeguru can't read tiff files
+from PIL import TiffImagePlugin, TiffTags
+
 if __name__ == "__main__":
    app = QApplication(sys.argv)
    app.setWindowIcon(QIcon(QPixmap(":/logo_pe")))
--- a/se/cocoa/py/dg_cocoa.py
+++ b/se/cocoa/py/dg_cocoa.py
@@ -8,12 +8,12 @@
 import objc
 from AppKit import *

-from dupeguru import app_se_cocoa, scanner
+from dupeguru_se.app_cocoa import DupeGuru
+from dupeguru import scanner

 # Fix py2app imports with chokes on relative imports
-from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, scanner
-from hsfs import auto, stats, tree
-from hsfs.phys import bundle
+from dupeguru_se import fs, data
+from dupeguru import app, app_cocoa, data, directories, engine, export, ignore, results, fs
 from hsutil import conflict

 class PyApp(NSObject):
@@ -22,7 +22,7 @@ class PyApp(NSObject):
 class PyDupeGuru(PyApp):
    def init(self):
        self = super(PyDupeGuru,self).init()
-        self.app = app_se_cocoa.DupeGuru()
+        self.app = DupeGuru()
        return self
    
    #---Directories
--- a/se/py/LICENSE
+++ b/se/py/LICENSE
@@ -0,0 +1,11 @@
+Copyright 2009 Hardcoded Software Inc. (http://www.hardcoded.net)
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the name of Hardcoded Software Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+    * If the source code has been published less than two years ago, any redistribution, in whole or in part, must retain full licensing functionality, without any attempt to change, obscure or in other ways circumvent its intent.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/se/py/init.py
+++ b/se/py/init.py
@@ -0,0 +1 @@
+
--- a/base/py/app_se_cocoa.py
+++ b/base/py/app_se_cocoa.py
@@ -11,14 +11,15 @@ import logging

 from AppKit import *

-from hsfs.phys import Directory as DirectoryBase
-from hsfs.phys.bundle import Bundle
+from hsutil import io
 from hsutil.path import Path
-from hsutil.misc import extract
 from hsutil.str import get_file_ext

-from . import app_cocoa, data
-from .directories import Directories as DirectoriesBase, STATE_EXCLUDED
+from dupeguru import fs
+from dupeguru.app_cocoa import DupeGuru as DupeGuruBase
+from dupeguru.directories import Directories as DirectoriesBase, STATE_EXCLUDED
+from . import data
+from .fs import Bundle as BundleBase

 if NSWorkspace.sharedWorkspace().respondsToSelector_('typeOfFile:error:'): # Only from 10.5
    def is_bundle(str_path):
@@ -31,27 +32,17 @@ else: # Tiger
    def is_bundle(str_path): # just return a list of a few known bundle extensions.
        return get_file_ext(str_path) in ('app', 'pages', 'numbers')

-class DGDirectory(DirectoryBase):
-    def _create_sub_file(self, name, with_parent=True):
-        if is_bundle(unicode(self.path + name)):
-            parent = self if with_parent else None
-            return Bundle(parent, name)
-        else:
-            return super(DGDirectory, self)._create_sub_file(name, with_parent)
-    
-    def _fetch_subitems(self):
-        subdirs, subfiles = super(DGDirectory, self)._fetch_subitems()
-        apps, normal_dirs = extract(lambda name: is_bundle(unicode(self.path + name)), subdirs)
-        subfiles += apps
-        return normal_dirs, subfiles
+class Bundle(BundleBase):
+    @classmethod
+    def can_handle(cls, path):
+        return not io.islink(path) and io.isdir(path) and is_bundle(unicode(path))
    

 class Directories(DirectoriesBase):
    ROOT_PATH_TO_EXCLUDE = map(Path, ['/Library', '/Volumes', '/System', '/bin', '/sbin', '/opt', '/private', '/dev'])
    HOME_PATH_TO_EXCLUDE = [Path('Library')]
    def __init__(self):
-        DirectoriesBase.__init__(self)
-        self.dirclass = DGDirectory
+        DirectoriesBase.__init__(self, fileclasses=[Bundle, fs.File])
    
    def _default_state_for_path(self, path):
        result = DirectoriesBase._default_state_for_path(self, path)
@@ -63,8 +54,8 @@ class Directories(DirectoriesBase):
            return STATE_EXCLUDED
    

-class DupeGuru(app_cocoa.DupeGuru):
+class DupeGuru(DupeGuruBase):
    def __init__(self):
-        app_cocoa.DupeGuru.__init__(self, data, 'dupeGuru', appid=4)
+        DupeGuruBase.__init__(self, data, 'dupeGuru', appid=4)
        self.directories = Directories()
    
--- a/se/py/data.py
+++ b/se/py/data.py
@@ -0,0 +1,72 @@
+# Created By: Virgil Dupras
+# Created On: 2006/03/15
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+from hsutil.str import format_size
+from dupeguru.data import (format_path, format_timestamp, format_words, format_perc, 
+    format_dupe_count, cmp_value)
+
+COLUMNS = [
+    {'attr':'name','display':'Filename'},
+    {'attr':'path','display':'Directory'},
+    {'attr':'size','display':'Size (KB)'},
+    {'attr':'extension','display':'Kind'},
+    {'attr':'ctime','display':'Creation'},
+    {'attr':'mtime','display':'Modification'},
+    {'attr':'percentage','display':'Match %'},
+    {'attr':'words','display':'Words Used'},
+    {'attr':'dupe_count','display':'Dupe Count'},
+]
+
+METADATA_TO_READ = ['size', 'ctime', 'mtime']
+
+def GetDisplayInfo(dupe, group, delta):
+    size = dupe.size
+    ctime = dupe.ctime
+    mtime = dupe.mtime
+    m = group.get_match_of(dupe)
+    if m:
+        percentage = m.percentage
+        dupe_count = 0
+        if delta:
+            r = group.ref
+            size -= r.size
+            ctime -= r.ctime
+            mtime -= r.mtime
+    else:
+        percentage = group.percentage
+        dupe_count = len(group.dupes)
+    return [
+        dupe.name,
+        format_path(dupe.path),
+        format_size(size, 0, 1, False),
+        dupe.extension,
+        format_timestamp(ctime, delta and m),
+        format_timestamp(mtime, delta and m),
+        format_perc(percentage),
+        format_words(dupe.words) if hasattr(dupe, 'words') else '',
+        format_dupe_count(dupe_count)
+    ]
+
+def GetDupeSortKey(dupe, get_group, key, delta):
+    if key == 6:
+        m = get_group().get_match_of(dupe)
+        return m.percentage
+    if key == 8:
+        return 0
+    r = cmp_value(getattr(dupe, COLUMNS[key]['attr']))
+    if delta and (key in (2, 4, 5)):
+        r -= cmp_value(getattr(get_group().ref, COLUMNS[key]['attr']))
+    return r
+
+def GetGroupSortKey(group, key):
+    if key == 6:
+        return group.percentage
+    if key == 8:
+        return len(group)
+    return cmp_value(getattr(group.ref, COLUMNS[key]['attr']))
--- a/se/py/fs.py
+++ b/se/py/fs.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import hashlib
+
+from hsutil import io
+from hsutil.misc import nonone
+
+from dupeguru import fs
+
+class Bundle(fs.File):
+    """This class is for Mac OSX bundles (.app). Bundles are seen by the OS as
+    normal directories, but I don't want that in dupeGuru. I want dupeGuru
+    to see them as files.
+    """
+    def _read_info(self, field):
+        if field in ('size', 'ctime', 'mtime'):
+            files = fs.get_all_files(self.path)
+            size = sum((file.size for file in files), 0)
+            self.size = size
+            stats = io.stat(self.path)
+            self.ctime = nonone(stats.st_ctime, 0)
+            self.mtime = nonone(stats.st_mtime, 0)
+        elif field in ('md5', 'md5partial'):
+            # What's sensitive here is that we must make sure that subfiles'
+            # md5 are always added up in the same order, but we also want a
+            # different md5 if a file gets moved in a different subdirectory.
+            def get_dir_md5_concat():
+                files = fs.get_all_files(self.path)
+                files.sort(key=lambda f:f.path)
+                md5s = [getattr(f, field) for f in files]
+                return ''.join(md5s)
+            
+            md5 = hashlib.md5(get_dir_md5_concat())
+            digest = md5.digest()
+            setattr(self, field, digest)
--- a/se/py/tests/init.py
+++ b/se/py/tests/init.py
--- a/se/py/tests/fs_test.py
+++ b/se/py/tests/fs_test.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# Created By: Virgil Dupras
+# Created On: 2009-10-23
+# $Id$
+# Copyright 2009 Hardcoded Software (http://www.hardcoded.net)
+# 
+# This software is licensed under the "HS" License as described in the "LICENSE" file, 
+# which should be included with this package. The terms are also available at 
+# http://www.hardcoded.net/licenses/hs_license
+
+import hashlib
+
+from nose.tools import eq_
+
+from hsutil.testcase import TestCase
+from dupeguru.fs import File
+from dupeguru.tests.directories_test import create_fake_fs
+
+from .. import fs
+
+class TCBundle(TestCase):
+    def test_size_aggregates_subfiles(self):
+        p = create_fake_fs(self.tmppath())
+        b = fs.Bundle(p)
+        eq_(b.size, 12)
+    
+    def test_md5_aggregate_subfiles_sorted(self):
+        #dir.allfiles can return child in any order. Thus, bundle.md5 must aggregate
+        #all files' md5 it contains, but it must make sure that it does so in the 
+        #same order everytime.
+        p = create_fake_fs(self.tmppath())
+        b = fs.Bundle(p)
+        md5s = File(p + ('dir1', 'file1.test')).md5
+        md5s += File(p + ('dir2', 'file2.test')).md5
+        md5s += File(p + ('dir3', 'file3.test')).md5
+        md5s += File(p + 'file1.test').md5
+        md5s += File(p + 'file2.test').md5
+        md5s += File(p + 'file3.test').md5
+        md5 = hashlib.md5(md5s)
+        eq_(b.md5, md5.digest())
+    
+    def test_has_file_attrs(self):
+        #a Bundle must behave like a file, so it must have ctime and mtime attributes
+        b = fs.Bundle(self.tmppath())
+        assert b.mtime > 0
+        assert b.ctime > 0
+        eq_(b.extension, '')
+    
--- a/se/qt/app.py
+++ b/se/qt/app.py
@@ -7,7 +7,7 @@
 # which should be included with this package. The terms are also available at 
 # http://www.hardcoded.net/licenses/hs_license

-from dupeguru import data
+from dupeguru_se import data
 from dupeguru.directories import Directories as DirectoriesBase, STATE_EXCLUDED

 from base.app import DupeGuru as DupeGuruBase
Author	SHA1	Message	Date
hsoft	911521d8e0	dgpe qt: build related fixes. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40217	2009-10-24 16:30:37 +00:00
hsoft	b25c1c3a3b	Added dgpe 1.7.8 to the changelog. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40215	2009-10-24 14:18:36 +00:00
hsoft	37a40040b3	[#73 state:port] Fixed a bug causing some matches to be ignored in the new pe match algo. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40212	2009-10-24 13:54:57 +00:00
hsoft	25dadc83eb	sgpe cocoa: adjusted to hsfs removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40210	2009-10-24 12:21:39 +00:00
hsoft	b8c11b5aae	dgpe cocoa: removed hsfs from externals. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40209	2009-10-24 12:21:09 +00:00
hsoft	a3ab314378	dgpe qt: adjusted to the hsfs move. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40208	2009-10-23 15:04:37 +00:00
hsoft	794192835d	dgme cocoa: added dupeguru_me external and removed the hsfs one. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40207	2009-10-23 14:46:00 +00:00
hsoft	385768a69b	dgme qt: adjusted code to the hsfs move. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40206	2009-10-23 14:35:51 +00:00
hsoft	a281931b16	dgme qt: added the dupeguru_me external and removed the hsfs one. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40205	2009-10-23 14:34:59 +00:00
hsoft	085311d559	Added the folder me/py --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40204	2009-10-23 14:05:06 +00:00
hsoft	4d7f032889	dgse cocoa: fixed quirks created by the hsfs move. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40203	2009-10-23 13:46:18 +00:00
hsoft	cf44c93013	dgse cocoa: added the dupeguru_se external and removed the hsfs one. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40202	2009-10-23 13:45:15 +00:00
hsoft	787cbcd01f	dgse qt: removed hsfs external --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40201	2009-10-23 12:59:29 +00:00
hsoft	b2b316b642	dgse qt: removed all hsfs usages. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40200	2009-10-23 12:56:52 +00:00
hsoft	49165125e4	dg se: Moved se-specific code from dupeguru to dupeguru_se. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40199	2009-10-23 08:19:48 +00:00
hsoft	54ac0fd19e	dg qt: oops, now I added the external ref. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40198	2009-10-23 08:19:02 +00:00
hsoft	0aff7f16e5	dg qt: Added the dupeguru_se external. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40197	2009-10-23 08:17:35 +00:00
hsoft	f9abc3b35d	Added a dupeguru_se sub-package. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40196	2009-10-23 08:02:43 +00:00
hsoft	b167a51243	Added dupeguru.fs, which is a simpler fork of hsfs and aims to replace it in the dupeguru project. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40195	2009-10-22 15:23:32 +00:00
hsoft	371cdda911	dgpe cocoa: adjusted to MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40194	2009-10-18 09:29:33 +00:00
hsoft	11977c6533	dgpe: adjusted to the MatchFactory removal. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40193	2009-10-18 09:26:04 +00:00
hsoft	7228adf433	Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans. --HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192	2009-10-18 08:46:00 +00:00