Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans.

--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192
2025-09-11 17:58:17 +00:00 · 2009-10-18 08:46:00 +00:00 · 2009-10-18 08:46:00 +00:00 · 7228adf433
commit 7228adf433
parent 3f34dab881
8 changed files with 123 additions and 179 deletions
--- a/base/py/data.py
+++ b/base/py/data.py
@ -79,7 +79,7 @@ def GetDisplayInfo(dupe, group, delta):
        format_timestamp(ctime, delta and m),
        format_timestamp(mtime, delta and m),
        format_perc(percentage),
-        format_words(dupe.words),
+        format_words(dupe.words) if hasattr(dupe, 'words') else '',
        format_dupe_count(dupe_count)
    ]

--- a/base/py/data_me.py
+++ b/base/py/data_me.py
@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
        str(dupe.track),
        dupe.comment,
        format_perc(percentage),
-        format_words(dupe.words),
+        format_words(dupe.words) if hasattr(dupe, 'words') else '',
        format_dupe_count(dupe_count)
    ]

--- a/base/py/engine.py
+++ b/base/py/engine.py
@ -9,6 +9,7 @@

 from __future__ import division
 import difflib
+import itertools
 import logging
 import string
 from collections import defaultdict, namedtuple
@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
    percentage = compare(first.words, second.words, flags)
    return Match(first, second, percentage)

-class MatchFactory(object):
-    common_word_threshold = 50
-    match_similar_words = False
-    min_match_percentage = 0
-    weight_words = False
-    no_field_order = False
-    limit = 5000000
-    
-    def getmatches(self, objects, j=job.nulljob):
-        j = j.start_subjob(2)
-        sj = j.start_subjob(2)
-        for o in objects:
-            if not hasattr(o, 'words'):
-                o.words = getwords(o.name)
-        word_dict = build_word_dict(objects, sj)
-        reduce_common_words(word_dict, self.common_word_threshold)
-        if self.match_similar_words:
-            merge_similar_words(word_dict)
-        match_flags = []
-        if self.weight_words:
-            match_flags.append(WEIGHT_WORDS)
-        if self.match_similar_words:
-            match_flags.append(MATCH_SIMILAR_WORDS)
-        if self.no_field_order:
-            match_flags.append(NO_FIELD_ORDER)
-        j.start_job(len(word_dict), '0 matches found')
-        compared = defaultdict(set)
-        result = []
-        try:
-            # This whole 'popping' thing is there to avoid taking too much memory at the same time.
-            while word_dict:
-                items = word_dict.popitem()[1]
-                while items:
-                    ref = items.pop()
-                    compared_already = compared[ref]
-                    to_compare = items - compared_already
-                    compared_already |= to_compare
-                    for other in to_compare:
-                        m = get_match(ref, other, match_flags)
-                        if m.percentage >= self.min_match_percentage:
-                            result.append(m)
-                            if len(result) >= self.limit:
-                                return result
-                j.add_progress(desc='%d matches found' % len(result))
-        except MemoryError:
-            # This is the place where the memory usage is at its peak during the scan.
-            # Just continue the process with an incomplete list of matches.
-            del compared # This should give us enough room to call logging.
-            logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
-            return result
+def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, 
+    no_field_order=False, j=job.nulljob):
+    COMMON_WORD_THRESHOLD = 50
+    LIMIT = 5000000
+    j = j.start_subjob(2)
+    sj = j.start_subjob(2)
+    for o in objects:
+        if not hasattr(o, 'words'):
+            o.words = getwords(o.name)
+    word_dict = build_word_dict(objects, sj)
+    reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
+    if match_similar_words:
+        merge_similar_words(word_dict)
+    match_flags = []
+    if weight_words:
+        match_flags.append(WEIGHT_WORDS)
+    if match_similar_words:
+        match_flags.append(MATCH_SIMILAR_WORDS)
+    if no_field_order:
+        match_flags.append(NO_FIELD_ORDER)
+    j.start_job(len(word_dict), '0 matches found')
+    compared = defaultdict(set)
+    result = []
+    try:
+        # This whole 'popping' thing is there to avoid taking too much memory at the same time.
+        while word_dict:
+            items = word_dict.popitem()[1]
+            while items:
+                ref = items.pop()
+                compared_already = compared[ref]
+                to_compare = items - compared_already
+                compared_already |= to_compare
+                for other in to_compare:
+                    m = get_match(ref, other, match_flags)
+                    if m.percentage >= min_match_percentage:
+                        result.append(m)
+                        if len(result) >= LIMIT:
+                            return result
+            j.add_progress(desc='%d matches found' % len(result))
+    except MemoryError:
+        # This is the place where the memory usage is at its peak during the scan.
+        # Just continue the process with an incomplete list of matches.
+        del compared # This should give us enough room to call logging.
+        logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
        return result
-    
+    return result
+
+def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
+    j = j.start_subjob([2, 8])
+    size2files = defaultdict(set)
+    for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
+        size2files[getattr(file, sizeattr)].add(file)
+    possible_matches = [files for files in size2files.values() if len(files) > 1]
+    del size2files
+    result = []
+    j.start_job(len(possible_matches), '0 matches found')
+    for group in possible_matches:
+        for first, second in itertools.combinations(group, 2):
+            if first.md5partial == second.md5partial:
+                if partial or first.md5 == second.md5:
+                    result.append(Match(first, second, 100))
+        j.add_progress(desc='%d matches found' % len(result))
+    return result

 class Group(object):
    #---Override
--- a/base/py/scanner.py
+++ b/base/py/scanner.py
@ -32,40 +32,32 @@ class Scanner(object):
        self.ignore_list = IgnoreList()
        self.discarded_file_count = 0
    
-    @staticmethod
-    def _filter_matches_by_content(matches, partial, j):
-        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
-        md5attrname = 'md5partial' if partial else 'md5'
-        md5 = lambda f: getattr(f, md5attrname)
-        for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
-            md5(matched_file)
-        j.set_progress(100, 'Removing false matches')
-        return [m for m in matches if md5(m.first) == md5(m.second)]
-    
    def _getmatches(self, files, j):
-        j = j.start_subjob(2)
-        mf = engine.MatchFactory()
-        if self.scan_type != SCAN_TYPE_CONTENT:
-            mf.match_similar_words = self.match_similar_words
-            mf.weight_words = self.word_weighting
-            mf.min_match_percentage = self.min_match_percentage
-        if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
-            self.scan_type = SCAN_TYPE_FIELDS
-            mf.no_field_order = True
-        func = {
-            SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
-            SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
-            SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
-            SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
-            SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
-        }[self.scan_type]
-        for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
-            if self.size_threshold:
+        if not self.size_threshold:
+            j = j.start_subjob([2, 8])
+            for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
                f.size # pre-read, makes a smoother progress if read here (especially for bundles)
-            f.words = func(f)
-        if self.size_threshold:
            files = [f for f in files if f.size >= self.size_threshold]
-        return mf.getmatches(files, j)
+        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
+            sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
+            return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
+        else:
+            j = j.start_subjob([2, 8])
+            kw = {}
+            kw['match_similar_words'] = self.match_similar_words
+            kw['weight_words'] = self.word_weighting
+            kw['min_match_percentage'] = self.min_match_percentage
+            if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
+                self.scan_type = SCAN_TYPE_FIELDS
+                kw['no_field_order'] = True
+            func = {
+                SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
+                SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
+                SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
+            }[self.scan_type]
+            for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
+                f.words = func(f)
+            return engine.getmatches(files, j=j, **kw)
    
    @staticmethod
    def _key_func(dupe):
@ -86,10 +78,7 @@ class Scanner(object):
        for f in [f for f in files if not hasattr(f, 'is_ref')]:
            f.is_ref = False
        logging.info('Getting matches')
-        if self.match_factory is None:
-            matches = self._getmatches(files, j)
-        else:
-            matches = self.match_factory.getmatches(files, j)
+        matches = self._getmatches(files, j)
        logging.info('Found %d matches' % len(matches))
        if not self.mix_file_kind:
            j.set_progress(100, 'Removing false matches')
@ -99,14 +88,6 @@ class Scanner(object):
            iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
            matches = [m for m in iter_matches 
                if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
-        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
-            j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
-            matches = self._filter_matches_by_content(matches, partial=True, j=j)
-            if self.scan_type == SCAN_TYPE_CONTENT:
-                matches = self._filter_matches_by_content(matches, partial=False, j=j)
-            # We compared md5. No words were involved.
-            for m in matches:
-                m.first.words = m.second.words = ['--']
        logging.info('Grouping matches')
        groups = engine.get_groups(matches, j)
        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
--- a/base/py/tests/app_cocoa_test.py
+++ b/base/py/tests/app_cocoa_test.py
@ -318,7 +318,7 @@ class TCDupeGuru_renameSelected(TestCase):
        fp = open(str(p + 'foo bar 3'),mode='w')
        fp.close()
        refdir = hsfs.phys.Directory(None,str(p))
-        matches = engine.MatchFactory().getmatches(refdir.files)
+        matches = engine.getmatches(refdir.files)
        groups = engine.get_groups(matches)
        g = groups[0]
        g.prioritize(lambda x:x.name)
--- a/base/py/tests/engine_test.py
+++ b/base/py/tests/engine_test.py
@ -340,21 +340,13 @@ class TCget_match(TestCase):
        self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
    

-class TCMatchFactory(TestCase):
+class GetMatches(TestCase):
    def test_empty(self):
-        self.assertEqual([],MatchFactory().getmatches([]))
-    
-    def test_defaults(self):
-        mf = MatchFactory()
-        self.assertEqual(50,mf.common_word_threshold)
-        self.assertEqual(False,mf.weight_words)
-        self.assertEqual(False,mf.match_similar_words)
-        self.assertEqual(False,mf.no_field_order)
-        self.assertEqual(0,mf.min_match_percentage)
+        eq_(getmatches([]), [])
    
    def test_simple(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(2,len(r))
        seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
        m = seek[0]
@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
    
    def test_null_and_unrelated_objects(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
        m = r[0]
        self.assertEqual(50,m.percentage)
@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
    
    def test_twice_the_same_word(self):
        l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    
    def test_twice_the_same_word_when_preworded(self):
        l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    
    def test_two_words_match(self):
        l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
-        r = MatchFactory().getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1,len(r))
    
    def test_match_files_with_only_common_words(self):
        #If a word occurs more than 50 times, it is excluded from the matching process
        #The problem with the common_word_threshold is that the files containing only common
        #words will never be matched together. We *should* match them.
-        mf = MatchFactory()
-        mf.common_word_threshold = 50
+        # This test assumes that the common word threashold const is 50
        l = [NamedObject("foo") for i in range(50)]
-        r = mf.getmatches(l)
+        r = getmatches(l)
        self.assertEqual(1225,len(r))
    
    def test_use_words_already_there_if_there(self):
        o1 = NamedObject('foo')
        o2 = NamedObject('bar')
        o2.words = ['foo']
-        self.assertEqual(1,len(MatchFactory().getmatches([o1,o2])))
+        eq_(1, len(getmatches([o1,o2])))
    
    def test_job(self):
        def do_progress(p,d=''):
@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
        j = job.Job(1,do_progress)
        self.log = []
        s = "foo bar"
-        MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j)
+        getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
        self.assert_(len(self.log) > 2)
        self.assertEqual(0,self.log[0])
        self.assertEqual(100,self.log[-1])
    
    def test_weight_words(self):
-        mf = MatchFactory()
-        mf.weight_words = True
        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
-        m = mf.getmatches(l)[0]
+        m = getmatches(l, weight_words=True)[0]
        self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
    
    def test_similar_word(self):
-        mf = MatchFactory()
-        mf.match_similar_words = True
        l = [NamedObject("foobar"),NamedObject("foobars")]
-        self.assertEqual(1,len(mf.getmatches(l)))
-        self.assertEqual(100,mf.getmatches(l)[0].percentage)
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
+        eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
        l = [NamedObject("foobar"),NamedObject("foo")]
-        self.assertEqual(0,len(mf.getmatches(l))) #too far
+        eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
        l = [NamedObject("bizkit"),NamedObject("bizket")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
        l = [NamedObject("foobar"),NamedObject("foosbar")]
-        self.assertEqual(1,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 1)
    
    def test_single_object_with_similar_words(self):
-        mf = MatchFactory()
-        mf.match_similar_words = True
        l = [NamedObject("foo foos")]
-        self.assertEqual(0,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=True)), 0)
    
    def test_double_words_get_counted_only_once(self):
-        mf = MatchFactory()
        l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
-        m = mf.getmatches(l)[0]
+        m = getmatches(l)[0]
        self.assertEqual(75,m.percentage)
    
    def test_with_fields(self):
-        mf = MatchFactory()
        o1 = NamedObject("foo bar - foo bleh")
        o2 = NamedObject("foo bar - bleh bar")
        o1.words = getfields(o1.name)
        o2.words = getfields(o2.name)
-        m = mf.getmatches([o1, o2])[0]
+        m = getmatches([o1, o2])[0]
        self.assertEqual(50, m.percentage)
    
    def test_with_fields_no_order(self):
-        mf = MatchFactory()
-        mf.no_field_order = True
        o1 = NamedObject("foo bar - foo bleh")
        o2 = NamedObject("bleh bang - foo bar")
        o1.words = getfields(o1.name)
        o2.words = getfields(o2.name)
-        m = mf.getmatches([o1, o2])[0]
-        self.assertEqual(50 ,m.percentage)
+        m = getmatches([o1, o2], no_field_order=True)[0]
+        eq_(m.percentage, 50)
    
    def test_only_match_similar_when_the_option_is_set(self):
-        mf = MatchFactory()
-        mf.match_similar_words = False
        l = [NamedObject("foobar"),NamedObject("foobars")]
-        self.assertEqual(0,len(mf.getmatches(l)))
+        eq_(len(getmatches(l, match_similar_words=False)), 0)
    
    def test_dont_recurse_do_match(self):
        # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
        sys.setrecursionlimit(100)
-        mf = MatchFactory()
        files = [NamedObject('foo bar') for i in range(101)]
        try:
-            mf.getmatches(files)
+            getmatches(files)
        except RuntimeError:
            self.fail()
        finally:
@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
    
    def test_min_match_percentage(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
-        mf = MatchFactory()
-        mf.min_match_percentage = 50
-        r = mf.getmatches(l)
+        r = getmatches(l, min_match_percentage=50)
        self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
    
-    def test_limit(self):
-        l = [NamedObject(),NamedObject(),NamedObject()]
-        mf = MatchFactory()
-        mf.limit = 2
-        r = mf.getmatches(l)
-        self.assertEqual(2,len(r))
-    
    def test_MemoryError(self):
        @log_calls
        def mocked_match(first, second, flags):
@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
        
        objects = [NamedObject() for i in range(10)] # results in 45 matches
        self.mock(engine, 'get_match', mocked_match)
-        mf = MatchFactory()
        try:
-            r = mf.getmatches(objects)
+            r = getmatches(objects)
        except MemoryError:
            self.fail('MemorryError must be handled')
        self.assertEqual(42, len(r))
@ -738,7 +706,7 @@ class TCget_groups(TestCase):
    
    def test_simple(self):
        l = [NamedObject("foo bar"),NamedObject("bar bleh")]
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        m = matches[0]
        r = get_groups(matches)
        self.assertEqual(1,len(r))
@ -749,7 +717,7 @@ class TCget_groups(TestCase):
    def test_group_with_multiple_matches(self):
        #This results in 3 matches
        l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(1,len(r))
        g = r[0]
@ -759,7 +727,7 @@ class TCget_groups(TestCase):
        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
        #There will be 2 groups here: group "a b" and group "c d"
        #"b c" can go either of them, but not both.
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(2,len(r))
        self.assertEqual(5,len(r[0])+len(r[1]))
@ -768,7 +736,7 @@ class TCget_groups(TestCase):
        l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
        #There will be 2 groups here: group "a b" and group "c d"
        #"b c" can fit in both, but it must be in only one of them
-        matches = MatchFactory().getmatches(l)
+        matches = getmatches(l)
        r = get_groups(matches)
        self.assertEqual(1,len(r))
    
@ -788,7 +756,7 @@ class TCget_groups(TestCase):
    
    def test_four_sized_group(self):
        l = [NamedObject("foobar") for i in xrange(4)]
-        m = MatchFactory().getmatches(l)
+        m = getmatches(l)
        r = get_groups(m)
        self.assertEqual(1,len(r))
        self.assertEqual(4,len(r[0]))
--- a/base/py/tests/results_test.py
+++ b/base/py/tests/results_test.py
@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
 def GetTestGroups():
    objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
    objects[1].size = 1024
-    matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
+    matches = engine.getmatches(objects) #we should have 5 matches
    groups = engine.get_groups(matches) #We should have 2 groups
    for g in groups:
        g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
                return objects[1]
        
        objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
-        matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches
+        matches = engine.getmatches(objects) #we should have 5 matches
        groups = engine.get_groups(matches) #We should have 2 groups
        for g in groups:
            g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
--- a/base/py/tests/scanner_test.py
+++ b/base/py/tests/scanner_test.py
@ -369,23 +369,6 @@ def test_ignore_list_checks_for_unicode():
    assert f2 in g
    assert f3 in g

-def test_custom_match_factory():
-    class MatchFactory(object):
-        def getmatches(self, objects, j=None):
-            return [Match(objects[0], objects[1], 420)]
-        
-    
-    s = Scanner()
-    s.match_factory = MatchFactory()
-    o1, o2 = no('foo'), no('bar')
-    groups = s.GetDupeGroups([o1, o2])
-    eq_(len(groups), 1)
-    g = groups[0]
-    eq_(len(g), 2)
-    g.switch_ref(o1)
-    m = g.get_match_of(o2)
-    eq_(m, (o1, o2, 420))
-
 def test_file_evaluates_to_false():
    # A very wrong way to use any() was added at some point, causing resulting group list
    # to be empty.