[#33 state:fixed] md5partial is now used before md5 in Contents scans.

--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40118
2025-07-25 22:43:20 +00:00 · 2009-09-05 16:27:50 +00:00 · 2009-09-05 16:27:50 +00:00 · 65944ef813
commit 65944ef813
parent 42ebef15dd
2 changed files with 25 additions and 20 deletions
--- a/base/py/scanner.py
+++ b/base/py/scanner.py
@ -31,6 +31,16 @@ class Scanner(object):
        self.ignore_list = IgnoreList()
        self.discarded_file_count = 0
    
+    @staticmethod
+    def _filter_matches_by_content(matches, partial, j):
+        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
+        md5attrname = 'md5partial' if partial else 'md5'
+        md5 = lambda f: getattr(f, md5attrname)
+        for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
+            md5(matched_file)
+        j.set_progress(100, 'Removing false matches')
+        return [m for m in matches if md5(m.first) == md5(m.second)]
+    
    def _getmatches(self, files, j):
        j = j.start_subjob(2)
        mf = engine.MatchFactory()
@ -88,19 +98,14 @@ class Scanner(object):
            iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
            matches = [m for m in iter_matches 
                if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
-        matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
        if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
-            md5attrname = 'md5partial' if self.scan_type == SCAN_TYPE_CONTENT_AUDIO else 'md5'
-            md5 = lambda f: getattr(f, md5attrname)
-            j = j.start_subjob(2)
-            for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
-                md5(matched_file)
-            j.set_progress(100, 'Removing false matches')
-            matches = [m for m in matches if md5(m.first) == md5(m.second)]
-            words_for_content = ['--'] # We compared md5. No words were involved.
+            j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
+            matches = self._filter_matches_by_content(matches, partial=True, j=j)
+            if self.scan_type == SCAN_TYPE_CONTENT:
+                matches = self._filter_matches_by_content(matches, partial=False, j=j)
+            # We compared md5. No words were involved.
            for m in matches:
-                m.first.words = words_for_content
-                m.second.words = words_for_content
+                m.first.words = m.second.words = ['--']
        logging.info('Grouping matches')
        groups = engine.get_groups(matches, j)
        groups = [g for g in groups if any(not f.is_ref for f in g)]
--- a/base/py/tests/scanner_test.py
+++ b/base/py/tests/scanner_test.py
@ -86,9 +86,9 @@ def test_content_scan():
    s = Scanner()
    s.scan_type = SCAN_TYPE_CONTENT
    f = [no('foo'), no('bar'), no('bleh')]
-    f[0].md5 = 'foobar'
-    f[1].md5 = 'foobar'
-    f[2].md5 = 'bleh'
+    f[0].md5 = f[0].md5partial = 'foobar'
+    f[1].md5 = f[1].md5partial = 'foobar'
+    f[2].md5 = f[2].md5partial = 'bleh'
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
    eq_(len(r[0]), 2)
@ -109,9 +109,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
    s = Scanner()
    s.scan_type = SCAN_TYPE_CONTENT
    f = [no('foo'), no('bar'), no('bleh')]
-    f[0].md5 = 'foobar'
-    f[1].md5 = 'foobar'
-    f[2].md5 = 'bleh'
+    f[0].md5 = f[0].md5partial = 'foobar'
+    f[1].md5 = f[1].md5partial = 'foobar'
+    f[2].md5 = f[2].md5partial = 'bleh'
    s.min_match_percentage = 101
    r = s.GetDupeGroups(f)
    eq_(len(r), 1)
@ -121,12 +121,12 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
    eq_(len(r), 1)
    eq_(len(r[0]), 2)

-def test_content_scan_puts_md5_in_words_at_the_end():
+def test_content_scan_doesnt_put_md5_in_words_at_the_end():
    s = Scanner()
    s.scan_type = SCAN_TYPE_CONTENT
    f = [no('foo'),no('bar')]
-    f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
-    f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
+    f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
+    f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
    r = s.GetDupeGroups(f)
    g = r[0]
    eq_(g.ref.words, ['--'])