[#33 state:fixed] md5partial is now used before md5 in Contents scans.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40118
This commit is contained in:
hsoft 2009-09-05 16:27:50 +00:00
parent 42ebef15dd
commit 65944ef813
2 changed files with 25 additions and 20 deletions

View File

@ -31,6 +31,16 @@ class Scanner(object):
self.ignore_list = IgnoreList()
self.discarded_file_count = 0
@staticmethod
def _filter_matches_by_content(matches, partial, j):
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
md5attrname = 'md5partial' if partial else 'md5'
md5 = lambda f: getattr(f, md5attrname)
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
md5(matched_file)
j.set_progress(100, 'Removing false matches')
return [m for m in matches if md5(m.first) == md5(m.second)]
def _getmatches(self, files, j):
j = j.start_subjob(2)
mf = engine.MatchFactory()
@ -88,19 +98,14 @@ class Scanner(object):
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
matches = [m for m in iter_matches
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
md5attrname = 'md5partial' if self.scan_type == SCAN_TYPE_CONTENT_AUDIO else 'md5'
md5 = lambda f: getattr(f, md5attrname)
j = j.start_subjob(2)
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
md5(matched_file)
j.set_progress(100, 'Removing false matches')
matches = [m for m in matches if md5(m.first) == md5(m.second)]
words_for_content = ['--'] # We compared md5. No words were involved.
j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
matches = self._filter_matches_by_content(matches, partial=True, j=j)
if self.scan_type == SCAN_TYPE_CONTENT:
matches = self._filter_matches_by_content(matches, partial=False, j=j)
# We compared md5. No words were involved.
for m in matches:
m.first.words = words_for_content
m.second.words = words_for_content
m.first.words = m.second.words = ['--']
logging.info('Grouping matches')
groups = engine.get_groups(matches, j)
groups = [g for g in groups if any(not f.is_ref for f in g)]

View File

@ -86,9 +86,9 @@ def test_content_scan():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = 'foobar'
f[1].md5 = 'foobar'
f[2].md5 = 'bleh'
f[0].md5 = f[0].md5partial = 'foobar'
f[1].md5 = f[1].md5partial = 'foobar'
f[2].md5 = f[2].md5partial = 'bleh'
r = s.GetDupeGroups(f)
eq_(len(r), 1)
eq_(len(r[0]), 2)
@ -109,9 +109,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'), no('bar'), no('bleh')]
f[0].md5 = 'foobar'
f[1].md5 = 'foobar'
f[2].md5 = 'bleh'
f[0].md5 = f[0].md5partial = 'foobar'
f[1].md5 = f[1].md5partial = 'foobar'
f[2].md5 = f[2].md5partial = 'bleh'
s.min_match_percentage = 101
r = s.GetDupeGroups(f)
eq_(len(r), 1)
@ -121,12 +121,12 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
eq_(len(r), 1)
eq_(len(r[0]), 2)
def test_content_scan_puts_md5_in_words_at_the_end():
def test_content_scan_doesnt_put_md5_in_words_at_the_end():
s = Scanner()
s.scan_type = SCAN_TYPE_CONTENT
f = [no('foo'),no('bar')]
f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
r = s.GetDupeGroups(f)
g = r[0]
eq_(g.ref.words, ['--'])