mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
[#33 state:fixed] md5partial is now used before md5 in Contents scans.
--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40118
This commit is contained in:
parent
42ebef15dd
commit
65944ef813
@ -31,6 +31,16 @@ class Scanner(object):
|
||||
self.ignore_list = IgnoreList()
|
||||
self.discarded_file_count = 0
|
||||
|
||||
@staticmethod
|
||||
def _filter_matches_by_content(matches, partial, j):
|
||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
||||
md5attrname = 'md5partial' if partial else 'md5'
|
||||
md5 = lambda f: getattr(f, md5attrname)
|
||||
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
|
||||
md5(matched_file)
|
||||
j.set_progress(100, 'Removing false matches')
|
||||
return [m for m in matches if md5(m.first) == md5(m.second)]
|
||||
|
||||
def _getmatches(self, files, j):
|
||||
j = j.start_subjob(2)
|
||||
mf = engine.MatchFactory()
|
||||
@ -88,19 +98,14 @@ class Scanner(object):
|
||||
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
|
||||
matches = [m for m in iter_matches
|
||||
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
|
||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
||||
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
|
||||
md5attrname = 'md5partial' if self.scan_type == SCAN_TYPE_CONTENT_AUDIO else 'md5'
|
||||
md5 = lambda f: getattr(f, md5attrname)
|
||||
j = j.start_subjob(2)
|
||||
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
|
||||
md5(matched_file)
|
||||
j.set_progress(100, 'Removing false matches')
|
||||
matches = [m for m in matches if md5(m.first) == md5(m.second)]
|
||||
words_for_content = ['--'] # We compared md5. No words were involved.
|
||||
j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
|
||||
matches = self._filter_matches_by_content(matches, partial=True, j=j)
|
||||
if self.scan_type == SCAN_TYPE_CONTENT:
|
||||
matches = self._filter_matches_by_content(matches, partial=False, j=j)
|
||||
# We compared md5. No words were involved.
|
||||
for m in matches:
|
||||
m.first.words = words_for_content
|
||||
m.second.words = words_for_content
|
||||
m.first.words = m.second.words = ['--']
|
||||
logging.info('Grouping matches')
|
||||
groups = engine.get_groups(matches, j)
|
||||
groups = [g for g in groups if any(not f.is_ref for f in g)]
|
||||
|
@ -86,9 +86,9 @@ def test_content_scan():
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [no('foo'), no('bar'), no('bleh')]
|
||||
f[0].md5 = 'foobar'
|
||||
f[1].md5 = 'foobar'
|
||||
f[2].md5 = 'bleh'
|
||||
f[0].md5 = f[0].md5partial = 'foobar'
|
||||
f[1].md5 = f[1].md5partial = 'foobar'
|
||||
f[2].md5 = f[2].md5partial = 'bleh'
|
||||
r = s.GetDupeGroups(f)
|
||||
eq_(len(r), 1)
|
||||
eq_(len(r[0]), 2)
|
||||
@ -109,9 +109,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [no('foo'), no('bar'), no('bleh')]
|
||||
f[0].md5 = 'foobar'
|
||||
f[1].md5 = 'foobar'
|
||||
f[2].md5 = 'bleh'
|
||||
f[0].md5 = f[0].md5partial = 'foobar'
|
||||
f[1].md5 = f[1].md5partial = 'foobar'
|
||||
f[2].md5 = f[2].md5partial = 'bleh'
|
||||
s.min_match_percentage = 101
|
||||
r = s.GetDupeGroups(f)
|
||||
eq_(len(r), 1)
|
||||
@ -121,12 +121,12 @@ def test_min_match_perc_doesnt_matter_for_content_scan():
|
||||
eq_(len(r), 1)
|
||||
eq_(len(r[0]), 2)
|
||||
|
||||
def test_content_scan_puts_md5_in_words_at_the_end():
|
||||
def test_content_scan_doesnt_put_md5_in_words_at_the_end():
|
||||
s = Scanner()
|
||||
s.scan_type = SCAN_TYPE_CONTENT
|
||||
f = [no('foo'),no('bar')]
|
||||
f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
|
||||
f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
|
||||
f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
|
||||
f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f'
|
||||
r = s.GetDupeGroups(f)
|
||||
g = r[0]
|
||||
eq_(g.ref.words, ['--'])
|
||||
|
Loading…
x
Reference in New Issue
Block a user