diff --git a/base/py/scanner.py b/base/py/scanner.py index 320151c2..a4a8891f 100644 --- a/base/py/scanner.py +++ b/base/py/scanner.py @@ -31,6 +31,16 @@ class Scanner(object): self.ignore_list = IgnoreList() self.discarded_file_count = 0 + @staticmethod + def _filter_matches_by_content(matches, partial, j): + matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) + md5attrname = 'md5partial' if partial else 'md5' + md5 = lambda f: getattr(f, md5attrname) + for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'): + md5(matched_file) + j.set_progress(100, 'Removing false matches') + return [m for m in matches if md5(m.first) == md5(m.second)] + def _getmatches(self, files, j): j = j.start_subjob(2) mf = engine.MatchFactory() @@ -88,19 +98,14 @@ class Scanner(object): iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list') matches = [m for m in iter_matches if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))] - matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO): - md5attrname = 'md5partial' if self.scan_type == SCAN_TYPE_CONTENT_AUDIO else 'md5' - md5 = lambda f: getattr(f, md5attrname) - j = j.start_subjob(2) - for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'): - md5(matched_file) - j.set_progress(100, 'Removing false matches') - matches = [m for m in matches if md5(m.first) == md5(m.second)] - words_for_content = ['--'] # We compared md5. No words were involved. + j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2) + matches = self._filter_matches_by_content(matches, partial=True, j=j) + if self.scan_type == SCAN_TYPE_CONTENT: + matches = self._filter_matches_by_content(matches, partial=False, j=j) + # We compared md5. No words were involved. for m in matches: - m.first.words = words_for_content - m.second.words = words_for_content + m.first.words = m.second.words = ['--'] logging.info('Grouping matches') groups = engine.get_groups(matches, j) groups = [g for g in groups if any(not f.is_ref for f in g)] diff --git a/base/py/tests/scanner_test.py b/base/py/tests/scanner_test.py index 5356d030..d683e405 100644 --- a/base/py/tests/scanner_test.py +++ b/base/py/tests/scanner_test.py @@ -86,9 +86,9 @@ def test_content_scan(): s = Scanner() s.scan_type = SCAN_TYPE_CONTENT f = [no('foo'), no('bar'), no('bleh')] - f[0].md5 = 'foobar' - f[1].md5 = 'foobar' - f[2].md5 = 'bleh' + f[0].md5 = f[0].md5partial = 'foobar' + f[1].md5 = f[1].md5partial = 'foobar' + f[2].md5 = f[2].md5partial = 'bleh' r = s.GetDupeGroups(f) eq_(len(r), 1) eq_(len(r[0]), 2) @@ -109,9 +109,9 @@ def test_min_match_perc_doesnt_matter_for_content_scan(): s = Scanner() s.scan_type = SCAN_TYPE_CONTENT f = [no('foo'), no('bar'), no('bleh')] - f[0].md5 = 'foobar' - f[1].md5 = 'foobar' - f[2].md5 = 'bleh' + f[0].md5 = f[0].md5partial = 'foobar' + f[1].md5 = f[1].md5partial = 'foobar' + f[2].md5 = f[2].md5partial = 'bleh' s.min_match_percentage = 101 r = s.GetDupeGroups(f) eq_(len(r), 1) @@ -121,12 +121,12 @@ def test_min_match_perc_doesnt_matter_for_content_scan(): eq_(len(r), 1) eq_(len(r[0]), 2) -def test_content_scan_puts_md5_in_words_at_the_end(): +def test_content_scan_doesnt_put_md5_in_words_at_the_end(): s = Scanner() s.scan_type = SCAN_TYPE_CONTENT f = [no('foo'),no('bar')] - f[0].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' - f[1].md5 = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + f[0].md5 = f[0].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' + f[1].md5 = f[1].md5partial = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f' r = s.GetDupeGroups(f) g = r[0] eq_(g.ref.words, ['--'])