diff --git a/base/py/scanner.py b/base/py/scanner.py index 18b83444..ff59d523 100644 --- a/base/py/scanner.py +++ b/base/py/scanner.py @@ -109,13 +109,13 @@ class Scanner(object): m.first.words = m.second.words = ['--'] logging.info('Grouping matches') groups = engine.get_groups(matches, j) + matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) + self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups) groups = [g for g in groups if any(not f.is_ref for f in g)] logging.info('Created %d groups' % len(groups)) j.set_progress(100, 'Doing group prioritization') for g in groups: g.prioritize(self._key_func, self._tie_breaker) - matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) - self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups) return groups match_factory = None diff --git a/base/py/tests/scanner_test.py b/base/py/tests/scanner_test.py index d683e405..7356d658 100644 --- a/base/py/tests/scanner_test.py +++ b/base/py/tests/scanner_test.py @@ -62,12 +62,15 @@ def test_simple_with_lower_min_match(): eq_(len(g), 3) def test_trim_all_ref_groups(): + # When all files of a group are ref, don't include that group in the results, but also don't + # count the files from that group as discarded. s = Scanner() f = [no('foo'), no('foo'), no('bar'), no('bar')] f[2].is_ref = True f[3].is_ref = True r = s.GetDupeGroups(f) eq_(len(r), 1) + eq_(s.discarded_file_count, 0) def test_priorize(): s = Scanner()