diff --git a/core/scanner.py b/core/scanner.py index fc1194b4..8418b8df 100644 --- a/core/scanner.py +++ b/core/scanner.py @@ -152,7 +152,20 @@ class Scanner: logging.info('Grouping matches') groups = engine.get_groups(matches, j) matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) - self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups) + if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}: + self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups) + else: + # Ticket #195 + # To speed up the scan, we don't bother comparing contents of files that are both ref + # files. However, this messes up "discarded" counting because there's a missing match + # in cases where we end up with a dupe group anyway (with a non-ref file). Because it's + # impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus + # bypassing our tricky problem. + # Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also + # bypass ref comparison, thus messing up with our "discarded" count. So we're + # effectively disabling the "discarded" feature in PE, but it's better than falsely + # reporting discarded matches. + self.discarded_file_count = 0 groups = [g for g in groups if any(not f.is_ref for f in g)] logging.info('Created %d groups' % len(groups)) j.set_progress(100, tr("Doing group prioritization")) diff --git a/core/tests/scanner_test.py b/core/tests/scanner_test.py index e65cc72c..e129ebde 100644 --- a/core/tests/scanner_test.py +++ b/core/tests/scanner_test.py @@ -28,7 +28,7 @@ class NamedObject: self.words = getwords(name) def __repr__(self): - return '' % self.name + return '' % (self.name, self.path) no = NamedObject @@ -507,3 +507,20 @@ def test_ignore_files_with_same_path(fake_fileexists): f1 = no('foobar', path='path1/foobar') f2 = no('foobar', path='path1/foobar') eq_(s.get_dupe_groups([f1, f2]), []) + +def test_dont_count_ref_files_as_discarded(fake_fileexists): + # To speed up the scan, we don't bother comparing contents of files that are both ref files. + # However, this causes problems in "discarded" counting and we make sure here that we don't + # report discarded matches in exact duplicate scans. + s = Scanner() + s.scan_type = ScanType.Contents + o1 = no("foo", path="p1") + o2 = no("foo", path="p2") + o3 = no("foo", path="p3") + o1.md5 = o1.md5partial = 'foobar' + o2.md5 = o2.md5partial = 'foobar' + o3.md5 = o3.md5partial = 'foobar' + o1.is_ref = True + o2.is_ref = True + eq_(len(s.get_dupe_groups([o1, o2, o3])), 1) + eq_(s.discarded_file_count, 0)