mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-09 21:24:36 +00:00
[#195 state:fixed] Fixed bug where there would be a false reporting of discarded matches.
This commit is contained in:
parent
7dfb42fb41
commit
93781a0f35
@ -152,7 +152,20 @@ class Scanner:
|
||||
logging.info('Grouping matches')
|
||||
groups = engine.get_groups(matches, j)
|
||||
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
|
||||
self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
|
||||
if self.scan_type in {ScanType.Filename, ScanType.Fields, ScanType.FieldsNoOrder, ScanType.Tag}:
|
||||
self.discarded_file_count = len(matched_files) - sum(len(g) for g in groups)
|
||||
else:
|
||||
# Ticket #195
|
||||
# To speed up the scan, we don't bother comparing contents of files that are both ref
|
||||
# files. However, this messes up "discarded" counting because there's a missing match
|
||||
# in cases where we end up with a dupe group anyway (with a non-ref file). Because it's
|
||||
# impossible to have discarded matches in exact dupe scans, we simply set it at 0, thus
|
||||
# bypassing our tricky problem.
|
||||
# Also, although ScanType.FuzzyBlock is not always doing exact comparisons, we also
|
||||
# bypass ref comparison, thus messing up with our "discarded" count. So we're
|
||||
# effectively disabling the "discarded" feature in PE, but it's better than falsely
|
||||
# reporting discarded matches.
|
||||
self.discarded_file_count = 0
|
||||
groups = [g for g in groups if any(not f.is_ref for f in g)]
|
||||
logging.info('Created %d groups' % len(groups))
|
||||
j.set_progress(100, tr("Doing group prioritization"))
|
||||
|
@ -28,7 +28,7 @@ class NamedObject:
|
||||
self.words = getwords(name)
|
||||
|
||||
def __repr__(self):
|
||||
return '<NamedObject %r>' % self.name
|
||||
return '<NamedObject %r %r>' % (self.name, self.path)
|
||||
|
||||
|
||||
no = NamedObject
|
||||
@ -507,3 +507,20 @@ def test_ignore_files_with_same_path(fake_fileexists):
|
||||
f1 = no('foobar', path='path1/foobar')
|
||||
f2 = no('foobar', path='path1/foobar')
|
||||
eq_(s.get_dupe_groups([f1, f2]), [])
|
||||
|
||||
def test_dont_count_ref_files_as_discarded(fake_fileexists):
|
||||
# To speed up the scan, we don't bother comparing contents of files that are both ref files.
|
||||
# However, this causes problems in "discarded" counting and we make sure here that we don't
|
||||
# report discarded matches in exact duplicate scans.
|
||||
s = Scanner()
|
||||
s.scan_type = ScanType.Contents
|
||||
o1 = no("foo", path="p1")
|
||||
o2 = no("foo", path="p2")
|
||||
o3 = no("foo", path="p3")
|
||||
o1.md5 = o1.md5partial = 'foobar'
|
||||
o2.md5 = o2.md5partial = 'foobar'
|
||||
o3.md5 = o3.md5partial = 'foobar'
|
||||
o1.is_ref = True
|
||||
o2.is_ref = True
|
||||
eq_(len(s.get_dupe_groups([o1, o2, o3])), 1)
|
||||
eq_(s.discarded_file_count, 0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user