mirror of
https://github.com/arsenetar/dupeguru.git
synced 2025-03-10 05:34:36 +00:00
[#51 state:fixed] Improved the grouping algorithm to reduce the number of discarded matches in a scan.
--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40116
This commit is contained in:
parent
536c43006d
commit
6d5ae99509
@ -14,6 +14,7 @@ import string
|
|||||||
from collections import defaultdict, namedtuple
|
from collections import defaultdict, namedtuple
|
||||||
from unicodedata import normalize
|
from unicodedata import normalize
|
||||||
|
|
||||||
|
from hsutil.misc import flatten
|
||||||
from hsutil.str import multi_replace
|
from hsutil.str import multi_replace
|
||||||
from hsutil import job
|
from hsutil import job
|
||||||
|
|
||||||
@ -260,9 +261,11 @@ class Group(object):
|
|||||||
self._percentage = None
|
self._percentage = None
|
||||||
self._matches_for_ref = None
|
self._matches_for_ref = None
|
||||||
|
|
||||||
def clean_matches(self):
|
def discard_matches(self):
|
||||||
self.matches = set(m for m in self.matches if (m.first in self.unordered) and (m.second in self.unordered))
|
discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second]))
|
||||||
|
self.matches -= discarded
|
||||||
self.candidates = defaultdict(set)
|
self.candidates = defaultdict(set)
|
||||||
|
return discarded
|
||||||
|
|
||||||
def get_match_of(self, item):
|
def get_match_of(self, item):
|
||||||
if item is self.ref:
|
if item is self.ref:
|
||||||
@ -286,14 +289,14 @@ class Group(object):
|
|||||||
if ref is not self.ref:
|
if ref is not self.ref:
|
||||||
self.switch_ref(ref)
|
self.switch_ref(ref)
|
||||||
|
|
||||||
def remove_dupe(self, item, clean_matches=True):
|
def remove_dupe(self, item, discard_matches=True):
|
||||||
try:
|
try:
|
||||||
self.ordered.remove(item)
|
self.ordered.remove(item)
|
||||||
self.unordered.remove(item)
|
self.unordered.remove(item)
|
||||||
self._percentage = None
|
self._percentage = None
|
||||||
self._matches_for_ref = None
|
self._matches_for_ref = None
|
||||||
if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
|
if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self):
|
||||||
if clean_matches:
|
if discard_matches:
|
||||||
self.matches = set(m for m in self.matches if item not in m)
|
self.matches = set(m for m in self.matches if item not in m)
|
||||||
else:
|
else:
|
||||||
self._clear()
|
self._clear()
|
||||||
@ -354,6 +357,13 @@ def get_groups(matches, j=job.nulljob):
|
|||||||
dupe2group[first] = target_group
|
dupe2group[first] = target_group
|
||||||
dupe2group[second] = target_group
|
dupe2group[second] = target_group
|
||||||
target_group.add_match(match)
|
target_group.add_match(match)
|
||||||
|
# Now that we have a group, we have to discard groups' matches and see if there're any "orphan"
|
||||||
|
# matches, that is, matches that were candidate in a group but that none of their 2 files were
|
||||||
|
# accepted in the group. With these orphan groups, it's safe to build additional groups
|
||||||
|
matched_files = set(flatten(groups))
|
||||||
|
orphan_matches = []
|
||||||
for group in groups:
|
for group in groups:
|
||||||
group.clean_matches()
|
orphan_matches += set(m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second]))
|
||||||
|
if groups and orphan_matches:
|
||||||
|
groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time
|
||||||
return groups
|
return groups
|
||||||
|
@ -249,7 +249,7 @@ class Results(Markable):
|
|||||||
else:
|
else:
|
||||||
affected_groups.add(group)
|
affected_groups.add(group)
|
||||||
for group in affected_groups:
|
for group in affected_groups:
|
||||||
group.clean_matches()
|
group.discard_matches()
|
||||||
self.__dupes = None
|
self.__dupes = None
|
||||||
|
|
||||||
def save_to_xml(self, outfile):
|
def save_to_xml(self, outfile):
|
||||||
|
@ -9,6 +9,8 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from nose.tools import eq_
|
||||||
|
|
||||||
from hsutil import job
|
from hsutil import job
|
||||||
from hsutil.decorators import log_calls
|
from hsutil.decorators import log_calls
|
||||||
from hsutil.testcase import TestCase
|
from hsutil.testcase import TestCase
|
||||||
@ -719,12 +721,12 @@ class TCGroup(TestCase):
|
|||||||
self.assert_(g[0] is o1)
|
self.assert_(g[0] is o1)
|
||||||
self.assert_(g[1] is o2)
|
self.assert_(g[1] is o2)
|
||||||
|
|
||||||
def test_clean_matches(self):
|
def test_discard_matches(self):
|
||||||
g = Group()
|
g = Group()
|
||||||
o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True))
|
o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True))
|
||||||
g.add_match(get_match(o1,o2))
|
g.add_match(get_match(o1,o2))
|
||||||
g.add_match(get_match(o1,o3))
|
g.add_match(get_match(o1,o3))
|
||||||
g.clean_matches()
|
g.discard_matches()
|
||||||
self.assertEqual(1,len(g.matches))
|
self.assertEqual(1,len(g.matches))
|
||||||
self.assertEqual(0,len(g.candidates))
|
self.assertEqual(0,len(g.candidates))
|
||||||
|
|
||||||
@ -815,3 +817,20 @@ class TCget_groups(TestCase):
|
|||||||
self.assertEqual(0,self.log[0])
|
self.assertEqual(0,self.log[0])
|
||||||
self.assertEqual(100,self.log[-1])
|
self.assertEqual(100,self.log[-1])
|
||||||
|
|
||||||
|
def test_group_admissible_discarded_dupes(self):
|
||||||
|
# If, with a (A, B, C, D) set, all match with A, but C and D don't match with B and that the
|
||||||
|
# (A, B) match is the highest (thus resulting in an (A, B) group), still match C and D
|
||||||
|
# in a separate group instead of discarding them.
|
||||||
|
A, B, C, D = [NamedObject() for _ in range(4)]
|
||||||
|
m1 = Match(A, B, 90) # This is the strongest "A" match
|
||||||
|
m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group
|
||||||
|
m3 = Match(A, D, 80) # Same thing for D
|
||||||
|
m4 = Match(C, D, 70) # However, because C and D match, they should have their own group.
|
||||||
|
groups = get_groups([m1, m2, m3, m4])
|
||||||
|
eq_(len(groups), 2)
|
||||||
|
g1, g2 = groups
|
||||||
|
assert A in g1
|
||||||
|
assert B in g1
|
||||||
|
assert C in g2
|
||||||
|
assert D in g2
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user