diff --git a/base/py/engine.py b/base/py/engine.py index 796c8c3d..5f5bc07c 100644 --- a/base/py/engine.py +++ b/base/py/engine.py @@ -14,6 +14,7 @@ import string from collections import defaultdict, namedtuple from unicodedata import normalize +from hsutil.misc import flatten from hsutil.str import multi_replace from hsutil import job @@ -260,9 +261,11 @@ class Group(object): self._percentage = None self._matches_for_ref = None - def clean_matches(self): - self.matches = set(m for m in self.matches if (m.first in self.unordered) and (m.second in self.unordered)) + def discard_matches(self): + discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second])) + self.matches -= discarded self.candidates = defaultdict(set) + return discarded def get_match_of(self, item): if item is self.ref: @@ -286,14 +289,14 @@ class Group(object): if ref is not self.ref: self.switch_ref(ref) - def remove_dupe(self, item, clean_matches=True): + def remove_dupe(self, item, discard_matches=True): try: self.ordered.remove(item) self.unordered.remove(item) self._percentage = None self._matches_for_ref = None if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self): - if clean_matches: + if discard_matches: self.matches = set(m for m in self.matches if item not in m) else: self._clear() @@ -354,6 +357,13 @@ def get_groups(matches, j=job.nulljob): dupe2group[first] = target_group dupe2group[second] = target_group target_group.add_match(match) + # Now that we have a group, we have to discard groups' matches and see if there're any "orphan" + # matches, that is, matches that were candidate in a group but that none of their 2 files were + # accepted in the group. With these orphan groups, it's safe to build additional groups + matched_files = set(flatten(groups)) + orphan_matches = [] for group in groups: - group.clean_matches() + orphan_matches += set(m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second])) + if groups and orphan_matches: + groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time return groups diff --git a/base/py/results.py b/base/py/results.py index 15debc0e..dee24920 100644 --- a/base/py/results.py +++ b/base/py/results.py @@ -249,7 +249,7 @@ class Results(Markable): else: affected_groups.add(group) for group in affected_groups: - group.clean_matches() + group.discard_matches() self.__dupes = None def save_to_xml(self, outfile): diff --git a/base/py/tests/engine_test.py b/base/py/tests/engine_test.py index a9ff33b1..2111618f 100644 --- a/base/py/tests/engine_test.py +++ b/base/py/tests/engine_test.py @@ -9,6 +9,8 @@ import sys +from nose.tools import eq_ + from hsutil import job from hsutil.decorators import log_calls from hsutil.testcase import TestCase @@ -719,12 +721,12 @@ class TCGroup(TestCase): self.assert_(g[0] is o1) self.assert_(g[1] is o2) - def test_clean_matches(self): + def test_discard_matches(self): g = Group() o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True)) g.add_match(get_match(o1,o2)) g.add_match(get_match(o1,o3)) - g.clean_matches() + g.discard_matches() self.assertEqual(1,len(g.matches)) self.assertEqual(0,len(g.candidates)) @@ -815,3 +817,20 @@ class TCget_groups(TestCase): self.assertEqual(0,self.log[0]) self.assertEqual(100,self.log[-1]) + def test_group_admissible_discarded_dupes(self): + # If, with a (A, B, C, D) set, all match with A, but C and D don't match with B and that the + # (A, B) match is the highest (thus resulting in an (A, B) group), still match C and D + # in a separate group instead of discarding them. + A, B, C, D = [NamedObject() for _ in range(4)] + m1 = Match(A, B, 90) # This is the strongest "A" match + m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group + m3 = Match(A, D, 80) # Same thing for D + m4 = Match(C, D, 70) # However, because C and D match, they should have their own group. + groups = get_groups([m1, m2, m3, m4]) + eq_(len(groups), 2) + g1, g2 = groups + assert A in g1 + assert B in g1 + assert C in g2 + assert D in g2 +