mirror of
				https://github.com/arsenetar/dupeguru.git
				synced 2025-09-11 17:58:17 +00:00 
			
		
		
		
	[#51 state:fixed] Improved the grouping algorithm to reduce the number of discarded matches in a scan.
--HG-- extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40116
This commit is contained in:
		
							parent
							
								
									536c43006d
								
							
						
					
					
						commit
						6d5ae99509
					
				| @ -14,6 +14,7 @@ import string | ||||
| from collections import defaultdict, namedtuple | ||||
| from unicodedata import normalize | ||||
| 
 | ||||
| from hsutil.misc import flatten | ||||
| from hsutil.str import multi_replace | ||||
| from hsutil import job | ||||
| 
 | ||||
| @ -260,9 +261,11 @@ class Group(object): | ||||
|         self._percentage = None | ||||
|         self._matches_for_ref = None | ||||
|      | ||||
|     def clean_matches(self): | ||||
|         self.matches = set(m for m in self.matches if (m.first in self.unordered) and (m.second in self.unordered)) | ||||
|     def discard_matches(self): | ||||
|         discarded = set(m for m in self.matches if not all(obj in self.unordered for obj in [m.first, m.second])) | ||||
|         self.matches -= discarded | ||||
|         self.candidates = defaultdict(set) | ||||
|         return discarded | ||||
|      | ||||
|     def get_match_of(self, item): | ||||
|         if item is self.ref: | ||||
| @ -286,14 +289,14 @@ class Group(object): | ||||
|         if ref is not self.ref: | ||||
|             self.switch_ref(ref) | ||||
|      | ||||
|     def remove_dupe(self, item, clean_matches=True): | ||||
|     def remove_dupe(self, item, discard_matches=True): | ||||
|         try: | ||||
|             self.ordered.remove(item) | ||||
|             self.unordered.remove(item) | ||||
|             self._percentage = None | ||||
|             self._matches_for_ref = None | ||||
|             if (len(self) > 1) and any(not getattr(item, 'is_ref', False) for item in self): | ||||
|                 if clean_matches: | ||||
|                 if discard_matches: | ||||
|                     self.matches = set(m for m in self.matches if item not in m) | ||||
|             else: | ||||
|                 self._clear() | ||||
| @ -354,6 +357,13 @@ def get_groups(matches, j=job.nulljob): | ||||
|                 dupe2group[first] = target_group | ||||
|                 dupe2group[second] = target_group | ||||
|         target_group.add_match(match) | ||||
|     # Now that we have a group, we have to discard groups' matches and see if there're any "orphan" | ||||
|     # matches, that is, matches that were candidate in a group but that none of their 2 files were | ||||
|     # accepted in the group. With these orphan groups, it's safe to build additional groups | ||||
|     matched_files = set(flatten(groups)) | ||||
|     orphan_matches = [] | ||||
|     for group in groups: | ||||
|         group.clean_matches() | ||||
|         orphan_matches += set(m for m in group.discard_matches() if not any(obj in matched_files for obj in [m.first, m.second])) | ||||
|     if groups and orphan_matches: | ||||
|         groups += get_groups(orphan_matches) # no job, as it isn't supposed to take a long time | ||||
|     return groups | ||||
|  | ||||
| @ -249,7 +249,7 @@ class Results(Markable): | ||||
|             else: | ||||
|                 affected_groups.add(group) | ||||
|         for group in affected_groups: | ||||
|             group.clean_matches() | ||||
|             group.discard_matches() | ||||
|         self.__dupes = None | ||||
|      | ||||
|     def save_to_xml(self, outfile): | ||||
|  | ||||
| @ -9,6 +9,8 @@ | ||||
| 
 | ||||
| import sys | ||||
| 
 | ||||
| from nose.tools import eq_ | ||||
| 
 | ||||
| from hsutil import job | ||||
| from hsutil.decorators import log_calls | ||||
| from hsutil.testcase import TestCase | ||||
| @ -719,12 +721,12 @@ class TCGroup(TestCase): | ||||
|         self.assert_(g[0] is o1) | ||||
|         self.assert_(g[1] is o2) | ||||
|      | ||||
|     def test_clean_matches(self): | ||||
|     def test_discard_matches(self): | ||||
|         g = Group() | ||||
|         o1,o2,o3 = (NamedObject("foo",True),NamedObject("bar",True),NamedObject("baz",True)) | ||||
|         g.add_match(get_match(o1,o2)) | ||||
|         g.add_match(get_match(o1,o3)) | ||||
|         g.clean_matches() | ||||
|         g.discard_matches() | ||||
|         self.assertEqual(1,len(g.matches)) | ||||
|         self.assertEqual(0,len(g.candidates)) | ||||
|      | ||||
| @ -815,3 +817,20 @@ class TCget_groups(TestCase): | ||||
|         self.assertEqual(0,self.log[0]) | ||||
|         self.assertEqual(100,self.log[-1]) | ||||
|      | ||||
|     def test_group_admissible_discarded_dupes(self): | ||||
|         # If, with a (A, B, C, D) set, all match with A, but C and D don't match with B and that the | ||||
|         # (A, B) match is the highest (thus resulting in an (A, B) group), still match C and D | ||||
|         # in a separate group instead of discarding them. | ||||
|         A, B, C, D = [NamedObject() for _ in range(4)] | ||||
|         m1 = Match(A, B, 90) # This is the strongest "A" match | ||||
|         m2 = Match(A, C, 80) # Because C doesn't match with B, it won't be in the group | ||||
|         m3 = Match(A, D, 80) # Same thing for D | ||||
|         m4 = Match(C, D, 70) # However, because C and D match, they should have their own group. | ||||
|         groups = get_groups([m1, m2, m3, m4]) | ||||
|         eq_(len(groups), 2) | ||||
|         g1, g2 = groups | ||||
|         assert A in g1 | ||||
|         assert B in g1 | ||||
|         assert C in g2 | ||||
|         assert D in g2 | ||||
|      | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user