diff --git a/base/py/data.py b/base/py/data.py index 3b1d1017..2f81084e 100644 --- a/base/py/data.py +++ b/base/py/data.py @@ -79,7 +79,7 @@ def GetDisplayInfo(dupe, group, delta): format_timestamp(ctime, delta and m), format_timestamp(mtime, delta and m), format_perc(percentage), - format_words(dupe.words), + format_words(dupe.words) if hasattr(dupe, 'words') else '', format_dupe_count(dupe_count) ] diff --git a/base/py/data_me.py b/base/py/data_me.py index 41ce0f85..4fc74069 100644 --- a/base/py/data_me.py +++ b/base/py/data_me.py @@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta): str(dupe.track), dupe.comment, format_perc(percentage), - format_words(dupe.words), + format_words(dupe.words) if hasattr(dupe, 'words') else '', format_dupe_count(dupe_count) ] diff --git a/base/py/engine.py b/base/py/engine.py index 19626f55..b34f2edd 100644 --- a/base/py/engine.py +++ b/base/py/engine.py @@ -9,6 +9,7 @@ from __future__ import division import difflib +import itertools import logging import string from collections import defaultdict, namedtuple @@ -156,58 +157,69 @@ def get_match(first, second, flags=()): percentage = compare(first.words, second.words, flags) return Match(first, second, percentage) -class MatchFactory(object): - common_word_threshold = 50 - match_similar_words = False - min_match_percentage = 0 - weight_words = False - no_field_order = False - limit = 5000000 - - def getmatches(self, objects, j=job.nulljob): - j = j.start_subjob(2) - sj = j.start_subjob(2) - for o in objects: - if not hasattr(o, 'words'): - o.words = getwords(o.name) - word_dict = build_word_dict(objects, sj) - reduce_common_words(word_dict, self.common_word_threshold) - if self.match_similar_words: - merge_similar_words(word_dict) - match_flags = [] - if self.weight_words: - match_flags.append(WEIGHT_WORDS) - if self.match_similar_words: - match_flags.append(MATCH_SIMILAR_WORDS) - if self.no_field_order: - match_flags.append(NO_FIELD_ORDER) - j.start_job(len(word_dict), '0 matches found') - compared = defaultdict(set) - result = [] - try: - # This whole 'popping' thing is there to avoid taking too much memory at the same time. - while word_dict: - items = word_dict.popitem()[1] - while items: - ref = items.pop() - compared_already = compared[ref] - to_compare = items - compared_already - compared_already |= to_compare - for other in to_compare: - m = get_match(ref, other, match_flags) - if m.percentage >= self.min_match_percentage: - result.append(m) - if len(result) >= self.limit: - return result - j.add_progress(desc='%d matches found' % len(result)) - except MemoryError: - # This is the place where the memory usage is at its peak during the scan. - # Just continue the process with an incomplete list of matches. - del compared # This should give us enough room to call logging. - logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict))) - return result +def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False, + no_field_order=False, j=job.nulljob): + COMMON_WORD_THRESHOLD = 50 + LIMIT = 5000000 + j = j.start_subjob(2) + sj = j.start_subjob(2) + for o in objects: + if not hasattr(o, 'words'): + o.words = getwords(o.name) + word_dict = build_word_dict(objects, sj) + reduce_common_words(word_dict, COMMON_WORD_THRESHOLD) + if match_similar_words: + merge_similar_words(word_dict) + match_flags = [] + if weight_words: + match_flags.append(WEIGHT_WORDS) + if match_similar_words: + match_flags.append(MATCH_SIMILAR_WORDS) + if no_field_order: + match_flags.append(NO_FIELD_ORDER) + j.start_job(len(word_dict), '0 matches found') + compared = defaultdict(set) + result = [] + try: + # This whole 'popping' thing is there to avoid taking too much memory at the same time. + while word_dict: + items = word_dict.popitem()[1] + while items: + ref = items.pop() + compared_already = compared[ref] + to_compare = items - compared_already + compared_already |= to_compare + for other in to_compare: + m = get_match(ref, other, match_flags) + if m.percentage >= min_match_percentage: + result.append(m) + if len(result) >= LIMIT: + return result + j.add_progress(desc='%d matches found' % len(result)) + except MemoryError: + # This is the place where the memory usage is at its peak during the scan. + # Just continue the process with an incomplete list of matches. + del compared # This should give us enough room to call logging. + logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict))) return result - + return result + +def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob): + j = j.start_subjob([2, 8]) + size2files = defaultdict(set) + for file in j.iter_with_progress(files, 'Read size of %d/%d files'): + size2files[getattr(file, sizeattr)].add(file) + possible_matches = [files for files in size2files.values() if len(files) > 1] + del size2files + result = [] + j.start_job(len(possible_matches), '0 matches found') + for group in possible_matches: + for first, second in itertools.combinations(group, 2): + if first.md5partial == second.md5partial: + if partial or first.md5 == second.md5: + result.append(Match(first, second, 100)) + j.add_progress(desc='%d matches found' % len(result)) + return result class Group(object): #---Override diff --git a/base/py/scanner.py b/base/py/scanner.py index ff59d523..0ac41d23 100644 --- a/base/py/scanner.py +++ b/base/py/scanner.py @@ -32,40 +32,32 @@ class Scanner(object): self.ignore_list = IgnoreList() self.discarded_file_count = 0 - @staticmethod - def _filter_matches_by_content(matches, partial, j): - matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) - md5attrname = 'md5partial' if partial else 'md5' - md5 = lambda f: getattr(f, md5attrname) - for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'): - md5(matched_file) - j.set_progress(100, 'Removing false matches') - return [m for m in matches if md5(m.first) == md5(m.second)] - def _getmatches(self, files, j): - j = j.start_subjob(2) - mf = engine.MatchFactory() - if self.scan_type != SCAN_TYPE_CONTENT: - mf.match_similar_words = self.match_similar_words - mf.weight_words = self.word_weighting - mf.min_match_percentage = self.min_match_percentage - if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER: - self.scan_type = SCAN_TYPE_FIELDS - mf.no_field_order = True - func = { - SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)), - SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)), - SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags], - SCAN_TYPE_CONTENT: lambda f: [str(f.size)], - SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)] - }[self.scan_type] - for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'): - if self.size_threshold: + if not self.size_threshold: + j = j.start_subjob([2, 8]) + for f in j.iter_with_progress(files, 'Read size of %d/%d files'): f.size # pre-read, makes a smoother progress if read here (especially for bundles) - f.words = func(f) - if self.size_threshold: files = [f for f in files if f.size >= self.size_threshold] - return mf.getmatches(files, j) + if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO): + sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize' + return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j) + else: + j = j.start_subjob([2, 8]) + kw = {} + kw['match_similar_words'] = self.match_similar_words + kw['weight_words'] = self.word_weighting + kw['min_match_percentage'] = self.min_match_percentage + if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER: + self.scan_type = SCAN_TYPE_FIELDS + kw['no_field_order'] = True + func = { + SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)), + SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)), + SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags], + }[self.scan_type] + for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'): + f.words = func(f) + return engine.getmatches(files, j=j, **kw) @staticmethod def _key_func(dupe): @@ -86,10 +78,7 @@ class Scanner(object): for f in [f for f in files if not hasattr(f, 'is_ref')]: f.is_ref = False logging.info('Getting matches') - if self.match_factory is None: - matches = self._getmatches(files, j) - else: - matches = self.match_factory.getmatches(files, j) + matches = self._getmatches(files, j) logging.info('Found %d matches' % len(matches)) if not self.mix_file_kind: j.set_progress(100, 'Removing false matches') @@ -99,14 +88,6 @@ class Scanner(object): iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list') matches = [m for m in iter_matches if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))] - if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO): - j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2) - matches = self._filter_matches_by_content(matches, partial=True, j=j) - if self.scan_type == SCAN_TYPE_CONTENT: - matches = self._filter_matches_by_content(matches, partial=False, j=j) - # We compared md5. No words were involved. - for m in matches: - m.first.words = m.second.words = ['--'] logging.info('Grouping matches') groups = engine.get_groups(matches, j) matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) diff --git a/base/py/tests/app_cocoa_test.py b/base/py/tests/app_cocoa_test.py index 4a89b356..4f8ca34e 100644 --- a/base/py/tests/app_cocoa_test.py +++ b/base/py/tests/app_cocoa_test.py @@ -318,7 +318,7 @@ class TCDupeGuru_renameSelected(TestCase): fp = open(str(p + 'foo bar 3'),mode='w') fp.close() refdir = hsfs.phys.Directory(None,str(p)) - matches = engine.MatchFactory().getmatches(refdir.files) + matches = engine.getmatches(refdir.files) groups = engine.get_groups(matches) g = groups[0] g.prioritize(lambda x:x.name) diff --git a/base/py/tests/engine_test.py b/base/py/tests/engine_test.py index 2111618f..1c3366bc 100644 --- a/base/py/tests/engine_test.py +++ b/base/py/tests/engine_test.py @@ -340,21 +340,13 @@ class TCget_match(TestCase): self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage) -class TCMatchFactory(TestCase): +class GetMatches(TestCase): def test_empty(self): - self.assertEqual([],MatchFactory().getmatches([])) - - def test_defaults(self): - mf = MatchFactory() - self.assertEqual(50,mf.common_word_threshold) - self.assertEqual(False,mf.weight_words) - self.assertEqual(False,mf.match_similar_words) - self.assertEqual(False,mf.no_field_order) - self.assertEqual(0,mf.min_match_percentage) + eq_(getmatches([]), []) def test_simple(self): l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")] - r = MatchFactory().getmatches(l) + r = getmatches(l) self.assertEqual(2,len(r)) seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh" m = seek[0] @@ -367,7 +359,7 @@ class TCMatchFactory(TestCase): def test_null_and_unrelated_objects(self): l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")] - r = MatchFactory().getmatches(l) + r = getmatches(l) self.assertEqual(1,len(r)) m = r[0] self.assertEqual(50,m.percentage) @@ -376,34 +368,33 @@ class TCMatchFactory(TestCase): def test_twice_the_same_word(self): l = [NamedObject("foo foo bar"),NamedObject("bar bleh")] - r = MatchFactory().getmatches(l) + r = getmatches(l) self.assertEqual(1,len(r)) def test_twice_the_same_word_when_preworded(self): l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)] - r = MatchFactory().getmatches(l) + r = getmatches(l) self.assertEqual(1,len(r)) def test_two_words_match(self): l = [NamedObject("foo bar"),NamedObject("foo bar bleh")] - r = MatchFactory().getmatches(l) + r = getmatches(l) self.assertEqual(1,len(r)) def test_match_files_with_only_common_words(self): #If a word occurs more than 50 times, it is excluded from the matching process #The problem with the common_word_threshold is that the files containing only common #words will never be matched together. We *should* match them. - mf = MatchFactory() - mf.common_word_threshold = 50 + # This test assumes that the common word threashold const is 50 l = [NamedObject("foo") for i in range(50)] - r = mf.getmatches(l) + r = getmatches(l) self.assertEqual(1225,len(r)) def test_use_words_already_there_if_there(self): o1 = NamedObject('foo') o2 = NamedObject('bar') o2.words = ['foo'] - self.assertEqual(1,len(MatchFactory().getmatches([o1,o2]))) + eq_(1, len(getmatches([o1,o2]))) def test_job(self): def do_progress(p,d=''): @@ -413,75 +404,62 @@ class TCMatchFactory(TestCase): j = job.Job(1,do_progress) self.log = [] s = "foo bar" - MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j) + getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j) self.assert_(len(self.log) > 2) self.assertEqual(0,self.log[0]) self.assertEqual(100,self.log[-1]) def test_weight_words(self): - mf = MatchFactory() - mf.weight_words = True l = [NamedObject("foo bar"),NamedObject("bar bleh")] - m = mf.getmatches(l)[0] + m = getmatches(l, weight_words=True)[0] self.assertEqual(int((6.0 / 13.0) * 100),m.percentage) def test_similar_word(self): - mf = MatchFactory() - mf.match_similar_words = True l = [NamedObject("foobar"),NamedObject("foobars")] - self.assertEqual(1,len(mf.getmatches(l))) - self.assertEqual(100,mf.getmatches(l)[0].percentage) + eq_(len(getmatches(l, match_similar_words=True)), 1) + eq_(getmatches(l, match_similar_words=True)[0].percentage, 100) l = [NamedObject("foobar"),NamedObject("foo")] - self.assertEqual(0,len(mf.getmatches(l))) #too far + eq_(len(getmatches(l, match_similar_words=True)), 0) #too far l = [NamedObject("bizkit"),NamedObject("bizket")] - self.assertEqual(1,len(mf.getmatches(l))) + eq_(len(getmatches(l, match_similar_words=True)), 1) l = [NamedObject("foobar"),NamedObject("foosbar")] - self.assertEqual(1,len(mf.getmatches(l))) + eq_(len(getmatches(l, match_similar_words=True)), 1) def test_single_object_with_similar_words(self): - mf = MatchFactory() - mf.match_similar_words = True l = [NamedObject("foo foos")] - self.assertEqual(0,len(mf.getmatches(l))) + eq_(len(getmatches(l, match_similar_words=True)), 0) def test_double_words_get_counted_only_once(self): - mf = MatchFactory() l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")] - m = mf.getmatches(l)[0] + m = getmatches(l)[0] self.assertEqual(75,m.percentage) def test_with_fields(self): - mf = MatchFactory() o1 = NamedObject("foo bar - foo bleh") o2 = NamedObject("foo bar - bleh bar") o1.words = getfields(o1.name) o2.words = getfields(o2.name) - m = mf.getmatches([o1, o2])[0] + m = getmatches([o1, o2])[0] self.assertEqual(50, m.percentage) def test_with_fields_no_order(self): - mf = MatchFactory() - mf.no_field_order = True o1 = NamedObject("foo bar - foo bleh") o2 = NamedObject("bleh bang - foo bar") o1.words = getfields(o1.name) o2.words = getfields(o2.name) - m = mf.getmatches([o1, o2])[0] - self.assertEqual(50 ,m.percentage) + m = getmatches([o1, o2], no_field_order=True)[0] + eq_(m.percentage, 50) def test_only_match_similar_when_the_option_is_set(self): - mf = MatchFactory() - mf.match_similar_words = False l = [NamedObject("foobar"),NamedObject("foobars")] - self.assertEqual(0,len(mf.getmatches(l))) + eq_(len(getmatches(l, match_similar_words=False)), 0) def test_dont_recurse_do_match(self): # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely sys.setrecursionlimit(100) - mf = MatchFactory() files = [NamedObject('foo bar') for i in range(101)] try: - mf.getmatches(files) + getmatches(files) except RuntimeError: self.fail() finally: @@ -489,18 +467,9 @@ class TCMatchFactory(TestCase): def test_min_match_percentage(self): l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")] - mf = MatchFactory() - mf.min_match_percentage = 50 - r = mf.getmatches(l) + r = getmatches(l, min_match_percentage=50) self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match - def test_limit(self): - l = [NamedObject(),NamedObject(),NamedObject()] - mf = MatchFactory() - mf.limit = 2 - r = mf.getmatches(l) - self.assertEqual(2,len(r)) - def test_MemoryError(self): @log_calls def mocked_match(first, second, flags): @@ -510,9 +479,8 @@ class TCMatchFactory(TestCase): objects = [NamedObject() for i in range(10)] # results in 45 matches self.mock(engine, 'get_match', mocked_match) - mf = MatchFactory() try: - r = mf.getmatches(objects) + r = getmatches(objects) except MemoryError: self.fail('MemorryError must be handled') self.assertEqual(42, len(r)) @@ -738,7 +706,7 @@ class TCget_groups(TestCase): def test_simple(self): l = [NamedObject("foo bar"),NamedObject("bar bleh")] - matches = MatchFactory().getmatches(l) + matches = getmatches(l) m = matches[0] r = get_groups(matches) self.assertEqual(1,len(r)) @@ -749,7 +717,7 @@ class TCget_groups(TestCase): def test_group_with_multiple_matches(self): #This results in 3 matches l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")] - matches = MatchFactory().getmatches(l) + matches = getmatches(l) r = get_groups(matches) self.assertEqual(1,len(r)) g = r[0] @@ -759,7 +727,7 @@ class TCget_groups(TestCase): l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")] #There will be 2 groups here: group "a b" and group "c d" #"b c" can go either of them, but not both. - matches = MatchFactory().getmatches(l) + matches = getmatches(l) r = get_groups(matches) self.assertEqual(2,len(r)) self.assertEqual(5,len(r[0])+len(r[1])) @@ -768,7 +736,7 @@ class TCget_groups(TestCase): l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")] #There will be 2 groups here: group "a b" and group "c d" #"b c" can fit in both, but it must be in only one of them - matches = MatchFactory().getmatches(l) + matches = getmatches(l) r = get_groups(matches) self.assertEqual(1,len(r)) @@ -788,7 +756,7 @@ class TCget_groups(TestCase): def test_four_sized_group(self): l = [NamedObject("foobar") for i in xrange(4)] - m = MatchFactory().getmatches(l) + m = getmatches(l) r = get_groups(m) self.assertEqual(1,len(r)) self.assertEqual(4,len(r[0])) diff --git a/base/py/tests/results_test.py b/base/py/tests/results_test.py index b49303a9..ef24a81a 100644 --- a/base/py/tests/results_test.py +++ b/base/py/tests/results_test.py @@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject): def GetTestGroups(): objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")] objects[1].size = 1024 - matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches + matches = engine.getmatches(objects) #we should have 5 matches groups = engine.get_groups(matches) #We should have 2 groups for g in groups: g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is @@ -505,7 +505,7 @@ class TCResultsXML(TestCase): return objects[1] objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)] - matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches + matches = engine.getmatches(objects) #we should have 5 matches groups = engine.get_groups(matches) #We should have 2 groups for g in groups: g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is diff --git a/base/py/tests/scanner_test.py b/base/py/tests/scanner_test.py index 7356d658..39d5eaf4 100644 --- a/base/py/tests/scanner_test.py +++ b/base/py/tests/scanner_test.py @@ -369,23 +369,6 @@ def test_ignore_list_checks_for_unicode(): assert f2 in g assert f3 in g -def test_custom_match_factory(): - class MatchFactory(object): - def getmatches(self, objects, j=None): - return [Match(objects[0], objects[1], 420)] - - - s = Scanner() - s.match_factory = MatchFactory() - o1, o2 = no('foo'), no('bar') - groups = s.GetDupeGroups([o1, o2]) - eq_(len(groups), 1) - g = groups[0] - eq_(len(g), 2) - g.switch_ref(o1) - m = g.get_match_of(o2) - eq_(m, (o1, o2, 420)) - def test_file_evaluates_to_false(): # A very wrong way to use any() was added at some point, causing resulting group list # to be empty.