Changed the MatchFactory into a simple getmatch method, and added a separate getmatches_by_contents() method for contents scan, which results in faster and less memory hungry scans.

--HG--
extra : convert_revision : svn%3Ac306627e-7827-47d3-bdf0-9a457c9553a1/trunk%40192
This commit is contained in:
hsoft 2009-10-18 08:46:00 +00:00
parent 3f34dab881
commit 7228adf433
8 changed files with 123 additions and 179 deletions

View File

@ -79,7 +79,7 @@ def GetDisplayInfo(dupe, group, delta):
format_timestamp(ctime, delta and m), format_timestamp(ctime, delta and m),
format_timestamp(mtime, delta and m), format_timestamp(mtime, delta and m),
format_perc(percentage), format_perc(percentage),
format_words(dupe.words), format_words(dupe.words) if hasattr(dupe, 'words') else '',
format_dupe_count(dupe_count) format_dupe_count(dupe_count)
] ]

View File

@ -76,7 +76,7 @@ def GetDisplayInfo(dupe, group, delta):
str(dupe.track), str(dupe.track),
dupe.comment, dupe.comment,
format_perc(percentage), format_perc(percentage),
format_words(dupe.words), format_words(dupe.words) if hasattr(dupe, 'words') else '',
format_dupe_count(dupe_count) format_dupe_count(dupe_count)
] ]

View File

@ -9,6 +9,7 @@
from __future__ import division from __future__ import division
import difflib import difflib
import itertools
import logging import logging
import string import string
from collections import defaultdict, namedtuple from collections import defaultdict, namedtuple
@ -156,58 +157,69 @@ def get_match(first, second, flags=()):
percentage = compare(first.words, second.words, flags) percentage = compare(first.words, second.words, flags)
return Match(first, second, percentage) return Match(first, second, percentage)
class MatchFactory(object): def getmatches(objects, min_match_percentage=0, match_similar_words=False, weight_words=False,
common_word_threshold = 50 no_field_order=False, j=job.nulljob):
match_similar_words = False COMMON_WORD_THRESHOLD = 50
min_match_percentage = 0 LIMIT = 5000000
weight_words = False j = j.start_subjob(2)
no_field_order = False sj = j.start_subjob(2)
limit = 5000000 for o in objects:
if not hasattr(o, 'words'):
def getmatches(self, objects, j=job.nulljob): o.words = getwords(o.name)
j = j.start_subjob(2) word_dict = build_word_dict(objects, sj)
sj = j.start_subjob(2) reduce_common_words(word_dict, COMMON_WORD_THRESHOLD)
for o in objects: if match_similar_words:
if not hasattr(o, 'words'): merge_similar_words(word_dict)
o.words = getwords(o.name) match_flags = []
word_dict = build_word_dict(objects, sj) if weight_words:
reduce_common_words(word_dict, self.common_word_threshold) match_flags.append(WEIGHT_WORDS)
if self.match_similar_words: if match_similar_words:
merge_similar_words(word_dict) match_flags.append(MATCH_SIMILAR_WORDS)
match_flags = [] if no_field_order:
if self.weight_words: match_flags.append(NO_FIELD_ORDER)
match_flags.append(WEIGHT_WORDS) j.start_job(len(word_dict), '0 matches found')
if self.match_similar_words: compared = defaultdict(set)
match_flags.append(MATCH_SIMILAR_WORDS) result = []
if self.no_field_order: try:
match_flags.append(NO_FIELD_ORDER) # This whole 'popping' thing is there to avoid taking too much memory at the same time.
j.start_job(len(word_dict), '0 matches found') while word_dict:
compared = defaultdict(set) items = word_dict.popitem()[1]
result = [] while items:
try: ref = items.pop()
# This whole 'popping' thing is there to avoid taking too much memory at the same time. compared_already = compared[ref]
while word_dict: to_compare = items - compared_already
items = word_dict.popitem()[1] compared_already |= to_compare
while items: for other in to_compare:
ref = items.pop() m = get_match(ref, other, match_flags)
compared_already = compared[ref] if m.percentage >= min_match_percentage:
to_compare = items - compared_already result.append(m)
compared_already |= to_compare if len(result) >= LIMIT:
for other in to_compare: return result
m = get_match(ref, other, match_flags) j.add_progress(desc='%d matches found' % len(result))
if m.percentage >= self.min_match_percentage: except MemoryError:
result.append(m) # This is the place where the memory usage is at its peak during the scan.
if len(result) >= self.limit: # Just continue the process with an incomplete list of matches.
return result del compared # This should give us enough room to call logging.
j.add_progress(desc='%d matches found' % len(result)) logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
except MemoryError:
# This is the place where the memory usage is at its peak during the scan.
# Just continue the process with an incomplete list of matches.
del compared # This should give us enough room to call logging.
logging.warning('Memory Overflow. Matches: %d. Word dict: %d' % (len(result), len(word_dict)))
return result
return result return result
return result
def getmatches_by_contents(files, sizeattr='size', partial=False, j=job.nulljob):
j = j.start_subjob([2, 8])
size2files = defaultdict(set)
for file in j.iter_with_progress(files, 'Read size of %d/%d files'):
size2files[getattr(file, sizeattr)].add(file)
possible_matches = [files for files in size2files.values() if len(files) > 1]
del size2files
result = []
j.start_job(len(possible_matches), '0 matches found')
for group in possible_matches:
for first, second in itertools.combinations(group, 2):
if first.md5partial == second.md5partial:
if partial or first.md5 == second.md5:
result.append(Match(first, second, 100))
j.add_progress(desc='%d matches found' % len(result))
return result
class Group(object): class Group(object):
#---Override #---Override

View File

@ -32,40 +32,32 @@ class Scanner(object):
self.ignore_list = IgnoreList() self.ignore_list = IgnoreList()
self.discarded_file_count = 0 self.discarded_file_count = 0
@staticmethod
def _filter_matches_by_content(matches, partial, j):
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])
md5attrname = 'md5partial' if partial else 'md5'
md5 = lambda f: getattr(f, md5attrname)
for matched_file in j.iter_with_progress(matched_files, 'Analyzed %d/%d matching files'):
md5(matched_file)
j.set_progress(100, 'Removing false matches')
return [m for m in matches if md5(m.first) == md5(m.second)]
def _getmatches(self, files, j): def _getmatches(self, files, j):
j = j.start_subjob(2) if not self.size_threshold:
mf = engine.MatchFactory() j = j.start_subjob([2, 8])
if self.scan_type != SCAN_TYPE_CONTENT: for f in j.iter_with_progress(files, 'Read size of %d/%d files'):
mf.match_similar_words = self.match_similar_words
mf.weight_words = self.word_weighting
mf.min_match_percentage = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
mf.no_field_order = True
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
SCAN_TYPE_CONTENT: lambda f: [str(f.size)],
SCAN_TYPE_CONTENT_AUDIO: lambda f: [str(f.audiosize)]
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
if self.size_threshold:
f.size # pre-read, makes a smoother progress if read here (especially for bundles) f.size # pre-read, makes a smoother progress if read here (especially for bundles)
f.words = func(f)
if self.size_threshold:
files = [f for f in files if f.size >= self.size_threshold] files = [f for f in files if f.size >= self.size_threshold]
return mf.getmatches(files, j) if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
sizeattr = 'size' if self.scan_type == SCAN_TYPE_CONTENT else 'audiosize'
return engine.getmatches_by_contents(files, sizeattr, partial=self.scan_type==SCAN_TYPE_CONTENT_AUDIO, j=j)
else:
j = j.start_subjob([2, 8])
kw = {}
kw['match_similar_words'] = self.match_similar_words
kw['weight_words'] = self.word_weighting
kw['min_match_percentage'] = self.min_match_percentage
if self.scan_type == SCAN_TYPE_FIELDS_NO_ORDER:
self.scan_type = SCAN_TYPE_FIELDS
kw['no_field_order'] = True
func = {
SCAN_TYPE_FILENAME: lambda f: engine.getwords(rem_file_ext(f.name)),
SCAN_TYPE_FIELDS: lambda f: engine.getfields(rem_file_ext(f.name)),
SCAN_TYPE_TAG: lambda f: [engine.getwords(unicode(getattr(f, attrname))) for attrname in SCANNABLE_TAGS if attrname in self.scanned_tags],
}[self.scan_type]
for f in j.iter_with_progress(files, 'Read metadata of %d/%d files'):
f.words = func(f)
return engine.getmatches(files, j=j, **kw)
@staticmethod @staticmethod
def _key_func(dupe): def _key_func(dupe):
@ -86,10 +78,7 @@ class Scanner(object):
for f in [f for f in files if not hasattr(f, 'is_ref')]: for f in [f for f in files if not hasattr(f, 'is_ref')]:
f.is_ref = False f.is_ref = False
logging.info('Getting matches') logging.info('Getting matches')
if self.match_factory is None: matches = self._getmatches(files, j)
matches = self._getmatches(files, j)
else:
matches = self.match_factory.getmatches(files, j)
logging.info('Found %d matches' % len(matches)) logging.info('Found %d matches' % len(matches))
if not self.mix_file_kind: if not self.mix_file_kind:
j.set_progress(100, 'Removing false matches') j.set_progress(100, 'Removing false matches')
@ -99,14 +88,6 @@ class Scanner(object):
iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list') iter_matches = j.iter_with_progress(matches, 'Processed %d/%d matches against the ignore list')
matches = [m for m in iter_matches matches = [m for m in iter_matches
if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))] if not self.ignore_list.AreIgnored(unicode(m.first.path), unicode(m.second.path))]
if self.scan_type in (SCAN_TYPE_CONTENT, SCAN_TYPE_CONTENT_AUDIO):
j = j.start_subjob(3 if self.scan_type == SCAN_TYPE_CONTENT else 2)
matches = self._filter_matches_by_content(matches, partial=True, j=j)
if self.scan_type == SCAN_TYPE_CONTENT:
matches = self._filter_matches_by_content(matches, partial=False, j=j)
# We compared md5. No words were involved.
for m in matches:
m.first.words = m.second.words = ['--']
logging.info('Grouping matches') logging.info('Grouping matches')
groups = engine.get_groups(matches, j) groups = engine.get_groups(matches, j)
matched_files = dedupe([m.first for m in matches] + [m.second for m in matches]) matched_files = dedupe([m.first for m in matches] + [m.second for m in matches])

View File

@ -318,7 +318,7 @@ class TCDupeGuru_renameSelected(TestCase):
fp = open(str(p + 'foo bar 3'),mode='w') fp = open(str(p + 'foo bar 3'),mode='w')
fp.close() fp.close()
refdir = hsfs.phys.Directory(None,str(p)) refdir = hsfs.phys.Directory(None,str(p))
matches = engine.MatchFactory().getmatches(refdir.files) matches = engine.getmatches(refdir.files)
groups = engine.get_groups(matches) groups = engine.get_groups(matches)
g = groups[0] g = groups[0]
g.prioritize(lambda x:x.name) g.prioritize(lambda x:x.name)

View File

@ -340,21 +340,13 @@ class TCget_match(TestCase):
self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage) self.assertEqual(int((6.0 / 13.0) * 100),get_match(NamedObject("foo bar",True),NamedObject("bar bleh",True),(WEIGHT_WORDS,)).percentage)
class TCMatchFactory(TestCase): class GetMatches(TestCase):
def test_empty(self): def test_empty(self):
self.assertEqual([],MatchFactory().getmatches([])) eq_(getmatches([]), [])
def test_defaults(self):
mf = MatchFactory()
self.assertEqual(50,mf.common_word_threshold)
self.assertEqual(False,mf.weight_words)
self.assertEqual(False,mf.match_similar_words)
self.assertEqual(False,mf.no_field_order)
self.assertEqual(0,mf.min_match_percentage)
def test_simple(self): def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")] l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
r = MatchFactory().getmatches(l) r = getmatches(l)
self.assertEqual(2,len(r)) self.assertEqual(2,len(r))
seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh" seek = [m for m in r if m.percentage == 50] #"foo bar" and "bar bleh"
m = seek[0] m = seek[0]
@ -367,7 +359,7 @@ class TCMatchFactory(TestCase):
def test_null_and_unrelated_objects(self): def test_null_and_unrelated_objects(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")] l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject(""),NamedObject("unrelated object")]
r = MatchFactory().getmatches(l) r = getmatches(l)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
m = r[0] m = r[0]
self.assertEqual(50,m.percentage) self.assertEqual(50,m.percentage)
@ -376,34 +368,33 @@ class TCMatchFactory(TestCase):
def test_twice_the_same_word(self): def test_twice_the_same_word(self):
l = [NamedObject("foo foo bar"),NamedObject("bar bleh")] l = [NamedObject("foo foo bar"),NamedObject("bar bleh")]
r = MatchFactory().getmatches(l) r = getmatches(l)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
def test_twice_the_same_word_when_preworded(self): def test_twice_the_same_word_when_preworded(self):
l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)] l = [NamedObject("foo foo bar",True),NamedObject("bar bleh",True)]
r = MatchFactory().getmatches(l) r = getmatches(l)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
def test_two_words_match(self): def test_two_words_match(self):
l = [NamedObject("foo bar"),NamedObject("foo bar bleh")] l = [NamedObject("foo bar"),NamedObject("foo bar bleh")]
r = MatchFactory().getmatches(l) r = getmatches(l)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
def test_match_files_with_only_common_words(self): def test_match_files_with_only_common_words(self):
#If a word occurs more than 50 times, it is excluded from the matching process #If a word occurs more than 50 times, it is excluded from the matching process
#The problem with the common_word_threshold is that the files containing only common #The problem with the common_word_threshold is that the files containing only common
#words will never be matched together. We *should* match them. #words will never be matched together. We *should* match them.
mf = MatchFactory() # This test assumes that the common word threashold const is 50
mf.common_word_threshold = 50
l = [NamedObject("foo") for i in range(50)] l = [NamedObject("foo") for i in range(50)]
r = mf.getmatches(l) r = getmatches(l)
self.assertEqual(1225,len(r)) self.assertEqual(1225,len(r))
def test_use_words_already_there_if_there(self): def test_use_words_already_there_if_there(self):
o1 = NamedObject('foo') o1 = NamedObject('foo')
o2 = NamedObject('bar') o2 = NamedObject('bar')
o2.words = ['foo'] o2.words = ['foo']
self.assertEqual(1,len(MatchFactory().getmatches([o1,o2]))) eq_(1, len(getmatches([o1,o2])))
def test_job(self): def test_job(self):
def do_progress(p,d=''): def do_progress(p,d=''):
@ -413,75 +404,62 @@ class TCMatchFactory(TestCase):
j = job.Job(1,do_progress) j = job.Job(1,do_progress)
self.log = [] self.log = []
s = "foo bar" s = "foo bar"
MatchFactory().getmatches([NamedObject(s),NamedObject(s),NamedObject(s)],j) getmatches([NamedObject(s), NamedObject(s), NamedObject(s)], j=j)
self.assert_(len(self.log) > 2) self.assert_(len(self.log) > 2)
self.assertEqual(0,self.log[0]) self.assertEqual(0,self.log[0])
self.assertEqual(100,self.log[-1]) self.assertEqual(100,self.log[-1])
def test_weight_words(self): def test_weight_words(self):
mf = MatchFactory()
mf.weight_words = True
l = [NamedObject("foo bar"),NamedObject("bar bleh")] l = [NamedObject("foo bar"),NamedObject("bar bleh")]
m = mf.getmatches(l)[0] m = getmatches(l, weight_words=True)[0]
self.assertEqual(int((6.0 / 13.0) * 100),m.percentage) self.assertEqual(int((6.0 / 13.0) * 100),m.percentage)
def test_similar_word(self): def test_similar_word(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foobar"),NamedObject("foobars")] l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(1,len(mf.getmatches(l))) eq_(len(getmatches(l, match_similar_words=True)), 1)
self.assertEqual(100,mf.getmatches(l)[0].percentage) eq_(getmatches(l, match_similar_words=True)[0].percentage, 100)
l = [NamedObject("foobar"),NamedObject("foo")] l = [NamedObject("foobar"),NamedObject("foo")]
self.assertEqual(0,len(mf.getmatches(l))) #too far eq_(len(getmatches(l, match_similar_words=True)), 0) #too far
l = [NamedObject("bizkit"),NamedObject("bizket")] l = [NamedObject("bizkit"),NamedObject("bizket")]
self.assertEqual(1,len(mf.getmatches(l))) eq_(len(getmatches(l, match_similar_words=True)), 1)
l = [NamedObject("foobar"),NamedObject("foosbar")] l = [NamedObject("foobar"),NamedObject("foosbar")]
self.assertEqual(1,len(mf.getmatches(l))) eq_(len(getmatches(l, match_similar_words=True)), 1)
def test_single_object_with_similar_words(self): def test_single_object_with_similar_words(self):
mf = MatchFactory()
mf.match_similar_words = True
l = [NamedObject("foo foos")] l = [NamedObject("foo foos")]
self.assertEqual(0,len(mf.getmatches(l))) eq_(len(getmatches(l, match_similar_words=True)), 0)
def test_double_words_get_counted_only_once(self): def test_double_words_get_counted_only_once(self):
mf = MatchFactory()
l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")] l = [NamedObject("foo bar foo bleh"),NamedObject("foo bar bleh bar")]
m = mf.getmatches(l)[0] m = getmatches(l)[0]
self.assertEqual(75,m.percentage) self.assertEqual(75,m.percentage)
def test_with_fields(self): def test_with_fields(self):
mf = MatchFactory()
o1 = NamedObject("foo bar - foo bleh") o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("foo bar - bleh bar") o2 = NamedObject("foo bar - bleh bar")
o1.words = getfields(o1.name) o1.words = getfields(o1.name)
o2.words = getfields(o2.name) o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0] m = getmatches([o1, o2])[0]
self.assertEqual(50, m.percentage) self.assertEqual(50, m.percentage)
def test_with_fields_no_order(self): def test_with_fields_no_order(self):
mf = MatchFactory()
mf.no_field_order = True
o1 = NamedObject("foo bar - foo bleh") o1 = NamedObject("foo bar - foo bleh")
o2 = NamedObject("bleh bang - foo bar") o2 = NamedObject("bleh bang - foo bar")
o1.words = getfields(o1.name) o1.words = getfields(o1.name)
o2.words = getfields(o2.name) o2.words = getfields(o2.name)
m = mf.getmatches([o1, o2])[0] m = getmatches([o1, o2], no_field_order=True)[0]
self.assertEqual(50 ,m.percentage) eq_(m.percentage, 50)
def test_only_match_similar_when_the_option_is_set(self): def test_only_match_similar_when_the_option_is_set(self):
mf = MatchFactory()
mf.match_similar_words = False
l = [NamedObject("foobar"),NamedObject("foobars")] l = [NamedObject("foobar"),NamedObject("foobars")]
self.assertEqual(0,len(mf.getmatches(l))) eq_(len(getmatches(l, match_similar_words=False)), 0)
def test_dont_recurse_do_match(self): def test_dont_recurse_do_match(self):
# with nosetests, the stack is increased. The number has to be high enough not to be failing falsely # with nosetests, the stack is increased. The number has to be high enough not to be failing falsely
sys.setrecursionlimit(100) sys.setrecursionlimit(100)
mf = MatchFactory()
files = [NamedObject('foo bar') for i in range(101)] files = [NamedObject('foo bar') for i in range(101)]
try: try:
mf.getmatches(files) getmatches(files)
except RuntimeError: except RuntimeError:
self.fail() self.fail()
finally: finally:
@ -489,18 +467,9 @@ class TCMatchFactory(TestCase):
def test_min_match_percentage(self): def test_min_match_percentage(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")] l = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("a b c foo")]
mf = MatchFactory() r = getmatches(l, min_match_percentage=50)
mf.min_match_percentage = 50
r = mf.getmatches(l)
self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match self.assertEqual(1,len(r)) #Only "foo bar" / "bar bleh" should match
def test_limit(self):
l = [NamedObject(),NamedObject(),NamedObject()]
mf = MatchFactory()
mf.limit = 2
r = mf.getmatches(l)
self.assertEqual(2,len(r))
def test_MemoryError(self): def test_MemoryError(self):
@log_calls @log_calls
def mocked_match(first, second, flags): def mocked_match(first, second, flags):
@ -510,9 +479,8 @@ class TCMatchFactory(TestCase):
objects = [NamedObject() for i in range(10)] # results in 45 matches objects = [NamedObject() for i in range(10)] # results in 45 matches
self.mock(engine, 'get_match', mocked_match) self.mock(engine, 'get_match', mocked_match)
mf = MatchFactory()
try: try:
r = mf.getmatches(objects) r = getmatches(objects)
except MemoryError: except MemoryError:
self.fail('MemorryError must be handled') self.fail('MemorryError must be handled')
self.assertEqual(42, len(r)) self.assertEqual(42, len(r))
@ -738,7 +706,7 @@ class TCget_groups(TestCase):
def test_simple(self): def test_simple(self):
l = [NamedObject("foo bar"),NamedObject("bar bleh")] l = [NamedObject("foo bar"),NamedObject("bar bleh")]
matches = MatchFactory().getmatches(l) matches = getmatches(l)
m = matches[0] m = matches[0]
r = get_groups(matches) r = get_groups(matches)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
@ -749,7 +717,7 @@ class TCget_groups(TestCase):
def test_group_with_multiple_matches(self): def test_group_with_multiple_matches(self):
#This results in 3 matches #This results in 3 matches
l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")] l = [NamedObject("foo"),NamedObject("foo"),NamedObject("foo")]
matches = MatchFactory().getmatches(l) matches = getmatches(l)
r = get_groups(matches) r = get_groups(matches)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
g = r[0] g = r[0]
@ -759,7 +727,7 @@ class TCget_groups(TestCase):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")] l = [NamedObject("a b"),NamedObject("a b"),NamedObject("b c"),NamedObject("c d"),NamedObject("c d")]
#There will be 2 groups here: group "a b" and group "c d" #There will be 2 groups here: group "a b" and group "c d"
#"b c" can go either of them, but not both. #"b c" can go either of them, but not both.
matches = MatchFactory().getmatches(l) matches = getmatches(l)
r = get_groups(matches) r = get_groups(matches)
self.assertEqual(2,len(r)) self.assertEqual(2,len(r))
self.assertEqual(5,len(r[0])+len(r[1])) self.assertEqual(5,len(r[0])+len(r[1]))
@ -768,7 +736,7 @@ class TCget_groups(TestCase):
l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")] l = [NamedObject("a b"),NamedObject("a b"),NamedObject("a b"),NamedObject("a b")]
#There will be 2 groups here: group "a b" and group "c d" #There will be 2 groups here: group "a b" and group "c d"
#"b c" can fit in both, but it must be in only one of them #"b c" can fit in both, but it must be in only one of them
matches = MatchFactory().getmatches(l) matches = getmatches(l)
r = get_groups(matches) r = get_groups(matches)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
@ -788,7 +756,7 @@ class TCget_groups(TestCase):
def test_four_sized_group(self): def test_four_sized_group(self):
l = [NamedObject("foobar") for i in xrange(4)] l = [NamedObject("foobar") for i in xrange(4)]
m = MatchFactory().getmatches(l) m = getmatches(l)
r = get_groups(m) r = get_groups(m)
self.assertEqual(1,len(r)) self.assertEqual(1,len(r))
self.assertEqual(4,len(r[0])) self.assertEqual(4,len(r[0]))

View File

@ -37,7 +37,7 @@ class NamedObject(engine_test.NamedObject):
def GetTestGroups(): def GetTestGroups():
objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")] objects = [NamedObject("foo bar"),NamedObject("bar bleh"),NamedObject("foo bleh"),NamedObject("ibabtu"),NamedObject("ibabtu")]
objects[1].size = 1024 objects[1].size = 1024
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups groups = engine.get_groups(matches) #We should have 2 groups
for g in groups: for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is
@ -505,7 +505,7 @@ class TCResultsXML(TestCase):
return objects[1] return objects[1]
objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)] objects = [NamedObject(u"\xe9foo bar",True),NamedObject("bar bleh",True)]
matches = engine.MatchFactory().getmatches(objects) #we should have 5 matches matches = engine.getmatches(objects) #we should have 5 matches
groups = engine.get_groups(matches) #We should have 2 groups groups = engine.get_groups(matches) #We should have 2 groups
for g in groups: for g in groups:
g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is g.prioritize(lambda x:objects.index(x)) #We want the dupes to be in the same order as the list is

View File

@ -369,23 +369,6 @@ def test_ignore_list_checks_for_unicode():
assert f2 in g assert f2 in g
assert f3 in g assert f3 in g
def test_custom_match_factory():
class MatchFactory(object):
def getmatches(self, objects, j=None):
return [Match(objects[0], objects[1], 420)]
s = Scanner()
s.match_factory = MatchFactory()
o1, o2 = no('foo'), no('bar')
groups = s.GetDupeGroups([o1, o2])
eq_(len(groups), 1)
g = groups[0]
eq_(len(g), 2)
g.switch_ref(o1)
m = g.get_match_of(o2)
eq_(m, (o1, o2, 420))
def test_file_evaluates_to_false(): def test_file_evaluates_to_false():
# A very wrong way to use any() was added at some point, causing resulting group list # A very wrong way to use any() was added at some point, causing resulting group list
# to be empty. # to be empty.